diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d0faeafca76..5231cc7dd04 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -186,10 +186,8 @@ add_library(cugraph src/community/legacy/egonet.cu src/sampling/neighborhood.cu src/sampling/random_walks.cu - src/sampling/detail/gather_utils_impl.cu src/sampling/detail/sampling_utils_mg.cu src/sampling/detail/sampling_utils_sg.cu - src/sampling/nbr_sampling_mg.cu src/sampling/uniform_neighbor_sampling_mg.cpp src/sampling/uniform_neighbor_sampling_sg.cpp src/cores/legacy/core_number.cu diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp index 6664978a3a1..bbfffe09466 100644 --- a/cpp/include/cugraph/algorithms.hpp +++ b/cpp/include/cugraph/algorithms.hpp @@ -1468,42 +1468,6 @@ void core_number(raft::handle_t const& handle, size_t k_last = std::numeric_limits::max(), bool do_expensive_check = false); -/** - * @brief Multi-GPU Uniform Neighborhood Sampling. - * @deprecated will be removed later in this release (22.06) - * - * @tparam graph_view_t Type of graph view. - * @tparam gpu_t Type of rank (GPU) indices; - * @tparam index_t Type used for indexing; typically edge_t - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Graph View object to generate NBR Sampling on. - * @param ptr_d_starting_vertices Device array of starting vertex IDs for the NBR Sampling. - * @param ptr_d_ranks Device array of: rank IDs (GPU IDs) for the NBR Sampling. - * @param num_starting_vertices size of starting vertex set - * @param h_fan_out vector of branching out (fan-out) degree per source vertex for each level - * parameter used for obtaining local out-degree information - * @param with_replacement boolean flag specifying if random sampling is done with replacement - * (true); or, without replacement (false); default = true; - * @return tuple of tuple of device vectors and counts: - * ((vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index), rx_counts) - */ -template -std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector>, - std::vector> -uniform_nbr_sample(raft::handle_t const& handle, - graph_view_t const& graph_view, - typename graph_view_t::vertex_type const* ptr_d_starting_vertices, - gpu_t const* ptr_d_ranks, - size_t num_starting_vertices, - std::vector const& h_fan_out, - bool with_replacement = true); - /** * @brief Uniform Neighborhood Sampling. * diff --git a/cpp/include/cugraph/detail/decompress_edge_partition.cuh b/cpp/include/cugraph/detail/decompress_edge_partition.cuh index afe841e31cb..520c4272bce 100644 --- a/cpp/include/cugraph/detail/decompress_edge_partition.cuh +++ b/cpp/include/cugraph/detail/decompress_edge_partition.cuh @@ -255,7 +255,7 @@ __global__ void partially_decompress_to_edgelist_mid_degree( edge_partition.local_edges(major_partition_offset); auto major_offset = input_major_start_offsets[idx]; - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { output_majors[major_offset + i] = major; output_minors[major_offset + i] = indices[i]; @@ -290,7 +290,10 @@ void partially_decompress_edge_partition_to_fill_edgelist( vertex_t* minors, thrust::optional weights, thrust::optional> property, - thrust::optional> global_edge_index) + thrust::optional> global_edge_index, + // FIXME: Once PR 2356 is merged, this parameter could go away because + // major_hypersparse_first will be part of edge_partition + std::optional> local_edge_partition_segment_offsets) { auto execution_policy = handle.get_thrust_policy(); static_assert(detail::num_sparse_segments_per_vertex_partition == 3); @@ -408,6 +411,10 @@ void partially_decompress_edge_partition_to_fill_edgelist( ? thrust::make_optional(thrust::make_tuple( thrust::get<0>(*property) + segment_offsets[3], thrust::get<1>(*property))) : thrust::nullopt, + // FIXME: Once PR 2356 is merged, this parameter could go away because + // major_hypersparse_first will be part of edge_partition + segment_offsets_last = + (*local_edge_partition_segment_offsets)[detail::num_sparse_segments_per_vertex_partition], global_edge_index] __device__(auto idx) { auto major = input_majors[idx]; auto major_offset = input_major_start_offsets[idx]; @@ -416,7 +423,10 @@ void partially_decompress_edge_partition_to_fill_edgelist( vertex_t const* indices{nullptr}; thrust::optional weights{thrust::nullopt}; edge_t local_degree{}; - thrust::tie(indices, weights, local_degree) = edge_partition.local_edges(*major_idx); + // FIXME: Once PR 2356 is merged, this computation should be changed to use + // major_hypersparse_first which will be part of edge_partition + thrust::tie(indices, weights, local_degree) = + edge_partition.local_edges(segment_offsets_last + *major_idx); thrust::fill( thrust::seq, majors + major_offset, majors + major_offset + local_degree, major); thrust::copy(thrust::seq, indices, indices + local_degree, minors + major_offset); diff --git a/cpp/include/cugraph/detail/graph_functions.cuh b/cpp/include/cugraph/detail/graph_functions.cuh deleted file mode 100644 index 049c7ba2a05..00000000000 --- a/cpp/include/cugraph/detail/graph_functions.cuh +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -namespace cugraph { -namespace detail { -namespace original { - -/** - * @brief Compute local out degrees of the majors belonging to the adjacency matrices - * stored on each gpu - * - * Iterate through partitions and store their local degrees - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @return A single vector containing the local out degrees of the majors belong to the adjacency - * matrices - */ -template -rmm::device_uvector compute_local_major_degrees( - raft::handle_t const& handle, GraphViewType const& graph_view); - -/** - * @brief Calculate global degree information for all vertices represented by current gpu - * - * Calculate local degree and perform row wise exclusive scan over all gpus in column - * communicator. - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @return Tuple of two device vectors. The first one contains per source edge-count encountered - * by gpus in the column communicator before current gpu. The second device vector contains the - * global out degree for every source represented by current gpu - */ -template -std::tuple, - rmm::device_uvector> -get_global_degree_information(raft::handle_t const& handle, GraphViewType const& graph_view); - -/** - * @brief Calculate global adjacency offset for all majors represented by current gpu - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param[in] global_degree_offsets Global degree offset to local adjacency list for every major - * represented by current gpu - * @param global_out_degrees Global out degrees for every source represented by current gpu - * @return Device vector containing the number of edges that are prior to the adjacency list of - * every major that can be represented by the current gpu - */ -template -rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_out_degrees); - -/** - * @brief Gather active majors and associated client gpu ids across gpus in a - * column communicator - * - * Collect all the vertex ids and client gpu ids to be processed by every gpu in - * the column communicator and call sort on the list. - * - * @tparam vertex_t Type of vertex indices. - * @tparam VertexIterator Type of the iterator for vertex identifiers. - * @tparam GPUIdIterator Type of the iterator for gpu id identifiers. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param vertex_input_first Iterator pointing to the first vertex id to be processed - * @param vertex_input_last Iterator pointing to the last (exclusive) vertex id to be processed - * @param gpu_id_first Iterator pointing to the first gpu id to be processed - * @return Device vector containing all the vertices that are to be processed by every gpu - * in the column communicator - */ -template -std::tuple, - rmm::device_uvector::value_type>> -gather_active_majors(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator vertex_input_first, - VertexIterator vertex_input_last, - GPUIdIterator gpu_id_first); - -/** - * @brief Return global out degrees of active majors - * - * Get partition information of all graph partitions on the gpu and select - * global degrees of all active majors - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param active_majors Device vector containing all the vertex id that are processed by - * gpus in the column communicator - * @param global_out_degrees Global out degrees for every source represented by current gpu - * @return Global out degrees of all majors in active_majors - */ -template -rmm::device_uvector get_active_major_global_degrees( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& global_out_degrees); - -/** - * @brief Return partition information of all vertex ids of all the partitions belonging to a gpu - * - * Iterate through partitions and store the starting vertex ids, exclusive scan of vertex counts, - * offsets and indices of the partitions csr structure - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @return Tuple of device vectors. The first vector contains all the partitions related to the - * gpu. The second and third vectors contain starting and ending vertex ids of all the partitions - * belonging to the gpu. The fourth vector contains the starting vertex id of the hypersparse - * region in each partition. The fifth vector denotes the vertex count offset (how many vertices - * are dealt with by the previous partitions. - */ -template -std::tuple>, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -partition_information(raft::handle_t const& handle, GraphViewType const& graph_view); - -/** - * @brief Gather valid edges present on the current gpu - * - * Collect all the edges that are present in the adjacency lists on the current gpu - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam gpu_t Type of gpu id identifiers. - * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, - * and handles to various CUDA libraries) to run graph algorithms. - * @param[in] graph_view Non-owning graph object. - * @param[in] active_majors Device vector containing all the vertex id that are processed by - * gpus in the column communicator - * @param[in] active_major_gpu_ids Device vector containing the gpu id associated by every vertex - * present in active_majors - * @param[in] minor_map Device vector of minor indices (modifiable in-place) corresponding to - * vertex IDs being returned - * @param[in] indices_per_major Number of indices supplied for every major in the range - * [vertex_input_first, vertex_input_last) - * @param[in] global_degree_offsets Global degree offset to local adjacency list for every major - * represented by current gpu - * @return A tuple of device vector containing the majors, minors, gpu_ids and indices gathered - * locally - */ -template -std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_local_edges( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - rmm::device_uvector&& minor_map, - typename GraphViewType::edge_type indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); - -/** - * @brief Gather edge list for specified vertices - * - * Collect all the edges that are present in the adjacency lists on the current gpu - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam prop_t Type of the property associated with the majors. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param active_majors Device vector containing all the vertex id that are processed by - * gpus in the column communicator - * @param active_major_property Device vector containing the property values associated by every - * vertex present in active_majors - * @return A tuple of device vector containing the majors, minors and properties gathered locally - */ -template -std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_one_hop_edgelist( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_property, - const rmm::device_uvector& global_adjacency_list_offsets); - -} // namespace original -} // namespace detail -} // namespace cugraph diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h index 16c1a9011d4..dbefac81742 100644 --- a/cpp/include/cugraph_c/sampling_algorithms.h +++ b/cpp/include/cugraph_c/sampling_algorithms.h @@ -122,9 +122,6 @@ typedef struct { * @param [in] graph Pointer to graph. NOTE: Graph might be modified if the storage * needs to be transposed * @param [in] start Device array of start vertices for the sampling - * @param [in] start_label Device array of start labels. These labels will propagate to the - * results so that the result can be properly organized when the input needs to be sent back to - * different callers (different processes or different gpus). * @param [in] fanout Host array defining the fan out at each step in the sampling algorithm * @param [in] with_replacement * Boolean value. If true selection of edges is done with @@ -136,37 +133,7 @@ typedef struct { * be populated if error code is not CUGRAPH_SUCCESS * @return error code */ -// FIXME: This older API will be phased out this release in favor of the experimental one below cugraph_error_code_t cugraph_uniform_neighbor_sample( - const cugraph_resource_handle_t* handle, - cugraph_graph_t* graph, - const cugraph_type_erased_device_array_view_t* start, - const cugraph_type_erased_device_array_view_t* start_label, - const cugraph_type_erased_host_array_view_t* fan_out, - bool_t with_replacement, - bool_t do_expensive_check, - cugraph_sample_result_t** result, - cugraph_error_t** error); - -/** - * @brief Uniform Neighborhood Sampling - * - * @param [in] handle Handle for accessing resources - * @param [in] graph Pointer to graph. NOTE: Graph might be modified if the storage - * needs to be transposed - * @param [in] start Device array of start vertices for the sampling - * @param [in] fanout Host array defining the fan out at each step in the sampling algorithm - * @param [in] with_replacement - * Boolean value. If true selection of edges is done with - * replacement. If false selection is done without replacement. - * @param [in] do_expensive_check - * A flag to run expensive checks for input arguments (if set to true) - * @param [in] result Output from the uniform_neighbor_sample call - * @param [out] error Pointer to an error object storing details of any error. Will - * be populated if error code is not CUGRAPH_SUCCESS - * @return error code - */ -cugraph_error_code_t cugraph_experimental_uniform_neighbor_sample( const cugraph_resource_handle_t* handle, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start, diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp index 612284c93c8..ed458eaf1cd 100644 --- a/cpp/src/c_api/uniform_neighbor_sampling.cpp +++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp @@ -32,10 +32,11 @@ namespace cugraph { namespace c_api { struct cugraph_sample_result_t { - bool experimental_{true}; cugraph_type_erased_device_array_t* src_{nullptr}; cugraph_type_erased_device_array_t* dst_{nullptr}; // FIXME: Will be deleted once experimental replaces current + // NOTE: Leaving in place while we discuss some future changes, although + // not currently used. cugraph_type_erased_device_array_t* label_{nullptr}; cugraph_type_erased_device_array_t* index_{nullptr}; // FIXME: Will be deleted once experimental replaces current @@ -53,7 +54,6 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct raft::handle_t const& handle_; cugraph::c_api::cugraph_graph_t* graph_{nullptr}; cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_{nullptr}; - cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_label_{nullptr}; cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr}; bool with_replacement_{false}; bool do_expensive_check_{false}; @@ -62,125 +62,9 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct uniform_neighbor_sampling_functor(cugraph_resource_handle_t const* handle, cugraph_graph_t* graph, cugraph_type_erased_device_array_view_t const* start, - cugraph_type_erased_device_array_view_t const* start_label, cugraph_type_erased_host_array_view_t const* fan_out, bool with_replacement, bool do_expensive_check) - : abstract_functor(), - handle_(*reinterpret_cast(handle)->handle_), - graph_(reinterpret_cast(graph)), - start_( - reinterpret_cast(start)), - start_label_(reinterpret_cast( - start_label)), - fan_out_( - reinterpret_cast(fan_out)), - with_replacement_(with_replacement), - do_expensive_check_(do_expensive_check) - { - } - - template - void operator()() - { - // FIXME: Think about how to handle SG vice MG - if constexpr (!cugraph::is_candidate::value) { - unsupported(); - } else if constexpr (!multi_gpu) { - unsupported(); - } else { - // uniform_nbr_sample expects store_transposed == false - if constexpr (store_transposed) { - error_code_ = cugraph::c_api:: - transpose_storage( - handle_, graph_, error_.get()); - if (error_code_ != CUGRAPH_SUCCESS) return; - } - - auto graph = - reinterpret_cast*>( - graph_->graph_); - - auto graph_view = graph->view(); - - auto number_map = reinterpret_cast*>(graph_->number_map_); - - rmm::device_uvector start(start_->size_, handle_.get_stream()); - raft::copy(start.data(), start_->as_type(), start.size(), handle_.get_stream()); - - // - // Need to renumber sources - // - cugraph::renumber_ext_vertices( - handle_, - start.data(), - start.size(), - number_map->data(), - graph_view.local_vertex_partition_range_first(), - graph_view.local_vertex_partition_range_last(), - false); - - // C++ API wants an std::vector - std::vector fan_out(fan_out_->size_); - std::copy_n(fan_out_->as_type(), fan_out_->size_, fan_out.data()); - - auto&& [tmp_tuple, counts] = cugraph::uniform_nbr_sample(handle_, - graph_view, - start.data(), - start_label_->as_type(), - start.size(), - fan_out, - with_replacement_); - - auto&& [srcs, dsts, labels, indices] = tmp_tuple; - - std::vector vertex_partition_lasts = graph_view.vertex_partition_range_lasts(); - - cugraph::unrenumber_int_vertices(handle_, - srcs.data(), - srcs.size(), - number_map->data(), - vertex_partition_lasts, - do_expensive_check_); - - cugraph::unrenumber_int_vertices(handle_, - dsts.data(), - dsts.size(), - number_map->data(), - vertex_partition_lasts, - do_expensive_check_); - - result_ = new cugraph::c_api::cugraph_sample_result_t{ - false, - new cugraph::c_api::cugraph_type_erased_device_array_t(srcs, graph_->vertex_type_), - new cugraph::c_api::cugraph_type_erased_device_array_t(dsts, graph_->vertex_type_), - new cugraph::c_api::cugraph_type_erased_device_array_t(labels, start_label_->type_), - new cugraph::c_api::cugraph_type_erased_device_array_t(indices, graph_->edge_type_), - new cugraph::c_api::cugraph_type_erased_host_array_t(counts, graph_->vertex_type_)}; - } - } -}; - -struct experimental_uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_functor { - raft::handle_t const& handle_; - cugraph::c_api::cugraph_graph_t* graph_{nullptr}; - cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_{nullptr}; - cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr}; - bool with_replacement_{false}; - bool do_expensive_check_{false}; - cugraph::c_api::cugraph_sample_result_t* result_{nullptr}; - - experimental_uniform_neighbor_sampling_functor( - cugraph_resource_handle_t const* handle, - cugraph_graph_t* graph, - cugraph_type_erased_device_array_view_t const* start, - cugraph_type_erased_host_array_view_t const* fan_out, - bool with_replacement, - bool do_expensive_check) : abstract_functor(), handle_(*reinterpret_cast(handle)->handle_), graph_(reinterpret_cast(graph)), @@ -259,7 +143,6 @@ struct experimental_uniform_neighbor_sampling_functor : public cugraph::c_api::a do_expensive_check_); result_ = new cugraph::c_api::cugraph_sample_result_t{ - true, new cugraph::c_api::cugraph_type_erased_device_array_t(srcs, graph_->vertex_type_), new cugraph::c_api::cugraph_type_erased_device_array_t(dsts, graph_->vertex_type_), nullptr, @@ -276,7 +159,6 @@ extern "C" cugraph_error_code_t cugraph_uniform_neighbor_sample( const cugraph_resource_handle_t* handle, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start, - const cugraph_type_erased_device_array_view_t* start_labels, const cugraph_type_erased_host_array_view_t* fan_out, bool_t with_replacement, bool_t do_expensive_check, @@ -284,21 +166,6 @@ extern "C" cugraph_error_code_t cugraph_uniform_neighbor_sample( cugraph_error_t** error) { uniform_neighbor_sampling_functor functor{ - handle, graph, start, start_labels, fan_out, with_replacement, do_expensive_check}; - return cugraph::c_api::run_algorithm(graph, functor, result, error); -} - -extern "C" cugraph_error_code_t cugraph_experimental_uniform_neighbor_sample( - const cugraph_resource_handle_t* handle, - cugraph_graph_t* graph, - const cugraph_type_erased_device_array_view_t* start, - const cugraph_type_erased_host_array_view_t* fan_out, - bool_t with_replacement, - bool_t do_expensive_check, - cugraph_sample_result_t** result, - cugraph_error_t** error) -{ - experimental_uniform_neighbor_sampling_functor functor{ handle, graph, start, fan_out, with_replacement, do_expensive_check}; return cugraph::c_api::run_algorithm(graph, functor, result, error); } diff --git a/cpp/src/sampling/detail/gather_utils_impl.cu b/cpp/src/sampling/detail/gather_utils_impl.cu deleted file mode 100644 index 3c8a7e2d16b..00000000000 --- a/cpp/src/sampling/detail/gather_utils_impl.cu +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -namespace cugraph { -namespace detail { -namespace original { - -template rmm::device_uvector compute_local_major_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template rmm::device_uvector compute_local_major_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template rmm::device_uvector compute_local_major_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template rmm::device_uvector compute_local_major_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template rmm::device_uvector compute_local_major_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template rmm::device_uvector compute_local_major_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple, rmm::device_uvector> -get_global_degree_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple, rmm::device_uvector> -get_global_degree_information( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple, rmm::device_uvector> -get_global_degree_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple, rmm::device_uvector> -get_global_degree_information( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple, rmm::device_uvector> -get_global_degree_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple, rmm::device_uvector> -get_global_degree_information( - raft::handle_t const& handle, - graph_view_t const& graph_view); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template std::tuple, rmm::device_uvector> -gather_active_majors(raft::handle_t const& handle, - graph_view_t const& graph_view, - int32_t const* vertex_input_first, - int32_t const* vertex_input_last, - int32_t const* gpu_id_first); - -template std::tuple, rmm::device_uvector> -gather_active_majors(raft::handle_t const& handle, - graph_view_t const& graph_view, - int32_t const* vertex_input_first, - int32_t const* vertex_input_last, - int32_t const* gpu_id_first); - -template std::tuple, rmm::device_uvector> -gather_active_majors(raft::handle_t const& handle, - graph_view_t const& graph_view, - int32_t const* vertex_input_first, - int32_t const* vertex_input_last, - int32_t const* gpu_id_first); - -template std::tuple, rmm::device_uvector> -gather_active_majors(raft::handle_t const& handle, - graph_view_t const& graph_view, - int32_t const* vertex_input_first, - int32_t const* vertex_input_last, - int32_t const* gpu_id_first); - -template std::tuple, rmm::device_uvector> -gather_active_majors(raft::handle_t const& handle, - graph_view_t const& graph_view, - int64_t const* vertex_input_first, - int64_t const* vertex_input_last, - int32_t const* gpu_id_first); - -template std::tuple, rmm::device_uvector> -gather_active_majors(raft::handle_t const& handle, - graph_view_t const& graph_view, - int64_t const* vertex_input_first, - int64_t const* vertex_input_last, - int32_t const* gpu_id_first); - -template rmm::device_uvector get_active_major_global_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& global_out_degrees); - -template rmm::device_uvector get_active_major_global_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& global_out_degrees); - -template rmm::device_uvector get_active_major_global_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& global_out_degrees); - -template rmm::device_uvector get_active_major_global_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& global_out_degrees); - -template rmm::device_uvector get_active_major_global_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& global_out_degrees); - -template rmm::device_uvector get_active_major_global_degrees( - raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& global_out_degrees); - -template std::tuple< - rmm::device_uvector>, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -partition_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple< - rmm::device_uvector>, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -partition_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple< - rmm::device_uvector>, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -partition_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple< - rmm::device_uvector>, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -partition_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple< - rmm::device_uvector>, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -partition_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple< - rmm::device_uvector>, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -partition_information(raft::handle_t const& handle, - graph_view_t const& graph_view); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_local_edges(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - rmm::device_uvector&& minor_map, - int32_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_local_edges(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - rmm::device_uvector&& minor_map, - int32_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_local_edges(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - rmm::device_uvector&& minor_map, - int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_local_edges(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - rmm::device_uvector&& minor_map, - int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_local_edges(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - rmm::device_uvector&& minor_map, - int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_local_edges(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - rmm::device_uvector&& minor_map, - int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_one_hop_edgelist(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_one_hop_edgelist(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_one_hop_edgelist(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_one_hop_edgelist(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_one_hop_edgelist(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - const rmm::device_uvector& global_adjacency_list_offsets); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_one_hop_edgelist(raft::handle_t const& handle, - graph_view_t const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - const rmm::device_uvector& global_adjacency_list_offsets); - -} // namespace original -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/sampling/detail/gather_utils_impl.cuh b/cpp/src/sampling/detail/gather_utils_impl.cuh deleted file mode 100644 index fe8f04adcbb..00000000000 --- a/cpp/src/sampling/detail/gather_utils_impl.cuh +++ /dev/null @@ -1,775 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -namespace cugraph { -namespace detail { -namespace original { - -template -rmm::device_uvector compute_local_major_degrees( - raft::handle_t const& handle, GraphViewType const& graph_view) -{ - static_assert(GraphViewType::is_storage_transposed == false); - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using weight_t = typename GraphViewType::weight_type; - - rmm::device_uvector local_degrees(GraphViewType::is_storage_transposed - ? graph_view.local_edge_partition_dst_range_size() - : graph_view.local_edge_partition_src_range_size(), - handle.get_stream()); - - // FIXME optimize for communication - // local_edge_partition_src_range_size == summation of major_range_size() of all partitions - // belonging to the gpu - vertex_t partial_offset{0}; - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - - // Check if hypersparse segment is present in the partition - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto use_dcs = segment_offsets - ? ((*segment_offsets).size() > (num_sparse_segments_per_vertex_partition + 1)) - : false; - - if (use_dcs) { - auto major_hypersparse_first = edge_partition.major_range_first() + - (*segment_offsets)[num_sparse_segments_per_vertex_partition]; - // Calculate degrees in sparse region - auto sparse_begin = local_degrees.begin() + partial_offset; - auto sparse_end = local_degrees.begin() + partial_offset + - (major_hypersparse_first - edge_partition.major_range_first()); - ; - - thrust::tabulate(handle.get_thrust_policy(), - sparse_begin, - sparse_end, - [offsets = edge_partition.offsets()] __device__(auto i) { - return offsets[i + 1] - offsets[i]; - }); - - // Calculate degrees in hypersparse region - auto dcs_nzd_vertex_count = *(edge_partition.dcs_nzd_vertex_count()); - // Initialize hypersparse region degrees as 0 - thrust::fill(handle.get_thrust_policy(), - sparse_end, - sparse_begin + edge_partition.major_range_size(), - edge_t{0}); - thrust::for_each(handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(dcs_nzd_vertex_count), - [major_hypersparse_first, - major_range_first = edge_partition.major_range_first(), - vertex_ids = *(edge_partition.dcs_nzd_vertices()), - offsets = edge_partition.offsets(), - local_degrees = thrust::raw_pointer_cast(sparse_begin)] __device__(auto i) { - auto d = offsets[(major_hypersparse_first - major_range_first) + i + 1] - - offsets[(major_hypersparse_first - major_range_first) + i]; - auto v = vertex_ids[i]; - local_degrees[v - major_range_first] = d; - }); - } else { - auto sparse_begin = local_degrees.begin() + partial_offset; - auto sparse_end = local_degrees.begin() + partial_offset + edge_partition.major_range_size(); - thrust::tabulate(handle.get_thrust_policy(), - sparse_begin, - sparse_end, - [offsets = edge_partition.offsets()] __device__(auto i) { - return offsets[i + 1] - offsets[i]; - }); - } - partial_offset += edge_partition.major_range_size(); - } - return local_degrees; -} - -template -rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_out_degrees) -{ - static_assert(GraphViewType::is_multi_gpu == true); - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using weight_t = typename GraphViewType::weight_type; - // auto const& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto const comm_rank = comm.get_rank(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_size = col_comm.get_size(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_rank = row_comm.get_rank(); - auto const row_size = row_comm.get_size(); - - rmm::device_uvector global_adjacency_list_offsets(global_degree_offsets.size(), - handle.get_stream()); - - edge_t edge_count_in_all_previous_partitions{0}; - vertex_t vertex_offset{0}; - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_counts = - cugraph::host_scalar_allgather(comm, edge_partition.number_of_edges(), handle.get_stream()); - edge_t partial_edge_count{0}; - for (int r = 0; r < row_rank; ++r) { - for (int c = 0; c < col_size; ++c) { - partial_edge_count += edge_counts[r + c * row_size]; - } - } - thrust::exclusive_scan( - handle.get_thrust_policy(), - global_out_degrees.cbegin() + vertex_offset, - global_out_degrees.cbegin() + vertex_offset + edge_partition.major_range_size(), - global_adjacency_list_offsets.begin() + vertex_offset); - - thrust::transform( - handle.get_thrust_policy(), - global_adjacency_list_offsets.cbegin() + vertex_offset, - global_adjacency_list_offsets.cbegin() + vertex_offset + edge_partition.major_range_size(), - global_degree_offsets.cbegin() + vertex_offset, - global_adjacency_list_offsets.begin() + vertex_offset, - [offset = edge_count_in_all_previous_partitions + partial_edge_count] __device__( - auto val0, auto val1) { return val0 + val1 + offset; }); - - edge_count_in_all_previous_partitions += - std::accumulate(edge_counts.begin(), edge_counts.end(), edge_t{0}); - vertex_offset += edge_partition.major_range_size(); - } - return global_adjacency_list_offsets; -} - -template -std::tuple, - rmm::device_uvector> -get_global_degree_information(raft::handle_t const& handle, GraphViewType const& graph_view) -{ - static_assert(GraphViewType::is_multi_gpu == true); - using edge_t = typename GraphViewType::edge_type; - auto local_degrees = compute_local_major_degrees(handle, graph_view); - - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_size = col_comm.get_size(); - auto const col_rank = col_comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_size = row_comm.get_size(); - - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto const comm_rank = comm.get_rank(); - - rmm::device_uvector temp_input(local_degrees.size(), handle.get_stream()); - raft::update_device( - temp_input.data(), local_degrees.data(), local_degrees.size(), handle.get_stream()); - - rmm::device_uvector recv_data(local_degrees.size(), handle.get_stream()); - if (col_rank == 0) { - thrust::fill(handle.get_thrust_policy(), recv_data.begin(), recv_data.end(), edge_t{0}); - } - for (int i = 0; i < col_size - 1; ++i) { - if (col_rank == i) { - comm.device_send( - temp_input.begin(), temp_input.size(), comm_rank + row_size, handle.get_stream()); - } - if (col_rank == i + 1) { - comm.device_recv( - recv_data.begin(), recv_data.size(), comm_rank - row_size, handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), - temp_input.begin(), - temp_input.end(), - recv_data.begin(), - temp_input.begin(), - thrust::plus()); - } - col_comm.barrier(); - } - // Get global degrees - device_bcast(col_comm, - temp_input.begin(), - temp_input.begin(), - temp_input.size(), - col_size - 1, - handle.get_stream()); - - return std::make_tuple(std::move(recv_data), std::move(temp_input)); -} - -template -std::tuple, - rmm::device_uvector::value_type>> -gather_active_majors(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator vertex_input_first, - VertexIterator vertex_input_last, - GPUIdIterator gpu_id_first) -{ - static_assert(GraphViewType::is_multi_gpu == true); - static_assert(GraphViewType::is_storage_transposed == false); - using gpu_t = typename std::iterator_traits::value_type; - using vertex_t = typename GraphViewType::vertex_type; - - auto const& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - size_t source_count = thrust::distance(vertex_input_first, vertex_input_last); - auto external_source_counts = - cugraph::host_scalar_allgather(col_comm, source_count, handle.get_stream()); - auto total_external_source_count = - std::accumulate(external_source_counts.begin(), external_source_counts.end(), size_t{0}); - std::vector displacements(external_source_counts.size(), size_t{0}); - std::exclusive_scan( - external_source_counts.begin(), external_source_counts.end(), displacements.begin(), size_t{0}); - - rmm::device_uvector active_majors(total_external_source_count, handle.get_stream()); - rmm::device_uvector active_major_gpu_ids(total_external_source_count, handle.get_stream()); - // Get the sources other gpus on the same row are working on - // FIXME : replace with device_bcast for better scaling - device_allgatherv(col_comm, - vertex_input_first, - active_majors.data(), - external_source_counts, - displacements, - handle.get_stream()); - device_allgatherv(col_comm, - gpu_id_first, - active_major_gpu_ids.data(), - external_source_counts, - displacements, - handle.get_stream()); - thrust::sort_by_key(handle.get_thrust_policy(), - active_majors.begin(), - active_majors.end(), - active_major_gpu_ids.begin()); - return std::make_tuple(std::move(active_majors), std::move(active_major_gpu_ids)); -} - -template -rmm::device_uvector get_active_major_global_degrees( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& global_out_degrees) -{ - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using partition_t = edge_partition_device_view_t; - rmm::device_uvector active_major_degrees(active_majors.size(), handle.get_stream()); - - std::vector id_begin; - std::vector id_end; - std::vector count_offsets; - id_begin.reserve(graph_view.number_of_local_edge_partitions()); - id_end.reserve(graph_view.number_of_local_edge_partitions()); - count_offsets.reserve(graph_view.number_of_local_edge_partitions()); - vertex_t counter{0}; - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = partition_t(graph_view.local_edge_partition_view(i)); - // Starting vertex ids of each partition - id_begin.push_back(edge_partition.major_range_first()); - id_end.push_back(edge_partition.major_range_last()); - count_offsets.push_back(counter); - counter += edge_partition.major_range_size(); - } - rmm::device_uvector vertex_id_begin(id_begin.size(), handle.get_stream()); - rmm::device_uvector vertex_id_end(id_end.size(), handle.get_stream()); - rmm::device_uvector vertex_count_offsets(count_offsets.size(), handle.get_stream()); - raft::update_device( - vertex_id_begin.data(), id_begin.data(), id_begin.size(), handle.get_stream()); - raft::update_device(vertex_id_end.data(), id_end.data(), id_end.size(), handle.get_stream()); - raft::update_device( - vertex_count_offsets.data(), count_offsets.data(), count_offsets.size(), handle.get_stream()); - - thrust::transform(handle.get_thrust_policy(), - active_majors.begin(), - active_majors.end(), - active_major_degrees.begin(), - [id_begin = vertex_id_begin.data(), - id_end = vertex_id_end.data(), - global_out_degrees = global_out_degrees.data(), - vertex_count_offsets = vertex_count_offsets.data(), - count = vertex_id_end.size()] __device__(auto v) { - // Find which partition id did the vertex belong to - auto partition_id = thrust::distance( - id_end, thrust::upper_bound(thrust::seq, id_end, id_end + count, v)); - // starting position of the segment within global_degree_offset - // where the information for partition (partition_id) starts - // vertex_count_offsets[partition_id] - // The relative location of offset information for vertex id v within - // the segment - // v - id_end[partition_id] - auto location_in_segment = v - id_begin[partition_id]; - // read location of global_degree_offset needs to take into account the - // partition offsets because it is a concatenation of all the offsets - // across all partitions - auto location = location_in_segment + vertex_count_offsets[partition_id]; - return global_out_degrees[location]; - }); - return active_major_degrees; -} - -template -std::tuple>, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -partition_information(raft::handle_t const& handle, GraphViewType const& graph_view) -{ - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using partition_t = edge_partition_device_view_t; - - std::vector partitions; - std::vector id_begin; - std::vector id_end; - std::vector hypersparse_begin; - std::vector vertex_count_offsets; - - partitions.reserve(graph_view.number_of_local_edge_partitions()); - id_begin.reserve(graph_view.number_of_local_edge_partitions()); - id_end.reserve(graph_view.number_of_local_edge_partitions()); - hypersparse_begin.reserve(graph_view.number_of_local_edge_partitions()); - vertex_count_offsets.reserve(graph_view.number_of_local_edge_partitions()); - - vertex_t counter{0}; - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - partitions.emplace_back(graph_view.local_edge_partition_view(i)); - auto& edge_partition = partitions.back(); - - // Starting vertex ids of each partition - id_begin.push_back(edge_partition.major_range_first()); - id_end.push_back(edge_partition.major_range_last()); - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto use_dcs = segment_offsets - ? ((*segment_offsets).size() > (num_sparse_segments_per_vertex_partition + 1)) - : false; - if (use_dcs) { - auto major_hypersparse_first = edge_partition.major_range_first() + - (*segment_offsets)[num_sparse_segments_per_vertex_partition]; - hypersparse_begin.push_back(major_hypersparse_first); - } else { - hypersparse_begin.push_back(edge_partition.major_range_last()); - } - - // Count of relative position of the vertices - vertex_count_offsets.push_back(counter); - - counter += edge_partition.major_range_size(); - } - - // Allocate device memory for transfer - rmm::device_uvector edge_partitions(graph_view.number_of_local_edge_partitions(), - handle.get_stream()); - - rmm::device_uvector major_begin(id_begin.size(), handle.get_stream()); - rmm::device_uvector minor_end(id_end.size(), handle.get_stream()); - rmm::device_uvector hs_begin(hypersparse_begin.size(), handle.get_stream()); - rmm::device_uvector vc_offsets(vertex_count_offsets.size(), handle.get_stream()); - - // Transfer data - raft::update_device( - edge_partitions.data(), partitions.data(), partitions.size(), handle.get_stream()); - raft::update_device(major_begin.data(), id_begin.data(), id_begin.size(), handle.get_stream()); - raft::update_device(minor_end.data(), id_end.data(), id_end.size(), handle.get_stream()); - raft::update_device(vc_offsets.data(), - vertex_count_offsets.data(), - vertex_count_offsets.size(), - handle.get_stream()); - raft::update_device( - hs_begin.data(), hypersparse_begin.data(), hypersparse_begin.size(), handle.get_stream()); - - return std::make_tuple(std::move(edge_partitions), - std::move(major_begin), - std::move(minor_end), - std::move(hs_begin), - std::move(vc_offsets)); -} - -template -std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_local_edges( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_gpu_ids, - rmm::device_uvector&& minor_map, - typename GraphViewType::edge_type indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets) -{ - static_assert(GraphViewType::is_multi_gpu == true); - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - auto edge_count = active_majors.size() * indices_per_major; - rmm::device_uvector majors(edge_count, handle.get_stream()); - rmm::device_uvector minors(edge_count, handle.get_stream()); - rmm::device_uvector minor_gpu_ids(edge_count, handle.get_stream()); - vertex_t invalid_vertex_id = graph_view.number_of_vertices(); - - auto [partitions, id_begin, id_end, hypersparse_begin, vertex_count_offsets] = - partition_information(handle, graph_view); - - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(edge_count), - [edge_index_first = minor_map.begin(), - active_majors = active_majors.data(), - active_major_gpu_ids = active_major_gpu_ids.data(), - id_begin = id_begin.data(), - id_end = id_end.data(), - id_seg_count = id_begin.size(), - vertex_count_offsets = vertex_count_offsets.data(), - glbl_degree_offsets = global_degree_offsets.data(), - glbl_adj_list_offsets = global_adjacency_list_offsets.data(), - majors = majors.data(), - minors = minors.data(), - dst_gpu_ids = minor_gpu_ids.data(), - partitions = partitions.data(), - hypersparse_begin = hypersparse_begin.data(), - invalid_vertex_id, - indices_per_major] __device__(auto index) { - // major which this edge index refers to - auto loc = index / indices_per_major; - auto major = active_majors[loc]; - majors[index] = major; - dst_gpu_ids[index] = active_major_gpu_ids[loc]; - - // Find which partition id did the major belong to - auto partition_id = thrust::distance( - id_end, thrust::upper_bound(thrust::seq, id_end, id_end + id_seg_count, major)); - // starting position of the segment within global_degree_offset - // where the information for partition (partition_id) starts - // vertex_count_offsets[partition_id] - // The relative location of offset information for vertex id v within - // the segment - // v - seg[partition_id] - vertex_t location_in_segment; - if (major < hypersparse_begin[partition_id]) { - location_in_segment = major - id_begin[partition_id]; - } else { - auto row_hypersparse_idx = - partitions[partition_id].major_hypersparse_idx_from_major_nocheck(major); - if (row_hypersparse_idx) { - location_in_segment = *(row_hypersparse_idx)-id_begin[partition_id]; - } else { - minors[index] = invalid_vertex_id; - return; - } - } - - // csr offset value for vertex v that belongs to partition (partition_id) - auto offset_ptr = partitions[partition_id].offsets(); - auto sparse_offset = offset_ptr[location_in_segment]; - auto local_out_degree = offset_ptr[location_in_segment + 1] - sparse_offset; - vertex_t const* adjacency_list = partitions[partition_id].indices() + sparse_offset; - // read location of global_degree_offset needs to take into account the - // partition offsets because it is a concatenation of all the offsets - // across all partitions - auto location = location_in_segment + vertex_count_offsets[partition_id]; - auto g_degree_offset = glbl_degree_offsets[location]; - auto g_dst_index = edge_index_first[index]; - if ((g_dst_index >= g_degree_offset) && (g_dst_index < g_degree_offset + local_out_degree)) { - minors[index] = adjacency_list[g_dst_index - g_degree_offset]; - edge_index_first[index] = g_dst_index - g_degree_offset + glbl_adj_list_offsets[location]; - } else { - minors[index] = invalid_vertex_id; - } - }); - auto input_iter = thrust::make_zip_iterator( - thrust::make_tuple(majors.begin(), minors.begin(), minor_gpu_ids.begin(), minor_map.begin())); - - auto compacted_length = thrust::distance( - input_iter, - thrust::remove_if( - handle.get_thrust_policy(), - input_iter, - input_iter + minors.size(), - minors.begin(), - [invalid_vertex_id] __device__(auto dst) { return (dst == invalid_vertex_id); })); - majors.resize(compacted_length, handle.get_stream()); - minors.resize(compacted_length, handle.get_stream()); - minor_gpu_ids.resize(compacted_length, handle.get_stream()); - minor_map.resize(compacted_length, handle.get_stream()); - return std::make_tuple( - std::move(majors), std::move(minors), std::move(minor_gpu_ids), std::move(minor_map)); -} - -template -typename GraphViewType::edge_type edgelist_count(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator vertex_input_first, - VertexIterator vertex_input_last) -{ - using edge_t = typename GraphViewType::edge_type; - // Expect that vertex input list is sorted - auto [partitions, id_begin, id_end, hypersparse_begin, vertex_count_offsets] = - partition_information(handle, graph_view); - return thrust::transform_reduce( - handle.get_thrust_policy(), - vertex_input_first, - vertex_input_last, - [partitions = partitions.data(), - id_begin = id_begin.data(), - id_end = id_end.data(), - id_seg_count = id_begin.size(), - hypersparse_begin = hypersparse_begin.data(), - vertex_count_offsets = vertex_count_offsets.data()] __device__(auto major) { - // Find which partition id did the vertex belong to - auto partition_id = thrust::distance( - id_end, thrust::upper_bound(thrust::seq, id_end, id_end + id_seg_count, major)); - auto edge_partition = partitions[partition_id]; - auto major_hypersparse_first = hypersparse_begin[partition_id]; - if (major < major_hypersparse_first) { - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - return edge_partition.local_degree(major_offset); - } else { - auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major); - return major_hypersparse_idx - ? edge_partition.local_degree( - edge_partition.major_offset_from_major_nocheck(major_hypersparse_first) + - *major_hypersparse_idx) - : edge_t{0}; - } - }, - edge_t{0}, - thrust::plus()); -} - -template -std::vector get_active_major_segments(raft::handle_t const& handle, - vertex_t major_range_first, - vertex_t major_range_last, - std::vector const& partition_segments, - const rmm::device_uvector& active_majors) -{ - std::vector segments(partition_segments.size()); - std::transform(partition_segments.begin(), - partition_segments.end(), - segments.begin(), - [major_range_first](auto s) { return s + major_range_first; }); - segments.push_back(major_range_last); - - rmm::device_uvector p_segments(segments.size(), handle.get_stream()); - raft::update_device(p_segments.data(), segments.data(), segments.size(), handle.get_stream()); - rmm::device_uvector majors_segments(segments.size(), handle.get_stream()); - thrust::lower_bound(handle.get_thrust_policy(), - active_majors.cbegin(), - active_majors.cend(), - p_segments.begin(), - p_segments.end(), - majors_segments.begin()); - std::vector active_majors_segments(majors_segments.size()); - raft::update_host(active_majors_segments.data(), - majors_segments.data(), - majors_segments.size(), - handle.get_stream()); - return active_majors_segments; -} - -template -void local_major_degree( - raft::handle_t const& handle, - edge_partition_device_view_t partition, - rmm::device_uvector const& active_majors, - std::vector const& majors_segments, - std::vector const& partition_segments, - edge_t* out_degrees) -{ - auto active_major_count = majors_segments.back() - majors_segments.front(); - // Sparse region - if (majors_segments[3] - majors_segments[0] > 0) { - thrust::transform(handle.get_thrust_policy(), - active_majors.cbegin() + majors_segments[0], - active_majors.cbegin() + majors_segments[3], - out_degrees, - [partition] __device__(auto major) { - auto major_offset = partition.major_offset_from_major_nocheck(major); - return partition.local_degree(major_offset); - }); - } - // Hypersparse region - if (majors_segments[4] - majors_segments[3] > 0) { - auto major_hypersparse_first = - partition.major_range_first() + - partition_segments[detail::num_sparse_segments_per_vertex_partition]; - auto major_offset = - static_cast(major_hypersparse_first - partition.major_range_first()); - thrust::transform(handle.get_thrust_policy(), - active_majors.cbegin() + majors_segments[3], - active_majors.cbegin() + majors_segments[4], - out_degrees + majors_segments[3] - majors_segments[0], - [partition, major_offset] __device__(auto major) { - auto major_idx = partition.major_hypersparse_idx_from_major_nocheck(major); - if (major_idx) { - return partition.local_degree(major_offset + *major_idx); - } else { - return edge_t{0}; - } - }); - } -} - -template -std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector> -gather_one_hop_edgelist( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& active_majors, - const rmm::device_uvector& active_major_property, - const rmm::device_uvector& global_adjacency_list_offsets) -{ - // Assumes active_majors is sorted - - static_assert(GraphViewType::is_multi_gpu == true); - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using weight_t = typename GraphViewType::weight_type; - - std::vector> active_majors_segments; - vertex_t max_active_major_count{0}; - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - // Identify segments of active_majors - active_majors_segments.emplace_back( - get_active_major_segments(handle, - partition.major_range_first(), - partition.major_range_last(), - *(graph_view.local_edge_partition_segment_offsets(i)), - active_majors)); - auto& majors_segments = active_majors_segments.back(); - // Count of active majors belonging to this partition - max_active_major_count = - std::max(max_active_major_count, majors_segments.back() - majors_segments.front()); - } - - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - rmm::device_uvector active_majors_out_offsets(1 + max_active_major_count, - handle.get_stream()); - auto edge_count = edgelist_count(handle, graph_view, active_majors.begin(), active_majors.end()); - rmm::device_uvector majors(edge_count, handle.get_stream()); - rmm::device_uvector minors(edge_count, handle.get_stream()); - rmm::device_uvector minor_prop_ids(edge_count, handle.get_stream()); - rmm::device_uvector minor_map(edge_count, handle.get_stream()); - - edge_t output_offset = 0; - vertex_t vertex_offset{0}; - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto& majors_segments = active_majors_segments[i]; - // Calculate local degree offsets - auto active_major_count = majors_segments.back() - majors_segments.front(); - active_majors_out_offsets.set_element_to_zero_async(0, handle.get_stream()); - local_major_degree(handle, - partition, - active_majors, - majors_segments, - *(graph_view.local_edge_partition_segment_offsets(i)), - 1 + active_majors_out_offsets.data()); - thrust::inclusive_scan(handle.get_thrust_policy(), - active_majors_out_offsets.begin() + 1, - active_majors_out_offsets.begin() + 1 + active_major_count, - active_majors_out_offsets.begin() + 1); - active_majors_out_offsets.resize(1 + active_major_count, handle.get_stream()); - partially_decompress_edge_partition_to_fill_edgelist( - handle, - partition, - active_majors.cbegin(), - active_majors_out_offsets.cbegin(), - majors_segments, - output_offset + majors.data(), - output_offset + minors.data(), - thrust::nullopt, - thrust::make_optional( - thrust::make_tuple(active_major_property.cbegin(), output_offset + minor_prop_ids.data())), - thrust::make_optional( - thrust::make_tuple(global_adjacency_list_offsets.cbegin() + vertex_offset, - output_offset + minor_map.begin()))); - output_offset += active_majors_out_offsets.back_element(handle.get_stream()); - vertex_offset += partition.major_range_size(); - } - - return std::make_tuple( - std::move(majors), std::move(minors), std::move(minor_prop_ids), std::move(minor_map)); -} - -} // namespace original -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/sampling/detail/graph_functions.hpp b/cpp/src/sampling/detail/graph_functions.hpp index f0b1580b88e..8eef9c83d61 100644 --- a/cpp/src/sampling/detail/graph_functions.hpp +++ b/cpp/src/sampling/detail/graph_functions.hpp @@ -150,8 +150,7 @@ gather_local_edges( const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, typename GraphViewType::edge_type indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); /** * @brief Gather edge list for specified vertices diff --git a/cpp/src/sampling/detail/sampling_utils_impl.cuh b/cpp/src/sampling/detail/sampling_utils_impl.cuh index 478f75095c1..793df64a8d6 100644 --- a/cpp/src/sampling/detail/sampling_utils_impl.cuh +++ b/cpp/src/sampling/detail/sampling_utils_impl.cuh @@ -135,73 +135,6 @@ rmm::device_uvector compute_local_major_degre return local_degrees; } -template -rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - GraphViewType const& graph_view, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_out_degrees) -{ - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using weight_t = typename GraphViewType::weight_type; - - rmm::device_uvector global_adjacency_list_offsets(global_degree_offsets.size(), - handle.get_stream()); - - if constexpr (GraphViewType::is_multi_gpu) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto const comm_rank = comm.get_rank(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_size = col_comm.get_size(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_rank = row_comm.get_rank(); - auto const row_size = row_comm.get_size(); - - edge_t edge_count_in_all_previous_partitions{0}; - vertex_t vertex_offset{0}; - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_counts = - cugraph::host_scalar_allgather(comm, edge_partition.number_of_edges(), handle.get_stream()); - edge_t partial_edge_count{0}; - for (int r = 0; r < row_rank; ++r) { - for (int c = 0; c < col_size; ++c) { - partial_edge_count += edge_counts[r + c * row_size]; - } - } - thrust::exclusive_scan( - handle.get_thrust_policy(), - global_out_degrees.cbegin() + vertex_offset, - global_out_degrees.cbegin() + vertex_offset + edge_partition.major_range_size(), - global_adjacency_list_offsets.begin() + vertex_offset); - - thrust::transform( - handle.get_thrust_policy(), - global_adjacency_list_offsets.cbegin() + vertex_offset, - global_adjacency_list_offsets.cbegin() + vertex_offset + edge_partition.major_range_size(), - global_degree_offsets.cbegin() + vertex_offset, - global_adjacency_list_offsets.begin() + vertex_offset, - [offset = edge_count_in_all_previous_partitions + partial_edge_count] __device__( - auto val0, auto val1) { return val0 + val1 + offset; }); - - edge_count_in_all_previous_partitions += - std::accumulate(edge_counts.begin(), edge_counts.end(), edge_t{0}); - vertex_offset += edge_partition.major_range_size(); - } - } else { - thrust::fill(handle.get_thrust_policy(), - global_adjacency_list_offsets.begin(), - global_adjacency_list_offsets.end(), - edge_t{0}); - } - - return global_adjacency_list_offsets; -} - template std::tuple, rmm::device_uvector> @@ -268,8 +201,10 @@ rmm::device_uvector allgather_active_majors(raft::handle_t const& hand { auto const& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); size_t source_count = d_in.size(); + auto external_source_counts = cugraph::host_scalar_allgather(col_comm, source_count, handle.get_stream()); + auto total_external_source_count = std::accumulate(external_source_counts.begin(), external_source_counts.end(), size_t{0}); std::vector displacements(external_source_counts.size(), size_t{0}); @@ -453,8 +388,7 @@ gather_local_edges( const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, typename GraphViewType::edge_type indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets) + const rmm::device_uvector& global_degree_offsets) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -479,19 +413,18 @@ gather_local_edges( handle.get_thrust_policy(), thrust::make_counting_iterator(0), thrust::make_counting_iterator(edge_count), - [edge_index_first = minor_map.begin(), - active_majors = active_majors.data(), - id_begin = id_begin.data(), - id_end = id_end.data(), - id_seg_count = id_begin.size(), - vertex_count_offsets = vertex_count_offsets.data(), - glbl_degree_offsets = global_degree_offsets.data(), - glbl_adj_list_offsets = global_adjacency_list_offsets.data(), - majors = majors.data(), - minors = minors.data(), - weights = weights ? weights->data() : nullptr, - partitions = partitions.data(), - hypersparse_begin = hypersparse_begin.data(), + [edge_index_first = minor_map.begin(), + active_majors = active_majors.data(), + id_begin = id_begin.data(), + id_end = id_end.data(), + id_seg_count = id_begin.size(), + vertex_count_offsets = vertex_count_offsets.data(), + glbl_degree_offsets = global_degree_offsets.data(), + majors = majors.data(), + minors = minors.data(), + weights = weights ? weights->data() : nullptr, + partitions = partitions.data(), + hypersparse_begin = hypersparse_begin.data(), invalid_vertex_id, indices_per_major] __device__(auto index) { // major which this edge index refers to @@ -510,7 +443,6 @@ gather_local_edges( if (major < hypersparse_begin[partition_id]) { location_in_segment = major - id_begin[partition_id]; local_out_degree = offset_ptr[location_in_segment + 1] - offset_ptr[location_in_segment]; - ; } else { auto row_hypersparse_idx = partitions[partition_id].major_hypersparse_idx_from_major_nocheck(major); @@ -520,7 +452,6 @@ gather_local_edges( (hypersparse_begin[partition_id] - id_begin[partition_id]) + *row_hypersparse_idx; local_out_degree = offset_ptr[location_in_segment + 1] - offset_ptr[location_in_segment]; - ; } } @@ -855,7 +786,10 @@ gather_one_hop_edgelist( output_offset + minors.data(), weights ? thrust::make_optional(output_offset + weights->data()) : thrust::nullopt, thrust::nullopt, - thrust::nullopt); + thrust::nullopt, + // FIXME: When PR 2365 is merged, this parameter can be removed + graph_view.local_edge_partition_segment_offsets(i)); + output_offset += active_majors_out_offsets.back_element(handle.get_stream()); vertex_offset += partition.major_range_size(); } @@ -911,7 +845,9 @@ gather_one_hop_edgelist( minors.data(), weights ? thrust::make_optional(weights->data()) : thrust::nullopt, thrust::nullopt, - thrust::nullopt); + thrust::nullopt, + // FIXME: When PR 2365 is merged, this parameter can be removed + std::nullopt); } return std::make_tuple(std::move(majors), std::move(minors), std::move(weights)); diff --git a/cpp/src/sampling/detail/sampling_utils_mg.cu b/cpp/src/sampling/detail/sampling_utils_mg.cu index ffcead02cf9..726309e5370 100644 --- a/cpp/src/sampling/detail/sampling_utils_mg.cu +++ b/cpp/src/sampling/detail/sampling_utils_mg.cu @@ -46,42 +46,6 @@ get_global_degree_information( raft::handle_t const& handle, graph_view_t const& graph_view); -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - template rmm::device_uvector allgather_active_majors(raft::handle_t const& handle, rmm::device_uvector&& d_in); @@ -186,8 +150,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int32_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -197,8 +160,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -208,8 +170,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -219,8 +180,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int32_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -230,8 +190,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -241,8 +200,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, diff --git a/cpp/src/sampling/detail/sampling_utils_sg.cu b/cpp/src/sampling/detail/sampling_utils_sg.cu index 52f2f9245b9..ae2980e5f10 100644 --- a/cpp/src/sampling/detail/sampling_utils_sg.cu +++ b/cpp/src/sampling/detail/sampling_utils_sg.cu @@ -49,42 +49,6 @@ get_global_degree_information( raft::handle_t const& handle, graph_view_t const& graph_view); -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - -template rmm::device_uvector get_global_adjacency_offset( - raft::handle_t const& handle, - graph_view_t const& graph_view, - rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_out_degrees); - template rmm::device_uvector get_active_major_global_degrees( raft::handle_t const& handle, graph_view_t const& graph_view, @@ -129,8 +93,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int32_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -140,8 +103,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -151,8 +113,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -162,8 +123,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int32_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -173,8 +133,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, @@ -184,8 +143,7 @@ gather_local_edges(raft::handle_t const& handle, const rmm::device_uvector& active_majors, rmm::device_uvector&& minor_map, int64_t indices_per_major, - const rmm::device_uvector& global_degree_offsets, - const rmm::device_uvector& global_adjacency_list_offsets); + const rmm::device_uvector& global_degree_offsets); template std::tuple, rmm::device_uvector, diff --git a/cpp/src/sampling/nbr_sampling_impl.cuh b/cpp/src/sampling/nbr_sampling_impl.cuh deleted file mode 100644 index 7b0402057ad..00000000000 --- a/cpp/src/sampling/nbr_sampling_impl.cuh +++ /dev/null @@ -1,556 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Andrei Schaffer, aschaffer@nvidia.com -// -#pragma once - -#include -#include -#include - -#include - -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rw_traversals.hpp" - -#include - -#include -#include -#include -#include -#include - -namespace cugraph { -namespace detail { -namespace original { - -/** - * @brief Projects zip input onto the lower dim zip output, where lower dimension components are - * specified by tuple indices; e.g., extracts the (destination_vertex_id, rank_to_send_it_to) - * components from the quadruplet (vertex_t source_vertex, vertex_t destination_vertex, int rank, - * edge_t index) via indices {1,2}; - * @tparam vertex_index non-type template parameter specifying index in the input tuple where vertex - * IDs are stored; - * @tparam rank_index non-type template parameter specifying index in the input tuple where rank IDs - * are stored; - * @tparam zip_in_it_t zip Type for the input tuple; - * @tparam zip_out_it_t zip Type for the output tuple; - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param[in] begin zip begin iterator of quadruplets from which new input is extracted; typically - * (vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index) - * @param[in] end zip end iterator of quadruplets from which new input is extracted; - * @param[out] result begin of result zip iterator of pairs for next iteration; typically - * (vertex_t source_vertex, int rank) - */ -template -void project(raft::handle_t const& handle, zip_in_it_t begin, zip_in_it_t end, zip_out_it_t result) -{ - thrust::transform(handle.get_thrust_policy(), begin, end, result, [] __device__(auto const& tpl) { - return thrust::make_tuple(thrust::get(tpl), thrust::get(tpl)); - }); -} - -/** - * @brief Shuffles zipped pairs of vertex IDs and ranks IDs to the GPU's that the vertex IDs belong - * to. The assumption is that the return provides a per-GPU coalesced set of pairs, with - * corresponding counts vector. To limit the result to the self-GPU one needs additional filtering - * to extract the corresponding set from the coalesced set of sets and using the corresponding - * counts entry. - * @tparam graph_view_t Type of graph view. - * @tparam zip_iterator_t zip Type for the zipped tuple (vertexID, rank); - * @tparam gpu_t Type used for storing GPU rank IDs; - * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, - * and handles to various CUDA libraries) to run graph algorithms. - * @param[in] graph_view Graph View object to generate NBR Sampling on. - * @param[in] begin zip begin iterator of (vertexID, rank) pairs. - * @param[in] end zip end iterator of (vertexID, rank) pairs. - * @param[in] unnamed tag used for template tag dispatching - * @return tuple pair of coalesced pairs and counts - */ -template -std::tuple, device_vec_t>, - std::vector> -shuffle_to_gpus(raft::handle_t const& handle, - graph_view_t const& graph_view, - zip_iterator_t begin, - zip_iterator_t end, - gpu_t) -{ - using vertex_t = typename graph_view_t::vertex_type; - using edge_t = typename graph_view_t::edge_type; - - auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts(); - device_vec_t d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(), - handle.get_stream()); - raft::update_device(d_vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.size(), - handle.get_stream()); - - return groupby_gpu_id_and_shuffle_values( - handle.get_comms(), - begin, - end, - [vertex_partition_range_lasts = d_vertex_partition_range_lasts.data(), - num_vertex_partitions = d_vertex_partition_range_lasts.size()] __device__(auto tpl_v_r) { - return static_cast( - thrust::distance(vertex_partition_range_lasts, - thrust::lower_bound(thrust::seq, - vertex_partition_range_lasts, - vertex_partition_range_lasts + num_vertex_partitions, - thrust::get<0>(tpl_v_r)))); - }, - handle.get_stream()); -} - -/** - * @brief Updates pair of vertex IDs and ranks IDs to the GPU's that the vertex IDs belong - * to. - * @tparam graph_view_t Type of graph view. - * @tparam zip_iterator_t zip Type for the zipped tuple (vertexID, rank). - * @tparam gpu_t Type used for storing GPU rank IDs; - * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, - * and handles to various CUDA libraries) to run graph algorithms. - * @param[in] graph_view Graph View object to generate NBR Sampling on. - * @param[in] begin zip begin iterator of (vertexID, rank) pairs. - * @param[in] end zip end iterator of (vertexID, rank) pairs. - * @param[in] rank for which data is to be extracted. - * @param[out] d_in vertex set to be updated. - * @param[out] d_ranks corresponding rank set to be updated. - * @param[in] unnamed tag used for template tag dispatching. - */ -template -void update_input_by_rank(raft::handle_t const& handle, - graph_view_t const& graph_view, - zip_iterator_t begin, - zip_iterator_t end, - size_t rank, - device_vec_t& d_in, - device_vec_t& d_ranks, - gpu_t) -{ - auto&& [rx_tpl_v_r, rx_counts] = - detail::original::shuffle_to_gpus(handle, graph_view, begin, end, gpu_t{}); - - // filter rx_tpl_v_r and rx_counts vector by rank: - // - decltype(rx_counts) rx_offsets(rx_counts.size()); - std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_offsets.begin(), 0); - - // resize d_in, d_ranks: - // - auto new_in_sz = rx_counts.at(rank); - d_in.resize(new_in_sz, handle.get_stream()); - d_ranks.resize(new_in_sz, handle.get_stream()); - - // project output onto input: - // zip d_in, d_ranks - // - auto new_in_zip = thrust::make_zip_iterator( - thrust::make_tuple(d_in.begin(), d_ranks.begin())); // result start_zip - - auto&& d_new_dests = std::get<0>(rx_tpl_v_r); - auto&& d_new_ranks = std::get<1>(rx_tpl_v_r); - auto offset = rx_offsets.at(rank); - - auto tpl_in_it_begin = thrust::make_zip_iterator( - thrust::make_tuple(d_new_dests.begin() + offset, d_new_ranks.begin() + offset)); - project<0, 1>(handle, tpl_in_it_begin, tpl_in_it_begin + new_in_sz, new_in_zip); -} - -/** - * @brief Shuffles zipped tuples of (vertex_t source_vertex, vertex_t destination_vertex, int rank, - * index_t index) to specified target GPU's. - * @tparam vertex_t Type of vertex IDs. - * @tparam gpu_t Type used for storing GPU rank IDs. - * @tparam index_t Type used for indexing; typically edge_t. - * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, - * and handles to various CUDA libraries) to run graph algorithms. - * @param[in] d_src source vertex IDs; shuffle prims require it be mutable. - * @param[in] d_dst destination vertex IDs; must be mutable. - * @param[in] d_gpu_id_keys target GPU IDs (ranks); must be mutable. - * @param[in] d_indices indices of destination vertices; must be mutable. - * @return tuple of tuple of device vectors and counts: - * ((vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index), rx_counts) - */ -template -std::tuple, - device_vec_t, - device_vec_t, - device_vec_t>, - std::vector> -shuffle_to_target_gpu_ids(raft::handle_t const& handle, - device_vec_t& d_src, - device_vec_t& d_dst, - device_vec_t& d_gpu_id_keys, - device_vec_t& d_indices) -{ - auto zip_it_begin = - thrust::make_zip_iterator(thrust::make_tuple(d_src.begin(), d_dst.begin(), d_indices.begin())); - - thrust::sort_by_key( - handle.get_thrust_policy(), d_gpu_id_keys.begin(), d_gpu_id_keys.end(), zip_it_begin); - - rmm::device_uvector tx_counts(handle.get_comms().get_size(), handle.get_stream()); - - thrust::tabulate( - handle.get_thrust_policy(), - tx_counts.begin(), - tx_counts.end(), - [gpu_id_first = d_gpu_id_keys.begin(), gpu_id_last = d_gpu_id_keys.end()] __device__(size_t i) { - return static_cast(thrust::distance( - gpu_id_first, - thrust::upper_bound(thrust::seq, gpu_id_first, gpu_id_last, static_cast(i)))); - }); - - thrust::adjacent_difference( - handle.get_thrust_policy(), tx_counts.begin(), tx_counts.end(), tx_counts.begin()); - - std::vector h_tx_counts(tx_counts.size()); - raft::update_host(h_tx_counts.data(), tx_counts.data(), tx_counts.size(), handle.get_stream()); - - handle.sync_stream(); - - return // [rx_tuple, rx_counts] - shuffle_values(handle.get_comms(), - thrust::make_zip_iterator(thrust::make_tuple( - d_src.begin(), d_dst.begin(), d_gpu_id_keys.begin(), d_indices.begin())), - h_tx_counts, - handle.get_stream()); -} - -/** - * @brief Multi-GPU Uniform Neighborhood Sampling. The outline of the algorithm: - * - * uniform_nbr_sample(J[p][], L, K[], flag_unique) { - * Out[p][] = {}; // initialize output result - * (empty) - * - * loop level in {0,…, L-1} { // 1 tree level / iteration - * n_per_level = |J| * L^ (level+1); // size of output per level - * - * J[] = union(J[], {J[partition_row], - * for partition_row same as `p`}; - * - * for each pair (s, _) in J[] { // cache out-degrees of src_v - * set; d_out_deg[s] = mnmg_get_out_deg(graph, s); - * } - * - * d_indices[] = segmented_random_generator(d_out_degs[], // sizes[] to define range to - * // sample from; - * K[level], // fanout per-level - * flag_unique); - * // for each (s, _) in J[]{ - * // generate {0,…,out-deg(s)};} - * - * d_out[] = gather_nbr(J[], d_indices[], level, K[level]); // {(s, d, r),…} MNMG prim that - * // gathers the NBR for current - * // level of each src_v; - * // output is set of triplets - * // (src_v, dst_v, - * rank_to_send_to) Out[p][] = union(Out[p][], d_out[]); // append local - * output to result d_out[] = shuffle(d_out[]); // reshuffle output - * to - * // corresponding rank - * J[] = project(d_out[], []((s,d,r)){ return (d,r);}); // extract the (d, r) from (s,d, - * r) - * // for next iter - * } - * return Out[p][]; - * } - * - * @tparam graph_view_t Type of graph view. - * @tparam gpu_t Type used for storing GPU rank IDs; - * @tparam index_t Type used for indexing; typically edge_t. - * @tparam seeder_t Type for generating random engine seeds. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Graph View object to generate NBR Sampling on. - * @param d_in Device vector of starting vertex IDs for the NBR Sampling. Must be non-const for - * shuffling. - * @param d_ranks Device vector of ranks for which corresponding vertex ID data must be sent to. The - * pairs (vertex_ID, rank) must be shuffled together. Must be non-const for shuffling. - * @param h_fan_out vector of branching out (fan-out) degree per source vertex for each level - * @param global_degree_offsets local partition of global out-degree cache; pass-through - * parameter used for obtaining local out-degree information - * @param flag_replacement boolean flag specifying if random sampling is done without replacement - * (true); or, with replacement (false); default = true; - * @return tuple of device vectors: - * (vertex_t source_vertex, vertex_t destination_vertex, int rank, edge_t index) - */ -template > -std::tuple, - device_vec_t, - device_vec_t, - device_vec_t> -uniform_nbr_sample_impl( - raft::handle_t const& handle, - graph_view_t const& graph_view, - device_vec_t& d_in, - device_vec_t& d_ranks, - std::vector const& h_fan_out, - device_vec_t const& global_out_degrees, - device_vec_t const& global_degree_offsets, - device_vec_t const& global_adjacency_list_offsets, - bool flag_replacement) -{ - using vertex_t = typename graph_view_t::vertex_type; - using edge_t = typename graph_view_t::edge_type; - using return_t = std::tuple, - device_vec_t, - device_vec_t, - device_vec_t>; - namespace cugraph_ops = cugraph::ops::gnn::graph; - - if constexpr (graph_view_t::is_multi_gpu) { - size_t num_starting_vs = d_in.size(); - - CUGRAPH_EXPECTS(num_starting_vs == d_ranks.size(), - "Sets of input vertices and ranks must have same sizes."); - - auto num_levels = h_fan_out.size(); - - CUGRAPH_EXPECTS(num_levels > 0, "Invalid input argument: number of levels must be non-zero."); - - // Output quad of accumulators to collect results into: - // (all start as empty) - // - device_vec_t d_acc_src(0, handle.get_stream()); - device_vec_t d_acc_dst(0, handle.get_stream()); - device_vec_t d_acc_ranks(0, handle.get_stream()); - device_vec_t d_acc_indices(0, handle.get_stream()); - - auto&& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto&& row_rank = row_comm.get_rank(); - - auto&& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto&& col_rank = col_comm.get_rank(); - - auto const self_rank = handle.get_comms().get_rank(); - - size_t level{0l}; - for (auto&& k_level : h_fan_out) { - // prep step for extracting out-degs(sources): - // - auto&& [d_new_in, d_new_rank] = - gather_active_majors(handle, graph_view, d_in.cbegin(), d_in.cend(), d_ranks.cbegin()); - - rmm::device_uvector d_out_src(0, handle.get_stream()); - rmm::device_uvector d_out_dst(0, handle.get_stream()); - rmm::device_uvector d_out_ranks(0, handle.get_stream()); - rmm::device_uvector d_indices(0, handle.get_stream()); - - if (k_level != 0) { - // extract out-degs(sources): - // - auto&& d_out_degs = - get_active_major_global_degrees(handle, graph_view, d_new_in, global_out_degrees); - - // segemented-random-generation of indices: - // - device_vec_t d_rnd_indices(d_new_in.size() * k_level, handle.get_stream()); - - raft::random::RngState rng_state(row_rank + level); - cugraph_ops::get_sampling_index(detail::original::raw_ptr(d_rnd_indices), - rng_state, - detail::original::raw_const_ptr(d_out_degs), - static_cast(d_out_degs.size()), - static_cast(k_level), - flag_replacement, - handle.get_stream()); - - // gather edges step: - // invalid entries (not found, etc.) filtered out in result; - // d_indices[] filtered out in-place (to avoid copies+moves); - // - auto&& [temp_d_out_src, temp_d_out_dst, temp_d_out_ranks, temp_d_indices] = - gather_local_edges(handle, - graph_view, - d_new_in, - d_new_rank, - std::move(d_rnd_indices), - static_cast(k_level), - global_degree_offsets, - global_adjacency_list_offsets); - d_out_src = std::move(temp_d_out_src); - d_out_dst = std::move(temp_d_out_dst); - d_out_ranks = std::move(temp_d_out_ranks); - d_indices = std::move(temp_d_indices); - } else { - auto&& [temp_d_out_src, temp_d_out_dst, temp_d_out_ranks, temp_d_indices] = - gather_one_hop_edgelist( - handle, graph_view, d_new_in, d_new_rank, global_adjacency_list_offsets); - d_out_src = std::move(temp_d_out_src); - d_out_dst = std::move(temp_d_out_dst); - d_out_ranks = std::move(temp_d_out_ranks); - d_indices = std::move(temp_d_indices); - } - - // resize accumulators: - // - auto old_sz = d_acc_dst.size(); - auto add_sz = d_out_dst.size(); - auto new_sz = old_sz + add_sz; - - d_acc_src.resize(new_sz, handle.get_stream()); - d_acc_dst.resize(new_sz, handle.get_stream()); - d_acc_ranks.resize(new_sz, handle.get_stream()); - d_acc_indices.resize(new_sz, handle.get_stream()); - - // zip quad; must be done after resizing, - // because they grow from one iteration to another, - // so iterators could be invalidated: - // - auto acc_zip_it = - thrust::make_zip_iterator(thrust::make_tuple(d_acc_src.begin() + old_sz, - d_acc_dst.begin() + old_sz, - d_acc_ranks.begin() + old_sz, - d_acc_indices.begin() + old_sz)); - - // union step: - // - auto out_zip_it = thrust::make_zip_iterator(thrust::make_tuple( - d_out_src.begin(), d_out_dst.begin(), d_out_ranks.begin(), d_indices.begin())); - - thrust::copy_n(handle.get_thrust_policy(), out_zip_it, add_sz, acc_zip_it); - - // shuffle step: update input for self_rank - // zipping is necessary to preserve rank info during shuffle! - // - auto next_in_zip_begin = - thrust::make_zip_iterator(thrust::make_tuple(d_out_dst.begin(), d_out_ranks.begin())); - auto next_in_zip_end = - thrust::make_zip_iterator(thrust::make_tuple(d_out_dst.end(), d_out_ranks.end())); - - update_input_by_rank(handle, - graph_view, - next_in_zip_begin, - next_in_zip_end, - static_cast(self_rank), - d_in, - d_ranks, - gpu_t{}); - - ++level; - } - - return std::make_tuple( - std::move(d_acc_src), std::move(d_acc_dst), std::move(d_acc_ranks), std::move(d_acc_indices)); - } else { - CUGRAPH_FAIL("Neighborhood sampling functionality is supported only for the multi-gpu case."); - } -} - -} // namespace original -} // namespace detail - -template -std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector>, - std::vector> -uniform_nbr_sample(raft::handle_t const& handle, - graph_view_t const& graph_view, - typename graph_view_t::vertex_type const* ptr_d_start, - gpu_t const* ptr_d_ranks, - size_t num_starting_vs, - std::vector const& h_fan_out, - bool flag_replacement) -{ - using vertex_t = typename graph_view_t::vertex_type; - using edge_t = typename graph_view_t::edge_type; - - size_t const self_rank = handle.get_comms().get_rank(); - - // shuffle input data to its corresponding rank; - // (Note: shuffle prims require mutable iterators) - // - detail::original::device_vec_t d_start_vs(num_starting_vs, handle.get_stream()); - detail::original::device_vec_t d_ranks(num_starting_vs, handle.get_stream()); - // ...hence copy required: - // - thrust::copy_n(handle.get_thrust_policy(), ptr_d_start, num_starting_vs, d_start_vs.begin()); - thrust::copy_n(handle.get_thrust_policy(), ptr_d_ranks, num_starting_vs, d_ranks.begin()); - - // shuffle data to local rank: - // - auto next_in_zip_begin = - thrust::make_zip_iterator(thrust::make_tuple(d_start_vs.begin(), d_ranks.begin())); - - auto next_in_zip_end = - thrust::make_zip_iterator(thrust::make_tuple(d_start_vs.end(), d_ranks.end())); - - detail::original::update_input_by_rank(handle, - graph_view, - next_in_zip_begin, - next_in_zip_end, - self_rank, - d_start_vs, - d_ranks, - gpu_t{}); - - // preamble step for out-degree info: - // - auto&& [global_degree_offsets, global_out_degrees] = - detail::original::get_global_degree_information(handle, graph_view); - auto&& global_adjacency_list_offsets = detail::original::get_global_adjacency_offset( - handle, graph_view, global_degree_offsets, global_out_degrees); - - // extract output quad SOA: - // - auto&& [d_src, d_dst, d_gpus, d_indices] = - detail::original::uniform_nbr_sample_impl(handle, - graph_view, - d_start_vs, - d_ranks, - h_fan_out, - global_out_degrees, - global_degree_offsets, - global_adjacency_list_offsets, - flag_replacement); - - // shuffle quad SOA by d_gpus: - // - return detail::original::shuffle_to_target_gpu_ids(handle, d_src, d_dst, d_gpus, d_indices); -} - -} // namespace cugraph diff --git a/cpp/src/sampling/nbr_sampling_mg.cu b/cpp/src/sampling/nbr_sampling_mg.cu deleted file mode 100644 index efb79d3995a..00000000000 --- a/cpp/src/sampling/nbr_sampling_mg.cu +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "nbr_sampling_impl.cuh" - -namespace cugraph { -// template explicit instantiation directives (EIDir's): -// -// SG FP32{ -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector>, - std::vector> -uniform_nbr_sample(raft::handle_t const& handle, - graph_view_t const& gview, - int32_t const* ptr_d_start, - int32_t const* ptr_d_ranks, - size_t num_starting_vs, - std::vector const& h_fan_out, - bool flag_replacement); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector>, - std::vector> -uniform_nbr_sample(raft::handle_t const& handle, - graph_view_t const& gview, - int32_t const* ptr_d_start, - int32_t const* ptr_d_ranks, - size_t num_starting_vs, - std::vector const& h_fan_out, - bool flag_replacement); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector>, - std::vector> -uniform_nbr_sample(raft::handle_t const& handle, - graph_view_t const& gview, - int64_t const* ptr_d_start, - int32_t const* ptr_d_ranks, - size_t num_starting_vs, - std::vector const& h_fan_out, - bool flag_replacement); -//} -// -// SG FP64{ -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector>, - std::vector> -uniform_nbr_sample(raft::handle_t const& handle, - graph_view_t const& gview, - int32_t const* ptr_d_start, - int32_t const* ptr_d_ranks, - size_t num_starting_vs, - std::vector const& h_fan_out, - bool flag_replacement); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector>, - std::vector> -uniform_nbr_sample(raft::handle_t const& handle, - graph_view_t const& gview, - int32_t const* ptr_d_start, - int32_t const* ptr_d_ranks, - size_t num_starting_vs, - std::vector const& h_fan_out, - bool flag_replacement); - -template std::tuple, - rmm::device_uvector, - rmm::device_uvector, - rmm::device_uvector>, - std::vector> -uniform_nbr_sample(raft::handle_t const& handle, - graph_view_t const& gview, - int64_t const* ptr_d_start, - int32_t const* ptr_d_ranks, - size_t num_starting_vs, - std::vector const& h_fan_out, - bool flag_replacement); -//} - -} // namespace cugraph diff --git a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp index 2748d75d2fc..9fe22d8a8d5 100644 --- a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp +++ b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp @@ -52,7 +52,6 @@ uniform_nbr_sample_impl( raft::host_span h_fan_out, rmm::device_uvector const& global_out_degrees, rmm::device_uvector const& global_degree_offsets, - rmm::device_uvector const& global_adjacency_list_offsets, bool with_replacement, uint64_t seed) { @@ -123,8 +122,7 @@ uniform_nbr_sample_impl( d_in, std::move(d_rnd_indices), static_cast(k_level), - global_degree_offsets, - global_adjacency_list_offsets); + global_degree_offsets); } else { std::tie(d_out_src, d_out_dst, d_out_indices) = gather_one_hop_edgelist(handle, graph_view, d_in); @@ -183,8 +181,6 @@ uniform_nbr_sample( // auto&& [global_degree_offsets, global_out_degrees] = detail::get_global_degree_information(handle, graph_view); - auto&& global_adjacency_list_offsets = detail::get_global_adjacency_offset( - handle, graph_view, global_degree_offsets, global_out_degrees); return detail::uniform_nbr_sample_impl(handle, graph_view, @@ -192,7 +188,6 @@ uniform_nbr_sample( fan_out, global_out_degrees, global_degree_offsets, - global_adjacency_list_offsets, with_replacement, seed); } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 26fc60e2cd6..a9841403723 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -610,10 +610,6 @@ if(BUILD_CUGRAPH_MG_TESTS) # - MG GATHER_ONE_HOP tests --------------------------------------------------------------- ConfigureTestMG(MG_GATHER_ONE_HOP_TEST sampling/detail/mg_gather_one_hop.cu) - ########################################################################################### - # - MG NBR SAMPLING tests ----------------------------------------------------------------- - ConfigureTestMG(MG_NBR_SAMPLING_TEST sampling/detail/mg_nbr_sampling.cu) - ########################################################################################### # - MG NBR SAMPLING tests ----------------------------------------------------------------- ConfigureTestMG(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_uniform_neighbor_sampling.cu) diff --git a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c index 8fbd80a90c0..46a54b74cc3 100644 --- a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c +++ b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c @@ -25,18 +25,18 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle, - vertex_t* h_src, - vertex_t* h_dst, - edge_t* h_idx, - size_t num_vertices, - size_t num_edges, - vertex_t* h_start, - size_t num_starts, - int* fan_out, - size_t max_depth, - bool_t with_replacement, - bool_t store_transposed) +int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle, + vertex_t* h_src, + vertex_t* h_dst, + edge_t* h_idx, + size_t num_vertices, + size_t num_edges, + vertex_t* h_start, + size_t num_starts, + int* fan_out, + size_t max_depth, + bool_t with_replacement, + bool_t store_transposed) { int test_ret_value = 0; @@ -66,7 +66,7 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32); - ret_code = cugraph_experimental_uniform_neighbor_sample( + ret_code = cugraph_uniform_neighbor_sample( handle, graph, d_start_view, h_fan_out_view, with_replacement, FALSE, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); @@ -123,140 +123,6 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han return test_ret_value; } -int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle, - vertex_t* h_src, - vertex_t* h_dst, - weight_t* h_wgt, - size_t num_vertices, - size_t num_edges, - vertex_t* h_start, - int* h_start_label, - size_t num_starts, - int* fan_out, - size_t max_depth, - bool_t with_replacement, - bool_t store_transposed) -{ - int test_ret_value = 0; - - cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; - cugraph_error_t* ret_error = NULL; - - cugraph_graph_t* graph = NULL; - cugraph_sample_result_t* result = NULL; - - cugraph_type_erased_device_array_t* d_start = NULL; - cugraph_type_erased_device_array_view_t* d_start_view = NULL; - cugraph_type_erased_device_array_t* d_start_label = NULL; - cugraph_type_erased_device_array_view_t* d_start_label_view = NULL; - cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL; - - ret_code = create_mg_test_graph( - handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, &graph, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); - - ret_code = - cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed."); - - d_start_view = cugraph_type_erased_device_array_view(d_start); - - ret_code = cugraph_type_erased_device_array_view_copy_from_host( - handle, d_start_view, (byte_t*)h_start, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); - - ret_code = - cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start_label, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start_label create failed."); - - d_start_label_view = cugraph_type_erased_device_array_view(d_start_label); - - ret_code = cugraph_type_erased_device_array_view_copy_from_host( - handle, d_start_label_view, (byte_t*)h_start_label, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); - - h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32); - - ret_code = cugraph_uniform_neighbor_sample(handle, - graph, - d_start_view, - d_start_label_view, - h_fan_out_view, - with_replacement, - FALSE, - &result, - &ret_error); - - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed."); - - cugraph_type_erased_device_array_view_t* srcs; - cugraph_type_erased_device_array_view_t* dsts; - cugraph_type_erased_device_array_view_t* labels; - cugraph_type_erased_device_array_view_t* index; - cugraph_type_erased_host_array_view_t* counts; - - srcs = cugraph_sample_result_get_sources(result); - dsts = cugraph_sample_result_get_destinations(result); - labels = cugraph_sample_result_get_start_labels(result); - index = cugraph_sample_result_get_index(result); - counts = cugraph_sample_result_get_counts(result); - - size_t result_size = cugraph_type_erased_device_array_view_size(srcs); - - vertex_t h_srcs[result_size]; - vertex_t h_dsts[result_size]; - int h_labels[result_size]; - edge_t h_index[result_size]; - size_t* h_counts; - - ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_srcs, srcs, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - - ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_dsts, dsts, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - - ret_code = cugraph_type_erased_device_array_view_copy_to_host( - handle, (byte_t*)h_labels, labels, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - - ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_index, index, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - - h_counts = (size_t*)cugraph_type_erased_host_array_pointer(counts); - - // NOTE: The C++ tester does a more thorough validation. For our purposes - // here we will do a simpler validation, merely checking that all edges - // are actually part of the graph - weight_t M[num_vertices][num_vertices]; - - for (int i = 0; i < num_vertices; ++i) - for (int j = 0; j < num_vertices; ++j) - M[i][j] = 0.0; - - for (int i = 0; i < num_edges; ++i) - M[h_src[i]][h_dst[i]] = h_wgt[i]; - - for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) { - TEST_ASSERT(test_ret_value, - M[h_srcs[i]][h_dsts[i]] > 0.0, - "uniform_neighbor_sample got edge that doesn't exist"); - - bool_t found = FALSE; - for (int j = 0; j < num_starts; ++j) - found = found || (h_labels[i] == h_start_label[j]); - - TEST_ASSERT(test_ret_value, found, "invalid label"); - } - - cugraph_type_erased_host_array_view_free(h_fan_out_view); - - return test_ret_value; -} - int test_uniform_neighbor_sample(const cugraph_resource_handle_t* handle) { size_t num_edges = 8; @@ -264,21 +130,19 @@ int test_uniform_neighbor_sample(const cugraph_resource_handle_t* handle) size_t fan_out_size = 2; size_t num_starts = 2; - vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; - vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; - weight_t wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t start[] = {2, 2}; - vertex_t start_labels[] = {0, 1}; - int fan_out[] = {1, 2}; + vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; + edge_t idx[] = {0, 1, 2, 3, 4, 5, 6, 7}; + vertex_t start[] = {2, 2}; + int fan_out[] = {1, 2}; return generic_uniform_neighbor_sample_test(handle, src, dst, - wgt, + idx, num_vertices, num_edges, start, - start_labels, num_starts, fan_out, fan_out_size, @@ -286,33 +150,6 @@ int test_uniform_neighbor_sample(const cugraph_resource_handle_t* handle) FALSE); } -int test_experimental_uniform_neighbor_sample(const cugraph_resource_handle_t* handle) -{ - size_t num_edges = 8; - size_t num_vertices = 6; - size_t fan_out_size = 2; - size_t num_starts = 2; - - vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; - vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; - edge_t idx[] = {0, 1, 2, 3, 4, 5, 6, 7}; - vertex_t start[] = {2, 2}; - int fan_out[] = {1, 2}; - - return generic_experimental_uniform_neighbor_sample_test(handle, - src, - dst, - idx, - num_vertices, - num_edges, - start, - num_starts, - fan_out, - fan_out_size, - TRUE, - FALSE); -} - /******************************************************************************/ int main(int argc, char** argv) @@ -340,7 +177,6 @@ int main(int argc, char** argv) if (result == 0) { result |= RUN_MG_TEST(test_uniform_neighbor_sample, handle); - result |= RUN_MG_TEST(test_experimental_uniform_neighbor_sample, handle); cugraph_free_resource_handle(handle); } diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c index 428ccbec7a9..180ab96566a 100644 --- a/cpp/tests/c_api/uniform_neighbor_sample_test.c +++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c @@ -110,18 +110,18 @@ int create_test_graph_with_edge_ids(const cugraph_resource_handle_t* p_handle, return test_ret_value; } -int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src, - vertex_t* h_dst, - edge_t* h_ids, - size_t num_vertices, - size_t num_edges, - vertex_t* h_start, - size_t num_starts, - int* fan_out, - size_t max_depth, - bool_t with_replacement, - bool_t renumber, - bool_t store_transposed) +int generic_uniform_neighbor_sample_test(vertex_t* h_src, + vertex_t* h_dst, + edge_t* h_ids, + size_t num_vertices, + size_t num_edges, + vertex_t* h_start, + size_t num_starts, + int* fan_out, + size_t max_depth, + bool_t with_replacement, + bool_t renumber, + bool_t store_transposed) { int test_ret_value = 0; @@ -155,7 +155,7 @@ int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src, h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32); - ret_code = cugraph_experimental_uniform_neighbor_sample( + ret_code = cugraph_uniform_neighbor_sample( handle, graph, d_start_view, h_fan_out_view, with_replacement, FALSE, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); @@ -210,151 +210,6 @@ int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src, return test_ret_value; } -int generic_uniform_neighbor_sample_test(vertex_t* h_src, - vertex_t* h_dst, - weight_t* h_wgt, - size_t num_vertices, - size_t num_edges, - vertex_t* h_start, - int* h_start_label, - size_t num_starts, - int* fan_out, - size_t max_depth, - bool_t with_replacement, - bool_t renumber, - bool_t store_transposed) -{ - int test_ret_value = 0; - - cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; - cugraph_error_t* ret_error = NULL; - - cugraph_resource_handle_t* handle = NULL; - cugraph_graph_t* graph = NULL; - cugraph_sample_result_t* result = NULL; - - cugraph_type_erased_device_array_t* d_start = NULL; - cugraph_type_erased_device_array_view_t* d_start_view = NULL; - cugraph_type_erased_device_array_t* d_start_label = NULL; - cugraph_type_erased_device_array_view_t* d_start_label_view = NULL; - cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL; - - handle = cugraph_create_resource_handle(NULL); - TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed."); - - ret_code = create_test_graph( - handle, h_src, h_dst, h_wgt, num_edges, store_transposed, renumber, FALSE, &graph, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); - - ret_code = - cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed."); - - d_start_view = cugraph_type_erased_device_array_view(d_start); - - ret_code = cugraph_type_erased_device_array_view_copy_from_host( - handle, d_start_view, (byte_t*)h_start, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); - - ret_code = - cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start_label, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start_label create failed."); - - d_start_label_view = cugraph_type_erased_device_array_view(d_start_label); - - ret_code = cugraph_type_erased_device_array_view_copy_from_host( - handle, d_start_label_view, (byte_t*)h_start_label, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start_label copy_from_host failed."); - - h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32); - - ret_code = cugraph_uniform_neighbor_sample(handle, - graph, - d_start_view, - d_start_label_view, - h_fan_out_view, - with_replacement, - FALSE, - &result, - &ret_error); - - TEST_ASSERT(test_ret_value, - ret_code != CUGRAPH_SUCCESS, - "cugraph_uniform_neighbor_sample expected to fail in SG test"); - -#if 0 - // FIXME: cugraph_uniform_neighbor_sample does not support SG - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed."); - - cugraph_type_erased_device_array_view_t* srcs; - cugraph_type_erased_device_array_view_t* dsts; - cugraph_type_erased_device_array_view_t* labels; - cugraph_type_erased_device_array_view_t* index; - cugraph_type_erased_host_array_view_t* counts; - - srcs = cugraph_sample_result_get_sources(result); - dsts = cugraph_sample_result_get_destinations(result); - labels = cugraph_sample_result_get_start_labels(result); - index = cugraph_sample_result_get_index(result); - counts = cugraph_sample_result_get_counts(result); - - size_t result_size = cugraph_type_erased_device_array_view_size(srcs); - - vertex_t h_srcs[result_size]; - vertex_t h_dsts[result_size]; - int h_labels[result_size]; - edge_t h_index[result_size]; - size_t* h_counts; - - ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_srcs, srcs, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - - ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_dsts, dsts, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - - ret_code = cugraph_type_erased_device_array_view_copy_to_host( - handle, (byte_t*)h_labels, labels, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - - ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_index, index, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - - h_counts = (size_t*)cugraph_type_erased_host_array_pointer(counts); - - // NOTE: The C++ tester does a more thorough validation. For our purposes - // here we will do a simpler validation, merely checking that all edges - // are actually part of the graph - weight_t M[num_vertices][num_vertices]; - - for (int i = 0; i < num_vertices; ++i) - for (int j = 0; j < num_vertices; ++j) - M[i][j] = 0.0; - - for (int i = 0; i < num_edges; ++i) - M[h_src[i]][h_dst[i]] = h_wgt[i]; - - for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) { - TEST_ASSERT(test_ret_value, - M[h_srcs[i]][h_dsts[i]] > 0.0, - "uniform_neighbor_sample got edge that doesn't exist"); - - bool_t found = FALSE; - for (int j = 0; j < num_starts; ++j) - found = found || (h_labels[i] == h_start_label[j]); - - TEST_ASSERT(test_ret_value, found, "invalid label"); - } - - cugraph_type_erased_host_array_view_free(h_fan_out_view); -#endif - - return test_ret_value; -} - int test_uniform_neighbor_sample() { size_t num_edges = 8; @@ -362,20 +217,18 @@ int test_uniform_neighbor_sample() size_t fan_out_size = 2; size_t num_starts = 2; - vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; - vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; - weight_t wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t start[] = {2, 2}; - vertex_t start_labels[] = {0, 1}; - int fan_out[] = {1, 2}; + vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; + edge_t edge_ids[] = {0, 1, 2, 3, 4, 5, 6, 7}; + vertex_t start[] = {2, 2}; + int fan_out[] = {1, 2}; return generic_uniform_neighbor_sample_test(src, dst, - wgt, + edge_ids, num_vertices, num_edges, start, - start_labels, num_starts, fan_out, fan_out_size, @@ -384,37 +237,9 @@ int test_uniform_neighbor_sample() FALSE); } -int test_experimental_uniform_neighbor_sample() -{ - size_t num_edges = 8; - size_t num_vertices = 6; - size_t fan_out_size = 2; - size_t num_starts = 2; - - vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; - vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; - edge_t edge_ids[] = {0, 1, 2, 3, 4, 5, 6, 7}; - vertex_t start[] = {2, 2}; - int fan_out[] = {1, 2}; - - return generic_experimental_uniform_neighbor_sample_test(src, - dst, - edge_ids, - num_vertices, - num_edges, - start, - num_starts, - fan_out, - fan_out_size, - TRUE, - FALSE, - FALSE); -} - int main(int argc, char** argv) { int result = 0; result |= RUN_TEST(test_uniform_neighbor_sample); - result |= RUN_TEST(test_experimental_uniform_neighbor_sample); return result; } diff --git a/cpp/tests/sampling/detail/mg_gather_one_hop.cu b/cpp/tests/sampling/detail/mg_gather_one_hop.cu index 7e7dc50bb09..11e3df78f8e 100644 --- a/cpp/tests/sampling/detail/mg_gather_one_hop.cu +++ b/cpp/tests/sampling/detail/mg_gather_one_hop.cu @@ -15,6 +15,9 @@ */ #include "nbr_sampling_utils.cuh" + +#include + #include #include @@ -92,49 +95,43 @@ class Tests_MG_GatherEdges // Generate random vertex ids in the range of current gpu auto [global_degree_offsets, global_out_degrees] = - cugraph::detail::original::get_global_degree_information(handle, mg_graph_view); - auto global_adjacency_list_offsets = cugraph::detail::original::get_global_adjacency_offset( - handle, mg_graph_view, global_degree_offsets, global_out_degrees); + cugraph::detail::get_global_degree_information(handle, mg_graph_view); // Generate random sources to gather on auto random_sources = cugraph::test::random_vertex_ids(handle, mg_graph_view.local_vertex_partition_range_first(), mg_graph_view.local_vertex_partition_range_last(), - source_sample_count, + std::min(mg_graph_view.local_vertex_partition_range_size() * + (repetitions_per_vertex + vertex_t{1}), + source_sample_count), repetitions_per_vertex); - rmm::device_uvector random_source_gpu_ids(random_sources.size(), handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), - random_source_gpu_ids.begin(), - random_source_gpu_ids.end(), - comm_rank); - - auto [active_sources, active_source_gpu_ids] = - cugraph::detail::original::gather_active_majors(handle, - mg_graph_view, - random_sources.cbegin(), - random_sources.cend(), - random_source_gpu_ids.cbegin()); - - auto [src, dst, gpu_ids, edge_ids] = cugraph::detail::original::gather_one_hop_edgelist( - handle, mg_graph_view, active_sources, active_source_gpu_ids, global_adjacency_list_offsets); + + // FIXME: allgather is probably a poor name for this function. + // It's really an allgather across the row communicator + auto active_sources = + cugraph::detail::allgather_active_majors(handle, std::move(random_sources)); + + auto [src, dst, edge_ids] = + cugraph::detail::gather_one_hop_edgelist(handle, mg_graph_view, active_sources); if (prims_usecase.check_correctness) { - // Gather outputs - auto mg_out_srcs = cugraph::test::device_gatherv(handle, src.data(), src.size()); - auto mg_out_dsts = cugraph::test::device_gatherv(handle, dst.data(), dst.size()); - auto mg_out_prop = cugraph::test::device_gatherv(handle, gpu_ids.data(), gpu_ids.size()); - - auto mg_out_edge_ids = - cugraph::test::device_gatherv(handle, edge_ids.data(), edge_ids.size()); - - // Gather inputs - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_rank = col_comm.get_rank(); - auto sg_active_sources = cugraph::test::device_gatherv( - handle, active_sources.data(), col_rank == 0 ? active_sources.size() : 0); - auto sg_active_sources_gpu_ids = cugraph::test::device_gatherv( - handle, active_source_gpu_ids.data(), col_rank == 0 ? active_source_gpu_ids.size() : 0); + // Gather outputs to gpu 0 + auto mg_out_srcs = cugraph::test::device_gatherv( + handle, raft::device_span{src.data(), src.size()}); + auto mg_out_dsts = cugraph::test::device_gatherv( + handle, raft::device_span{dst.data(), dst.size()}); + + // Gather relevant edges from graph + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_rank = col_comm.get_rank(); + auto all_active_sources = cugraph::test::device_allgatherv( + handle, + raft::device_span{active_sources.data(), + col_rank == 0 ? active_sources.size() : 0}); + + thrust::sort( + handle.get_thrust_policy(), all_active_sources.begin(), all_active_sources.end()); // Gather input graph edgelist rmm::device_uvector sg_src(0, handle.get_stream()); @@ -142,42 +139,57 @@ class Tests_MG_GatherEdges std::tie(sg_src, sg_dst, std::ignore) = mg_graph_view.decompress_to_edgelist(handle, std::nullopt); - auto aggregated_sg_src = cugraph::test::device_gatherv(handle, sg_src.begin(), sg_src.size()); - auto aggregated_sg_dst = cugraph::test::device_gatherv(handle, sg_dst.begin(), sg_dst.size()); - - sort_coo(handle, mg_out_srcs, mg_out_prop, mg_out_dsts); - - if (handle.get_comms().get_rank() == int{0}) { - cugraph::graph_t sg_graph(handle); - auto aggregated_edge_iter = thrust::make_zip_iterator( - thrust::make_tuple(aggregated_sg_src.begin(), aggregated_sg_dst.begin())); - thrust::sort(handle.get_thrust_policy(), - aggregated_edge_iter, - aggregated_edge_iter + aggregated_sg_src.size()); - auto sg_graph_properties = - cugraph::graph_properties_t{mg_graph_view.is_symmetric(), mg_graph_view.is_multigraph()}; - - std::tie(sg_graph, std::ignore) = - cugraph::create_graph_from_edgelist( - handle, - std::nullopt, - std::move(aggregated_sg_src), - std::move(aggregated_sg_dst), - std::nullopt, - sg_graph_properties, - false); - auto sg_graph_view = sg_graph.view(); - // Call single gpu gather - auto [sg_out_srcs, sg_out_dsts, sg_out_prop] = - sg_gather_edges(handle, sg_graph_view, sg_active_sources, sg_active_sources_gpu_ids); - sort_coo(handle, sg_out_srcs, sg_out_prop, sg_out_dsts); - - auto passed = thrust::equal( - handle.get_thrust_policy(), sg_out_srcs.begin(), sg_out_srcs.end(), mg_out_srcs.begin()); - passed &= thrust::equal( - handle.get_thrust_policy(), sg_out_dsts.begin(), sg_out_dsts.end(), mg_out_dsts.begin()); - ASSERT_TRUE(passed); - } + auto begin_iter = thrust::make_zip_iterator(sg_src.begin(), sg_dst.begin()); + auto new_end = thrust::remove_if( + handle.get_thrust_policy(), + begin_iter, + begin_iter + sg_src.size(), + [sources = all_active_sources.data(), size = all_active_sources.size()] __device__(auto t) { + auto src = thrust::get<0>(t); + return !thrust::binary_search(thrust::seq, sources, sources + size, src); + }); + + sg_src.resize(thrust::distance(begin_iter, new_end), handle.get_stream()); + sg_dst.resize(thrust::distance(begin_iter, new_end), handle.get_stream()); + + auto aggregated_sg_src = cugraph::test::device_gatherv( + handle, raft::device_span{sg_src.begin(), sg_src.size()}); + auto aggregated_sg_dst = cugraph::test::device_gatherv( + handle, raft::device_span{sg_dst.begin(), sg_dst.size()}); + + thrust::sort(handle.get_thrust_policy(), + thrust::make_zip_iterator(mg_out_srcs.begin(), mg_out_dsts.begin()), + thrust::make_zip_iterator(mg_out_srcs.end(), mg_out_dsts.end())); + + thrust::sort(handle.get_thrust_policy(), + thrust::make_zip_iterator(aggregated_sg_src.begin(), aggregated_sg_dst.begin()), + thrust::make_zip_iterator(aggregated_sg_src.end(), aggregated_sg_dst.end())); + + // FIXME: This is ignoring the case of the same seed being specified multiple + // times. Not sure that's worth worrying about, so taking the easy way out here. + auto unique_end = + thrust::unique(handle.get_thrust_policy(), + thrust::make_zip_iterator(mg_out_srcs.begin(), mg_out_dsts.begin()), + thrust::make_zip_iterator(mg_out_srcs.end(), mg_out_dsts.end())); + + mg_out_srcs.resize( + thrust::distance(thrust::make_zip_iterator(mg_out_srcs.begin(), mg_out_dsts.begin()), + unique_end), + handle.get_stream()); + mg_out_dsts.resize( + thrust::distance(thrust::make_zip_iterator(mg_out_srcs.begin(), mg_out_dsts.begin()), + unique_end), + handle.get_stream()); + + auto passed = thrust::equal(handle.get_thrust_policy(), + mg_out_srcs.begin(), + mg_out_srcs.end(), + aggregated_sg_src.begin()); + passed &= thrust::equal(handle.get_thrust_policy(), + mg_out_dsts.begin(), + mg_out_dsts.end(), + aggregated_sg_dst.begin()); + ASSERT_TRUE(passed); } } }; diff --git a/cpp/tests/sampling/detail/mg_gather_utils.cu b/cpp/tests/sampling/detail/mg_gather_utils.cu index 6ea0e40c60a..dc0a2fb2cc1 100644 --- a/cpp/tests/sampling/detail/mg_gather_utils.cu +++ b/cpp/tests/sampling/detail/mg_gather_utils.cu @@ -15,9 +15,14 @@ */ #include "nbr_sampling_utils.cuh" + +#include + #include #include +#include + #include #include @@ -30,6 +35,102 @@ struct Prims_Usecase { bool check_correctness{true}; }; +template +std::tuple, std::vector> test_gather_local_edges( + raft::handle_t const& handle, + cugraph::graph_view_t const& mg_graph_view, + rmm::device_uvector const& sources, + rmm::device_uvector const& destination_offsets, + edge_t indices_per_source) +{ + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_rank = col_comm.get_rank(); + + // logic relies on gather_one_hop not having duplicates + rmm::device_uvector sources_copy(sources.size(), handle.get_stream()); + raft::copy(sources_copy.data(), sources.data(), sources.size(), handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), sources_copy.begin(), sources_copy.end()); + auto sources_copy_end = + thrust::unique(handle.get_thrust_policy(), sources_copy.begin(), sources_copy.end()); + sources_copy.resize(thrust::distance(sources_copy.begin(), sources_copy_end), + handle.get_stream()); + + auto [one_hop_src, one_hop_dst, one_hop_edge_ids] = + cugraph::detail::gather_one_hop_edgelist(handle, mg_graph_view, sources_copy); + + rmm::device_uvector one_hop_gpu_id(one_hop_src.size(), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + one_hop_gpu_id.begin(), + one_hop_gpu_id.end(), + handle.get_comms().get_rank()); + + // Pull everything to rank 0 + auto sg_src = cugraph::test::device_gatherv( + handle, raft::device_span{one_hop_src.data(), one_hop_src.size()}); + auto sg_dst = cugraph::test::device_gatherv( + handle, raft::device_span{one_hop_dst.data(), one_hop_dst.size()}); + auto sg_gpu_id = cugraph::test::device_gatherv( + handle, raft::device_span{one_hop_gpu_id.data(), one_hop_gpu_id.size()}); + auto sg_sources = cugraph::test::device_gatherv( + handle, raft::device_span{sources.data(), col_rank == 0 ? sources.size() : 0}); + auto sg_destination_offsets = cugraph::test::device_gatherv( + handle, + raft::device_span{destination_offsets.data(), + col_rank == 0 ? destination_offsets.size() : 0}); + + thrust::sort(handle.get_thrust_policy(), + thrust::make_zip_iterator(sg_src.begin(), sg_gpu_id.begin(), sg_dst.begin()), + thrust::make_zip_iterator(sg_src.end(), sg_gpu_id.end(), sg_dst.end())); + + std::vector h_sources(sg_sources.size()); + std::vector h_src(sg_src.size()); + std::vector h_dst(sg_dst.size()); + std::vector h_result_src(sg_destination_offsets.size()); + std::vector h_result_dst(sg_destination_offsets.size()); + std::vector h_destination_offsets(sg_destination_offsets.size()); + + raft::update_host(h_sources.data(), sg_sources.data(), sg_sources.size(), handle.get_stream()); + raft::update_host(h_src.data(), sg_src.data(), sg_src.size(), handle.get_stream()); + raft::update_host(h_dst.data(), sg_dst.data(), sg_dst.size(), handle.get_stream()); + raft::update_host(h_destination_offsets.data(), + sg_destination_offsets.data(), + sg_destination_offsets.size(), + handle.get_stream()); + + thrust::for_each(thrust::host, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(sg_destination_offsets.size()), + [&] __host__(auto i) { + h_result_src[i] = h_sources[i / indices_per_source]; + h_result_dst[i] = mg_graph_view.number_of_vertices(); + edge_t offset = h_destination_offsets[i]; + + for (size_t j = 0; j < h_src.size(); ++j) { + if (h_result_src[i] == h_src[j]) { + if (offset == 0) { + h_result_dst[i] = h_dst[j]; + break; + } + --offset; + } + } + }); + + auto new_end = + thrust::remove_if(thrust::host, + thrust::make_zip_iterator(h_result_src.begin(), h_result_dst.begin()), + thrust::make_zip_iterator(h_result_src.end(), h_result_dst.end()), + [invalid_vertex = mg_graph_view.number_of_vertices()] __host__(auto p) { + return (thrust::get<1>(p) == invalid_vertex); + }); + + h_result_src.resize(thrust::distance( + thrust::make_zip_iterator(h_result_src.begin(), h_result_dst.begin()), new_end)); + h_result_dst.resize(h_result_src.size()); + + return std::make_tuple(std::move(h_result_src), std::move(h_result_dst)); +} + template class Tests_MG_GatherEdges : public ::testing::TestWithParam> { @@ -93,118 +194,77 @@ class Tests_MG_GatherEdges // Generate random vertex ids in the range of current gpu auto [global_degree_offsets, global_out_degrees] = - cugraph::detail::original::get_global_degree_information(handle, mg_graph_view); - auto global_adjacency_list_offsets = cugraph::detail::original::get_global_adjacency_offset( - handle, mg_graph_view, global_degree_offsets, global_out_degrees); + cugraph::detail::get_global_degree_information(handle, mg_graph_view); // Generate random sources to gather on - auto random_sources = random_vertex_ids(handle, - mg_graph_view.local_vertex_partition_range_first(), - mg_graph_view.local_vertex_partition_range_last(), - source_sample_count, - repetitions_per_vertex); - rmm::device_uvector random_source_gpu_ids(random_sources.size(), handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), - random_source_gpu_ids.begin(), - random_source_gpu_ids.end(), - comm_rank); - - auto [active_sources, active_source_gpu_ids] = - cugraph::detail::original::gather_active_majors(handle, - mg_graph_view, - random_sources.cbegin(), - random_sources.cend(), - random_source_gpu_ids.cbegin()); + auto random_sources = + random_vertex_ids(handle, + mg_graph_view.local_vertex_partition_range_first(), + mg_graph_view.local_vertex_partition_range_last(), + std::min(mg_graph_view.local_vertex_partition_range_size() * + (repetitions_per_vertex + vertex_t{1}), + source_sample_count), + repetitions_per_vertex); + + // FIXME: allgather is probably a poor name for this function. + // It's really an allgather across the row communicator + auto active_sources = + cugraph::detail::allgather_active_majors(handle, std::move(random_sources)); // get source global out degrees to generate indices - auto active_source_degrees = cugraph::detail::original::get_active_major_global_degrees( + auto active_source_degrees = cugraph::detail::get_active_major_global_degrees( handle, mg_graph_view, active_sources, global_out_degrees); - auto random_destination_indices = + auto random_destination_offsets = generate_random_destination_indices(handle, active_source_degrees, mg_graph_view.number_of_vertices(), - mg_graph_view.number_of_edges(), + edge_t{-1}, indices_per_source); - rmm::device_uvector input_destination_indices(random_destination_indices.size(), + + rmm::device_uvector input_destination_offsets(random_destination_offsets.size(), handle.get_stream()); - raft::update_device(input_destination_indices.data(), - random_destination_indices.data(), - random_destination_indices.size(), - handle.get_stream()); - - auto [src, dst, gpu_ids, dst_map] = - cugraph::detail::original::gather_local_edges(handle, - mg_graph_view, - active_sources, - active_source_gpu_ids, - std::move(input_destination_indices), - indices_per_source, - global_degree_offsets, - global_adjacency_list_offsets); + raft::copy(input_destination_offsets.data(), + random_destination_offsets.data(), + random_destination_offsets.size(), + handle.get_stream()); + + auto [src, dst, dst_map] = + cugraph::detail::gather_local_edges(handle, + mg_graph_view, + active_sources, + std::move(random_destination_offsets), + indices_per_source, + global_degree_offsets); if (prims_usecase.check_correctness) { - // Gather outputs - auto mg_out_srcs = cugraph::test::device_gatherv(handle, src.data(), src.size()); - auto mg_out_dsts = cugraph::test::device_gatherv(handle, dst.data(), dst.size()); - - // Gather inputs - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_rank = col_comm.get_rank(); - auto sg_random_srcs = cugraph::test::device_gatherv( - handle, active_sources.data(), col_rank == 0 ? active_sources.size() : 0); - auto sg_random_dst_indices = - cugraph::test::device_gatherv(handle, - random_destination_indices.data(), - col_rank == 0 ? random_destination_indices.size() : 0); - - // Gather input graph edgelist - rmm::device_uvector sg_src(0, handle.get_stream()); - rmm::device_uvector sg_dst(0, handle.get_stream()); - std::tie(sg_src, sg_dst, std::ignore) = - mg_graph_view.decompress_to_edgelist(handle, std::nullopt); - - auto aggregated_sg_src = cugraph::test::device_gatherv(handle, sg_src.begin(), sg_src.size()); - auto aggregated_sg_dst = cugraph::test::device_gatherv(handle, sg_dst.begin(), sg_dst.size()); - - sort_coo(handle, mg_out_srcs, mg_out_dsts); - - if (handle.get_comms().get_rank() == int{0}) { - cugraph::graph_t sg_graph(handle); - auto aggregated_edge_iter = thrust::make_zip_iterator( - thrust::make_tuple(aggregated_sg_src.begin(), aggregated_sg_dst.begin())); - thrust::sort(handle.get_thrust_policy(), - aggregated_edge_iter, - aggregated_edge_iter + aggregated_sg_src.size()); - auto sg_graph_properties = - cugraph::graph_properties_t{mg_graph_view.is_symmetric(), mg_graph_view.is_multigraph()}; - - std::tie(sg_graph, std::ignore) = - cugraph::create_graph_from_edgelist( - handle, - std::nullopt, - std::move(aggregated_sg_src), - std::move(aggregated_sg_dst), - std::nullopt, - sg_graph_properties, - false); - auto sg_graph_view = sg_graph.view(); - // Call single gpu gather - auto [sg_out_srcs, sg_out_dsts] = sg_gather_edges(handle, - sg_graph_view, - sg_random_srcs.begin(), - sg_random_srcs.end(), - sg_random_dst_indices.begin(), - sg_graph_view.number_of_vertices(), - indices_per_source); - sort_coo(handle, sg_out_srcs, sg_out_dsts); - - auto passed = thrust::equal( - handle.get_thrust_policy(), sg_out_srcs.begin(), sg_out_srcs.end(), mg_out_srcs.begin()); - passed &= thrust::equal( - handle.get_thrust_policy(), sg_out_dsts.begin(), sg_out_dsts.end(), mg_out_dsts.begin()); - ASSERT_TRUE(passed); - } + // NOTE: This test assumes that edgea within the data structure are sorted + // We'll use gather_one_hop_edgelist to pull out the relevant edges + auto [h_src, h_dst] = test_gather_local_edges( + handle, mg_graph_view, active_sources, input_destination_offsets, indices_per_source); + + auto agg_src = cugraph::test::device_gatherv( + handle, raft::device_span{src.data(), src.size()}); + auto agg_dst = cugraph::test::device_gatherv( + handle, raft::device_span{dst.data(), dst.size()}); + + thrust::sort(handle.get_thrust_policy(), + thrust::make_zip_iterator(agg_src.begin(), agg_dst.begin()), + thrust::make_zip_iterator(agg_src.end(), agg_dst.end())); + thrust::sort(thrust::host, + thrust::make_zip_iterator(h_src.begin(), h_dst.begin()), + thrust::make_zip_iterator(h_src.end(), h_dst.end())); + + std::vector h_agg_src(agg_src.size()); + std::vector h_agg_dst(agg_dst.size()); + raft::update_host(h_agg_src.data(), agg_src.data(), agg_src.size(), handle.get_stream()); + raft::update_host(h_agg_dst.data(), agg_dst.data(), agg_dst.size(), handle.get_stream()); + + // FIXME: Why are the randomly selected vertices on each GPU so similar?? + + auto passed = thrust::equal(thrust::host, h_src.begin(), h_src.end(), h_agg_src.begin()); + passed &= thrust::equal(thrust::host, h_dst.begin(), h_dst.end(), h_agg_dst.begin()); + ASSERT_TRUE(passed); } } }; diff --git a/cpp/tests/sampling/detail/nbr_sampling_utils.cuh b/cpp/tests/sampling/detail/nbr_sampling_utils.cuh index 96e6d129b24..093c231dfce 100644 --- a/cpp/tests/sampling/detail/nbr_sampling_utils.cuh +++ b/cpp/tests/sampling/detail/nbr_sampling_utils.cuh @@ -19,7 +19,6 @@ #pragma once #include -#include #include #include #include diff --git a/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu b/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu index a036dda5fb1..12a387d4589 100644 --- a/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu +++ b/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu @@ -90,12 +90,15 @@ class Tests_MG_Nbr_Sampling constexpr vertex_t source_sample_count = 3; // Generate random vertex ids in the range of current gpu - auto random_sources = random_vertex_ids(handle, - mg_graph_view.local_vertex_partition_range_first(), - mg_graph_view.local_vertex_partition_range_last(), - source_sample_count, - repetitions_per_vertex, - comm_rank); + auto random_sources = + random_vertex_ids(handle, + mg_graph_view.local_vertex_partition_range_first(), + mg_graph_view.local_vertex_partition_range_last(), + std::min(mg_graph_view.local_vertex_partition_range_size() * + (repetitions_per_vertex + vertex_t{1}), + source_sample_count), + repetitions_per_vertex, + comm_rank); std::vector h_fan_out{indices_per_source}; // depth = 1 @@ -108,14 +111,14 @@ class Tests_MG_Nbr_Sampling if (prims_usecase.check_correctness) { // Consolidate results on GPU 0 - auto d_mg_start_src = - cugraph::test::device_gatherv(handle, random_sources.data(), random_sources.size()); - auto d_mg_aggregate_src = - cugraph::test::device_gatherv(handle, d_src_out.data(), d_src_out.size()); - auto d_mg_aggregate_dst = - cugraph::test::device_gatherv(handle, d_dst_out.data(), d_dst_out.size()); - auto d_mg_aggregate_indices = - cugraph::test::device_gatherv(handle, d_indices.data(), d_indices.size()); + auto d_mg_start_src = cugraph::test::device_gatherv( + handle, raft::device_span{random_sources.data(), random_sources.size()}); + auto d_mg_aggregate_src = cugraph::test::device_gatherv( + handle, raft::device_span{d_src_out.data(), d_src_out.size()}); + auto d_mg_aggregate_dst = cugraph::test::device_gatherv( + handle, raft::device_span{d_dst_out.data(), d_dst_out.size()}); + auto d_mg_aggregate_indices = cugraph::test::device_gatherv( + handle, raft::device_span{d_indices.data(), d_indices.size()}); #if 0 // FIXME: extract_induced_subgraphs not currently support MG, so we'll skip this validation diff --git a/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu b/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu index 241de0bf747..346c6e1d449 100644 --- a/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu +++ b/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu @@ -72,7 +72,9 @@ class Tests_Uniform_Neighbor_Sampling cugraph::test::random_vertex_ids(handle, graph_view.local_vertex_partition_range_first(), graph_view.local_vertex_partition_range_last(), - source_sample_count, + std::min(graph_view.local_vertex_partition_range_size() * + (repetitions_per_vertex + vertex_t{1}), + source_sample_count), repetitions_per_vertex, uint64_t{0}); diff --git a/cpp/tests/utilities/device_comm_wrapper.cu b/cpp/tests/utilities/device_comm_wrapper.cu index 9937e59273d..9ce1f35a33c 100644 --- a/cpp/tests/utilities/device_comm_wrapper.cu +++ b/cpp/tests/utilities/device_comm_wrapper.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,11 +26,13 @@ namespace cugraph { namespace test { template -rmm::device_uvector device_gatherv(raft::handle_t const& handle, T const* d_input, size_t size) +rmm::device_uvector device_gatherv(raft::handle_t const& handle, + raft::device_span d_input) + { bool is_root = handle.get_comms().get_rank() == int{0}; auto rx_sizes = - cugraph::host_scalar_gather(handle.get_comms(), size, int{0}, handle.get_stream()); + cugraph::host_scalar_gather(handle.get_comms(), d_input.size(), int{0}, handle.get_stream()); std::vector rx_displs(is_root ? static_cast(handle.get_comms().get_size()) : size_t{0}); if (is_root) { std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); } @@ -39,9 +41,9 @@ rmm::device_uvector device_gatherv(raft::handle_t const& handle, T const* d_i is_root ? std::reduce(rx_sizes.begin(), rx_sizes.end()) : size_t{0}, handle.get_stream()); cugraph::device_gatherv(handle.get_comms(), - d_input, + d_input.data(), gathered_v.data(), - size, + d_input.size(), rx_sizes, rx_displs, int{0}, @@ -50,23 +52,53 @@ rmm::device_uvector device_gatherv(raft::handle_t const& handle, T const* d_i return gathered_v; } +template +rmm::device_uvector device_allgatherv(raft::handle_t const& handle, + raft::device_span d_input) +{ + auto rx_sizes = + cugraph::host_scalar_allgather(handle.get_comms(), d_input.size(), handle.get_stream()); + std::vector rx_displs(static_cast(handle.get_comms().get_size())); + std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + + rmm::device_uvector gathered_v(std::reduce(rx_sizes.begin(), rx_sizes.end()), + handle.get_stream()); + + cugraph::device_allgatherv(handle.get_comms(), + d_input.data(), + gathered_v.data(), + rx_sizes, + rx_displs, + handle.get_stream()); + + return gathered_v; +} + // explicit instantiation template rmm::device_uvector device_gatherv(raft::handle_t const& handle, - int32_t const* d_input, - size_t size); + raft::device_span d_input); template rmm::device_uvector device_gatherv(raft::handle_t const& handle, - int64_t const* d_input, - size_t size); + raft::device_span d_input); template rmm::device_uvector device_gatherv(raft::handle_t const& handle, - float const* d_input, - size_t size); + raft::device_span d_input); template rmm::device_uvector device_gatherv(raft::handle_t const& handle, - double const* d_input, - size_t size); + raft::device_span d_input); + +template rmm::device_uvector device_allgatherv(raft::handle_t const& handle, + raft::device_span d_input); + +template rmm::device_uvector device_allgatherv(raft::handle_t const& handle, + raft::device_span d_input); + +template rmm::device_uvector device_allgatherv(raft::handle_t const& handle, + raft::device_span d_input); + +template rmm::device_uvector device_allgatherv(raft::handle_t const& handle, + raft::device_span d_input); } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/device_comm_wrapper.hpp b/cpp/tests/utilities/device_comm_wrapper.hpp index 55145edd71b..c1d7b6b8250 100644 --- a/cpp/tests/utilities/device_comm_wrapper.hpp +++ b/cpp/tests/utilities/device_comm_wrapper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,14 +16,35 @@ #pragma once +#include #include + #include namespace cugraph { namespace test { template -rmm::device_uvector device_gatherv(raft::handle_t const& handle, T const* d_input, size_t size); +rmm::device_uvector device_gatherv(raft::handle_t const& handle, + raft::device_span d_input); + +template +rmm::device_uvector device_gatherv(raft::handle_t const& handle, T const* d_input, size_t size) +{ + return device_gatherv(handle, raft::device_span{d_input, size}); +} + +template +rmm::device_uvector device_allgatherv(raft::handle_t const& handle, + raft::device_span d_input); + +template +rmm::device_uvector device_allgatherv(raft::handle_t const& handle, + T const* d_input, + size_t size) +{ + return device_allgatherv(handle, raft::device_span{d_input, size}); +} } // namespace test } // namespace cugraph diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index 396b73afee5..cea48f5c420 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -183,7 +183,7 @@ cdef extern from "cugraph_c/algorithms.h": # uniform neighborhood sampling cdef cugraph_error_code_t \ - cugraph_experimental_uniform_neighbor_sample( + cugraph_uniform_neighbor_sample( const cugraph_resource_handle_t* handle, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start, diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx index 8dfea32d821..e23a35396fb 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx @@ -37,7 +37,7 @@ from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, ) from pylibcugraph._cugraph_c.algorithms cimport ( - cugraph_experimental_uniform_neighbor_sample, + cugraph_uniform_neighbor_sample, cugraph_sample_result_t, cugraph_sample_result_get_sources, cugraph_sample_result_get_destinations, @@ -130,7 +130,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, len(h_fan_out), get_c_type_from_numpy_type(h_fan_out.dtype)) - error_code = cugraph_experimental_uniform_neighbor_sample( + error_code = cugraph_uniform_neighbor_sample( c_resource_handle_ptr, c_graph_ptr, start_ptr,