rapidsai · rapids-bot · May 10, 2023 · Apr 20, 2023 · Apr 27, 2023 · Apr 27, 2023
@@ -52,8 +52,8 @@ namespace raft::neighbors::experimental::cagra {
  * @code{.cpp}
  *   using namespace raft::neighbors;
  *   // use default index parameters
- *   ivf_pq::index_params build_params;
- *   ivf_pq::search_params search_params
+ *   cagra::index_params build_params;
+ *   cagra::search_params search_params
  *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
  *   // create knn graph
  *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
@@ -84,34 +84,71 @@ void build_knn_graph(raft::device_resources const& res,
   detail::build_knn_graph(res, dataset, knn_graph, refine_rate, build_params, search_params);
 }
 
+/**
+ * @brief Sort a KNN graph index.
+ * Preprocessing step for `cagra::prune`: If a KNN graph is not built using
+ * `cagra::build_knn_graph`, then it is necessary to call this function before calling
+ * `cagra::prune`. If the graph is built by `cagra::build_knn_graph`, it is already sorted and you
+ * do not need to call this function.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   cagra::index_params build_params;
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // build KNN graph not using `cagra::build_knn_graph`
+ *   // build(knn_graph, dataset, ...);
+ *   // sort graph index
+ *   sort_knn_graph(res, dataset.view(), knn_graph.view());
+ *   // prune graph
+ *   cagra::prune(res, dataset, knn_graph.view(), pruned_graph.view());
+ *   // Construct an index from dataset and pruned knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset, pruned_graph.view());
+ * @endcode
+ *
+ * @tparam DataT type of the data in the source dataset
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resources
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[in,out] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
+ * knn_graph_degree]
+ */
+template <typename DataT,
+          typename IdxT = uint32_t,
+          typename d_accessor =
+            host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>,
+          typename g_accessor =
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
+void sort_knn_graph(raft::device_resources const& res,
+                    mdspan<const DataT, matrix_extent<IdxT>, row_major, d_accessor> dataset,
+                    mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph)
+{
+  detail::graph::sort_knn_graph(res, dataset, knn_graph);
+}
+
 /**
  * @brief Prune a KNN graph.
  *
  * Decrease the number of neighbors for each node.
  *
  * See [cagra::build_knn_graph](#cagra::build_knn_graph) for usage example
  *
- * @tparam T data element type
  * @tparam IdxT type of the indices in the source dataset
  *
  * @param[in] res raft resources
- * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
  * @param[in] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
  * knn_graph_degree]
  * @param[out] new_graph a host matrix view of the pruned knn graph [n_rows, graph_degree]
  */
-template <class DATA_T,
-          typename IdxT = uint32_t,
-          typename d_accessor =
-            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
+template <typename IdxT = uint32_t,
           typename g_accessor =
-            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
 void prune(raft::device_resources const& res,
-           mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
            mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
            raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
 {
-  detail::graph::prune(res, dataset, knn_graph, new_graph);
+  detail::graph::prune(res, knn_graph, new_graph);
 }
 
 /**
@@ -138,11 +175,11 @@ void prune(raft::device_resources const& res,
  *   // create and fill the index from a [N, D] dataset
  *   auto index = cagra::build(res, index_params, dataset);
  *   // use default search parameters
- *   ivf_pq::search_params search_params;
+ *   cagra::search_params search_params;
  *   // search K nearest neighbours
  *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
  *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
- *   ivf_pq::search(res, search_params, index, queries, neighbors, distances);
+ *   cagra::search(res, search_params, index, queries, neighbors, distances);
  * @endcode
  *
  * @tparam T data element type
@@ -178,7 +215,7 @@ index<T, IdxT> build(raft::device_resources const& res,
 
   auto cagra_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), params.graph_degree);
 
-  prune<T, IdxT>(res, dataset, knn_graph.view(), cagra_graph.view());
+  prune<IdxT>(res, knn_graph.view(), cagra_graph.view());
 
   // Construct an index from dataset and pruned knn graph.
   return index<T, IdxT>(res, params.metric, dataset, cagra_graph.view());

@@ -405,36 +405,24 @@ void shift_array(T* array, uint64_t num)
   }
 }
 
-/** Input arrays can be both host and device*/
-template <class DATA_T,
+template <typename DataT,
           typename IdxT = uint32_t,
           typename d_accessor =
-            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
+            host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>,
           typename g_accessor =
-            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
-void prune(raft::device_resources const& res,
-           mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
-           mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
-           raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
+void sort_knn_graph(raft::device_resources const& res,
+                    mdspan<const DataT, matrix_extent<IdxT>, row_major, d_accessor> dataset,
+                    mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph)
 {
-  RAFT_LOG_DEBUG(
-    "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
+  RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
+               "dataset size is expected to have the same number of graph index size");
+  const uint32_t dataset_size = dataset.extent(0);
+  const uint32_t dataset_dim  = dataset.extent(1);
+  const DataT* dataset_ptr    = dataset.data_handle();
 
-  RAFT_EXPECTS(
-    dataset.extent(0) == knn_graph.extent(0) && knn_graph.extent(0) == new_graph.extent(0),
-    "Each input array is expected to have the same number of rows");
-  RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
-               "output graph cannot have more columns than input graph");
-  const uint32_t dataset_size        = dataset.extent(0);
-  const uint32_t dataset_dim         = dataset.extent(1);
-  const uint32_t input_graph_degree  = knn_graph.extent(1);
-  const uint32_t output_graph_degree = new_graph.extent(1);
-  const DATA_T* dataset_ptr          = dataset.data_handle();
-  uint32_t* input_graph_ptr          = (uint32_t*)knn_graph.data_handle();
-  uint32_t* output_graph_ptr         = new_graph.data_handle();
-  float scale                  = 1.0f / raft::spatial::knn::detail::utils::config<DATA_T>::kDivisor;
-  const std::size_t graph_size = dataset_size;
-  size_t array_size;
+  const uint32_t input_graph_degree = knn_graph.extent(1);
+  uint32_t* input_graph_ptr         = (uint32_t*)knn_graph.data_handle();
 
   // Setup GPUs
   int num_gpus = 0;
@@ -451,46 +439,48 @@ void prune(raft::device_resources const& res,
   }
   RAFT_CUDA_TRY(cudaSetDevice(0));
 
-  uint32_t graph_chunk_size     = graph_size;
-  uint32_t*** d_input_graph_ptr = NULL;  // [...][num_gpus][graph_chunk_size, input_graph_degree]
-  graph_chunk_size              = (graph_size + num_gpus - 1) / num_gpus;
+  const uint32_t graph_size       = knn_graph.extent(0);
+  uint32_t*** d_input_graph_ptr   = NULL;  // [...][num_gpus][graph_chunk_size, input_graph_degree]
+  const uint32_t graph_chunk_size = (graph_size + num_gpus - 1) / num_gpus;
   d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);
 
-  uint32_t dataset_chunk_size = dataset_size;
-  DATA_T*** d_dataset_ptr     = NULL;  // [num_gpus+1][...][...]
-  dataset_chunk_size          = (dataset_size + num_gpus - 1) / num_gpus;
+  DataT*** d_dataset_ptr            = NULL;  // [num_gpus+1][...][...]
+  const uint32_t dataset_chunk_size = (dataset_size + num_gpus - 1) / num_gpus;
   assert(dataset_chunk_size == graph_chunk_size);
-  d_dataset_ptr = mgpu_alloc<DATA_T>(num_gpus, dataset_chunk_size, dataset_dim);
+  d_dataset_ptr = mgpu_alloc<DataT>(num_gpus, dataset_chunk_size, dataset_dim);
 
-  mgpu_H2D<DATA_T>(
+  const float scale = 1.0f / raft::spatial::knn::detail::utils::config<DataT>::kDivisor;
+
+  mgpu_H2D<DataT>(
     d_dataset_ptr, dataset_ptr, num_gpus, dataset_size, dataset_chunk_size, dataset_dim);
 
-  //
-  // Sorting kNN graph
-  //
   double time_sort_start = cur_time();
   RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");
-  mgpu_H2D<uint32_t>(
-    d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+  mgpu_H2D<uint32_t>(d_input_graph_ptr,
+                     input_graph_ptr,
+                     num_gpus,
+                     dataset_size,
+                     graph_chunk_size,
+                     input_graph_degree);
   void (*kernel_sort)(
-    DATA_T**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
+    DataT**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
   constexpr int numElementsPerThread = 4;
   dim3 threads_sort(1, 1, 1);
   if (input_graph_degree <= numElementsPerThread * 32) {
     constexpr int blockDim_x = 32;
-    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    kernel_sort              = kern_sort<DataT, blockDim_x, numElementsPerThread>;
     threads_sort.x           = blockDim_x;
   } else if (input_graph_degree <= numElementsPerThread * 64) {
     constexpr int blockDim_x = 64;
-    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    kernel_sort              = kern_sort<DataT, blockDim_x, numElementsPerThread>;
     threads_sort.x           = blockDim_x;
   } else if (input_graph_degree <= numElementsPerThread * 128) {
     constexpr int blockDim_x = 128;
-    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    kernel_sort              = kern_sort<DataT, blockDim_x, numElementsPerThread>;
     threads_sort.x           = blockDim_x;
   } else if (input_graph_degree <= numElementsPerThread * 256) {
     constexpr int blockDim_x = 256;
-    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    kernel_sort              = kern_sort<DataT, blockDim_x, numElementsPerThread>;
     threads_sort.x           = blockDim_x;
   } else {
     fprintf(stderr,
@@ -510,21 +500,68 @@ void prune(raft::device_resources const& res,
                                                dataset_dim,
                                                scale,
                                                d_input_graph_ptr[i_gpu],
-                                               graph_size,
+                                               dataset_size,
                                                graph_chunk_size,
                                                input_graph_degree,
                                                i_gpu);
   }
   RAFT_CUDA_TRY(cudaSetDevice(0));
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   RAFT_LOG_DEBUG(".");
-  mgpu_D2H<uint32_t>(
-    d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+  mgpu_D2H<uint32_t>(d_input_graph_ptr,
+                     input_graph_ptr,
+                     num_gpus,
+                     dataset_size,
+                     graph_chunk_size,
+                     input_graph_degree);
   RAFT_LOG_DEBUG("\n");
   double time_sort_end = cur_time();
   RAFT_LOG_DEBUG("# Sorting kNN graph time: %.1lf sec\n", time_sort_end - time_sort_start);
 
-  mgpu_free<DATA_T>(d_dataset_ptr, num_gpus);
+  mgpu_free<DataT>(d_dataset_ptr, num_gpus);
+}
+
+/** Input arrays can be both host and device*/
+template <typename IdxT = uint32_t,
+          typename g_accessor =
+            host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
+void prune(raft::device_resources const& res,
+           mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
+           raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
+{
+  RAFT_LOG_DEBUG(
+    "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
+
+  RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
+               "Each input array is expected to have the same number of rows");
+  RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
+               "output graph cannot have more columns than input graph");
+  const uint32_t input_graph_degree  = knn_graph.extent(1);
+  const uint32_t output_graph_degree = new_graph.extent(1);
+  uint32_t* input_graph_ptr          = (uint32_t*)knn_graph.data_handle();
+  uint32_t* output_graph_ptr         = new_graph.data_handle();
+  const std::size_t graph_size       = new_graph.extent(0);
+  size_t array_size;
+
+  // Setup GPUs
+  int num_gpus = 0;
+
+  // Setup GPUs
+  RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus));
+  RAFT_LOG_DEBUG("# num_gpus: %d\n", num_gpus);
+  for (int self = 0; self < num_gpus; self++) {
+    RAFT_CUDA_TRY(cudaSetDevice(self));
+    for (int peer = 0; peer < num_gpus; peer++) {
+      if (self == peer) { continue; }
+      RAFT_CUDA_TRY(cudaDeviceEnablePeerAccess(peer, 0));
+    }
+  }
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+
+  uint32_t graph_chunk_size     = graph_size;
+  uint32_t*** d_input_graph_ptr = NULL;  // [...][num_gpus][graph_chunk_size, input_graph_degree]
+  graph_chunk_size              = (graph_size + num_gpus - 1) / num_gpus;
+  d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);
 
   //
   uint8_t* detour_count;  // [graph_size, input_graph_degree]