Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CAGRA: Separate graph index sorting functionality from prune function #1471

Merged
merged 16 commits into from
May 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 51 additions & 14 deletions cpp/include/raft/neighbors/cagra.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ namespace raft::neighbors::experimental::cagra {
* @code{.cpp}
* using namespace raft::neighbors;
* // use default index parameters
* ivf_pq::index_params build_params;
* ivf_pq::search_params search_params
* cagra::index_params build_params;
* cagra::search_params search_params
* auto knn_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
* // create knn graph
* cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
Expand Down Expand Up @@ -84,34 +84,71 @@ void build_knn_graph(raft::device_resources const& res,
detail::build_knn_graph(res, dataset, knn_graph, refine_rate, build_params, search_params);
}

/**
* @brief Sort a KNN graph index.
* Preprocessing step for `cagra::prune`: If a KNN graph is not built using
* `cagra::build_knn_graph`, then it is necessary to call this function before calling
* `cagra::prune`. If the graph is built by `cagra::build_knn_graph`, it is already sorted and you
* do not need to call this function.
*
* Usage example:
* @code{.cpp}
* using namespace raft::neighbors;
* cagra::index_params build_params;
* auto knn_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
* // build KNN graph not using `cagra::build_knn_graph`
* // build(knn_graph, dataset, ...);
* // sort graph index
* sort_knn_graph(res, dataset.view(), knn_graph.view());
* // prune graph
* cagra::prune(res, dataset, knn_graph.view(), pruned_graph.view());
* // Construct an index from dataset and pruned knn_graph
* auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset, pruned_graph.view());
* @endcode
*
* @tparam DataT type of the data in the source dataset
* @tparam IdxT type of the indices in the source dataset
*
tfeher marked this conversation as resolved.
Show resolved Hide resolved
* @param[in] res raft resources
* @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
* @param[in,out] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
* knn_graph_degree]
*/
template <typename DataT,
typename IdxT = uint32_t,
typename d_accessor =
host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>,
typename g_accessor =
host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
void sort_knn_graph(raft::device_resources const& res,
mdspan<const DataT, matrix_extent<IdxT>, row_major, d_accessor> dataset,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph)
{
detail::graph::sort_knn_graph(res, dataset, knn_graph);
}

/**
* @brief Prune a KNN graph.
*
* Decrease the number of neighbors for each node.
*
* See [cagra::build_knn_graph](#cagra::build_knn_graph) for usage example
*
* @tparam T data element type
* @tparam IdxT type of the indices in the source dataset
*
* @param[in] res raft resources
* @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
* @param[in] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
* knn_graph_degree]
* @param[out] new_graph a host matrix view of the pruned knn graph [n_rows, graph_degree]
*/
template <class DATA_T,
typename IdxT = uint32_t,
typename d_accessor =
host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
template <typename IdxT = uint32_t,
typename g_accessor =
host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
void prune(raft::device_resources const& res,
mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
{
detail::graph::prune(res, dataset, knn_graph, new_graph);
detail::graph::prune(res, knn_graph, new_graph);
}

/**
Expand All @@ -138,11 +175,11 @@ void prune(raft::device_resources const& res,
* // create and fill the index from a [N, D] dataset
* auto index = cagra::build(res, index_params, dataset);
* // use default search parameters
* ivf_pq::search_params search_params;
* cagra::search_params search_params;
* // search K nearest neighbours
* auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
* auto distances = raft::make_device_matrix<float>(res, n_queries, k);
* ivf_pq::search(res, search_params, index, queries, neighbors, distances);
* cagra::search(res, search_params, index, queries, neighbors, distances);
* @endcode
*
* @tparam T data element type
Expand Down Expand Up @@ -178,7 +215,7 @@ index<T, IdxT> build(raft::device_resources const& res,

auto cagra_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), params.graph_degree);

prune<T, IdxT>(res, dataset, knn_graph.view(), cagra_graph.view());
prune<IdxT>(res, knn_graph.view(), cagra_graph.view());

// Construct an index from dataset and pruned knn graph.
return index<T, IdxT>(res, params.metric, dataset, cagra_graph.view());
Expand Down
131 changes: 84 additions & 47 deletions cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -405,36 +405,24 @@ void shift_array(T* array, uint64_t num)
}
}

/** Input arrays can be both host and device*/
template <class DATA_T,
template <typename DataT,
typename IdxT = uint32_t,
typename d_accessor =
host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
host_device_accessor<std::experimental::default_accessor<DataT>, memory_type::device>,
typename g_accessor =
host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
void prune(raft::device_resources const& res,
mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
void sort_knn_graph(raft::device_resources const& res,
mdspan<const DataT, matrix_extent<IdxT>, row_major, d_accessor> dataset,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph)
{
RAFT_LOG_DEBUG(
"# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
"dataset size is expected to have the same number of graph index size");
const uint32_t dataset_size = dataset.extent(0);
const uint32_t dataset_dim = dataset.extent(1);
const DataT* dataset_ptr = dataset.data_handle();

RAFT_EXPECTS(
dataset.extent(0) == knn_graph.extent(0) && knn_graph.extent(0) == new_graph.extent(0),
"Each input array is expected to have the same number of rows");
RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
"output graph cannot have more columns than input graph");
const uint32_t dataset_size = dataset.extent(0);
const uint32_t dataset_dim = dataset.extent(1);
const uint32_t input_graph_degree = knn_graph.extent(1);
const uint32_t output_graph_degree = new_graph.extent(1);
const DATA_T* dataset_ptr = dataset.data_handle();
uint32_t* input_graph_ptr = (uint32_t*)knn_graph.data_handle();
uint32_t* output_graph_ptr = new_graph.data_handle();
float scale = 1.0f / raft::spatial::knn::detail::utils::config<DATA_T>::kDivisor;
const std::size_t graph_size = dataset_size;
size_t array_size;
const uint32_t input_graph_degree = knn_graph.extent(1);
uint32_t* input_graph_ptr = (uint32_t*)knn_graph.data_handle();

// Setup GPUs
int num_gpus = 0;
Expand All @@ -451,46 +439,48 @@ void prune(raft::device_resources const& res,
}
RAFT_CUDA_TRY(cudaSetDevice(0));

uint32_t graph_chunk_size = graph_size;
uint32_t*** d_input_graph_ptr = NULL; // [...][num_gpus][graph_chunk_size, input_graph_degree]
graph_chunk_size = (graph_size + num_gpus - 1) / num_gpus;
const uint32_t graph_size = knn_graph.extent(0);
uint32_t*** d_input_graph_ptr = NULL; // [...][num_gpus][graph_chunk_size, input_graph_degree]
const uint32_t graph_chunk_size = (graph_size + num_gpus - 1) / num_gpus;
d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);

uint32_t dataset_chunk_size = dataset_size;
DATA_T*** d_dataset_ptr = NULL; // [num_gpus+1][...][...]
dataset_chunk_size = (dataset_size + num_gpus - 1) / num_gpus;
DataT*** d_dataset_ptr = NULL; // [num_gpus+1][...][...]
const uint32_t dataset_chunk_size = (dataset_size + num_gpus - 1) / num_gpus;
assert(dataset_chunk_size == graph_chunk_size);
d_dataset_ptr = mgpu_alloc<DATA_T>(num_gpus, dataset_chunk_size, dataset_dim);
d_dataset_ptr = mgpu_alloc<DataT>(num_gpus, dataset_chunk_size, dataset_dim);

mgpu_H2D<DATA_T>(
const float scale = 1.0f / raft::spatial::knn::detail::utils::config<DataT>::kDivisor;

mgpu_H2D<DataT>(
d_dataset_ptr, dataset_ptr, num_gpus, dataset_size, dataset_chunk_size, dataset_dim);

//
// Sorting kNN graph
//
double time_sort_start = cur_time();
RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");
mgpu_H2D<uint32_t>(
d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
mgpu_H2D<uint32_t>(d_input_graph_ptr,
input_graph_ptr,
num_gpus,
dataset_size,
graph_chunk_size,
input_graph_degree);
void (*kernel_sort)(
DATA_T**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
DataT**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
constexpr int numElementsPerThread = 4;
dim3 threads_sort(1, 1, 1);
if (input_graph_degree <= numElementsPerThread * 32) {
constexpr int blockDim_x = 32;
kernel_sort = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
kernel_sort = kern_sort<DataT, blockDim_x, numElementsPerThread>;
threads_sort.x = blockDim_x;
} else if (input_graph_degree <= numElementsPerThread * 64) {
constexpr int blockDim_x = 64;
kernel_sort = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
kernel_sort = kern_sort<DataT, blockDim_x, numElementsPerThread>;
threads_sort.x = blockDim_x;
} else if (input_graph_degree <= numElementsPerThread * 128) {
constexpr int blockDim_x = 128;
kernel_sort = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
kernel_sort = kern_sort<DataT, blockDim_x, numElementsPerThread>;
threads_sort.x = blockDim_x;
} else if (input_graph_degree <= numElementsPerThread * 256) {
constexpr int blockDim_x = 256;
kernel_sort = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
kernel_sort = kern_sort<DataT, blockDim_x, numElementsPerThread>;
threads_sort.x = blockDim_x;
} else {
fprintf(stderr,
Expand All @@ -510,21 +500,68 @@ void prune(raft::device_resources const& res,
dataset_dim,
scale,
d_input_graph_ptr[i_gpu],
graph_size,
dataset_size,
graph_chunk_size,
input_graph_degree,
i_gpu);
}
RAFT_CUDA_TRY(cudaSetDevice(0));
RAFT_CUDA_TRY(cudaDeviceSynchronize());
RAFT_LOG_DEBUG(".");
mgpu_D2H<uint32_t>(
d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
mgpu_D2H<uint32_t>(d_input_graph_ptr,
input_graph_ptr,
num_gpus,
dataset_size,
graph_chunk_size,
input_graph_degree);
RAFT_LOG_DEBUG("\n");
double time_sort_end = cur_time();
RAFT_LOG_DEBUG("# Sorting kNN graph time: %.1lf sec\n", time_sort_end - time_sort_start);

mgpu_free<DATA_T>(d_dataset_ptr, num_gpus);
mgpu_free<DataT>(d_dataset_ptr, num_gpus);
}

/** Input arrays can be both host and device*/
template <typename IdxT = uint32_t,
typename g_accessor =
host_device_accessor<std::experimental::default_accessor<IdxT>, memory_type::host>>
void prune(raft::device_resources const& res,
mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
{
RAFT_LOG_DEBUG(
"# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));

RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
"Each input array is expected to have the same number of rows");
RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
"output graph cannot have more columns than input graph");
const uint32_t input_graph_degree = knn_graph.extent(1);
const uint32_t output_graph_degree = new_graph.extent(1);
uint32_t* input_graph_ptr = (uint32_t*)knn_graph.data_handle();
uint32_t* output_graph_ptr = new_graph.data_handle();
const std::size_t graph_size = new_graph.extent(0);
size_t array_size;

// Setup GPUs
int num_gpus = 0;

// Setup GPUs
RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus));
RAFT_LOG_DEBUG("# num_gpus: %d\n", num_gpus);
for (int self = 0; self < num_gpus; self++) {
RAFT_CUDA_TRY(cudaSetDevice(self));
for (int peer = 0; peer < num_gpus; peer++) {
if (self == peer) { continue; }
RAFT_CUDA_TRY(cudaDeviceEnablePeerAccess(peer, 0));
}
}
RAFT_CUDA_TRY(cudaSetDevice(0));
tfeher marked this conversation as resolved.
Show resolved Hide resolved

uint32_t graph_chunk_size = graph_size;
uint32_t*** d_input_graph_ptr = NULL; // [...][num_gpus][graph_chunk_size, input_graph_degree]
graph_chunk_size = (graph_size + num_gpus - 1) / num_gpus;
d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);
tfeher marked this conversation as resolved.
Show resolved Hide resolved

//
uint8_t* detour_count; // [graph_size, input_graph_degree]
Expand Down
Loading