From f1368d7e5d3d6c345135058339da1155cf81a3a0 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 May 2023 17:11:48 +0200 Subject: [PATCH 1/5] Use utils::mapping for scaling --- .../detail/cagra/compute_distance.hpp | 8 ++- .../neighbors/detail/cagra/device_common.hpp | 26 +-------- .../detail/cagra/search_multi_cta.cuh | 19 ++++--- .../detail/cagra/search_multi_kernel.cuh | 55 +++++++++++-------- .../detail/cagra/search_single_cta.cuh | 26 +++++---- .../raft/spatial/knn/detail/ann_utils.cuh | 22 ++++++-- 6 files changed, 81 insertions(+), 75 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp index 29c841c0b5..3b85c7c93b 100644 --- a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp @@ -15,6 +15,8 @@ */ #pragma once +#include + #include "device_common.hpp" #include "hashmap.hpp" #include "utils.hpp" @@ -51,7 +53,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( INDEX_T* const result_indices_ptr, // [num_pickup] DISTANCE_T* const result_distances_ptr, // [num_pickup] const float* const query_buffer, - const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] + const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] const std::size_t dataset_dim, const std::size_t dataset_size, const std::size_t num_pickup, @@ -102,7 +104,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( const uint32_t kv = k + v; // if (kv >= dataset_dim) break; DISTANCE_T diff = query_buffer[device::swizzling(kv)]; - diff -= static_cast(dl_buff[e].data[v]) * device::fragment_scale(); + diff -= spatial::knn::detail::utils::mapping{}(dl_buff[e].data[v]); norm2 += diff * diff; } } @@ -229,7 +231,7 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in const unsigned kv = k + v; diff = query_buffer[device::swizzling(kv)]; } - diff -= static_cast(dl_buff[e].data[v]) * device::fragment_scale(); + diff -= spatial::knn::detail::utils::mapping{}(dl_buff[e].data[v]); norm2 += diff * diff; } } diff --git a/cpp/include/raft/neighbors/detail/cagra/device_common.hpp b/cpp/include/raft/neighbors/detail/cagra/device_common.hpp index 20f30d9f11..f9c81f3d25 100644 --- a/cpp/include/raft/neighbors/detail/cagra/device_common.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/device_common.hpp @@ -27,30 +27,6 @@ namespace device { // warpSize for compile time calculation constexpr unsigned warp_size = 32; -// scaling factor for distance computation -template -_RAFT_HOST_DEVICE constexpr float fragment_scale(); -template <> -_RAFT_HOST_DEVICE constexpr float fragment_scale() -{ - return 1.0; -}; -template <> -_RAFT_HOST_DEVICE constexpr float fragment_scale() -{ - return 1.0; -}; -template <> -_RAFT_HOST_DEVICE constexpr float fragment_scale() -{ - return 1.0 / 256.0; -}; -template <> -_RAFT_HOST_DEVICE constexpr float fragment_scale() -{ - return 1.0 / 128.0; -}; - /** Xorshift rondem number generator. * * See https://en.wikipedia.org/wiki/Xorshift#xorshift for reference. @@ -73,4 +49,4 @@ _RAFT_DEVICE inline T swizzling(T x) } } // namespace device -} // namespace raft::neighbors::experimental::cagra::detail \ No newline at end of file +} // namespace raft::neighbors::experimental::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh index 6148441bd0..909e8198c2 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh @@ -14,6 +14,9 @@ * limitations under the License. */ #pragma once + +#include + #include #include #include @@ -43,7 +46,7 @@ namespace multi_cta_search { template __device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [num_parents] const uint32_t num_parents, - INDEX_T* const itopk_indices, // [num_itopk] + INDEX_T* const itopk_indices, // [num_itopk] const size_t num_itopk, uint32_t* const terminate_flag) { @@ -80,8 +83,8 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [num } template -__device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] - uint32_t* indices, // [num_elements] +__device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] + uint32_t* indices, // [num_elements] const uint32_t num_elements, const uint32_t num_itopk // num_itopk <= num_elements ) @@ -137,7 +140,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel( const uint32_t graph_degree, const unsigned num_distilation, const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const INDEX_T* seed_ptr, // [num_queries, num_seeds] const uint32_t num_seeds, uint32_t* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] const uint32_t hash_bitlen, @@ -204,7 +207,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel( for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) { unsigned j = device::swizzling(i); if (i < dataset_dim) { - query_buffer[j] = static_cast(query_ptr[i]) * device::fragment_scale(); + query_buffer[j] = spatial::knn::detail::utils::mapping{}(query_ptr[i]); } else { query_buffer[j] = 0.0; } @@ -561,9 +564,9 @@ struct search : public search_plan_impl { void operator()(raft::device_resources const& res, raft::device_matrix_view dataset, raft::device_matrix_view graph, - INDEX_T* const topk_indices_ptr, // [num_queries, topk] - DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] const uint32_t num_queries, const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] uint32_t* const num_executed_iterations, // [num_queries,] diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh index 629bed2aee..374109d08d 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh @@ -14,6 +14,9 @@ * limitations under the License. */ #pragma once + +#include + #include #include #include @@ -93,7 +96,7 @@ __global__ void random_pickup_kernel( const std::size_t num_pickup, const unsigned num_distilation, const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const INDEX_T* seed_ptr, // [num_queries, num_seeds] const uint32_t num_seeds, INDEX_T* const result_indices_ptr, // [num_queries, ldr] DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] @@ -124,10 +127,12 @@ __global__ void random_pickup_kernel( random_data_frag, dataset_ptr + (dataset_dim * seed_index), dataset_dim); // Compute the norm of two data - const auto norm2 = - device::norm2(query_frag, random_data_frag, device::fragment_scale() - /*, scale*/ - ); + const auto norm2 = device::norm2( + query_frag, + random_data_frag, + static_cast(1.0 / spatial::knn::detail::utils::config::kDivisor) + /*, scale*/ + ); if (norm2 < best_norm2_team_local) { best_norm2_team_local = norm2; @@ -162,7 +167,7 @@ void random_pickup(const DATA_T* const dataset_ptr, // [dataset_size, dataset_d const std::size_t num_pickup, const unsigned num_distilation, const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const INDEX_T* seed_ptr, // [num_queries, num_seeds] const uint32_t num_seeds, INDEX_T* const result_indices_ptr, // [num_queries, ldr] DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] @@ -300,17 +305,17 @@ template = num_parents * graph_degree + INDEX_T* const result_indices_ptr, // [num_queries, ldd] + DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] + const std::uint32_t ldd // (*) ldd >= num_parents * graph_degree ) { const uint32_t ldb = hashmap::get_size(hash_bitlen); @@ -335,8 +340,10 @@ __global__ void compute_distance_to_child_nodes_kernel( device::fragment frag_query; device::load_vector_sync(frag_query, query_ptr + blockIdx.y * data_dim, data_dim); - const auto norm2 = - device::norm2(frag_target, frag_query, device::fragment_scale()); + const auto norm2 = device::norm2( + frag_target, + frag_query, + static_cast(1.0 / spatial::knn::detail::utils::config::kDivisor)); if (threadIdx.x % TEAM_SIZE == 0) { result_indices_ptr[ldd * blockIdx.y + global_team_id] = child_id; @@ -357,18 +364,18 @@ template = num_parents * graph_degree + INDEX_T* const result_indices_ptr, // [num_queries, ldd] + DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] + const std::uint32_t ldd, // (*) ldd >= num_parents * graph_degree cudaStream_t cuda_stream = 0) { const auto block_size = 128; @@ -419,7 +426,7 @@ void remove_parent_bit(const std::uint32_t num_queries, } template -__global__ void batched_memcpy_kernel(T* const dst, // [batch_size, ld_dst] +__global__ void batched_memcpy_kernel(T* const dst, // [batch_size, ld_dst] const uint64_t ld_dst, const T* const src, // [batch_size, ld_src] const uint64_t ld_src, @@ -434,7 +441,7 @@ __global__ void batched_memcpy_kernel(T* const dst, // [batch_size, ld_ds } template -void batched_memcpy(T* const dst, // [batch_size, ld_dst] +void batched_memcpy(T* const dst, // [batch_size, ld_dst] const uint64_t ld_dst, const T* const src, // [batch_size, ld_src] const uint64_t ld_src, @@ -578,9 +585,9 @@ struct search : search_plan_impl { void operator()(raft::device_resources const& res, raft::device_matrix_view dataset, raft::device_matrix_view graph, - INDEX_T* const topk_indices_ptr, // [num_queries, topk] - DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] const uint32_t num_queries, const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] uint32_t* const num_executed_iterations, // [num_queries,] diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh index fc87b952b0..d9bf369496 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh @@ -14,6 +14,9 @@ * limitations under the License. */ #pragma once + +#include + #include #include #include @@ -89,7 +92,8 @@ struct topk_by_radix_sort_base { static constexpr std::uint32_t vecLen = 2; // TODO }; template -struct topk_by_radix_sort : topk_by_radix_sort_base {}; +struct topk_by_radix_sort : topk_by_radix_sort_base { +}; template struct topk_by_radix_sort __device__ inline void topk_by_bitonic_sort_2nd( - float* itopk_distances, // [num_itopk] - std::uint32_t* itopk_indices, // [num_itopk] + float* itopk_distances, // [num_itopk] + std::uint32_t* itopk_indices, // [num_itopk] const std::uint32_t num_itopk, float* candidate_distances, // [num_candidates] std::uint32_t* candidate_indices, // [num_candidates] @@ -464,8 +468,8 @@ template -__device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] - std::uint32_t* itopk_indices, // [num_itopk] +__device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] + std::uint32_t* itopk_indices, // [num_itopk] const std::uint32_t num_itopk, float* candidate_distances, // [num_candidates] std::uint32_t* candidate_indices, // [num_candidates] @@ -526,7 +530,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(INDEX_T* const result_indices_ptr, // [num_queries, top_k] DISTANCE_T* const result_distances_ptr, // [num_queries, top_k] const std::uint32_t top_k, - const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] + const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] const std::size_t dataset_dim, const std::size_t dataset_size, const DATA_T* const queries_ptr, // [num_queries, dataset_dim] @@ -534,7 +538,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ const std::uint32_t graph_degree, const unsigned num_distilation, const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const INDEX_T* seed_ptr, // [num_queries, num_seeds] const uint32_t num_seeds, std::uint32_t* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] const std::uint32_t internal_topk, @@ -592,7 +596,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) { unsigned j = device::swizzling(i); if (i < dataset_dim) { - query_buffer[j] = static_cast(query_ptr[i]) * device::fragment_scale(); + query_buffer[j] = spatial::knn::detail::utils::mapping{}(query_ptr[i]); } else { query_buffer[j] = 0.0; } @@ -1109,9 +1113,9 @@ struct search : search_plan_impl { void operator()(raft::device_resources const& res, raft::device_matrix_view dataset, raft::device_matrix_view graph, - INDEX_T* const result_indices_ptr, // [num_queries, topk] - DISTANCE_T* const result_distances_ptr, // [num_queries, topk] - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + INDEX_T* const result_indices_ptr, // [num_queries, topk] + DISTANCE_T* const result_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] const std::uint32_t num_queries, const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] std::uint32_t* const num_executed_iterations, // [num_queries] diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index dd291251b4..2212465941 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -29,6 +29,8 @@ #include #include +#include + namespace raft::spatial::knn::detail::utils { /** Whether pointers are accessible on the device or on the host. */ @@ -44,7 +46,8 @@ enum class pointer_residency { }; template -struct pointer_residency_count {}; +struct pointer_residency_count { +}; template <> struct pointer_residency_count<> { @@ -134,14 +137,25 @@ struct with_mapped_memory_t { }; template -struct config {}; +struct config { +}; +template <> +struct config { + using value_t = double; + static constexpr double kDivisor = 1.0; +}; template <> struct config { using value_t = float; static constexpr double kDivisor = 1.0; }; template <> +struct config { + using value_t = half; + static constexpr double kDivisor = 1.0; +}; +template <> struct config { using value_t = uint32_t; static constexpr double kDivisor = 256.0; @@ -169,13 +183,13 @@ struct mapping { * @{ */ template - HDI auto operator()(const S& x) const -> std::enable_if_t, T> + HDI constexpr auto operator()(const S& x) const -> std::enable_if_t, T> { return x; }; template - HDI auto operator()(const S& x) const -> std::enable_if_t, T> + HDI constexpr auto operator()(const S& x) const -> std::enable_if_t, T> { constexpr double kMult = config::kDivisor / config::kDivisor; if constexpr (std::is_floating_point_v) { return static_cast(x * static_cast(kMult)); } From a8eec2a506d4836da275ad4a52949334e2b73317 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 May 2023 17:42:30 +0200 Subject: [PATCH 2/5] Add tests for int8/uint8 data types --- cpp/test/CMakeLists.txt | 2 ++ cpp/test/neighbors/ann_cagra.cuh | 8 ++++-- .../ann_cagra/test_int8_t_uint32_t.cu | 28 +++++++++++++++++++ .../ann_cagra/test_uint8_t_uint32_t.cu | 28 +++++++++++++++++++ 4 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu create mode 100644 cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 7f45a6dd22..88ad7772c2 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -314,6 +314,8 @@ if(BUILD_TESTS) NEIGHBORS_TEST PATH test/neighbors/ann_cagra/test_float_uint32_t.cu + test/neighbors/ann_cagra/test_int8_t_uint32_t.cu + test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu test/neighbors/ann_ivf_flat/test_float_int64_t.cu test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 8b8aa21fc9..ff8a896bab 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -82,6 +82,10 @@ class AnnCagraTest : public ::testing::TestWithParam { protected: void testCagra() { + if (ps.dim * sizeof(DataT) % 8 != 0) { + GTEST_SKIP() + << "CAGRA requires the input data rows to be aligned at least to 8 bytes for now."; + } size_t queries_size = ps.n_queries * ps.k; std::vector indices_Cagra(queries_size); std::vector indices_naive(queries_size); @@ -221,7 +225,7 @@ inline std::vector generate_inputs() {100}, {1000}, {8}, - {1, 16, 33}, // k + {1, 16, 33}, // k {search_algo::SINGLE_CTA, search_algo::MULTI_KERNEL}, {1, 10, 100}, // query size {0}, @@ -310,4 +314,4 @@ inline std::vector generate_inputs() const std::vector inputs = generate_inputs(); -} // namespace raft::neighbors::experimental::cagra \ No newline at end of file +} // namespace raft::neighbors::experimental::cagra diff --git a/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu new file mode 100644 index 0000000000..f148ebc186 --- /dev/null +++ b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../ann_cagra.cuh" + +namespace raft::neighbors::experimental::cagra { + +typedef AnnCagraTest AnnCagraTestI8; +TEST_P(AnnCagraTestI8, AnnCagra) { this->testCagra(); } + +INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestI8, ::testing::ValuesIn(inputs)); + +} // namespace raft::neighbors::experimental::cagra diff --git a/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu new file mode 100644 index 0000000000..087d7cec71 --- /dev/null +++ b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../ann_cagra.cuh" + +namespace raft::neighbors::experimental::cagra { + +typedef AnnCagraTest AnnCagraTestU8; +TEST_P(AnnCagraTestU8, AnnCagra) { this->testCagra(); } + +INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestU8, ::testing::ValuesIn(inputs)); + +} // namespace raft::neighbors::experimental::cagra From 43b6d2b7714cb89cb4ed8d1fa9eda4532d025cc3 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 May 2023 17:43:40 +0200 Subject: [PATCH 3/5] Scale the output distances in a post-processing step --- .../neighbors/detail/cagra/cagra_search.cuh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh index 79cbb6198f..58ff036c93 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh @@ -16,6 +16,9 @@ #pragma once +#include +#include + #include #include #include @@ -94,6 +97,22 @@ void search_main(raft::device_resources const& res, _num_executed_iterations, topk); } + + static_assert(std::is_same_v, + "only float distances are supported at the moment"); + float* dist_out = distances.data_handle(); + const DistanceT* dist_in = distances.data_handle(); + // We're converting the data from T to DistanceT during distance computation; + // hence now we convert it back (DistanceT -> T). + constexpr float kScale = spatial::knn::detail::utils::config::kDivisor / + spatial::knn::detail::utils::config::kDivisor; + ivf_pq::detail::postprocess_distances(dist_out, + dist_in, + index.metric(), + distances.extent(0), + distances.extent(1), + kScale, + res.get_stream()); } /** @} */ // end group cagra From f1093849bab07c8e893b2a76bfad195c23c7074f Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 May 2023 17:58:41 +0200 Subject: [PATCH 4/5] Run clang-format via the updated pre-commit hook --- .../detail/cagra/compute_distance.hpp | 2 +- .../detail/cagra/search_multi_cta.cuh | 14 ++++---- .../detail/cagra/search_multi_kernel.cuh | 36 +++++++++---------- .../detail/cagra/search_single_cta.cuh | 21 ++++++----- .../raft/spatial/knn/detail/ann_utils.cuh | 6 ++-- cpp/test/neighbors/ann_cagra.cuh | 2 +- 6 files changed, 39 insertions(+), 42 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp index 3b85c7c93b..52e5c62169 100644 --- a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp @@ -53,7 +53,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( INDEX_T* const result_indices_ptr, // [num_pickup] DISTANCE_T* const result_distances_ptr, // [num_pickup] const float* const query_buffer, - const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] + const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] const std::size_t dataset_dim, const std::size_t dataset_size, const std::size_t num_pickup, diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh index 909e8198c2..99553632ac 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh @@ -46,7 +46,7 @@ namespace multi_cta_search { template __device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [num_parents] const uint32_t num_parents, - INDEX_T* const itopk_indices, // [num_itopk] + INDEX_T* const itopk_indices, // [num_itopk] const size_t num_itopk, uint32_t* const terminate_flag) { @@ -83,8 +83,8 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [num } template -__device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] - uint32_t* indices, // [num_elements] +__device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] + uint32_t* indices, // [num_elements] const uint32_t num_elements, const uint32_t num_itopk // num_itopk <= num_elements ) @@ -140,7 +140,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel( const uint32_t graph_degree, const unsigned num_distilation, const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const INDEX_T* seed_ptr, // [num_queries, num_seeds] const uint32_t num_seeds, uint32_t* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] const uint32_t hash_bitlen, @@ -564,9 +564,9 @@ struct search : public search_plan_impl { void operator()(raft::device_resources const& res, raft::device_matrix_view dataset, raft::device_matrix_view graph, - INDEX_T* const topk_indices_ptr, // [num_queries, topk] - DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] const uint32_t num_queries, const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] uint32_t* const num_executed_iterations, // [num_queries,] diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh index 374109d08d..e3e9c8a655 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh @@ -96,7 +96,7 @@ __global__ void random_pickup_kernel( const std::size_t num_pickup, const unsigned num_distilation, const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const INDEX_T* seed_ptr, // [num_queries, num_seeds] const uint32_t num_seeds, INDEX_T* const result_indices_ptr, // [num_queries, ldr] DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] @@ -167,7 +167,7 @@ void random_pickup(const DATA_T* const dataset_ptr, // [dataset_size, dataset_d const std::size_t num_pickup, const unsigned num_distilation, const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const INDEX_T* seed_ptr, // [num_queries, num_seeds] const uint32_t num_seeds, INDEX_T* const result_indices_ptr, // [num_queries, ldr] DISTANCE_T* const result_distances_ptr, // [num_queries, ldr] @@ -305,17 +305,17 @@ template = num_parents * graph_degree + INDEX_T* const result_indices_ptr, // [num_queries, ldd] + DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] + const std::uint32_t ldd // (*) ldd >= num_parents * graph_degree ) { const uint32_t ldb = hashmap::get_size(hash_bitlen); @@ -364,18 +364,18 @@ template = num_parents * graph_degree + INDEX_T* const result_indices_ptr, // [num_queries, ldd] + DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] + const std::uint32_t ldd, // (*) ldd >= num_parents * graph_degree cudaStream_t cuda_stream = 0) { const auto block_size = 128; @@ -426,7 +426,7 @@ void remove_parent_bit(const std::uint32_t num_queries, } template -__global__ void batched_memcpy_kernel(T* const dst, // [batch_size, ld_dst] +__global__ void batched_memcpy_kernel(T* const dst, // [batch_size, ld_dst] const uint64_t ld_dst, const T* const src, // [batch_size, ld_src] const uint64_t ld_src, @@ -441,7 +441,7 @@ __global__ void batched_memcpy_kernel(T* const dst, // [batch_size, ld_dst] } template -void batched_memcpy(T* const dst, // [batch_size, ld_dst] +void batched_memcpy(T* const dst, // [batch_size, ld_dst] const uint64_t ld_dst, const T* const src, // [batch_size, ld_src] const uint64_t ld_src, @@ -585,9 +585,9 @@ struct search : search_plan_impl { void operator()(raft::device_resources const& res, raft::device_matrix_view dataset, raft::device_matrix_view graph, - INDEX_T* const topk_indices_ptr, // [num_queries, topk] - DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] const uint32_t num_queries, const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] uint32_t* const num_executed_iterations, // [num_queries,] diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh index d9bf369496..531b30ba85 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh @@ -92,8 +92,7 @@ struct topk_by_radix_sort_base { static constexpr std::uint32_t vecLen = 2; // TODO }; template -struct topk_by_radix_sort : topk_by_radix_sort_base { -}; +struct topk_by_radix_sort : topk_by_radix_sort_base {}; template struct topk_by_radix_sort __device__ inline void topk_by_bitonic_sort_2nd( - float* itopk_distances, // [num_itopk] - std::uint32_t* itopk_indices, // [num_itopk] + float* itopk_distances, // [num_itopk] + std::uint32_t* itopk_indices, // [num_itopk] const std::uint32_t num_itopk, float* candidate_distances, // [num_candidates] std::uint32_t* candidate_indices, // [num_candidates] @@ -468,8 +467,8 @@ template -__device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] - std::uint32_t* itopk_indices, // [num_itopk] +__device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] + std::uint32_t* itopk_indices, // [num_itopk] const std::uint32_t num_itopk, float* candidate_distances, // [num_candidates] std::uint32_t* candidate_indices, // [num_candidates] @@ -530,7 +529,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(INDEX_T* const result_indices_ptr, // [num_queries, top_k] DISTANCE_T* const result_distances_ptr, // [num_queries, top_k] const std::uint32_t top_k, - const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] + const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] const std::size_t dataset_dim, const std::size_t dataset_size, const DATA_T* const queries_ptr, // [num_queries, dataset_dim] @@ -538,7 +537,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ const std::uint32_t graph_degree, const unsigned num_distilation, const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const INDEX_T* seed_ptr, // [num_queries, num_seeds] const uint32_t num_seeds, std::uint32_t* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] const std::uint32_t internal_topk, @@ -1113,9 +1112,9 @@ struct search : search_plan_impl { void operator()(raft::device_resources const& res, raft::device_matrix_view dataset, raft::device_matrix_view graph, - INDEX_T* const result_indices_ptr, // [num_queries, topk] - DISTANCE_T* const result_distances_ptr, // [num_queries, topk] - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + INDEX_T* const result_indices_ptr, // [num_queries, topk] + DISTANCE_T* const result_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] const std::uint32_t num_queries, const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] std::uint32_t* const num_executed_iterations, // [num_queries] diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index 2212465941..850b741dfd 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -46,8 +46,7 @@ enum class pointer_residency { }; template -struct pointer_residency_count { -}; +struct pointer_residency_count {}; template <> struct pointer_residency_count<> { @@ -137,8 +136,7 @@ struct with_mapped_memory_t { }; template -struct config { -}; +struct config {}; template <> struct config { diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index ff8a896bab..f9df1f724f 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -225,7 +225,7 @@ inline std::vector generate_inputs() {100}, {1000}, {8}, - {1, 16, 33}, // k + {1, 16, 33}, // k {search_algo::SINGLE_CTA, search_algo::MULTI_KERNEL}, {1, 10, 100}, // query size {0}, From 92302bed3995f376b75225d11889b8bbfc32d24e Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Tue, 9 May 2023 19:54:19 +0200 Subject: [PATCH 5/5] Update cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh Co-authored-by: Tamas Bela Feher --- cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh index 58ff036c93..5902d1405f 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh @@ -102,8 +102,8 @@ void search_main(raft::device_resources const& res, "only float distances are supported at the moment"); float* dist_out = distances.data_handle(); const DistanceT* dist_in = distances.data_handle(); - // We're converting the data from T to DistanceT during distance computation; - // hence now we convert it back (DistanceT -> T). + // We're converting the data from T to DistanceT during distance computation + // and divide the values by kDivisor. Here we restore the original scale. constexpr float kScale = spatial::knn::detail::utils::config::kDivisor / spatial::knn::detail::utils::config::kDivisor; ivf_pq::detail::postprocess_distances(dist_out,