diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 50ee1a0ce2..5606ff85f0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -458,13 +458,6 @@ if(BUILD_SHARED_LIBS) src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float.cu src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false.cu src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true.cu - src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu - src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu - src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half_bitset64.cu - src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half_bitset64.cu - src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float_bitset64.cu - src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu - src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu src/neighbors/ivf_pq/detail/ivf_pq_contiguous_list_data.cu src/neighbors/ivf_pq/detail/ivf_pq_process_and_fill_codes.cu src/neighbors/ivf_pq/detail/ivf_pq_search_float_int64_t.cu diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index 2e91f3c15f..70da22a09d 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -485,13 +485,13 @@ namespace filtering { enum class FilterType { None, Bitmap, Bitset }; struct base_filter { - ~base_filter() = default; + virtual ~base_filter() = default; virtual FilterType get_filter_type() const = 0; }; /* A filter that filters nothing. This is the default behavior. */ struct none_sample_filter : public base_filter { - inline _RAFT_HOST_DEVICE bool operator()( + constexpr __forceinline__ _RAFT_HOST_DEVICE bool operator()( // query index const uint32_t query_ix, // the current inverted list index @@ -499,7 +499,7 @@ struct none_sample_filter : public base_filter { // the index of the current sample inside the current inverted list const uint32_t sample_ix) const; - inline _RAFT_HOST_DEVICE bool operator()( + constexpr __forceinline__ _RAFT_HOST_DEVICE bool operator()( // query index const uint32_t query_ix, // the index of the current sample @@ -517,7 +517,7 @@ struct none_sample_filter : public base_filter { * @tparam filter_t */ template -struct ivf_to_sample_filter { +struct ivf_to_sample_filter : public base_filter { const index_t* const* inds_ptrs_; const filter_t next_filter_; @@ -534,6 +534,8 @@ struct ivf_to_sample_filter { const uint32_t cluster_ix, // the index of the current sample inside the current inverted list const uint32_t sample_ix) const; + + FilterType get_filter_type() const override { return next_filter_.get_filter_type(); } }; /** @@ -577,8 +579,8 @@ struct bitset_filter : public base_filter { // View of the bitset to use as a filter const view_t bitset_view_; - bitset_filter(const view_t bitset_for_filtering); - inline _RAFT_HOST_DEVICE bool operator()( + _RAFT_HOST_DEVICE bitset_filter(const view_t bitset_for_filtering); + constexpr __forceinline__ _RAFT_HOST_DEVICE bool operator()( // query index const uint32_t query_ix, // the index of the current sample diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float.cu index 33e7067a70..a5c726b6ee 100644 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float.cu +++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,50 +24,43 @@ #include "../ivf_pq_compute_similarity_impl.cuh" #include "../ivf_pq_fp_8bit.cuh" -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \ + template auto cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ + \ + template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ + cuvs::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + cuvs::distance::DistanceType metric, \ + cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + const cuvs::neighbors::filtering::base_filter& sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - float, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); +instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(float, float); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float_bitset64.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float_bitset64.cu deleted file mode 100644 index 8c28e2c5b9..0000000000 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float_bitset64.cu +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by generate_ivf_pq_compute_similarity.py - * Make changes there and run in this directory: - * > python generate_ivf_pq_compute_similarity.py - */ - -#include "../../sample_filter.cuh" -#include "../ivf_pq_compute_similarity_impl.cuh" -#include "../ivf_pq_fp_8bit.cuh" - -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ - uint32_t* _out_indices); - -#define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - float, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false.cu index 4cb3424cb5..b65b26fc8c 100644 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false.cu +++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,50 +24,44 @@ #include "../ivf_pq_compute_similarity_impl.cuh" #include "../ivf_pq_fp_8bit.cuh" -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \ + template auto cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ + \ + template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ + cuvs::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + cuvs::distance::DistanceType metric, \ + cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + const cuvs::neighbors::filtering::base_filter& sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); + float, cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu deleted file mode 100644 index e3f330dccb..0000000000 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by generate_ivf_pq_compute_similarity.py - * Make changes there and run in this directory: - * > python generate_ivf_pq_compute_similarity.py - */ - -#include "../../sample_filter.cuh" -#include "../ivf_pq_compute_similarity_impl.cuh" -#include "../ivf_pq_fp_8bit.cuh" - -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ - uint32_t* _out_indices); - -#define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true.cu index 75f8f2cf8a..30f7eaa30c 100644 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true.cu +++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,50 +24,44 @@ #include "../ivf_pq_compute_similarity_impl.cuh" #include "../ivf_pq_fp_8bit.cuh" -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \ + template auto cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ + \ + template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ + cuvs::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + cuvs::distance::DistanceType metric, \ + cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + const cuvs::neighbors::filtering::base_filter& sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); + float, cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu deleted file mode 100644 index 3d0c610dd2..0000000000 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by generate_ivf_pq_compute_similarity.py - * Make changes there and run in this directory: - * > python generate_ivf_pq_compute_similarity.py - */ - -#include "../../sample_filter.cuh" -#include "../ivf_pq_compute_similarity_impl.cuh" -#include "../ivf_pq_fp_8bit.cuh" - -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ - uint32_t* _out_indices); - -#define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half.cu index 9a288a1c7c..1545374419 100644 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half.cu +++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,50 +24,43 @@ #include "../ivf_pq_compute_similarity_impl.cuh" #include "../ivf_pq_fp_8bit.cuh" -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \ + template auto cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ + \ + template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ + cuvs::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + cuvs::distance::DistanceType metric, \ + cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + const cuvs::neighbors::filtering::base_filter& sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - half, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); +instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(float, half); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half_bitset64.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half_bitset64.cu deleted file mode 100644 index 07af7815e2..0000000000 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half_bitset64.cu +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by generate_ivf_pq_compute_similarity.py - * Make changes there and run in this directory: - * > python generate_ivf_pq_compute_similarity.py - */ - -#include "../../sample_filter.cuh" -#include "../ivf_pq_compute_similarity_impl.cuh" -#include "../ivf_pq_fp_8bit.cuh" - -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ - uint32_t* _out_indices); - -#define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - half, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false.cu index 8886b01bef..00707e5ecc 100644 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false.cu +++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,50 +24,44 @@ #include "../ivf_pq_compute_similarity_impl.cuh" #include "../ivf_pq_fp_8bit.cuh" -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \ + template auto cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ + \ + template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ + cuvs::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + cuvs::distance::DistanceType metric, \ + cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + const cuvs::neighbors::filtering::base_filter& sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); + half, cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu deleted file mode 100644 index a921545f80..0000000000 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by generate_ivf_pq_compute_similarity.py - * Make changes there and run in this directory: - * > python generate_ivf_pq_compute_similarity.py - */ - -#include "../../sample_filter.cuh" -#include "../ivf_pq_compute_similarity_impl.cuh" -#include "../ivf_pq_fp_8bit.cuh" - -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ - uint32_t* _out_indices); - -#define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true.cu index 05241135b5..7de8719957 100644 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true.cu +++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,50 +24,44 @@ #include "../ivf_pq_compute_similarity_impl.cuh" #include "../ivf_pq_fp_8bit.cuh" -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \ + template auto cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ + \ + template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ + cuvs::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + cuvs::distance::DistanceType metric, \ + cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + const cuvs::neighbors::filtering::base_filter& sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); + half, cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu deleted file mode 100644 index 2ebd3d5270..0000000000 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by generate_ivf_pq_compute_similarity.py - * Make changes there and run in this directory: - * > python generate_ivf_pq_compute_similarity.py - */ - -#include "../../sample_filter.cuh" -#include "../ivf_pq_compute_similarity_impl.cuh" -#include "../ivf_pq_fp_8bit.cuh" - -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ - uint32_t* _out_indices); - -#define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half.cu index 95a6f0bedd..94406c844a 100644 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half.cu +++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,50 +24,43 @@ #include "../ivf_pq_compute_similarity_impl.cuh" #include "../ivf_pq_fp_8bit.cuh" -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \ + template auto cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ + \ + template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ + cuvs::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + cuvs::distance::DistanceType metric, \ + cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + const cuvs::neighbors::filtering::base_filter& sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - half, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); +instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(half, half); diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half_bitset64.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half_bitset64.cu deleted file mode 100644 index a0be0b392f..0000000000 --- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half_bitset64.cu +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * NOTE: this file is generated by generate_ivf_pq_compute_similarity.py - * Make changes there and run in this directory: - * > python generate_ivf_pq_compute_similarity.py - */ - -#include "../../sample_filter.cuh" -#include "../ivf_pq_compute_similarity_impl.cuh" -#include "../ivf_pq_fp_8bit.cuh" - -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - cuvs::distance::DistanceType metric, \ - cuvs::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - IvfSampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ - uint32_t* _out_indices); - -#define COMMA , -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - half, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity.cuh index 235f0aacb3..6f9d97dbf1 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,8 @@ #include "../sample_filter.cuh" // none_sample_filter #include "ivf_pq_fp_8bit.cuh" // cuvs::neighbors::ivf_pq::detail::fp_8bit -#include // cuvs::distance::DistanceType +#include // cuvs::distance::DistanceType +#include #include // cuvs::neighbors::ivf_pq::codebook_gen #include // RAFT_WEAK_FUNCTION #include // rmm::cuda_stream_view @@ -36,7 +37,6 @@ auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, ui template +template using compute_similarity_kernel_t = - decltype(&compute_similarity_kernel); + decltype(&compute_similarity_kernel); -template +template struct selected { - compute_similarity_kernel_t kernel; + compute_similarity_kernel_t kernel; dim3 grid_dim; dim3 block_dim; size_t smem_size; size_t device_lut_size; }; -template -void compute_similarity_run(selected s, +template +void compute_similarity_run(selected s, rmm::cuda_stream_view stream, uint32_t dim, uint32_t n_probes, @@ -97,7 +97,7 @@ void compute_similarity_run(selected s, const float* queries, const uint32_t* index_list, float* query_kths, - IvfSampleFilterT sample_filter, + const filtering::base_filter& sample_filter, LutT* lut_scores, OutT* _out_scores, uint32_t* _out_indices); @@ -116,7 +116,7 @@ void compute_similarity_run(selected s, * beyond this limit do not consider increasing the number of active blocks per SM * would improve locality anymore. */ -template +template auto compute_similarity_select(const cudaDeviceProp& dev_props, bool manage_local_topk, int locality_hint, @@ -126,14 +126,12 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, uint32_t precomp_data_count, uint32_t n_queries, uint32_t n_probes, - uint32_t topk) -> selected; + uint32_t topk) -> selected; } // namespace cuvs::neighbors::ivf_pq::detail -#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, IvfSampleFilterT) \ - extern template auto \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ +#define instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \ + extern template auto cuvs::neighbors::ivf_pq::detail::compute_similarity_select( \ const cudaDeviceProp& dev_props, \ bool manage_local_topk, \ int locality_hint, \ @@ -143,11 +141,10 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, uint32_t precomp_data_count, \ uint32_t n_queries, \ uint32_t n_probes, \ - uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ + uint32_t topk) -> cuvs::neighbors::ivf_pq::detail::selected; \ \ - extern template void \ - cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ - cuvs::neighbors::ivf_pq::detail::selected s, \ + extern template void cuvs::neighbors::ivf_pq::detail::compute_similarity_run( \ + cuvs::neighbors::ivf_pq::detail::selected s, \ rmm::cuda_stream_view stream, \ uint32_t dim, \ uint32_t n_probes, \ @@ -166,82 +163,23 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, const float* queries, \ const uint32_t* index_list, \ float* query_kths, \ - IvfSampleFilterT sample_filter, \ + const filtering::base_filter& sample_filter, \ LutT* lut_scores, \ OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); + half, cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>); instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); + half, cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>); +instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(half, half); +instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(float, half); +instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select(float, float); instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - half, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); + float, cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>); instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - half, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - float, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::none_sample_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - half, - half, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - half, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - float, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); -instantiate_cuvs_neighbors_ivf_pq_detail_compute_similarity_select( - float, - cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - cuvs::neighbors::filtering::ivf_to_sample_filter< - int64_t COMMA cuvs::neighbors::filtering::bitset_filter>); + float, cuvs::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>); #undef COMMA diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh index fbbdd06c24..2bb5b342e8 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,10 @@ #pragma once -#include "../ivf_common.cuh" // dummy_block_sort_t -#include "../sample_filter.cuh" // none_sample_filter -#include // cuvs::distance::DistanceType +#include "../ivf_common.cuh" // dummy_block_sort_t +#include "../sample_filter.cuh" // none_sample_filter +#include // cuvs::distance::DistanceType +#include #include // codebook_gen #include // matrix::detail::select::warpsort::warp_sort_distributed #include // RAFT_CUDA_TRY @@ -247,8 +248,7 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim, * query_kths keep the current state of the filtering - atomically updated distances to the * k-th closest neighbors for each query [n_queries]. * @param sample_filter - * A filter that selects samples for a given query. Use an instance of none_sample_filter to - * provide a green light for every sample. + * A filter that selects samples for a given query. * @param lut_scores * The device pointer for storing the lookup table globally [gridDim.x, pq_dim << PqBits]. * Ignored when `EnableSMemLut == true`. @@ -264,7 +264,6 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim, */ template +template using compute_similarity_kernel_t = - decltype(&compute_similarity_kernel); + decltype(&compute_similarity_kernel); // The config struct lifts the runtime parameters to the template parameters -template +template struct compute_similarity_kernel_config { public: - static auto get(uint32_t pq_bits, uint32_t k_max) - -> compute_similarity_kernel_t + static auto get(uint32_t pq_bits, uint32_t k_max) -> compute_similarity_kernel_t { return kernel_choose_bits(pq_bits, k_max); } private: static auto kernel_choose_bits(uint32_t pq_bits, uint32_t k_max) - -> compute_similarity_kernel_t + -> compute_similarity_kernel_t { switch (pq_bits) { case 4: return kernel_try_capacity<4, kMaxCapacity>(k_max); @@ -549,8 +541,7 @@ struct compute_similarity_kernel_config { } template - static auto kernel_try_capacity(uint32_t k_max) - -> compute_similarity_kernel_t + static auto kernel_try_capacity(uint32_t k_max) -> compute_similarity_kernel_t { if constexpr (Capacity > 0) { if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity(k_max); } @@ -558,36 +549,23 @@ struct compute_similarity_kernel_config { if constexpr (Capacity > 1) { if (k_max * 2 <= Capacity) { return kernel_try_capacity(k_max); } } - return compute_similarity_kernel; + return compute_similarity_kernel; } }; // A standalone accessor function was necessary to make sure template // instantiation work correctly. This accessor function is not used anymore and // may be removed. -template +template auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max) - -> compute_similarity_kernel_t + -> compute_similarity_kernel_t { - return compute_similarity_kernel_config::get(pq_bits, k_max); + return compute_similarity_kernel_config::get(pq_bits, + k_max); } /** Estimate the occupancy for the given kernel on the given device. */ -template +template struct occupancy_t { using shmem_unit = raft::Pow2<128>; @@ -598,7 +576,7 @@ struct occupancy_t { inline occupancy_t() = default; inline occupancy_t(size_t smem, uint32_t n_threads, - compute_similarity_kernel_t kernel, + compute_similarity_kernel_t kernel, const cudaDeviceProp& dev_props) { RAFT_CUDA_TRY( @@ -609,19 +587,17 @@ struct occupancy_t { } }; -template +template struct selected { - compute_similarity_kernel_t kernel; + compute_similarity_kernel_t kernel; dim3 grid_dim; dim3 block_dim; size_t smem_size; size_t device_lut_size; }; -template -void compute_similarity_run(selected s, +template +void compute_similarity_run(selected s, rmm::cuda_stream_view stream, uint32_t dim, uint32_t n_probes, @@ -640,33 +616,63 @@ void compute_similarity_run(selected s, const float* queries, const uint32_t* index_list, float* query_kths, - IvfSampleFilterT sample_filter, + const filtering::base_filter& sample_filter_ref, LutT* lut_scores, OutT* _out_scores, uint32_t* _out_indices) { - s.kernel<<>>(dim, - n_probes, - pq_dim, - n_queries, - queries_offset, - metric, - codebook_kind, - topk, - max_samples, - cluster_centers, - pq_centers, - pq_dataset, - cluster_labels, - _chunk_indices, - queries, - index_list, - query_kths, - sample_filter, - lut_scores, - _out_scores, - _out_indices); - RAFT_CHECK_CUDA(stream); + auto launch_kernel = [&](filtering::ivf_filter_dev sample_filter) { + s.kernel<<>>(dim, + n_probes, + pq_dim, + n_queries, + queries_offset, + metric, + codebook_kind, + topk, + max_samples, + cluster_centers, + pq_centers, + pq_dataset, + cluster_labels, + _chunk_indices, + queries, + index_list, + query_kths, + sample_filter, + lut_scores, + _out_scores, + _out_indices); + RAFT_CHECK_CUDA(stream); + }; + + switch (sample_filter_ref.get_filter_type()) { + case filtering::FilterType::None: { + try { + auto& typed_sample_filter = dynamic_cast< + const filtering::ivf_to_sample_filter&>( + sample_filter_ref); + filtering::ivf_filter_dev sample_filter{filtering::none_filter_args_t{}}; + launch_kernel(sample_filter); + } catch (const std::bad_cast& e) { + } + break; + } + case filtering::FilterType::Bitset: { + try { + auto& typed_sample_filter = dynamic_cast< + const filtering::ivf_to_sample_filter>&>( + sample_filter_ref); + filtering::ivf_filter_dev sample_filter{filtering::bitset_filter_args_t{ + typed_sample_filter.inds_ptrs_, typed_sample_filter.next_filter_.bitset_view_}}; + launch_kernel(sample_filter); + } catch (const std::bad_cast& e) { + } + break; + } + default: RAFT_FAIL("Unsupported filter type"); + } } /** @@ -683,9 +689,7 @@ void compute_similarity_run(selected s, * beyond this limit do not consider increasing the number of active blocks per SM * would improve locality anymore. */ -template +template auto compute_similarity_select(const cudaDeviceProp& dev_props, bool manage_local_topk, int locality_hint, @@ -695,7 +699,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, uint32_t precomp_data_count, uint32_t n_queries, uint32_t n_probes, - uint32_t topk) -> selected + uint32_t topk) -> selected { // Shared memory for storing the lookup table size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits); @@ -796,9 +800,9 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, the minimum number of blocks (just one, really). Then, we tweak the `n_threads` to further optimize occupancy and data locality for the L1 cache. */ - auto conf_fast = get_compute_similarity_kernel; - auto conf_no_basediff = get_compute_similarity_kernel; - auto conf_no_smem_lut = get_compute_similarity_kernel; + auto conf_fast = get_compute_similarity_kernel; + auto conf_no_basediff = get_compute_similarity_kernel; + auto conf_no_smem_lut = get_compute_similarity_kernel; auto topk_or_zero = manage_local_topk ? topk : 0u; std::array candidates{ std::make_tuple(conf_fast(pq_bits, topk_or_zero), @@ -814,8 +818,8 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, // we may allow slightly lower than 100% occupancy; constexpr double kTargetOccupancy = 0.75; // This struct is used to select the better candidate - occupancy_t selected_perf{}; - selected selected_config; + occupancy_t selected_perf{}; + selected selected_config; for (auto [kernel, smem_size_f, lut_is_in_shmem] : candidates) { if (smem_size_f(raft::WarpSize) > dev_props.sharedMemPerBlockOptin) { // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate. @@ -852,7 +856,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, continue; } - occupancy_t cur(smem_size, n_threads, kernel, dev_props); + occupancy_t cur(smem_size, n_threads, kernel, dev_props); if (cur.blocks_per_sm <= 0) { // For some reason, we still cannot make this kernel run. Skip the candidate. continue; @@ -867,8 +871,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, if (n_threads_tmp < n_threads) { while (n_threads_tmp >= n_threads_min) { auto smem_size_tmp = smem_size_f(n_threads_tmp); - occupancy_t tmp( - smem_size_tmp, n_threads_tmp, kernel, dev_props); + occupancy_t tmp(smem_size_tmp, n_threads_tmp, kernel, dev_props); bool select_it = false; if (lut_is_in_shmem && locality_hint >= tmp.blocks_per_sm) { // Normally, the smaller the block the better for L1 cache hit rate. diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh index 8d9e606671..15ab026cee 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh @@ -417,10 +417,7 @@ template struct search_kernel_cache { /** Number of matmul invocations to cache. */ static constexpr size_t kDefaultSize = 100; - raft::cache::lru, - selected> + raft::cache::lru, selected> value{kDefaultSize}; }; @@ -557,7 +554,7 @@ void ivfpq_search_worker(raft::resources const& handle, } break; } - selected search_instance; + selected search_instance; search_kernel_key search_key{manage_local_topk, coresidency, preferred_shmem_carveout, @@ -571,17 +568,17 @@ void ivfpq_search_worker(raft::resources const& handle, raft::resource::get_custom_resource>(handle) ->value; if (!cache.get(search_key, &search_instance)) { - search_instance = compute_similarity_select( - raft::resource::get_device_properties(handle), - manage_local_topk, - coresidency, - preferred_shmem_carveout, - index.pq_bits(), - index.pq_dim(), - precomp_data_count, - n_queries, - n_probes, - topK); + search_instance = + compute_similarity_select(raft::resource::get_device_properties(handle), + manage_local_topk, + coresidency, + preferred_shmem_carveout, + index.pq_bits(), + index.pq_dim(), + precomp_data_count, + n_queries, + n_probes, + topK); cache.set(search_key, search_instance); } diff --git a/cpp/src/neighbors/sample_filter.cuh b/cpp/src/neighbors/sample_filter.cuh index b0c61f924b..4df4794206 100644 --- a/cpp/src/neighbors/sample_filter.cuh +++ b/cpp/src/neighbors/sample_filter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ namespace cuvs::neighbors::filtering { /* A filter that filters nothing. This is the default behavior. */ -inline _RAFT_HOST_DEVICE bool none_sample_filter::operator()( +constexpr __forceinline__ _RAFT_HOST_DEVICE bool none_sample_filter::operator()( // query index const uint32_t query_ix, // the current inverted list index @@ -40,7 +40,7 @@ inline _RAFT_HOST_DEVICE bool none_sample_filter::operator()( } /* A filter that filters nothing. This is the default behavior. */ -inline _RAFT_HOST_DEVICE bool none_sample_filter::operator()( +constexpr __forceinline__ _RAFT_HOST_DEVICE bool none_sample_filter::operator()( // query index const uint32_t query_ix, // the index of the current sample @@ -93,14 +93,14 @@ inline _RAFT_HOST_DEVICE bool ivf_to_sample_filter::operator( } template -bitset_filter::bitset_filter( +_RAFT_HOST_DEVICE bitset_filter::bitset_filter( const cuvs::core::bitset_view bitset_for_filtering) : bitset_view_{bitset_for_filtering} { } template -inline _RAFT_HOST_DEVICE bool bitset_filter::operator()( +constexpr __forceinline__ _RAFT_HOST_DEVICE bool bitset_filter::operator()( // query index const uint32_t query_ix, // the index of the current sample @@ -140,4 +140,49 @@ void bitmap_filter::to_csr(raft::resources const& handle, csr raft::sparse::convert::bitmap_to_csr(handle, bitmap_view_, csr); } +struct none_filter_args_t {}; +using bitset_filter_args_t = + std::tuple>; + +struct ivf_filter_dev { + filtering::FilterType tag_; + + union ivf_filter_dev_args_variant { + none_filter_args_t none_filter_args; + bitset_filter_args_t bitset_filter_args; + + _RAFT_HOST_DEVICE explicit ivf_filter_dev_args_variant(const none_filter_args_t& args) + : none_filter_args(args) + { + } + + _RAFT_HOST_DEVICE explicit ivf_filter_dev_args_variant(const bitset_filter_args_t& args) + : bitset_filter_args(args) + { + } + } args_; + + _RAFT_HOST_DEVICE ivf_filter_dev(none_filter_args_t args = {}) + : tag_(FilterType::None), args_(args) {}; + + _RAFT_HOST_DEVICE ivf_filter_dev(bitset_filter_args_t args) + : tag_(FilterType::Bitset), args_(args) {}; + + constexpr __forceinline__ _RAFT_HOST_DEVICE bool operator()(const uint32_t query_ix, + const uint32_t cluster_ix, + const uint32_t sample_ix) + { + switch (tag_) { + case FilterType::None: + return filtering::none_sample_filter{}(query_ix, cluster_ix, sample_ix); + case FilterType::Bitset: { + auto& [inds_ptrs_, bitset_view_] = args_.bitset_filter_args; + return filtering::bitset_filter(bitset_view_)( + query_ix, inds_ptrs_[cluster_ix][sample_ix]); + } + default: return true; + } + } +}; + } // namespace cuvs::neighbors::filtering