Merge branch 'branch-24.10' into doc-2410-cuvs_migration

rapidsai · Sep 26, 2024 · 61a3b08 · 61a3b08
2 parents bafff63 + f37c41c
commit 61a3b08
Show file tree

Hide file tree

Showing 26 changed files with 359 additions and 51 deletions.
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -25,8 +25,8 @@ NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 # Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCXX_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
+NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0

diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0

diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8

diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8

diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - openblas

diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - openblas

diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
  - ">=3.26.4,!=3.30.0"
 
 nccl_version:
- - ">=2.9.9"
+ - ">=2.18.1.1"
 
 glog_version:
  - ">=0.6.0"

diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
  - ">=3.26.4,!=3.30.0"
 
 nccl_version:
- - ">=2.9.9"
+ - ">=2.18.1.1"
 
 glog_version:
  - ">=0.6.0"

diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -24,3 +24,6 @@ ucxx_version:
 
 cmake_version:
  - ">=3.26.4,!=3.30.0"
+
+nccl_version:
+ - ">=2.18.1.1"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
@@ -50,7 +50,7 @@ requirements:
  {% endif %}
  - cuda-version ={{ cuda_version }}
  - cython >=3.0.0
- - nccl >=2.9.9
+ - nccl {{ nccl_version }}
  - pylibraft {{ version }}
  - python x.x
  - rmm ={{ minor_version }}
@@ -68,7 +68,7 @@ requirements:
  - dask-cuda ={{ minor_version }}
  - rapids-dask-dependency ={{ minor_version }}
  - joblib >=0.11
- - nccl >=2.9.9
+ - nccl {{ nccl_version }}
  - pylibraft {{ version }}
  - python x.x
  - rmm ={{ minor_version }}

diff --git a/cpp/bench/prims/util/popc.cu b/cpp/bench/prims/util/popc.cu
@@ -89,10 +89,9 @@ struct popc_bench : public fixture {
  auto bits_view =
  raft::make_device_vector_view<const bits_t, index_t>(bits_d.data_handle(), bits_d.size());
 
- index_t max_len = params.n_rows * params.n_cols;
- auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
- auto nnz_actual_view =
- nnz_actual_d.view(); // raft::make_device_scalar_view<index_t>(nnz_actual_d.data_handle());
+ index_t max_len = params.n_rows * params.n_cols;
+ auto max_len_view = raft::make_host_scalar_view<const index_t, index_t>(&max_len);
+ auto nnz_actual_view = nnz_actual_d.view();
  raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view);
  });
  }

diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
@@ -26,6 +26,8 @@
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/popc.cuh>
 
+#include <rmm/device_scalar.hpp>
+
 #include <thrust/for_each.h>
 
 namespace raft::core {
@@ -60,6 +62,109 @@ _RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index
  }
 }
 
+template <typename bitset_t, typename index_t>
+void bitset_view<bitset_t, index_t>::count(const raft::resources& res,
+ raft::device_scalar_view<index_t> count_gpu_scalar) const
+{
+ auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
+ auto values = raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
+ raft::popc(res, values, max_len, count_gpu_scalar);
+}
+
+template <typename bitset_t, typename index_t>
+RAFT_KERNEL bitset_repeat_kernel(const bitset_t* src,
+ bitset_t* output,
+ index_t src_bit_len,
+ index_t repeat_times)
+{
+ constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+ int output_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+ index_t total_bits = src_bit_len * repeat_times;
+ index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;
+ index_t src_size = (src_bit_len + bits_per_element - 1) / bits_per_element;
+
+ if (output_idx < output_size) {
+ bitset_t result = 0;
+ index_t bit_written = 0;
+
+ index_t start_bit = output_idx * bits_per_element;
+
+ while (bit_written < bits_per_element && start_bit + bit_written < total_bits) {
+ index_t bit_idx = (start_bit + bit_written) % src_bit_len;
+ index_t src_word_idx = bit_idx / bits_per_element;
+ index_t src_offset = bit_idx % bits_per_element;
+
+ index_t remaining_bits = min(bits_per_element - bit_written, src_bit_len - bit_idx);
+
+ bitset_t src_value = (src[src_word_idx] >> src_offset);
+
+ if (src_offset + remaining_bits > bits_per_element) {
+ bitset_t next_value = src[(src_word_idx + 1) % src_size];
+ src_value |= (next_value << (bits_per_element - src_offset));
+ }
+ src_value &= ((bitset_t{1} << remaining_bits) - 1);
+ result |= (src_value << bit_written);
+ bit_written += remaining_bits;
+ }
+ output[output_idx] = result;
+ }
+}
+
+template <typename bitset_t, typename index_t>
+void bitset_repeat(raft::resources const& handle,
+ const bitset_t* d_src,
+ bitset_t* d_output,
+ index_t src_bit_len,
+ index_t repeat_times)
+{
+ if (src_bit_len == 0 || repeat_times == 0) return;
+ auto stream = resource::get_cuda_stream(handle);
+
+ constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+ const index_t total_bits = src_bit_len * repeat_times;
+ const index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;
+
+ int threadsPerBlock = 128;
+ int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;
+ bitset_repeat_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+ d_src, d_output, src_bit_len, repeat_times);
+
+ return;
+}
+
+template <typename bitset_t, typename index_t>
+void bitset_view<bitset_t, index_t>::repeat(const raft::resources& res,
+ index_t times,
+ bitset_t* output_device_ptr) const
+{
+ auto thrust_policy = raft::resource::get_thrust_policy(res);
+ constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+
+ if (bitset_len_ % bits_per_element == 0) {
+ index_t num_elements_to_copy = bitset_len_ / bits_per_element;
+
+ for (index_t i = 0; i < times; ++i) {
+ raft::copy(output_device_ptr + i * num_elements_to_copy,
+ bitset_ptr_,
+ num_elements_to_copy,
+ raft::resource::get_cuda_stream(res));
+ }
+ } else {
+ bitset_repeat(res, bitset_ptr_, output_device_ptr, bitset_len_, times);
+ }
+}
+
+template <typename bitset_t, typename index_t>
+double bitset_view<bitset_t, index_t>::sparsity(const raft::resources& res) const
+{
+ index_t size_h = this->size();
+ if (0 == size_h) { return static_cast<double>(1.0); }
+ index_t count_h = this->count(res);
+
+ return static_cast<double>((1.0 * (size_h - count_h)) / (1.0 * size_h));
+}
+
 template <typename bitset_t, typename index_t>
 bitset<bitset_t, index_t>::bitset(const raft::resources& res,
  raft::device_vector_view<const index_t, index_t> mask_index,
@@ -155,7 +260,7 @@ template <typename bitset_t, typename index_t>
 void bitset<bitset_t, index_t>::count(const raft::resources& res,
  raft::device_scalar_view<index_t> count_gpu_scalar)
 {
- auto max_len = raft::make_host_scalar_view<index_t>(&bitset_len_);
+ auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
  auto values =
  raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
  raft::popc(res, values, max_len, count_gpu_scalar);

diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
@@ -22,6 +22,8 @@
 #include <raft/core/resources.hpp>
 #include <raft/util/integer_utils.hpp>
 
+#include <cmath>
+
 namespace raft::core {
 /**
  * @defgroup bitset Bitset
@@ -103,6 +105,80 @@ struct bitset_view {
  {
  return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
  }
+ /**
+ * @brief Returns the number of bits set to true in count_gpu_scalar.
+ *
+ * @param[in] res RAFT resources
+ * @param[out] count_gpu_scalar Device scalar to store the count
+ */
+ void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar) const;
+ /**
+ * @brief Returns the number of bits set to true.
+ *
+ * @param res RAFT resources
+ * @return index_t Number of bits set to true
+ */
+ auto count(const raft::resources& res) const -> index_t
+ {
+ auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
+ count(res, count_gpu_scalar.view());
+ index_t count_cpu = 0;
+ raft::update_host(
+ &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+ resource::sync_stream(res);
+ return count_cpu;
+ }
+
+ /**
+ * @brief Repeats the bitset data and copies it to the output device pointer.
+ *
+ * This function takes the original bitset data stored in the device memory
+ * and repeats it a specified number of times into a new location in the device memory.
+ * The bits are copied bit-by-bit to ensure that even if the number of bits (bitset_len_)
+ * is not a multiple of the bitset element size (e.g., 32 for uint32_t), the bits are
+ * tightly packed without any gaps between rows.
+ *
+ * @param res RAFT resources for managing CUDA streams and execution policies.
+ * @param times Number of times the bitset data should be repeated in the output.
+ * @param output_device_ptr Device pointer where the repeated bitset data will be stored.
+ *
+ * The caller must ensure that the output device pointer has enough memory allocated
+ * to hold `times * bitset_len` bits, where `bitset_len` is the number of bits in the original
+ * bitset. This function uses Thrust parallel algorithms to efficiently perform the operation on
+ * the GPU.
+ */
+ void repeat(const raft::resources& res, index_t times, bitset_t* output_device_ptr) const;
+
+ /**
+ * @brief Calculate the sparsity (fraction of 0s) of the bitset.
+ *
+ * This function computes the sparsity of the bitset, defined as the ratio of unset bits (0s)
+ * to the total number of bits in the set. If the total number of bits is zero, the function
+ * returns 1.0, indicating the set is fully sparse.
+ *
+ * @param res RAFT resources for managing CUDA streams and execution policies.
+ * @return double The sparsity of the bitset, i.e., the fraction of unset bits.
+ *
+ * This API will synchronize on the stream of `res`.
+ */
+ double sparsity(const raft::resources& res) const;
+
+ /**
+ * @brief Calculates the number of `bitset_t` elements required to store a bitset.
+ *
+ * This function computes the number of `bitset_t` elements needed to store a bitset, ensuring
+ * that all bits are accounted for. If the bitset length is not a multiple of the `bitset_t` size
+ * (in bits), the calculation rounds up to include the remaining bits in an additional `bitset_t`
+ * element.
+ *
+ * @param bitset_len The total length of the bitset in bits.
+ * @return size_t The number of `bitset_t` elements required to store the bitset.
+ */
+ static inline size_t eval_n_elements(size_t bitset_len)
+ {
+ const size_t bits_per_element = sizeof(bitset_t) * 8;
+ return (bitset_len + bits_per_element - 1) / bits_per_element;
+ }
 
  private:
  bitset_t* bitset_ptr_;

diff --git a/cpp/include/raft/util/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh
@@ -36,12 +36,12 @@ namespace raft::detail {
  */
 template <typename value_t, typename index_t>
 void popc(const raft::resources& res,
- device_vector_view<value_t, index_t> values,
- raft::host_scalar_view<index_t> max_len,
+ device_vector_view<const value_t, index_t> values,
+ raft::host_scalar_view<const index_t, index_t> max_len,
  raft::device_scalar_view<index_t> counter)
 {
  auto values_size = values.size();
- auto values_matrix = raft::make_device_matrix_view<value_t, index_t, col_major>(
+ auto values_matrix = raft::make_device_matrix_view<const value_t, index_t, col_major>(
  values.data_handle(), values_size, 1);
  auto counter_vector = raft::make_device_vector_view<index_t, index_t>(counter.data_handle(), 1);
 

diff --git a/cpp/include/raft/util/popc.cuh b/cpp/include/raft/util/popc.cuh
@@ -31,8 +31,8 @@ namespace raft {
  */
 template <typename value_t, typename index_t>
 void popc(const raft::resources& res,
- device_vector_view<value_t, index_t> values,
- raft::host_scalar_view<index_t> max_len,
+ device_vector_view<const value_t, index_t> values,
+ raft::host_scalar_view<const index_t, index_t> max_len,
  raft::device_scalar_view<index_t> counter)
 {
  detail::popc(res, values, max_len, counter);

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
@@ -440,7 +440,9 @@ if(BUILD_TESTS)
  neighbors/ann_nn_descent/test_float_uint32_t.cu
  neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
  neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
- neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
+ # TODO: Investigate why this test is failing Reference issue
+ # https://github.com/rapidsai/raft/issues/2450
+ # neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
  LIB
  EXPLICIT_INSTANTIATE_ONLY
  GPUS