From cc9cbd3f82f094a0c3bb197e4052ecd1f4c2abd8 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 30 Jun 2023 17:03:57 -0700
Subject: [PATCH 01/22] Unpack list data kernel

---
 .../raft/neighbors/detail/ivf_flat_build.cuh  |  65 ++++++
 .../raft/neighbors/ivf_flat_helpers.cuh       | 216 ++++++++++++++++++
 2 files changed, 281 insertions(+)
 create mode 100644 cpp/include/raft/neighbors/ivf_flat_helpers.cuh
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 7c2fa05bfe..35135de7e7 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -31,6 +31,7 @@
 #include <raft/neighbors/ivf_list_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/stats/histogram.cuh>
+#include <raft/util/fast_int_div.cuh>
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -416,4 +417,68 @@ inline void fill_refinement_index(raft::resources const& handle,
                                          refinement_index->veclen());
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
+
+
+
+template <typename T, typename IdxT, uint32_t BlockSize>
+__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel_float32(
+  T* out_codes,
+  T* in_list_data,
+  uint32_t n_rows,
+  uint32_t dim,
+  uint32_t veclen)
+{
+  const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
+  if (i >= n_rows * dim) { return; }
+
+  auto col = i % dim;
+  auto row = i / n_rows;
+
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+  auto group_offset       = interleaved_group::roundDown(row);
+  auto ingroup_id         = interleaved_group::mod(row) * veclen;
+
+  // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4
+  auto within_group_offset = Pow2<4>::quot(col);
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  out_codes[group_offset * dim + within_group_offset * kIndexGroupSize * veclen + ingroup_id + col % veclen] = in_list_data[i];
+}
+
+/**
+ * Unpack interleaved flat codes from an existing packed non-interleaved list by the given row offset.
+ *
+ * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] packed_list_data the packed ivf::list data.
+ * @param[in] row_offset how many rows in the list to skip.
+ * @param[in] stream
+ */
+template<typename T, typename IdxT>
+inline void unpack_list_data_float32(
+  raft::resources const& handle,
+  device_matrix_view<T, uint32_t, row_major> codes,
+  device_matrix_view<T, uint32_t, row_major> packed_list_data,
+  uint32_t row_offset)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  auto n_rows = packed_list_data.extent(0);
+  if (n_rows == 0) { return; }
+
+  auto dim = packed_list_data.extent(1);
+
+  n_rows -= row_offset;
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = unpack_list_data_kernel_float32<T, IdxT, kBlockSize>;
+  kernel<<<blocks, threads, 0, stream>>>(codes.data_handle(),
+  packed_list_data.data_handle(),
+  n_rows,
+  dim,
+  4);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // namespace raft::neighbors::ivf_flat::detail
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
new file mode 100644
index 0000000000..2eb8085b71
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/neighbors/detail/ivf_flat_build.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::neighbors::ivf_flat::helpers {
+/**
+ * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index
+ * @{
+ */
+
+namespace codepacker {
+/**
+ * @brief Unpack `n_take` consecutive records of a single non-interleaved list (cluster) starting at given `row_offset`.
+ *
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_take, index.pq_dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_flat::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view());
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[out] codes
+ */
+template <typename value_t, typename idx_t>
+inline void unpack(
+  raft::resources const& res,
+  device_matrix_view<value_t> list_data,
+  uint32_t offset,
+  device_matrix_view<value_t, uint32_t, row_major> codes)
+{
+  ivf_flat::detail::unpack_list_data_float32(res, codes, list_data, offset);
+}
+
+// /**
+//  * Write flat PQ codes into an existing list by the given offset.
+//  *
+//  * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+//  *
+//  * Usage example:
+//  * @code{.cpp}
+//  *   auto list_data  = index.lists()[label]->data.view();
+//  *   // allocate the buffer for the input codes
+//  *   auto codes = raft::make_device_matrix<uint8_t>(res, n_vec, index.pq_dim());
+//  *   ... prepare n_vecs to pack into the list in codes ...
+//  *   // write codes into the list starting from the 42nd position
+//  *   ivf_flat::helpers::codepacker::pack(
+//  *       res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data);
+//  * @endcode
+//  *
+//  * @param[in] res
+//  * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim]
+//  * @param[in] offset how many records to skip before writing the data into the list
+//  * @param[in] list_data block to write into
+//  */
+//  template <typename value_t, typename idx_t>
+// inline void pack(
+//   raft::resources const& res,
+//   device_matrix_view<const value_t, uint32_t, row_major> codes,
+//   uint32_t offset,
+//   device_mdspan<value_t, typename list_spec<uint32_t, value_t, idx_t>::list_extents, row_major> list_data)
+// {
+//   ivf_flat::detail::pack_list_data(list_data, codes, offset, resource::get_cuda_stream(res));
+// }
+// }  // namespace codepacker
+
+// /**
+//  * Write flat PQ codes into an existing list by the given offset.
+//  *
+//  * The list is identified by its label.
+//  *
+//  * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+//  *
+//  * Usage example:
+//  * @code{.cpp}
+//  *   // We will write into the 137th cluster
+//  *   uint32_t label = 137;
+//  *   // allocate the buffer for the input codes
+//  *   auto codes = raft::make_device_matrix<const uint8_t>(res, n_vec, index.pq_dim());
+//  *   ... prepare n_vecs to pack into the list in codes ...
+//  *   // write codes into the list starting from the 42nd position
+//  *   ivf_flat::helpers::pack_list_data(res, &index, codes_to_pack, label, 42);
+//  * @endcode
+//  *
+//  * @param[in] res
+//  * @param[inout] index IVF-PQ index.
+//  * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+//  * @param[in] label The id of the list (cluster) into which we write.
+//  * @param[in] offset how many records to skip before writing the data into the list
+//  */
+// template <typename IdxT>
+// void pack_list_data(raft::resources const& res,
+//                     index<IdxT>* index,
+//                     device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+//                     uint32_t label,
+//                     uint32_t offset)
+// {
+//   ivf_flat::detail::pack_list_data(res, index, codes, label, offset);
+// }
+
+// /**
+//  * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+//  * starting at given `offset`, one code per byte (independently of pq_bits).
+//  *
+//  * Usage example:
+//  * @code{.cpp}
+//  *   // We will unpack the fourth cluster
+//  *   uint32_t label = 3;
+//  *   // Get the list size
+//  *   uint32_t list_size = 0;
+//  *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1,
+//  * resource::get_cuda_stream(res)); resource::sync_stream(res);
+//  *   // allocate the buffer for the output
+//  *   auto codes = raft::make_device_matrix<float>(res, list_size, index.pq_dim());
+//  *   // unpack the whole list
+//  *   ivf_flat::helpers::unpack_list_data(res, index, codes.view(), label, 0);
+//  * @endcode
+//  *
+//  * @tparam IdxT type of the indices in the source dataset
+//  *
+//  * @param[in] res
+//  * @param[in] index
+//  * @param[out] out_codes
+//  *   the destination buffer [n_take, index.pq_dim()].
+//  *   The length `n_take` defines how many records to unpack,
+//  *   it must be smaller than the list size.
+//  * @param[in] label
+//  *   The id of the list (cluster) to decode.
+//  * @param[in] offset
+//  *   How many records in the list to skip.
+//  */
+// template <typename IdxT>
+// void unpack_list_data(raft::resources const& res,
+//                       const index<IdxT>& index,
+//                       device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+//                       uint32_t label,
+//                       uint32_t offset)
+// {
+//   return ivf_flat::detail::unpack_list_data<IdxT>(res, index, out_codes, label, offset);
+// }
+
+// /**
+//  * @brief Unpack a series of records of a single list (cluster) in the compressed index
+//  * by their in-list offsets, one code per byte (independently of pq_bits).
+//  *
+//  * Usage example:
+//  * @code{.cpp}
+//  *   // We will unpack the fourth cluster
+//  *   uint32_t label = 3;
+//  *   // Create the selection vector
+//  *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+//  *   ... fill the indices ...
+//  *   resource::sync_stream(res);
+//  *   // allocate the buffer for the output
+//  *   auto codes = raft::make_device_matrix<float>(res, selected_indices.size(), index.pq_dim());
+//  *   // decode the whole list
+//  *   ivf_flat::helpers::unpack_list_data(
+//  *       res, index, selected_indices.view(), codes.view(), label);
+//  * @endcode
+//  *
+//  * @tparam IdxT type of the indices in the source dataset
+//  *
+//  * @param[in] res
+//  * @param[in] index
+//  * @param[in] in_cluster_indices
+//  *   The offsets of the selected indices within the cluster.
+//  * @param[out] out_codes
+//  *   the destination buffer [n_take, index.pq_dim()].
+//  *   The length `n_take` defines how many records to unpack,
+//  *   it must be smaller than the list size.
+//  * @param[in] label
+//  *   The id of the list (cluster) to decode.
+//  */
+// template <typename IdxT>
+// void unpack_list_data(raft::resources const& res,
+//                       const index<IdxT>& index,
+//                       device_vector_view<const uint32_t> in_cluster_indices,
+//                       device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+//                       uint32_t label)
+// {
+//   return ivf_flat::detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
+// }
+
+/** @} */
+}  // namespace raft::neighbors::ivf_flat::helpers

From e39ee5647dfe7ebcb06897f6676d0cfe5cce61bd Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 5 Jul 2023 15:42:48 -0700
Subject: [PATCH 02/22] update packing and unpacking functions

---
 .../raft/neighbors/detail/ivf_flat_build.cuh  | 185 +++++++++----
 .../raft/neighbors/ivf_flat_helpers.cuh       | 261 ++++++------------
 2 files changed, 206 insertions(+), 240 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 35135de7e7..fcca19be0e 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -420,65 +420,130 @@ inline void fill_refinement_index(raft::resources const& handle,
 
 
 
-template <typename T, typename IdxT, uint32_t BlockSize>
-__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel_float32(
-  T* out_codes,
-  T* in_list_data,
-  uint32_t n_rows,
-  uint32_t dim,
-  uint32_t veclen)
-{
-  const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
-  if (i >= n_rows * dim) { return; }
-
-  auto col = i % dim;
-  auto row = i / n_rows;
-
-  // The data is written in interleaved groups of `index::kGroupSize` vectors
-  using interleaved_group = Pow2<kIndexGroupSize>;
-  auto group_offset       = interleaved_group::roundDown(row);
-  auto ingroup_id         = interleaved_group::mod(row) * veclen;
-
-  // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4
-  auto within_group_offset = Pow2<4>::quot(col);
-
-  // Interleave dimensions of the source vector while recording it.
-  // NB: such `veclen` is selected, that `dim % veclen == 0`
-  out_codes[group_offset * dim + within_group_offset * kIndexGroupSize * veclen + ingroup_id + col % veclen] = in_list_data[i];
-}
-
-/**
- * Unpack interleaved flat codes from an existing packed non-interleaved list by the given row offset.
- *
- * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim]
- * @param[in] packed_list_data the packed ivf::list data.
- * @param[in] row_offset how many rows in the list to skip.
- * @param[in] stream
- */
-template<typename T, typename IdxT>
-inline void unpack_list_data_float32(
-  raft::resources const& handle,
-  device_matrix_view<T, uint32_t, row_major> codes,
-  device_matrix_view<T, uint32_t, row_major> packed_list_data,
-  uint32_t row_offset)
-{
-  auto stream = raft::resource::get_cuda_stream(handle);
-  auto n_rows = packed_list_data.extent(0);
-  if (n_rows == 0) { return; }
-
-  auto dim = packed_list_data.extent(1);
-
-  n_rows -= row_offset;
-  constexpr uint32_t kBlockSize = 256;
-  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
-  dim3 threads(kBlockSize, 1, 1);
-  auto kernel = unpack_list_data_kernel_float32<T, IdxT, kBlockSize>;
-  kernel<<<blocks, threads, 0, stream>>>(codes.data_handle(),
-  packed_list_data.data_handle(),
-  n_rows,
-  dim,
-  4);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
+// template <typename T, typename IdxT, uint32_t BlockSize>
+// __launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel_float32(
+//   T* out_codes,
+//   T* in_list_data,
+//   uint32_t n_rows,
+//   uint32_t dim,
+//   uint32_t veclen)
+// {
+//   const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
+//   if (i >= n_rows * dim) { return; }
+
+//   auto col = i % kIndexGroupSize * veclen;
+//   auto row = i / (kIndexGroupSize * veclen);
+
+//   auto vec = 
+
+//   // The data is written in interleaved groups of `index::kGroupSize` vectors
+//   using interleaved_group = Pow2<kIndexGroupSize>;
+//   auto group_offset       = interleaved_group::roundDown(row);
+//   auto ingroup_id         = interleaved_group::mod(row) * veclen;
+
+//   // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4
+//   auto within_group_offset = Pow2<4>::quot(col);
+
+//   // Interleave dimensions of the source vector while recording it.
+//   // NB: such `veclen` is selected, that `dim % veclen == 0`
+//   out_codes[] = in_list_data[i];
+// }
+
+// /**
+//  * Pack interleaved flat codes from an existing packed non-interleaved list by the given row offset.
+//  *
+//  * @param[out] codes flat codes, [n_rows, dim]
+//  * @param[in] list_data the packed ivf::list data.
+//  * @param[in] row_offset how many rows in the list to skip.
+//  * @param[in] stream
+//  */
+// template<typename T, typename IdxT>
+// inline void unpack_list_data_float32(
+//   raft::resources const& handle,
+//   device_matrix_view<T, uint32_t, row_major> codes,
+//   device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data,
+//   uint32_t row_offset)
+// {
+//   auto stream = raft::resource::get_cuda_stream(handle);
+//   auto n_rows = codes.extent(0);
+//   if (n_rows == 0) { return; }
+
+//   auto dim = codes.extent(1);
+
+//   n_rows -= row_offset;
+//   constexpr uint32_t kBlockSize = 256;
+//   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+//   dim3 threads(kBlockSize, 1, 1);
+//   auto kernel = pack_list_data_kernel_float32<T, IdxT, kBlockSize>;
+//   kernel<<<blocks, threads, 0, stream>>>(codes.data_handle(),
+//   list_data.data_handle(),
+//   n_rows,
+//   dim,
+//   4);
+//   RAFT_CUDA_TRY(cudaPeekAtLastError());
+// }
+
+
+
+// template <typename T, typename IdxT, uint32_t BlockSize>
+// __launch_bounds__(BlockSize) __global__ void pack_list_data_kernel_float32(
+//   T* list_data,
+//   T* codes,
+//   uint32_t n_rows,
+//   uint32_t dim,
+//   uint32_t veclen)
+// {
+//   const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
+//   if (i >= n_rows * dim) { return; }
+
+//   auto col = i % dim;
+//   auto row = i / n_rows;
+
+//   // The data is written in interleaved groups of `index::kGroupSize` vectors
+//   using interleaved_group = Pow2<kIndexGroupSize>;
+//   auto group_offset       = interleaved_group::roundDown(row);
+//   auto ingroup_id         = interleaved_group::mod(row) * veclen;
+
+//   // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4
+//   auto within_group_offset = Pow2<4>::quot(col);
+
+//   // Interleave dimensions of the source vector while recording it.
+//   // NB: such `veclen` is selected, that `dim % veclen == 0`
+//   list_data[group_offset * dim + within_group_offset * kIndexGroupSize * veclen + ingroup_id + col % veclen] = codes[i];
+// }
+
+// /**
+//  * Pack interleaved flat codes from an existing packed non-interleaved list by the given row offset.
+//  *
+//  * @param[out] codes flat codes, [n_rows, dim]
+//  * @param[in] list_data the packed ivf::list data.
+//  * @param[in] row_offset how many rows in the list to skip.
+//  * @param[in] stream
+//  */
+// template<typename T, typename IdxT>
+// inline void pack_list_data_float32(
+//   raft::resources const& handle,
+//   device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data,
+//   device_matrix_view<const T, uint32_t, row_major> codes,
+//   uint32_t row_offset)
+// {
+//   auto stream = raft::resource::get_cuda_stream(handle);
+//   auto n_rows = codes.extent(0);
+//   if (n_rows == 0) { return; }
+
+//   auto dim = codes.extent(1);
+
+//   n_rows -= row_offset;
+//   constexpr uint32_t kBlockSize = 256;
+//   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+//   dim3 threads(kBlockSize, 1, 1);
+//   auto kernel = pack_list_data_kernel_float32<T, IdxT, kBlockSize>;
+//   kernel<<<blocks, threads, 0, stream>>>(codes.data_handle(),
+//   list_data.data_handle(),
+//   n_rows,
+//   dim,
+//   4);
+//   RAFT_CUDA_TRY(cudaPeekAtLastError());
+// }
 
 }  // namespace raft::neighbors::ivf_flat::detail
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index 2eb8085b71..da0990fdbf 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -29,188 +29,89 @@ namespace raft::neighbors::ivf_flat::helpers {
  * @{
  */
 
-namespace codepacker {
-/**
- * @brief Unpack `n_take` consecutive records of a single non-interleaved list (cluster) starting at given `row_offset`.
- *
- *
- * Usage example:
- * @code{.cpp}
- *   auto list_data = index.lists()[label]->data.view();
- *   // allocate the buffer for the output
- *   uint32_t n_take = 4;
- *   auto codes = raft::make_device_matrix<uint8_t>(res, n_take, index.pq_dim());
- *   uint32_t offset = 0;
- *   // unpack n_take elements from the list
- *   ivf_flat::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view());
- * @endcode
- *
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] res raft resource
- * @param[in] list_data block to read from
- * @param[in] offset
- *   How many records in the list to skip.
- * @param[out] codes
- */
-template <typename value_t, typename idx_t>
-inline void unpack(
-  raft::resources const& res,
-  device_matrix_view<value_t> list_data,
-  uint32_t offset,
-  device_matrix_view<value_t, uint32_t, row_major> codes)
-{
-  ivf_flat::detail::unpack_list_data_float32(res, codes, list_data, offset);
+template <typename T>
+void unpackInterleaved(
+        const T* in,
+        T* out,
+        int numVecs,
+        int dim,
+        int veclen) {
+
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  #pragma omp parallel for
+  for (int i = 0; i < numVecs; i++) {
+    auto group_offset = interleaved_group::roundDown(i);
+    auto ingroup_id = interleaved_group::mod(i) * veclen;
+
+    // Point to the location of the interleaved group of vectors
+    out += group_offset * dim;
+    for (uint32_t l = 0; l < dim; l += veclen) {
+      for (uint32_t j = 0; j < veclen; j++) {
+        out[l * kIndexGroupSize + ingroup_id + j] = in[i * dim + l + j];
+      }
+    }
+  }
 }
 
-// /**
-//  * Write flat PQ codes into an existing list by the given offset.
-//  *
-//  * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
-//  *
-//  * Usage example:
-//  * @code{.cpp}
-//  *   auto list_data  = index.lists()[label]->data.view();
-//  *   // allocate the buffer for the input codes
-//  *   auto codes = raft::make_device_matrix<uint8_t>(res, n_vec, index.pq_dim());
-//  *   ... prepare n_vecs to pack into the list in codes ...
-//  *   // write codes into the list starting from the 42nd position
-//  *   ivf_flat::helpers::codepacker::pack(
-//  *       res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data);
-//  * @endcode
-//  *
-//  * @param[in] res
-//  * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim]
-//  * @param[in] offset how many records to skip before writing the data into the list
-//  * @param[in] list_data block to write into
-//  */
-//  template <typename value_t, typename idx_t>
-// inline void pack(
-//   raft::resources const& res,
-//   device_matrix_view<const value_t, uint32_t, row_major> codes,
-//   uint32_t offset,
-//   device_mdspan<value_t, typename list_spec<uint32_t, value_t, idx_t>::list_extents, row_major> list_data)
-// {
-//   ivf_flat::detail::pack_list_data(list_data, codes, offset, resource::get_cuda_stream(res));
-// }
-// }  // namespace codepacker
-
-// /**
-//  * Write flat PQ codes into an existing list by the given offset.
-//  *
-//  * The list is identified by its label.
-//  *
-//  * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
-//  *
-//  * Usage example:
-//  * @code{.cpp}
-//  *   // We will write into the 137th cluster
-//  *   uint32_t label = 137;
-//  *   // allocate the buffer for the input codes
-//  *   auto codes = raft::make_device_matrix<const uint8_t>(res, n_vec, index.pq_dim());
-//  *   ... prepare n_vecs to pack into the list in codes ...
-//  *   // write codes into the list starting from the 42nd position
-//  *   ivf_flat::helpers::pack_list_data(res, &index, codes_to_pack, label, 42);
-//  * @endcode
-//  *
-//  * @param[in] res
-//  * @param[inout] index IVF-PQ index.
-//  * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
-//  * @param[in] label The id of the list (cluster) into which we write.
-//  * @param[in] offset how many records to skip before writing the data into the list
-//  */
-// template <typename IdxT>
-// void pack_list_data(raft::resources const& res,
-//                     index<IdxT>* index,
-//                     device_matrix_view<const uint8_t, uint32_t, row_major> codes,
-//                     uint32_t label,
-//                     uint32_t offset)
-// {
-//   ivf_flat::detail::pack_list_data(res, index, codes, label, offset);
-// }
-
-// /**
-//  * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
-//  * starting at given `offset`, one code per byte (independently of pq_bits).
-//  *
-//  * Usage example:
-//  * @code{.cpp}
-//  *   // We will unpack the fourth cluster
-//  *   uint32_t label = 3;
-//  *   // Get the list size
-//  *   uint32_t list_size = 0;
-//  *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1,
-//  * resource::get_cuda_stream(res)); resource::sync_stream(res);
-//  *   // allocate the buffer for the output
-//  *   auto codes = raft::make_device_matrix<float>(res, list_size, index.pq_dim());
-//  *   // unpack the whole list
-//  *   ivf_flat::helpers::unpack_list_data(res, index, codes.view(), label, 0);
-//  * @endcode
-//  *
-//  * @tparam IdxT type of the indices in the source dataset
-//  *
-//  * @param[in] res
-//  * @param[in] index
-//  * @param[out] out_codes
-//  *   the destination buffer [n_take, index.pq_dim()].
-//  *   The length `n_take` defines how many records to unpack,
-//  *   it must be smaller than the list size.
-//  * @param[in] label
-//  *   The id of the list (cluster) to decode.
-//  * @param[in] offset
-//  *   How many records in the list to skip.
-//  */
-// template <typename IdxT>
-// void unpack_list_data(raft::resources const& res,
-//                       const index<IdxT>& index,
-//                       device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
-//                       uint32_t label,
-//                       uint32_t offset)
-// {
-//   return ivf_flat::detail::unpack_list_data<IdxT>(res, index, out_codes, label, offset);
-// }
-
-// /**
-//  * @brief Unpack a series of records of a single list (cluster) in the compressed index
-//  * by their in-list offsets, one code per byte (independently of pq_bits).
-//  *
-//  * Usage example:
-//  * @code{.cpp}
-//  *   // We will unpack the fourth cluster
-//  *   uint32_t label = 3;
-//  *   // Create the selection vector
-//  *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
-//  *   ... fill the indices ...
-//  *   resource::sync_stream(res);
-//  *   // allocate the buffer for the output
-//  *   auto codes = raft::make_device_matrix<float>(res, selected_indices.size(), index.pq_dim());
-//  *   // decode the whole list
-//  *   ivf_flat::helpers::unpack_list_data(
-//  *       res, index, selected_indices.view(), codes.view(), label);
-//  * @endcode
-//  *
-//  * @tparam IdxT type of the indices in the source dataset
-//  *
-//  * @param[in] res
-//  * @param[in] index
-//  * @param[in] in_cluster_indices
-//  *   The offsets of the selected indices within the cluster.
-//  * @param[out] out_codes
-//  *   the destination buffer [n_take, index.pq_dim()].
-//  *   The length `n_take` defines how many records to unpack,
-//  *   it must be smaller than the list size.
-//  * @param[in] label
-//  *   The id of the list (cluster) to decode.
-//  */
-// template <typename IdxT>
-// void unpack_list_data(raft::resources const& res,
-//                       const index<IdxT>& index,
-//                       device_vector_view<const uint32_t> in_cluster_indices,
-//                       device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
-//                       uint32_t label)
-// {
-//   return ivf_flat::detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
-// }
 
+template <typename T>
+void pack_host_interleaved(
+        const T* in,
+        T* out,
+        int numVecs,
+        int dim,
+        int veclen) {
+
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  #pragma omp parallel for
+  for (int i = 0; i < numVecs; i++) {
+    auto group_offset = interleaved_group::roundDown(i);
+    auto ingroup_id = interleaved_group::mod(i) * veclen;
+
+    // Point to the location of the interleaved group of vectors
+    out += group_offset * dim;
+    for (uint32_t l = 0; l < dim; l += veclen) {
+      for (uint32_t j = 0; j < veclen; j++) {
+        out[l * kIndexGroupSize + ingroup_id + j] = in[i * dim + l + j];
+      }
+    }
+  }
+}
+
+template <typename T>
+void unpack_host_interleaved(
+        const T* in,
+        T* out,
+        int numVecs,
+        int dim,
+        int veclen) {
+
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  #pragma omp parallel for
+  for (int i = 0; i < numVecs; i++) {
+    auto group_offset = interleaved_group::roundDown(i);
+    auto ingroup_id = interleaved_group::mod(i) * veclen;
+
+    // Point to the location of the interleaved group of vectors
+    out += group_offset * dim;
+    for (uint32_t l = 0; l < dim; l += veclen) {
+      for (uint32_t j = 0; j < veclen; j++) {
+        out[i * dim + l + j] = in[l * kIndexGroupSize + ingroup_id + j];
+      }
+    }
+  }
+}
 /** @} */
 }  // namespace raft::neighbors::ivf_flat::helpers

From 78d63801d4b66c5e49aa832e163d768e8165c779 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 13 Jul 2023 21:53:46 -0700
Subject: [PATCH 03/22] Update codepacker

---
 .../raft/neighbors/ivf_flat_helpers.cuh       | 104 ++++++++----------
 .../raft/spatial/knn/detail/ann_quantized.cuh |  90 +++++++++++++++
 2 files changed, 135 insertions(+), 59 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index da0990fdbf..85272d01d1 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -23,95 +23,81 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
+// #include <omp.h>
+
 namespace raft::neighbors::ivf_flat::helpers {
 /**
  * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index
  * @{
  */
 
+namespace codepacker {
+/**
+ * Write one flat code into a block by the given offset. The offset indicates the id of the record in the list. This function interleaves the code and is intended to later copy the interleaved codes over to the IVF list on device.
+ * NB: no memory allocation happens here; the block must fit the record (offset + 1).
+ *
+ * @tparam T
+ *
+ * @param[in] flat_code input flat code
+ * @param[out] block block of memory to write interleaved codes to
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
 template <typename T>
-void unpackInterleaved(
-        const T* in,
-        T* out,
-        int numVecs,
-        int dim,
-        int veclen) {
-
-  // The data is written in interleaved groups of `index::kGroupSize` vectors
-  using interleaved_group = Pow2<kIndexGroupSize>;
-
-  // Interleave dimensions of the source vector while recording it.
-  // NB: such `veclen` is selected, that `dim % veclen == 0`
-  #pragma omp parallel for
-  for (int i = 0; i < numVecs; i++) {
-    auto group_offset = interleaved_group::roundDown(i);
-    auto ingroup_id = interleaved_group::mod(i) * veclen;
-
-    // Point to the location of the interleaved group of vectors
-    out += group_offset * dim;
-    for (uint32_t l = 0; l < dim; l += veclen) {
-      for (uint32_t j = 0; j < veclen; j++) {
-        out[l * kIndexGroupSize + ingroup_id + j] = in[i * dim + l + j];
-      }
-    }
-  }
-}
-
-
-template <typename T>
-void pack_host_interleaved(
-        const T* in,
-        T* out,
-        int numVecs,
-        int dim,
-        int veclen) {
-
+__host__ __device__ void pack_1_interleaved(
+        const T* flat_code,
+        T* block,
+        uint32_t dim,
+        uint32_t veclen,
+        uint32_t offset) {    
   // The data is written in interleaved groups of `index::kGroupSize` vectors
   using interleaved_group = Pow2<kIndexGroupSize>;
 
   // Interleave dimensions of the source vector while recording it.
   // NB: such `veclen` is selected, that `dim % veclen == 0`
-  #pragma omp parallel for
-  for (int i = 0; i < numVecs; i++) {
-    auto group_offset = interleaved_group::roundDown(i);
-    auto ingroup_id = interleaved_group::mod(i) * veclen;
+    auto group_offset = interleaved_group::roundDown(offset);
+    auto ingroup_id = interleaved_group::mod(offset) * veclen;
 
-    // Point to the location of the interleaved group of vectors
-    out += group_offset * dim;
     for (uint32_t l = 0; l < dim; l += veclen) {
       for (uint32_t j = 0; j < veclen; j++) {
-        out[l * kIndexGroupSize + ingroup_id + j] = in[i * dim + l + j];
+        block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j];
       }
     }
-  }
 }
 
+/**
+ * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset indicates the id of the record. This function fetches one flat code from an interleaved code.
+ *
+ * @tparam T
+ *
+ * @param[in] block interleaved block. The block can be thought of as the whole inverted list in interleaved format.
+ * @param[out] flat_code output flat code
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset fetch the flat code by the given offset
+ */
 template <typename T>
-void unpack_host_interleaved(
-        const T* in,
-        T* out,
-        int numVecs,
-        int dim,
-        int veclen) {
+__host__ __device__ void unpack_1_interleaved(
+        const T* block,
+        T* flat_code,
+        uint32_t dim,
+        uint32_t veclen,
+        uint32_t offset) {
 
   // The data is written in interleaved groups of `index::kGroupSize` vectors
   using interleaved_group = Pow2<kIndexGroupSize>;
 
-  // Interleave dimensions of the source vector while recording it.
   // NB: such `veclen` is selected, that `dim % veclen == 0`
-  #pragma omp parallel for
-  for (int i = 0; i < numVecs; i++) {
-    auto group_offset = interleaved_group::roundDown(i);
-    auto ingroup_id = interleaved_group::mod(i) * veclen;
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id = interleaved_group::mod(offset) * veclen;
 
-    // Point to the location of the interleaved group of vectors
-    out += group_offset * dim;
     for (uint32_t l = 0; l < dim; l += veclen) {
       for (uint32_t j = 0; j < veclen; j++) {
-        out[i * dim + l + j] = in[l * kIndexGroupSize + ingroup_id + j];
+          flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j];
       }
     }
-  }
 }
+} // namespace codepacker
 /** @} */
 }  // namespace raft::neighbors::ivf_flat::helpers
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 964292f6cb..1d724bada7 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -18,9 +18,14 @@
 
 #include "../ann_common.h"
 #include "../ivf_flat.cuh"
+#include <cstring>
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include "processing.cuh"
+#include "raft/core/host_mdarray.hpp"
+#include "raft/neighbors/ivf_flat_types.hpp"
+#include "raft/neighbors/ivf_flat_helpers.cuh"
+#include "raft/util/pow2_utils.cuh"
 #include <raft/core/operators.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -73,6 +78,91 @@ void approx_knn_build_index(raft::resources const& handle,
     auto new_params               = from_legacy_index_params(*ivf_ft_pams, metric, metricArg);
     index->ivf_flat<T, int64_t>() = std::make_unique<const ivf_flat::index<T, int64_t>>(
       ivf_flat::build(handle, new_params, index_array, int64_t(n), D));
+    
+    // raft::resource::sync_stream(handle);
+
+    // auto old_list = index->ivf_flat<T, int64_t>()->lists()[0];
+    // uint32_t n_rows   = old_list->size.load();
+    // uint32_t roundup = Pow2<raft::neighbors::ivf_flat::kIndexGroupSize>::roundUp(n_rows);
+
+    // RAFT_LOG_INFO("roundup %d, n_rows %d", roundup, n_rows);
+
+    // if (n_rows == 0) { return; }
+
+    // auto dim = index->ivf_flat<T, int64_t>()->dim();
+    // auto veclen = index -> ivf_flat<T, int64_t>()->veclen();
+    // RAFT_LOG_INFO("roundup %d, n_rows %d, veclen %d, dim %d", roundup, n_rows, veclen, dim);
+    // auto codes   = make_host_matrix<T>(roundup, dim);
+    // auto block = make_host_matrix<T>(roundup, dim);
+
+    // T* firstArray;
+    // cudaMemcpy(&firstArray, index->ivf_flat<T, int64_t>()->data_ptrs().data_handle(), sizeof(float*), cudaMemcpyDeviceToHost);  // Copy the pointer to the first array from device to host
+
+    // raft::print_device_vector("codes_gpu", firstArray, 1, std::cout);
+    // raft::update_host(codes.data_handle(), firstArray, (size_t)(roundup * dim), stream);
+    // raft::resource::sync_stream(handle);
+    // raft::neighbors::ivf_flat::helpers::pack_host_interleaved(
+    //     codes.data_handle(),
+    //     block.data_handle(),
+    //     n_rows,
+    //     dim,
+    //     veclen);
+    
+    // RAFT_LOG_INFO("veclen %d", veclen);
+    // raft::print_host_vector("codes", codes.data_handle(), roundup * dim, std::cout);
+    // raft::print_host_vector("block", block.data_handle(), roundup * dim, std::cout);
+    // // auto indices = make_device_vector<IdxT>(handle_, n_rows);
+    // copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    // ivf_flat::helpers::pack_list_data(handle_, *index, codes.view(), label, 0);
+    // ivf_pq::helpers::erase_list(handle_, index, label);
+    // ivf_pq::helpers::extend_list_with_codes<IdxT>(
+    //   handle_, index, codes.view(), indices.view(), label);
+
+    // auto& new_list = index->lists()[label];
+    // ASSERT_NE(old_list.get(), new_list.get())
+    //   << "The old list should have been shared and retained after ivf_pq index has erased the "
+    //      "corresponding cluster.";
+    // auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) *
+    //                       new_list->data.extent(2) * new_list->data.extent(3);
+
+    // ASSERT_TRUE(old_list->data.size() >= list_data_size);
+    // ASSERT_TRUE(new_list->data.size() >= list_data_size);
+    // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+    //                         new_list->data.data_handle(),
+    //                         list_data_size,
+    //                         Compare<uint8_t>{}));
+
+    // // Pack a few vectors back to the list.
+    // int row_offset = 9;
+    // int n_vec      = 3;
+    // ASSERT_TRUE(row_offset + n_vec < n_rows);
+    // size_t offset      = row_offset * index->pq_dim();
+    // auto codes_to_pack = make_device_matrix_view<const uint8_t, uint32_t>(
+    //   codes.data_handle() + offset, n_vec, index->pq_dim());
+    // ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset);
+    // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+    //                         new_list->data.data_handle(),
+    //                         list_data_size,
+    //                         Compare<uint8_t>{}));
+
+    // Another test with the API that take list_data directly
+    // auto list_data  = index->lists()[label]->data.view();
+    // uint32_t n_take = 4;
+    // ASSERT_TRUE(row_offset + n_take < n_rows);
+    // auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, index->pq_dim());
+    // ivf_pq::helpers::codepacker::unpack(
+    //   handle_, list_data, index->pq_bits(), row_offset, codes2.view());
+
+    // // Write it back
+    // ivf_pq::helpers::codepacker::pack(
+    //   handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data);
+    // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+    //                         new_list->data.data_handle(),
+    //                         list_data_size,
+    //                         Compare<uint8_t>{}));
+  // }
+
   } else if (ivf_pq_pams) {
     neighbors::ivf_pq::index_params params;
     params.metric     = metric;

From 897338e227ac184061ab98b11670f377c52b0bbb Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 17 Jul 2023 09:50:43 -0700
Subject: [PATCH 04/22] refactor codepacker (does not build)

---
 .../all_cuda-118_arch-x86_64.yaml             |  60 ------
 .../all_cuda-120_arch-x86_64.yaml             |  56 ------
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  38 ----
 .../raft/neighbors/detail/ivf_flat_build.cuh  | 179 ++++++------------
 .../raft/neighbors/ivf_flat_codepacker.cuh    |  89 +++++++++
 .../raft/neighbors/ivf_flat_helpers.cuh       |  82 ++------
 .../raft/spatial/knn/detail/ann_quantized.cuh |  89 ---------
 cpp/test/neighbors/ann_ivf_flat.cuh           |  90 ++++++++-
 8 files changed, 252 insertions(+), 431 deletions(-)
 delete mode 100644 conda/environments/all_cuda-118_arch-x86_64.yaml
 delete mode 100644 conda/environments/all_cuda-120_arch-x86_64.yaml
 delete mode 100644 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
 create mode 100644 cpp/include/raft/neighbors/ivf_flat_codepacker.cuh

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
deleted file mode 100644
index 546728d2c6..0000000000
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- conda-forge
-- nvidia
-dependencies:
-- breathe
-- c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
-- cmake>=3.23.1,!=3.25.0
-- cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
-- cuda-version=11.8
-- cudatoolkit
-- cupy>=12.0.0
-- cxx-compiler
-- cython>=0.29,<0.30
-- dask-core>=2023.5.1
-- dask-cuda==23.8.*
-- dask>=2023.5.1
-- distributed>=2023.5.1
-- doxygen>=1.8.20
-- gcc_linux-64=11.*
-- gmock>=1.13.0
-- graphviz
-- gtest>=1.13.0
-- ipython
-- joblib>=0.11
-- libcublas-dev=11.11.3.6
-- libcublas=11.11.3.6
-- libcurand-dev=10.3.0.86
-- libcurand=10.3.0.86
-- libcusolver-dev=11.4.1.48
-- libcusolver=11.4.1.48
-- libcusparse-dev=11.7.5.86
-- libcusparse=11.7.5.86
-- nccl>=2.9.9
-- ninja
-- numba>=0.57
-- numpy>=1.21
-- numpydoc
-- pydata-sphinx-theme
-- pytest
-- pytest-cov
-- recommonmark
-- rmm==23.8.*
-- scikit-build>=0.13.1
-- scikit-learn
-- scipy
-- sphinx-copybutton
-- sphinx-markdown-tables
-- sysroot_linux-64==2.17
-- ucx-proc=*=gpu
-- ucx-py==0.33.*
-- ucx>=1.13.0
-name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
deleted file mode 100644
index 0ae6154078..0000000000
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- conda-forge
-- nvidia
-dependencies:
-- breathe
-- c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
-- cmake>=3.23.1,!=3.25.0
-- cuda-cudart-dev
-- cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
-- cuda-version=12.0
-- cupy>=12.0.0
-- cxx-compiler
-- cython>=0.29,<0.30
-- dask-core>=2023.5.1
-- dask-cuda==23.8.*
-- dask>=2023.5.1
-- distributed>=2023.5.1
-- doxygen>=1.8.20
-- gcc_linux-64=11.*
-- gmock>=1.13.0
-- graphviz
-- gtest>=1.13.0
-- ipython
-- joblib>=0.11
-- libcublas-dev
-- libcurand-dev
-- libcusolver-dev
-- libcusparse-dev
-- nccl>=2.9.9
-- ninja
-- numba>=0.57
-- numpy>=1.21
-- numpydoc
-- pydata-sphinx-theme
-- pytest
-- pytest-cov
-- recommonmark
-- rmm==23.8.*
-- scikit-build>=0.13.1
-- scikit-learn
-- scipy
-- sphinx-copybutton
-- sphinx-markdown-tables
-- sysroot_linux-64==2.17
-- ucx-proc=*=gpu
-- ucx-py==0.33.*
-- ucx>=1.13.0
-name: all_cuda-120_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
deleted file mode 100644
index 74b966cc03..0000000000
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- conda-forge
-- nvidia
-dependencies:
-- c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
-- cmake>=3.23.1,!=3.25.0
-- cuda-profiler-api=11.8.86
-- cuda-version=11.8
-- cudatoolkit
-- cxx-compiler
-- cython>=0.29,<0.30
-- faiss-proc=*=cuda
-- gcc_linux-64=11.*
-- glog>=0.6.0
-- h5py>=3.8.0
-- hnswlib=0.7.0
-- libcublas-dev=11.11.3.6
-- libcublas=11.11.3.6
-- libcurand-dev=10.3.0.86
-- libcurand=10.3.0.86
-- libcusolver-dev=11.4.1.48
-- libcusolver=11.4.1.48
-- libcusparse-dev=11.7.5.86
-- libcusparse=11.7.5.86
-- libfaiss>=1.7.1
-- nccl>=2.9.9
-- ninja
-- nlohmann_json>=3.11.2
-- scikit-build>=0.13.1
-- sysroot_linux-64==2.17
-name: bench_ann_cuda-118_arch-x86_64
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index fcca19be0e..4615ddba57 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -26,6 +26,7 @@
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/neighbors/ivf_flat_codepacker.cuh>
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_list_types.hpp>
@@ -418,132 +419,60 @@ inline void fill_refinement_index(raft::resources const& handle,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+template <typename T>
+__global__ void pack_interleaved_list_kernel(
+  const T* codes, T* list_data, uint32_t n_rows, uint32_t dim, uint32_t veclen)
+{
+  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n_rows) {
+    codepacker::pack_1_interleaved(
+      codes + tid * dim, list_data, dim, veclen, tid);
+    }
+}
 
+template <typename T>
+__global__ void unpack_interleaved_list_kernel(
+  const T* list_data, T* codes, uint32_t n_rows, uint32_t dim, uint32_t veclen)
+{
+  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < n_rows) {
+    codepacker::unpack_1_interleaved(
+      codes + tid * dim, list_data, dim, veclen, tid);
+    }
+}
 
-// template <typename T, typename IdxT, uint32_t BlockSize>
-// __launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel_float32(
-//   T* out_codes,
-//   T* in_list_data,
-//   uint32_t n_rows,
-//   uint32_t dim,
-//   uint32_t veclen)
-// {
-//   const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
-//   if (i >= n_rows * dim) { return; }
-
-//   auto col = i % kIndexGroupSize * veclen;
-//   auto row = i / (kIndexGroupSize * veclen);
-
-//   auto vec = 
-
-//   // The data is written in interleaved groups of `index::kGroupSize` vectors
-//   using interleaved_group = Pow2<kIndexGroupSize>;
-//   auto group_offset       = interleaved_group::roundDown(row);
-//   auto ingroup_id         = interleaved_group::mod(row) * veclen;
-
-//   // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4
-//   auto within_group_offset = Pow2<4>::quot(col);
-
-//   // Interleave dimensions of the source vector while recording it.
-//   // NB: such `veclen` is selected, that `dim % veclen == 0`
-//   out_codes[] = in_list_data[i];
-// }
-
-// /**
-//  * Pack interleaved flat codes from an existing packed non-interleaved list by the given row offset.
-//  *
-//  * @param[out] codes flat codes, [n_rows, dim]
-//  * @param[in] list_data the packed ivf::list data.
-//  * @param[in] row_offset how many rows in the list to skip.
-//  * @param[in] stream
-//  */
-// template<typename T, typename IdxT>
-// inline void unpack_list_data_float32(
-//   raft::resources const& handle,
-//   device_matrix_view<T, uint32_t, row_major> codes,
-//   device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data,
-//   uint32_t row_offset)
-// {
-//   auto stream = raft::resource::get_cuda_stream(handle);
-//   auto n_rows = codes.extent(0);
-//   if (n_rows == 0) { return; }
-
-//   auto dim = codes.extent(1);
-
-//   n_rows -= row_offset;
-//   constexpr uint32_t kBlockSize = 256;
-//   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
-//   dim3 threads(kBlockSize, 1, 1);
-//   auto kernel = pack_list_data_kernel_float32<T, IdxT, kBlockSize>;
-//   kernel<<<blocks, threads, 0, stream>>>(codes.data_handle(),
-//   list_data.data_handle(),
-//   n_rows,
-//   dim,
-//   4);
-//   RAFT_CUDA_TRY(cudaPeekAtLastError());
-// }
-
-
-
-// template <typename T, typename IdxT, uint32_t BlockSize>
-// __launch_bounds__(BlockSize) __global__ void pack_list_data_kernel_float32(
-//   T* list_data,
-//   T* codes,
-//   uint32_t n_rows,
-//   uint32_t dim,
-//   uint32_t veclen)
-// {
-//   const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
-//   if (i >= n_rows * dim) { return; }
-
-//   auto col = i % dim;
-//   auto row = i / n_rows;
-
-//   // The data is written in interleaved groups of `index::kGroupSize` vectors
-//   using interleaved_group = Pow2<kIndexGroupSize>;
-//   auto group_offset       = interleaved_group::roundDown(row);
-//   auto ingroup_id         = interleaved_group::mod(row) * veclen;
-
-//   // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4
-//   auto within_group_offset = Pow2<4>::quot(col);
-
-//   // Interleave dimensions of the source vector while recording it.
-//   // NB: such `veclen` is selected, that `dim % veclen == 0`
-//   list_data[group_offset * dim + within_group_offset * kIndexGroupSize * veclen + ingroup_id + col % veclen] = codes[i];
-// }
-
-// /**
-//  * Pack interleaved flat codes from an existing packed non-interleaved list by the given row offset.
-//  *
-//  * @param[out] codes flat codes, [n_rows, dim]
-//  * @param[in] list_data the packed ivf::list data.
-//  * @param[in] row_offset how many rows in the list to skip.
-//  * @param[in] stream
-//  */
-// template<typename T, typename IdxT>
-// inline void pack_list_data_float32(
-//   raft::resources const& handle,
-//   device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data,
-//   device_matrix_view<const T, uint32_t, row_major> codes,
-//   uint32_t row_offset)
-// {
-//   auto stream = raft::resource::get_cuda_stream(handle);
-//   auto n_rows = codes.extent(0);
-//   if (n_rows == 0) { return; }
-
-//   auto dim = codes.extent(1);
-
-//   n_rows -= row_offset;
-//   constexpr uint32_t kBlockSize = 256;
-//   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
-//   dim3 threads(kBlockSize, 1, 1);
-//   auto kernel = pack_list_data_kernel_float32<T, IdxT, kBlockSize>;
-//   kernel<<<blocks, threads, 0, stream>>>(codes.data_handle(),
-//   list_data.data_handle(),
-//   n_rows,
-//   dim,
-//   4);
-//   RAFT_CUDA_TRY(cudaPeekAtLastError());
-// }
+template <typename T>
+void pack_list_data(
+  raft::resources const& res,
+  device_matrix_view<const T, uint32_t, row_major> codes,
+  uint32_t veclen,
+  device_mdspan<T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data)
+{
+  uint32_t n_rows = codes.extent(0);
+  uint32_t dim = codes.extent(1);
+  static constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto stream = resource::get_cuda_stream(res);
+  pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T>
+void unpack_list_data(
+  raft::resources const& res,
+  device_mdspan<T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data,
+  uint32_t veclen,
+  device_matrix_view<const T, uint32_t, row_major> codes)
+{
+  uint32_t n_rows = codes.extent(0);
+  uint32_t dim = codes.extent(1);
+  static constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto stream = resource::get_cuda_stream(res);
+  unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
 
 }  // namespace raft::neighbors::ivf_flat::detail
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh
new file mode 100644
index 0000000000..2a6dc55c10
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+namespace raft::neighbors::ivf_flat::codepacker {
+/**
+ * Write one flat code into a block by the given offset. The offset indicates the id of the record
+ * in the list. This function interleaves the code and is intended to later copy the interleaved
+ * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit
+ * the record (offset + 1).
+ *
+ * @tparam T
+ *
+ * @param[in] flat_code input flat code
+ * @param[out] block block of memory to write interleaved codes to
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+template <typename T>
+__host__ __device__ void pack_1_interleaved(
+  const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j];
+    }
+  }
+}
+
+/**
+ * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset
+ * indicates the id of the record. This function fetches one flat code from an interleaved code.
+ *
+ * @tparam T
+ *
+ * @param[in] block interleaved block. The block can be thought of as the whole inverted list in
+ * interleaved format.
+ * @param[out] flat_code output flat code
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset fetch the flat code by the given offset
+ */
+template <typename T>
+__host__ __device__ void unpack_1_interleaved(
+  const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = Pow2<kIndexGroupSize>;
+
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j];
+    }
+  }
+}
+}
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index 85272d01d1..135467ad32 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -23,8 +23,6 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
-// #include <omp.h>
-
 namespace raft::neighbors::ivf_flat::helpers {
 /**
  * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index
@@ -32,72 +30,32 @@ namespace raft::neighbors::ivf_flat::helpers {
  */
 
 namespace codepacker {
-/**
- * Write one flat code into a block by the given offset. The offset indicates the id of the record in the list. This function interleaves the code and is intended to later copy the interleaved codes over to the IVF list on device.
- * NB: no memory allocation happens here; the block must fit the record (offset + 1).
- *
- * @tparam T
- *
- * @param[in] flat_code input flat code
- * @param[out] block block of memory to write interleaved codes to
- * @param[in] dim dimension of the flat code
- * @param[in] veclen size of interleaved data chunks
- * @param[in] offset how many records to skip before writing the data into the list
- */
-template <typename T>
-__host__ __device__ void pack_1_interleaved(
-        const T* flat_code,
-        T* block,
-        uint32_t dim,
-        uint32_t veclen,
-        uint32_t offset) {    
-  // The data is written in interleaved groups of `index::kGroupSize` vectors
-  using interleaved_group = Pow2<kIndexGroupSize>;
 
-  // Interleave dimensions of the source vector while recording it.
-  // NB: such `veclen` is selected, that `dim % veclen == 0`
-    auto group_offset = interleaved_group::roundDown(offset);
-    auto ingroup_id = interleaved_group::mod(offset) * veclen;
 
-    for (uint32_t l = 0; l < dim; l += veclen) {
-      for (uint32_t j = 0; j < veclen; j++) {
-        block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j];
-      }
-    }
+template <typename T>
+inline void pack_full_list(
+  raft::resources const& res,
+  device_matrix_view<const T, uint32_t, row_major> codes,
+  uint32_t veclen,
+  device_mdspan<T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data)
+{
+  raft::neighbors::ivf_flat::detail::pack_list_data(res,
+  codes,
+  veclen,
+  list_data);
 }
 
-/**
- * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset indicates the id of the record. This function fetches one flat code from an interleaved code.
- *
- * @tparam T
- *
- * @param[in] block interleaved block. The block can be thought of as the whole inverted list in interleaved format.
- * @param[out] flat_code output flat code
- * @param[in] dim dimension of the flat code
- * @param[in] veclen size of interleaved data chunks
- * @param[in] offset fetch the flat code by the given offset
- */
 template <typename T>
-__host__ __device__ void unpack_1_interleaved(
-        const T* block,
-        T* flat_code,
-        uint32_t dim,
-        uint32_t veclen,
-        uint32_t offset) {
-
-  // The data is written in interleaved groups of `index::kGroupSize` vectors
-  using interleaved_group = Pow2<kIndexGroupSize>;
-
-  // NB: such `veclen` is selected, that `dim % veclen == 0`
-  auto group_offset = interleaved_group::roundDown(offset);
-  auto ingroup_id = interleaved_group::mod(offset) * veclen;
-
-    for (uint32_t l = 0; l < dim; l += veclen) {
-      for (uint32_t j = 0; j < veclen; j++) {
-          flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j];
-      }
-    }
+inline void unpack_full_list(
+  raft::resources const& res,
+  device_mdspan<const T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data,
+  uint32_t veclen,
+  device_matrix_view<T, uint32_t, row_major> codes)
+{
+  raft::neighbors::ivf_flat::detail::unpack_list_data(res,
+    list_data, veclen, codes);
 }
 } // namespace codepacker
 /** @} */
 }  // namespace raft::neighbors::ivf_flat::helpers
+
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 1d724bada7..25339a6391 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -22,10 +22,6 @@
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include "processing.cuh"
-#include "raft/core/host_mdarray.hpp"
-#include "raft/neighbors/ivf_flat_types.hpp"
-#include "raft/neighbors/ivf_flat_helpers.cuh"
-#include "raft/util/pow2_utils.cuh"
 #include <raft/core/operators.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -78,91 +74,6 @@ void approx_knn_build_index(raft::resources const& handle,
     auto new_params               = from_legacy_index_params(*ivf_ft_pams, metric, metricArg);
     index->ivf_flat<T, int64_t>() = std::make_unique<const ivf_flat::index<T, int64_t>>(
       ivf_flat::build(handle, new_params, index_array, int64_t(n), D));
-    
-    // raft::resource::sync_stream(handle);
-
-    // auto old_list = index->ivf_flat<T, int64_t>()->lists()[0];
-    // uint32_t n_rows   = old_list->size.load();
-    // uint32_t roundup = Pow2<raft::neighbors::ivf_flat::kIndexGroupSize>::roundUp(n_rows);
-
-    // RAFT_LOG_INFO("roundup %d, n_rows %d", roundup, n_rows);
-
-    // if (n_rows == 0) { return; }
-
-    // auto dim = index->ivf_flat<T, int64_t>()->dim();
-    // auto veclen = index -> ivf_flat<T, int64_t>()->veclen();
-    // RAFT_LOG_INFO("roundup %d, n_rows %d, veclen %d, dim %d", roundup, n_rows, veclen, dim);
-    // auto codes   = make_host_matrix<T>(roundup, dim);
-    // auto block = make_host_matrix<T>(roundup, dim);
-
-    // T* firstArray;
-    // cudaMemcpy(&firstArray, index->ivf_flat<T, int64_t>()->data_ptrs().data_handle(), sizeof(float*), cudaMemcpyDeviceToHost);  // Copy the pointer to the first array from device to host
-
-    // raft::print_device_vector("codes_gpu", firstArray, 1, std::cout);
-    // raft::update_host(codes.data_handle(), firstArray, (size_t)(roundup * dim), stream);
-    // raft::resource::sync_stream(handle);
-    // raft::neighbors::ivf_flat::helpers::pack_host_interleaved(
-    //     codes.data_handle(),
-    //     block.data_handle(),
-    //     n_rows,
-    //     dim,
-    //     veclen);
-    
-    // RAFT_LOG_INFO("veclen %d", veclen);
-    // raft::print_host_vector("codes", codes.data_handle(), roundup * dim, std::cout);
-    // raft::print_host_vector("block", block.data_handle(), roundup * dim, std::cout);
-    // // auto indices = make_device_vector<IdxT>(handle_, n_rows);
-    // copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
-
-    // ivf_flat::helpers::pack_list_data(handle_, *index, codes.view(), label, 0);
-    // ivf_pq::helpers::erase_list(handle_, index, label);
-    // ivf_pq::helpers::extend_list_with_codes<IdxT>(
-    //   handle_, index, codes.view(), indices.view(), label);
-
-    // auto& new_list = index->lists()[label];
-    // ASSERT_NE(old_list.get(), new_list.get())
-    //   << "The old list should have been shared and retained after ivf_pq index has erased the "
-    //      "corresponding cluster.";
-    // auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) *
-    //                       new_list->data.extent(2) * new_list->data.extent(3);
-
-    // ASSERT_TRUE(old_list->data.size() >= list_data_size);
-    // ASSERT_TRUE(new_list->data.size() >= list_data_size);
-    // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
-    //                         new_list->data.data_handle(),
-    //                         list_data_size,
-    //                         Compare<uint8_t>{}));
-
-    // // Pack a few vectors back to the list.
-    // int row_offset = 9;
-    // int n_vec      = 3;
-    // ASSERT_TRUE(row_offset + n_vec < n_rows);
-    // size_t offset      = row_offset * index->pq_dim();
-    // auto codes_to_pack = make_device_matrix_view<const uint8_t, uint32_t>(
-    //   codes.data_handle() + offset, n_vec, index->pq_dim());
-    // ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset);
-    // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
-    //                         new_list->data.data_handle(),
-    //                         list_data_size,
-    //                         Compare<uint8_t>{}));
-
-    // Another test with the API that take list_data directly
-    // auto list_data  = index->lists()[label]->data.view();
-    // uint32_t n_take = 4;
-    // ASSERT_TRUE(row_offset + n_take < n_rows);
-    // auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, index->pq_dim());
-    // ivf_pq::helpers::codepacker::unpack(
-    //   handle_, list_data, index->pq_bits(), row_offset, codes2.view());
-
-    // // Write it back
-    // ivf_pq::helpers::codepacker::pack(
-    //   handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data);
-    // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
-    //                         new_list->data.data_handle(),
-    //                         list_data_size,
-    //                         Compare<uint8_t>{}));
-  // }
-
   } else if (ivf_pq_pams) {
     neighbors::ivf_pq::index_params params;
     params.metric     = metric;
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index a252b26600..b2fa4de90a 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -17,6 +17,11 @@
 
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
+#include "raft/core/device_mdarray.hpp"
+#include "raft/core/host_mdarray.hpp"
+#include "raft/linalg/map.cuh"
+#include "raft/util/cudart_utils.hpp"
+#include "raft/util/fast_int_div.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 
@@ -26,6 +31,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_flat_codepacker.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
 #include <raft/spatial/knn/knn.cuh>
@@ -36,6 +42,7 @@
 
 #include <gtest/gtest.h>
 
+#include <rmm/device_uvector.hpp>
 #include <thrust/sequence.h>
 
 #include <cstddef>
@@ -65,6 +72,15 @@ template <typename IdxT>
   return os;
 }
 
+template <typename DataT, typename IdxT>
+void flat_codes_from_indices(raft::resources const& handle, IdxT* indices, DataT* data, uint32_t n_rows, uint32_t dim, DataT* flat_codes) {
+  raft::linalg::map_offset(handle, raft::make_device_vector_view(flat_codes, n_rows * dim), [dim = util::FastIntDiv(dim), indices, data]__device__ (auto idx) {
+              auto row = idx / dim;
+              auto col = idx % dim;
+              return data[indices[row] * dim + col];
+            });
+}
+
 template <typename T, typename DataT, typename IdxT>
 class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
  public:
@@ -264,6 +280,79 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     }
   }
 
+  void testPacker() {
+    ivf_flat::index_params index_params;
+        index_params.n_lists          = ps.n_list;
+        index_params.metric           = ps.metric;
+        index_params.adaptive_centers = false;
+
+        index_params.add_data_on_build        = true;
+        index_params.kmeans_trainset_fraction = 0.5;
+        index_params.metric_arg               = 0;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+
+        auto index = ivf_flat::build(handle_, index_params, database_view);
+
+        auto list_sizes = raft::make_host_vector<uint32_t>(index.n_lists());
+            update_host(list_sizes.data_handle(),
+            index.list_sizes().data_handle(),
+            index.n_lists(),
+            stream_);
+        auto data_ptrs = raft::make_host_vector<DataT*>(index.n_lists());
+            update_host(data_ptrs.data_handle(),
+            index.data_ptrs().data_handle(),
+            index.n_lists(),
+            stream_);
+          auto inds_ptrs = raft::make_host_vector<IdxT*>(index.n_lists());
+            update_host(inds_ptrs.data_handle(),
+            index.inds_ptrs().data_handle(),
+            index.n_lists(),
+            stream_);
+            resource::sync_stream(handle_);
+
+            auto list_device_spec = list_spec<uint32_t, T, uint32_t>{ps.dim, true};
+
+        for(uint32_t label = 0; label < index.n_lists(); label++) {
+            
+            rmm::device_uvector<T> interleaved_data(Pow2<kIndexGroupSize>::roundUp(list_sizes[label] * ps.dim), stream_);
+            list_device_spec.make_list_extents(list_sizes[label]);
+
+            // fetch the flat codes
+            rmm::device_uvector<T> flat_codes(list_sizes[label] * ps.dim, stream_);
+            
+            flat_codes_from_indices(handle_, inds_ptrs + label, data_ptrs + label, list_sizes[label], ps.dim, flat_codes.data_handle());
+
+            helpers::codepacker::pack_full_list(
+  handle_,
+  flat_codes.view(),
+  index.veclen(),
+  make_device_vector_view<T>(interleaved_data.data(), Pow2<kIndexGroupSize>::roundUp(list_sizes[label]), ps.dim));
+
+            ASSERT_TRUE(raft::devArrMatch(interleaved_data.data(),
+                                        data_ptrs + label,
+                                        Pow2<kIndexGroupSize>::roundUp(list_sizes[label]) * ps.dim,
+                                        raft::Compare<float>(),
+                                        stream_));
+            
+  rmm::device_uvector<DataT> unpacked_flat_codes(list_sizes[label] * ps.dim, stream_);
+            
+            helpers::codepacker::unpack_full_list(
+  handle_,
+  raft::make_device_matrix_view(data_ptrs[label], Pow2<kIndexGroupSize>::roundUp(list_sizes[label]), ps.dim),
+  index.veclen(),
+  make_device_vector_view<T>(unpacked_flat_codes.data(), list_sizes[label], ps.dim));
+
+            ASSERT_TRUE(raft::devArrMatch(flat_codes.data(),
+                                        unpacked_flat_codes.data(),
+                                        list_sizes[label] * ps.dim,
+                                        raft::Compare<int>(),
+                                        stream_));
+
+          }
+  }
+
   void SetUp() override
   {
     database.resize(ps.num_db_vecs * ps.dim, stream_);
@@ -356,5 +445,4 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
    raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
    raft::distance::DistanceType::InnerProduct,
    false}};
-
 }  // namespace raft::neighbors::ivf_flat

From 2a2ee515a323c93a5ab922adf7e8176fa7a8593a Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 17 Jul 2023 09:57:29 -0700
Subject: [PATCH 05/22] Undo deletions

---
 .../all_cuda-118_arch-x86_64.yaml             | 60 +++++++++++++++++++
 .../all_cuda-120_arch-x86_64.yaml             | 57 ++++++++++++++++++
 .../bench_ann_cuda-118_arch-x86_64.yaml       | 38 ++++++++++++
 3 files changed, 155 insertions(+)
 create mode 100644 conda/environments/all_cuda-118_arch-x86_64.yaml
 create mode 100644 conda/environments/all_cuda-120_arch-x86_64.yaml
 create mode 100644 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
new file mode 100644
index 0000000000..67dd01ada9
--- /dev/null
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -0,0 +1,60 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- breathe
+- c-compiler
+- clang-tools=16.0.1
+- clang=16.0.1
+- cmake>=3.23.1,!=3.25.0
+- cuda-profiler-api=11.8.86
+- cuda-python>=11.7.1,<12.0a0
+- cuda-version=11.8
+- cudatoolkit
+- cupy>=12.0.0
+- cxx-compiler
+- cython>=0.29,<0.30
+- dask-core>=2023.5.1
+- dask-cuda==23.8.*
+- dask>=2023.5.1
+- distributed>=2023.5.1
+- doxygen>=1.8.20
+- gcc_linux-64=11.*
+- gmock>=1.13.0
+- graphviz
+- gtest>=1.13.0
+- ipython
+- joblib>=0.11
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- nccl>=2.9.9
+- ninja
+- numba>=0.57
+- numpy>=1.21
+- numpydoc
+- pydata-sphinx-theme
+- pytest
+- pytest-cov
+- recommonmark
+- rmm==23.8.*
+- scikit-build>=0.13.1
+- scikit-learn
+- scipy
+- sphinx-copybutton
+- sphinx-markdown-tables
+- sysroot_linux-64==2.17
+- ucx-proc=*=gpu
+- ucx-py==0.33.*
+- ucx>=1.13.0
+name: all_cuda-118_arch-x86_64
\ No newline at end of file
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
new file mode 100644
index 0000000000..f1d56d4ff6
--- /dev/null
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -0,0 +1,57 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- breathe
+- c-compiler
+- clang-tools=16.0.1
+- clang=16.0.1
+- cmake>=3.23.1,!=3.25.0
+- cuda-cudart-dev
+- cuda-profiler-api
+- cuda-python>=12.0,<13.0a0
+- cuda-version=12.0
+- cupy>=12.0.0
+- cxx-compiler
+- cython>=0.29,<0.30
+- dask-core>=2023.5.1
+- dask-cuda==23.8.*
+- dask>=2023.5.1
+- distributed>=2023.5.1
+- doxygen>=1.8.20
+- gcc_linux-64=11.*
+- gmock>=1.13.0
+- graphviz
+- gtest>=1.13.0
+- ipython
+- joblib>=0.11
+- libcublas-dev
+- libcurand-dev
+- libcusolver-dev
+- libcusparse-dev
+- nccl>=2.9.9
+- ninja
+- numba>=0.57
+- numpy>=1.21
+- numpydoc
+- pydata-sphinx-theme
+- pytest
+- pytest-cov
+- recommonmark
+- rmm==23.8.*
+- scikit-build>=0.13.1
+- scikit-learn
+- scipy
+- sphinx-copybutton
+- sphinx-markdown-tables
+- sysroot_linux-64==2.17
+- ucx-proc=*=gpu
+- ucx-py==0.33.*
+- ucx>=1.13.0
+name: all_cuda-120_arch-x86_64
+
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
new file mode 100644
index 0000000000..0869be212f
--- /dev/null
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -0,0 +1,38 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- c-compiler
+- clang-tools=16.0.1
+- clang=16.0.1
+- cmake>=3.23.1,!=3.25.0
+- cuda-profiler-api=11.8.86
+- cuda-version=11.8
+- cudatoolkit
+- cxx-compiler
+- cython>=0.29,<0.30
+- faiss-proc=*=cuda
+- gcc_linux-64=11.*
+- glog>=0.6.0
+- h5py>=3.8.0
+- hnswlib=0.7.0
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- libfaiss>=1.7.1
+- nccl>=2.9.9
+- ninja
+- nlohmann_json>=3.11.2
+- scikit-build>=0.13.1
+- sysroot_linux-64==2.17
+name: bench_ann_cuda-118_arch-x86_64
\ No newline at end of file

From 834dd2c17c3c7820a2edbfcc9d293ce1b1b14fc8 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 17 Jul 2023 10:00:13 -0700
Subject: [PATCH 06/22] undo yaml changes

---
 conda/environments/all_cuda-120_arch-x86_64.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index f1d56d4ff6..589fb4920b 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -53,5 +53,4 @@ dependencies:
 - ucx-proc=*=gpu
 - ucx-py==0.33.*
 - ucx>=1.13.0
-name: all_cuda-120_arch-x86_64
-
+name: all_cuda-120_arch-x86_64
\ No newline at end of file

From 60134298533a92b22c7db8e3fa79b720b25800c4 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 17 Jul 2023 10:07:25 -0700
Subject: [PATCH 07/22] style

---
 .../raft/neighbors/detail/ivf_flat_build.cuh  |  24 +--
 .../raft/neighbors/ivf_flat_codepacker.cuh    |   2 +-
 .../raft/neighbors/ivf_flat_helpers.cuh       |  15 +-
 cpp/test/neighbors/ann_ivf_flat.cuh           | 157 +++++++++---------
 4 files changed, 100 insertions(+), 98 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 4615ddba57..edd2b07701 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -425,9 +425,8 @@ __global__ void pack_interleaved_list_kernel(
 {
   auto tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < n_rows) {
-    codepacker::pack_1_interleaved(
-      codes + tid * dim, list_data, dim, veclen, tid);
-    }
+    codepacker::pack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid);
+  }
 }
 
 template <typename T>
@@ -436,9 +435,8 @@ __global__ void unpack_interleaved_list_kernel(
 {
   auto tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < n_rows) {
-    codepacker::unpack_1_interleaved(
-      codes + tid * dim, list_data, dim, veclen, tid);
-    }
+    codepacker::unpack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid);
+  }
 }
 
 template <typename T>
@@ -448,13 +446,14 @@ void pack_list_data(
   uint32_t veclen,
   device_mdspan<T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data)
 {
-  uint32_t n_rows = codes.extent(0);
-  uint32_t dim = codes.extent(1);
+  uint32_t n_rows                      = codes.extent(0);
+  uint32_t dim                         = codes.extent(1);
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto stream = resource::get_cuda_stream(res);
-  pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen);
+  pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
+    list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -465,13 +464,14 @@ void unpack_list_data(
   uint32_t veclen,
   device_matrix_view<const T, uint32_t, row_major> codes)
 {
-  uint32_t n_rows = codes.extent(0);
-  uint32_t dim = codes.extent(1);
+  uint32_t n_rows                      = codes.extent(0);
+  uint32_t dim                         = codes.extent(1);
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto stream = resource::get_cuda_stream(res);
-  unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen);
+  unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
+    codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh
index 2a6dc55c10..9b12761daf 100644
--- a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh
@@ -86,4 +86,4 @@ __host__ __device__ void unpack_1_interleaved(
     }
   }
 }
-}
\ No newline at end of file
+}  // namespace raft::neighbors::ivf_flat::codepacker
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index 135467ad32..7ed0b7ec8e 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -31,7 +31,6 @@ namespace raft::neighbors::ivf_flat::helpers {
 
 namespace codepacker {
 
-
 template <typename T>
 inline void pack_full_list(
   raft::resources const& res,
@@ -39,23 +38,19 @@ inline void pack_full_list(
   uint32_t veclen,
   device_mdspan<T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data)
 {
-  raft::neighbors::ivf_flat::detail::pack_list_data(res,
-  codes,
-  veclen,
-  list_data);
+  raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, list_data);
 }
 
 template <typename T>
 inline void unpack_full_list(
   raft::resources const& res,
-  device_mdspan<const T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data,
+  device_mdspan<const T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major>
+    list_data,
   uint32_t veclen,
   device_matrix_view<T, uint32_t, row_major> codes)
 {
-  raft::neighbors::ivf_flat::detail::unpack_list_data(res,
-    list_data, veclen, codes);
+  raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes);
 }
-} // namespace codepacker
+}  // namespace codepacker
 /** @} */
 }  // namespace raft::neighbors::ivf_flat::helpers
-
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index b2fa4de90a..17952c4019 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -73,12 +73,20 @@ template <typename IdxT>
 }
 
 template <typename DataT, typename IdxT>
-void flat_codes_from_indices(raft::resources const& handle, IdxT* indices, DataT* data, uint32_t n_rows, uint32_t dim, DataT* flat_codes) {
-  raft::linalg::map_offset(handle, raft::make_device_vector_view(flat_codes, n_rows * dim), [dim = util::FastIntDiv(dim), indices, data]__device__ (auto idx) {
-              auto row = idx / dim;
-              auto col = idx % dim;
-              return data[indices[row] * dim + col];
-            });
+void flat_codes_from_indices(raft::resources const& handle,
+                             IdxT* indices,
+                             DataT* data,
+                             uint32_t n_rows,
+                             uint32_t dim,
+                             DataT* flat_codes)
+{
+  raft::linalg::map_offset(handle,
+                           raft::make_device_vector_view(flat_codes, n_rows * dim),
+                           [dim = util::FastIntDiv(dim), indices, data] __device__(auto idx) {
+                             auto row = idx / dim;
+                             auto col = idx % dim;
+                             return data[indices[row] * dim + col];
+                           });
 }
 
 template <typename T, typename DataT, typename IdxT>
@@ -280,77 +288,76 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     }
   }
 
-  void testPacker() {
+  void testPacker()
+  {
     ivf_flat::index_params index_params;
-        index_params.n_lists          = ps.n_list;
-        index_params.metric           = ps.metric;
-        index_params.adaptive_centers = false;
-
-        index_params.add_data_on_build        = true;
-        index_params.kmeans_trainset_fraction = 0.5;
-        index_params.metric_arg               = 0;
-
-        auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
-          (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
-
-        auto index = ivf_flat::build(handle_, index_params, database_view);
-
-        auto list_sizes = raft::make_host_vector<uint32_t>(index.n_lists());
-            update_host(list_sizes.data_handle(),
-            index.list_sizes().data_handle(),
-            index.n_lists(),
-            stream_);
-        auto data_ptrs = raft::make_host_vector<DataT*>(index.n_lists());
-            update_host(data_ptrs.data_handle(),
-            index.data_ptrs().data_handle(),
-            index.n_lists(),
-            stream_);
-          auto inds_ptrs = raft::make_host_vector<IdxT*>(index.n_lists());
-            update_host(inds_ptrs.data_handle(),
-            index.inds_ptrs().data_handle(),
-            index.n_lists(),
-            stream_);
-            resource::sync_stream(handle_);
-
-            auto list_device_spec = list_spec<uint32_t, T, uint32_t>{ps.dim, true};
-
-        for(uint32_t label = 0; label < index.n_lists(); label++) {
-            
-            rmm::device_uvector<T> interleaved_data(Pow2<kIndexGroupSize>::roundUp(list_sizes[label] * ps.dim), stream_);
-            list_device_spec.make_list_extents(list_sizes[label]);
-
-            // fetch the flat codes
-            rmm::device_uvector<T> flat_codes(list_sizes[label] * ps.dim, stream_);
-            
-            flat_codes_from_indices(handle_, inds_ptrs + label, data_ptrs + label, list_sizes[label], ps.dim, flat_codes.data_handle());
-
-            helpers::codepacker::pack_full_list(
-  handle_,
-  flat_codes.view(),
-  index.veclen(),
-  make_device_vector_view<T>(interleaved_data.data(), Pow2<kIndexGroupSize>::roundUp(list_sizes[label]), ps.dim));
-
-            ASSERT_TRUE(raft::devArrMatch(interleaved_data.data(),
-                                        data_ptrs + label,
-                                        Pow2<kIndexGroupSize>::roundUp(list_sizes[label]) * ps.dim,
-                                        raft::Compare<float>(),
-                                        stream_));
-            
-  rmm::device_uvector<DataT> unpacked_flat_codes(list_sizes[label] * ps.dim, stream_);
-            
-            helpers::codepacker::unpack_full_list(
-  handle_,
-  raft::make_device_matrix_view(data_ptrs[label], Pow2<kIndexGroupSize>::roundUp(list_sizes[label]), ps.dim),
-  index.veclen(),
-  make_device_vector_view<T>(unpacked_flat_codes.data(), list_sizes[label], ps.dim));
-
-            ASSERT_TRUE(raft::devArrMatch(flat_codes.data(),
-                                        unpacked_flat_codes.data(),
-                                        list_sizes[label] * ps.dim,
-                                        raft::Compare<int>(),
-                                        stream_));
+    index_params.n_lists          = ps.n_list;
+    index_params.metric           = ps.metric;
+    index_params.adaptive_centers = false;
+
+    index_params.add_data_on_build        = true;
+    index_params.kmeans_trainset_fraction = 0.5;
+    index_params.metric_arg               = 0;
+
+    auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+      (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+
+    auto index = ivf_flat::build(handle_, index_params, database_view);
+
+    auto list_sizes = raft::make_host_vector<uint32_t>(index.n_lists());
+    update_host(
+      list_sizes.data_handle(), index.list_sizes().data_handle(), index.n_lists(), stream_);
+    auto data_ptrs = raft::make_host_vector<DataT*>(index.n_lists());
+    update_host(data_ptrs.data_handle(), index.data_ptrs().data_handle(), index.n_lists(), stream_);
+    auto inds_ptrs = raft::make_host_vector<IdxT*>(index.n_lists());
+    update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_);
+    resource::sync_stream(handle_);
 
-          }
+    auto list_device_spec = list_spec<uint32_t, T, uint32_t>{ps.dim, true};
+
+    for (uint32_t label = 0; label < index.n_lists(); label++) {
+      rmm::device_uvector<T> interleaved_data(
+        Pow2<kIndexGroupSize>::roundUp(list_sizes[label] * ps.dim), stream_);
+      list_device_spec.make_list_extents(list_sizes[label]);
+
+      // fetch the flat codes
+      rmm::device_uvector<T> flat_codes(list_sizes[label] * ps.dim, stream_);
+
+      flat_codes_from_indices(handle_,
+                              inds_ptrs + label,
+                              data_ptrs + label,
+                              list_sizes[label],
+                              ps.dim,
+                              flat_codes.data_handle());
+
+      helpers::codepacker::pack_full_list(
+        handle_,
+        flat_codes.view(),
+        index.veclen(),
+        make_device_vector_view<T>(
+          interleaved_data.data(), Pow2<kIndexGroupSize>::roundUp(list_sizes[label]), ps.dim));
+
+      ASSERT_TRUE(raft::devArrMatch(interleaved_data.data(),
+                                    data_ptrs + label,
+                                    Pow2<kIndexGroupSize>::roundUp(list_sizes[label]) * ps.dim,
+                                    raft::Compare<float>(),
+                                    stream_));
+
+      rmm::device_uvector<DataT> unpacked_flat_codes(list_sizes[label] * ps.dim, stream_);
+
+      helpers::codepacker::unpack_full_list(
+        handle_,
+        raft::make_device_matrix_view(
+          data_ptrs[label], Pow2<kIndexGroupSize>::roundUp(list_sizes[label]), ps.dim),
+        index.veclen(),
+        make_device_vector_view<T>(unpacked_flat_codes.data(), list_sizes[label], ps.dim));
+
+      ASSERT_TRUE(raft::devArrMatch(flat_codes.data(),
+                                    unpacked_flat_codes.data(),
+                                    list_sizes[label] * ps.dim,
+                                    raft::Compare<int>(),
+                                    stream_));
+    }
   }
 
   void SetUp() override

From ab6345a7cd4c459dd792403cfce22fd5976ce4b9 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 17 Jul 2023 17:29:27 -0700
Subject: [PATCH 08/22] Update tests, correct make_list_extents

---
 .../raft/neighbors/detail/ivf_flat_build.cuh  | 12 ++--
 .../raft/neighbors/ivf_flat_helpers.cuh       | 12 ++--
 cpp/include/raft/neighbors/ivf_flat_types.hpp |  6 +-
 cpp/test/neighbors/ann_ivf_flat.cuh           | 65 ++++++++++---------
 .../ann_ivf_flat/test_float_int64_t.cu        |  2 +-
 5 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index edd2b07701..2c8b0cbe71 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -439,12 +439,12 @@ __global__ void unpack_interleaved_list_kernel(
   }
 }
 
-template <typename T>
+template <typename SizeT, typename T, typename IdxT>
 void pack_list_data(
   raft::resources const& res,
-  device_matrix_view<const T, uint32_t, row_major> codes,
+  device_matrix_view<const T, SizeT, row_major> codes,
   uint32_t veclen,
-  device_mdspan<T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data)
+  device_mdspan<T, typename list_spec<SizeT, T, IdxT>::list_extents, row_major> list_data)
 {
   uint32_t n_rows                      = codes.extent(0);
   uint32_t dim                         = codes.extent(1);
@@ -457,12 +457,12 @@ void pack_list_data(
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename T>
+template <typename SizeT, typename T, typename IdxT>
 void unpack_list_data(
   raft::resources const& res,
-  device_mdspan<T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data,
+  device_mdspan<const T, typename list_spec<SizeT, T, IdxT>::list_extents, row_major> list_data,
   uint32_t veclen,
-  device_matrix_view<const T, uint32_t, row_major> codes)
+  device_matrix_view<T, SizeT, row_major> codes)
 {
   uint32_t n_rows                      = codes.extent(0);
   uint32_t dim                         = codes.extent(1);
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index 7ed0b7ec8e..b90797d4c5 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -31,23 +31,23 @@ namespace raft::neighbors::ivf_flat::helpers {
 
 namespace codepacker {
 
-template <typename T>
+template <typename SizeT, typename T, typename IdxT>
 inline void pack_full_list(
   raft::resources const& res,
-  device_matrix_view<const T, uint32_t, row_major> codes,
+  device_matrix_view<const T, SizeT, row_major> codes,
   uint32_t veclen,
-  device_mdspan<T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major> list_data)
+  device_mdspan<T, typename list_spec<SizeT, T, IdxT>::list_extents, row_major> list_data)
 {
   raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, list_data);
 }
 
-template <typename T>
+template <typename SizeT, typename T, typename IdxT>
 inline void unpack_full_list(
   raft::resources const& res,
-  device_mdspan<const T, typename list_spec<uint32_t, T, uint32_t>::list_extents, row_major>
+  device_mdspan<const T, typename list_spec<SizeT, T, IdxT>::list_extents, row_major>
     list_data,
   uint32_t veclen,
-  device_matrix_view<T, uint32_t, row_major> codes)
+  device_matrix_view<T, SizeT, row_major> codes)
 {
   raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes);
 }
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 2e2e49cdbc..d427e99e3e 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -94,9 +94,9 @@ struct list_spec {
 
   SizeT align_max;
   SizeT align_min;
-  uint32_t dim;
+  SizeT dim;
 
-  constexpr list_spec(uint32_t dim, bool conservative_memory_allocation)
+  constexpr list_spec(SizeT dim, bool conservative_memory_allocation)
     : dim(dim),
       align_min(kIndexGroupSize),
       align_max(conservative_memory_allocation ? kIndexGroupSize : 1024)
@@ -113,7 +113,7 @@ struct list_spec {
   /** Determine the extents of an array enough to hold a given amount of data. */
   constexpr auto make_list_extents(SizeT n_rows) const -> list_extents
   {
-    return make_extents<SizeT>(n_rows, dim);
+    return make_extents<SizeT>(div_rounding_up_safe<SizeT>(n_rows, kIndexGroupSize), dim);
   }
 };
 
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 17952c4019..eb49e3d2d9 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -20,6 +20,7 @@
 #include "raft/core/device_mdarray.hpp"
 #include "raft/core/host_mdarray.hpp"
 #include "raft/linalg/map.cuh"
+#include "raft/neighbors/ivf_flat_types.hpp"
 #include "raft/util/cudart_utils.hpp"
 #include "raft/util/fast_int_div.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
@@ -31,7 +32,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_flat.cuh>
-#include <raft/neighbors/ivf_flat_codepacker.cuh>
+#include <raft/neighbors/ivf_flat_helpers.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
 #include <raft/spatial/knn/knn.cuh>
@@ -73,18 +74,18 @@ template <typename IdxT>
 }
 
 template <typename DataT, typename IdxT>
-void flat_codes_from_indices(raft::resources const& handle,
+void flat_codes_from_list_indices(raft::resources const& handle,
                              IdxT* indices,
                              DataT* data,
-                             uint32_t n_rows,
-                             uint32_t dim,
+                             IdxT n_rows,
+                             IdxT dim,
                              DataT* flat_codes)
 {
   raft::linalg::map_offset(handle,
                            raft::make_device_vector_view(flat_codes, n_rows * dim),
-                           [dim = util::FastIntDiv(dim), indices, data] __device__(auto idx) {
-                             auto row = idx / dim;
-                             auto col = idx % dim;
+                           [dim, divisor = util::FastIntDiv(dim), indices, data] __device__(auto idx) {
+                             auto row = idx / divisor;
+                             auto col = idx % divisor;
                              return data[indices[row] * dim + col];
                            });
 }
@@ -291,7 +292,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   void testPacker()
   {
     ivf_flat::index_params index_params;
-    index_params.n_lists          = ps.n_list;
+    index_params.n_lists          = ps.nlist;
     index_params.metric           = ps.metric;
     index_params.adaptive_centers = false;
 
@@ -313,49 +314,51 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_);
     resource::sync_stream(handle_);
 
-    auto list_device_spec = list_spec<uint32_t, T, uint32_t>{ps.dim, true};
+    auto list_device_spec = list_spec<IdxT, T, IdxT>{ps.dim, false};
 
     for (uint32_t label = 0; label < index.n_lists(); label++) {
-      rmm::device_uvector<T> interleaved_data(
-        Pow2<kIndexGroupSize>::roundUp(list_sizes[label] * ps.dim), stream_);
-      list_device_spec.make_list_extents(list_sizes[label]);
+      uint32_t list_size = list_sizes.data_handle()[label];
+      T* list_data_ptr = data_ptrs.data_handle()[label];
+      IdxT* list_inds_ptr = inds_ptrs.data_handle()[label];
+      
+      auto exts = list_device_spec.make_list_extents(static_cast<IdxT>(list_size));
+      auto interleaved_data = make_device_mdarray<T>(handle_, exts);
 
       // fetch the flat codes
-      rmm::device_uvector<T> flat_codes(list_sizes[label] * ps.dim, stream_);
+      auto flat_codes = make_device_matrix<T, IdxT>(handle_, static_cast<IdxT>(list_size), ps.dim);
 
-      flat_codes_from_indices(handle_,
-                              inds_ptrs + label,
-                              data_ptrs + label,
-                              list_sizes[label],
+      flat_codes_from_list_indices(handle_,
+                              list_inds_ptr,
+                              list_data_ptr,
+                              static_cast<IdxT>(list_size),
                               ps.dim,
                               flat_codes.data_handle());
+      auto l = ivf::list(handle_, list_device_spec, static_cast<IdxT>(list_size));
 
       helpers::codepacker::pack_full_list(
         handle_,
-        flat_codes.view(),
+        make_device_matrix_view<const T, IdxT>(flat_codes.data_handle(), static_cast<IdxT>(list_size), ps.dim),
         index.veclen(),
-        make_device_vector_view<T>(
-          interleaved_data.data(), Pow2<kIndexGroupSize>::roundUp(list_sizes[label]), ps.dim));
+        interleaved_data.view());
 
-      ASSERT_TRUE(raft::devArrMatch(interleaved_data.data(),
-                                    data_ptrs + label,
-                                    Pow2<kIndexGroupSize>::roundUp(list_sizes[label]) * ps.dim,
+      ASSERT_TRUE(raft::devArrMatch(interleaved_data.data_handle(),
+                                    list_data_ptr,
+                                    Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim,
                                     raft::Compare<float>(),
                                     stream_));
 
-      rmm::device_uvector<DataT> unpacked_flat_codes(list_sizes[label] * ps.dim, stream_);
+      auto unpacked_flat_codes = make_device_matrix<T, IdxT>(handle_, static_cast<IdxT>(list_size), ps.dim);
 
       helpers::codepacker::unpack_full_list(
         handle_,
-        raft::make_device_matrix_view(
-          data_ptrs[label], Pow2<kIndexGroupSize>::roundUp(list_sizes[label]), ps.dim),
+        make_const_mdspan(interleaved_data.view()),
         index.veclen(),
-        make_device_vector_view<T>(unpacked_flat_codes.data(), list_sizes[label], ps.dim));
+        unpacked_flat_codes.view());
 
-      ASSERT_TRUE(raft::devArrMatch(flat_codes.data(),
-                                    unpacked_flat_codes.data(),
-                                    list_sizes[label] * ps.dim,
-                                    raft::Compare<int>(),
+      ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(),
+                                    unpacked_flat_codes.data_handle(),
+                                    list_size * ps.dim,
+                                    raft::Compare<float>(),
                                     stream_));
     }
   }
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
index f0988ca988..1b4a5f36c3 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
@@ -21,7 +21,7 @@
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, float, std::int64_t> AnnIVFFlatTestF;
-TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); }
+TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); this->testPacker();}
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));
 

From ed80d1ad0109f11b81dbb7f8f6720aca627f3e13 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 19 Jul 2023 15:21:18 -0700
Subject: [PATCH 09/22] More changes

---
 .../raft/neighbors/detail/ivf_flat_build.cuh  | 34 ++++----
 .../raft/neighbors/ivf_flat_helpers.cuh       | 37 ++++++---
 cpp/include/raft/neighbors/ivf_flat_types.hpp |  7 +-
 cpp/test/neighbors/ann_ivf_flat.cuh           | 82 ++++++++++---------
 .../ann_ivf_flat/test_float_int64_t.cu        |  2 +-
 log.txt                                       |  2 +
 6 files changed, 96 insertions(+), 68 deletions(-)
 create mode 100644 log.txt

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 2c8b0cbe71..c475e12b96 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -423,7 +423,7 @@ template <typename T>
 __global__ void pack_interleaved_list_kernel(
   const T* codes, T* list_data, uint32_t n_rows, uint32_t dim, uint32_t veclen)
 {
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < n_rows) {
     codepacker::pack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid);
   }
@@ -433,45 +433,49 @@ template <typename T>
 __global__ void unpack_interleaved_list_kernel(
   const T* list_data, T* codes, uint32_t n_rows, uint32_t dim, uint32_t veclen)
 {
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < n_rows) {
-    codepacker::unpack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid);
+    codepacker::unpack_1_interleaved(list_data, codes + tid * dim, dim, veclen, tid);
   }
 }
 
-template <typename SizeT, typename T, typename IdxT>
+template <typename T>
 void pack_list_data(
   raft::resources const& res,
-  device_matrix_view<const T, SizeT, row_major> codes,
+  T* codes,
+  uint32_t n_rows,
+  uint32_t dim,
   uint32_t veclen,
-  device_mdspan<T, typename list_spec<SizeT, T, IdxT>::list_extents, row_major> list_data)
+  T* list_data)
 {
-  uint32_t n_rows                      = codes.extent(0);
-  uint32_t dim                         = codes.extent(1);
+  // uint32_t n_rows                      = codes.extent(0);
+  // uint32_t dim                         = codes.extent(1);
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto stream = resource::get_cuda_stream(res);
   pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
-    list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen);
+    list_data, codes, n_rows, dim, veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename SizeT, typename T, typename IdxT>
+template <typename T>
 void unpack_list_data(
   raft::resources const& res,
-  device_mdspan<const T, typename list_spec<SizeT, T, IdxT>::list_extents, row_major> list_data,
+  T* list_data,
+  uint32_t n_rows,
+  uint32_t dim,
   uint32_t veclen,
-  device_matrix_view<T, SizeT, row_major> codes)
+  T* codes)
 {
-  uint32_t n_rows                      = codes.extent(0);
-  uint32_t dim                         = codes.extent(1);
+  // uint32_t n_rows                      = codes.extent(0);
+  // uint32_t dim                         = codes.extent(1);
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto stream = resource::get_cuda_stream(res);
   unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
-    codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen);
+    codes, list_data, n_rows, dim, veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index b90797d4c5..e757be9b8a 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -31,25 +31,38 @@ namespace raft::neighbors::ivf_flat::helpers {
 
 namespace codepacker {
 
-template <typename SizeT, typename T, typename IdxT>
-inline void pack_full_list(
-  raft::resources const& res,
-  device_matrix_view<const T, SizeT, row_major> codes,
+template <typename T>
+void pack_full_list(
+  raft::resources const& handle,
+  T* codes,
+  uint32_t n_rows,
+  uint32_t dim,
   uint32_t veclen,
-  device_mdspan<T, typename list_spec<SizeT, T, IdxT>::list_extents, row_major> list_data)
+  T* list_data)
 {
-  raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, list_data);
+  raft::neighbors::ivf_flat::detail::pack_list_data(handle, codes, n_rows, dim, veclen, list_data);
 }
 
-template <typename SizeT, typename T, typename IdxT>
-inline void unpack_full_list(
+// template <typename T, typename IdxT>
+// void unpack_full_list(
+//   raft::resources const& res,
+//   device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+//     list_data,
+//   uint32_t veclen,
+//   device_matrix_view<T, uint32_t, row_major> codes)
+// {
+//   raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes);
+// }
+template <typename T, typename IdxT>
+void unpack_full_list(
   raft::resources const& res,
-  device_mdspan<const T, typename list_spec<SizeT, T, IdxT>::list_extents, row_major>
-    list_data,
+  T* list_data,
+  uint32_t n_rows,
+  uint32_t dim,
   uint32_t veclen,
-  device_matrix_view<T, SizeT, row_major> codes)
+  T* codes)
 {
-  raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes);
+  raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, n_rows, dim, veclen, codes);
 }
 }  // namespace codepacker
 /** @} */
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index d427e99e3e..6032da8556 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -94,9 +94,9 @@ struct list_spec {
 
   SizeT align_max;
   SizeT align_min;
-  SizeT dim;
+  uint32_t dim;
 
-  constexpr list_spec(SizeT dim, bool conservative_memory_allocation)
+  constexpr list_spec(uint32_t dim, bool conservative_memory_allocation)
     : dim(dim),
       align_min(kIndexGroupSize),
       align_max(conservative_memory_allocation ? kIndexGroupSize : 1024)
@@ -113,7 +113,8 @@ struct list_spec {
   /** Determine the extents of an array enough to hold a given amount of data. */
   constexpr auto make_list_extents(SizeT n_rows) const -> list_extents
   {
-    return make_extents<SizeT>(div_rounding_up_safe<SizeT>(n_rows, kIndexGroupSize), dim);
+    // return make_extents<SizeT>(round_up_safe(n_rows, this->align_min), dim);
+    return make_extents<SizeT>(n_rows, dim);
   }
 };
 
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index eb49e3d2d9..b6bcb01d91 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -19,6 +19,8 @@
 #include "ann_utils.cuh"
 #include "raft/core/device_mdarray.hpp"
 #include "raft/core/host_mdarray.hpp"
+#include "raft/core/mdspan.hpp"
+#include "raft/core/mdspan_types.hpp"
 #include "raft/linalg/map.cuh"
 #include "raft/neighbors/ivf_flat_types.hpp"
 #include "raft/util/cudart_utils.hpp"
@@ -31,8 +33,10 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_flat.cuh>
 #include <raft/neighbors/ivf_flat_helpers.cuh>
+#include <raft/neighbors/detail/ivf_flat_build.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
 #include <raft/spatial/knn/knn.cuh>
@@ -73,23 +77,6 @@ template <typename IdxT>
   return os;
 }
 
-template <typename DataT, typename IdxT>
-void flat_codes_from_list_indices(raft::resources const& handle,
-                             IdxT* indices,
-                             DataT* data,
-                             IdxT n_rows,
-                             IdxT dim,
-                             DataT* flat_codes)
-{
-  raft::linalg::map_offset(handle,
-                           raft::make_device_vector_view(flat_codes, n_rows * dim),
-                           [dim, divisor = util::FastIntDiv(dim), indices, data] __device__(auto idx) {
-                             auto row = idx / divisor;
-                             auto col = idx % divisor;
-                             return data[indices[row] * dim + col];
-                           });
-}
-
 template <typename T, typename DataT, typename IdxT>
 class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
  public:
@@ -151,6 +138,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                database.data(),
                                ps.num_db_vecs,
                                ps.dim);
+        
+        RAFT_LOG_INFO("Index build successfully");
 
         resource::sync_stream(handle_);
         approx_knn_search(handle_,
@@ -160,6 +149,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                           ps.k,
                           search_queries.data(),
                           ps.num_queries);
+        
+        RAFT_LOG_INFO("search successful");
 
         update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
         update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
@@ -305,6 +296,9 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
     auto index = ivf_flat::build(handle_, index_params, database_view);
 
+    raft::print_device_vector("list_sizes", index.list_sizes().data_handle(), ps.nlist, std::cout);
+    RAFT_LOG_INFO("total db size %lld", ps.num_db_vecs);
+
     auto list_sizes = raft::make_host_vector<uint32_t>(index.n_lists());
     update_host(
       list_sizes.data_handle(), index.list_sizes().data_handle(), index.n_lists(), stream_);
@@ -314,32 +308,44 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_);
     resource::sync_stream(handle_);
 
-    auto list_device_spec = list_spec<IdxT, T, IdxT>{ps.dim, false};
+    auto list_device_spec = list_spec<uint32_t, T, IdxT>{static_cast<uint32_t>(ps.dim), false};
 
     for (uint32_t label = 0; label < index.n_lists(); label++) {
       uint32_t list_size = list_sizes.data_handle()[label];
       T* list_data_ptr = data_ptrs.data_handle()[label];
       IdxT* list_inds_ptr = inds_ptrs.data_handle()[label];
       
-      auto exts = list_device_spec.make_list_extents(static_cast<IdxT>(list_size));
-      auto interleaved_data = make_device_mdarray<T>(handle_, exts);
+      auto exts = list_device_spec.make_list_extents(Pow2<kIndexGroupSize>::roundUp(list_size));
+      auto interleaved_data = make_device_mdarray<T, uint32_t, row_major>(handle_, exts);
 
-      // fetch the flat codes
-      auto flat_codes = make_device_matrix<T, IdxT>(handle_, static_cast<IdxT>(list_size), ps.dim);
-
-      flat_codes_from_list_indices(handle_,
-                              list_inds_ptr,
-                              list_data_ptr,
-                              static_cast<IdxT>(list_size),
-                              ps.dim,
-                              flat_codes.data_handle());
-      auto l = ivf::list(handle_, list_device_spec, static_cast<IdxT>(list_size));
+      RAFT_LOG_INFO("interleaved_codes_extent(0) %u", interleaved_data.extent(0));
+      RAFT_LOG_INFO("interleaved_codes_extent(1) %u", interleaved_data.extent(1));
 
-      helpers::codepacker::pack_full_list(
+      // fetch the flat codes
+      auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, static_cast<uint32_t>(ps.dim));
+      //  auto flat_codes_view = raft::make_device_matrix_view<const T, uint32_t>(
+      // (const T*)flat_codes.data_handle(), list_size, ps.dim);
+
+      // flat_codes_from_list_indices<DataT, IdxT>(handle_,
+      //                         list_inds_ptr,
+      //                         list_data_ptr,
+      //                         list_size,
+      //                         static_cast<uint32_t>(ps.dim),
+      //                         flat_codes.data_handle());
+      matrix::gather(handle_,
+                     make_device_matrix_view<const DataT, uint32_t>((const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), static_cast<uint32_t>(ps.dim)),
+                     make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds_ptr, list_size),
+                     flat_codes.view());
+
+      // auto interleaved_codes_view = device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>(interleaved_data.data_handle(), exts);
+      // auto interleaved_codes_view = raft::make_device_matrix_view(interleaved_data.data_handle(), list_size, (uint32_t)ps.dim);
+      detail::pack_list_data<T>(
         handle_,
-        make_device_matrix_view<const T, IdxT>(flat_codes.data_handle(), static_cast<IdxT>(list_size), ps.dim),
+        flat_codes.data_handle(),
+        list_size,
+        (uint32_t)ps.dim,
         index.veclen(),
-        interleaved_data.view());
+        interleaved_data.data_handle());
 
       ASSERT_TRUE(raft::devArrMatch(interleaved_data.data_handle(),
                                     list_data_ptr,
@@ -347,13 +353,15 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                     raft::Compare<float>(),
                                     stream_));
 
-      auto unpacked_flat_codes = make_device_matrix<T, IdxT>(handle_, static_cast<IdxT>(list_size), ps.dim);
+      auto unpacked_flat_codes = make_device_matrix<T, uint32_t>(handle_, static_cast<uint32_t>(list_size), ps.dim);
 
-      helpers::codepacker::unpack_full_list(
+      detail::unpack_list_data<T>(
         handle_,
-        make_const_mdspan(interleaved_data.view()),
+        interleaved_data.data_handle(),
+        list_size,
+        (uint32_t)ps.dim,
         index.veclen(),
-        unpacked_flat_codes.view());
+        unpacked_flat_codes.data_handle());
 
       ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(),
                                     unpacked_flat_codes.data_handle(),
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
index 1b4a5f36c3..fc33621bf8 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
@@ -21,7 +21,7 @@
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, float, std::int64_t> AnnIVFFlatTestF;
-TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); this->testPacker();}
+TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat();}
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));
 
diff --git a/log.txt b/log.txt
new file mode 100644
index 0000000000..733021b7ab
--- /dev/null
+++ b/log.txt
@@ -0,0 +1,2 @@
+========= COMPUTE-SANITIZER
+========= Target application doesn't exist or is not a valid executable

From 7412272c17a245f9a4f00ba40b18764ab31aeabd Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 20 Jul 2023 09:04:05 -0700
Subject: [PATCH 10/22] debugging

---
 cpp/test/neighbors/ann_ivf_flat.cuh                 | 13 +++++++++++++
 .../neighbors/ann_ivf_flat/test_float_int64_t.cu    |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index b6bcb01d91..eaf0858c66 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -19,6 +19,7 @@
 #include "ann_utils.cuh"
 #include "raft/core/device_mdarray.hpp"
 #include "raft/core/host_mdarray.hpp"
+#include "raft/core/logger-macros.hpp"
 #include "raft/core/mdspan.hpp"
 #include "raft/core/mdspan_types.hpp"
 #include "raft/linalg/map.cuh"
@@ -332,6 +333,9 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
       //                         list_size,
       //                         static_cast<uint32_t>(ps.dim),
       //                         flat_codes.data_handle());
+
+      RAFT_LOG_INFO("static_cast %u", static_cast<uint32_t>(ps.num_db_vecs));
+      RAFT_LOG_INFO("static_cast %u", static_cast<uint32_t>(ps.dim));
       matrix::gather(handle_,
                      make_device_matrix_view<const DataT, uint32_t>((const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), static_cast<uint32_t>(ps.dim)),
                      make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds_ptr, list_size),
@@ -346,6 +350,15 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
         (uint32_t)ps.dim,
         index.veclen(),
         interleaved_data.data_handle());
+      
+      raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout);
+      raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout);
+      raft::print_device_vector("interleaved_data", interleaved_data.data_handle(), Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim, std::cout);
+      raft::print_device_vector("list_data", list_data_ptr, Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim, std::cout);
+      auto inds = make_host_vector<IdxT>(handle_, list_size);
+      raft::update_host(inds.data_handle(), list_inds_ptr, list_size, stream_);
+      resource::sync_stream(handle_);
+      raft::print_device_vector("first_flat_code", database.data() + inds.data_handle()[0] * ps.dim, ps.dim, std::cout);
 
       ASSERT_TRUE(raft::devArrMatch(interleaved_data.data_handle(),
                                     list_data_ptr,
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
index fc33621bf8..553c10e0f4 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
@@ -21,7 +21,7 @@
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, float, std::int64_t> AnnIVFFlatTestF;
-TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat();}
+TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testPacker();}
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));
 

From 700ea822e5a39f0a45fcb2cd1d615bfa3d56a7b1 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 20 Jul 2023 17:19:15 -0700
Subject: [PATCH 11/22] Working build

---
 .../raft/neighbors/detail/ivf_flat_build.cuh  | 32 +++++++-------
 .../raft/neighbors/ivf_flat_helpers.cuh       | 33 +++++---------
 cpp/test/neighbors/ann_ivf_flat.cuh           | 43 ++++++-------------
 3 files changed, 39 insertions(+), 69 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index c475e12b96..708f9ae549 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -439,43 +439,41 @@ __global__ void unpack_interleaved_list_kernel(
   }
 }
 
-template <typename T>
+template <typename T, typename IdxT>
 void pack_list_data(
-  raft::resources const& res,
-  T* codes,
-  uint32_t n_rows,
-  uint32_t dim,
+raft::resources const& res,
+  device_matrix_view<const T, uint32_t, row_major> codes,
   uint32_t veclen,
-  T* list_data)
+  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+    list_data)
 {
-  // uint32_t n_rows                      = codes.extent(0);
-  // uint32_t dim                         = codes.extent(1);
+  uint32_t n_rows                      = codes.extent(0);
+  uint32_t dim                         = codes.extent(1);
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto stream = resource::get_cuda_stream(res);
   pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
-    list_data, codes, n_rows, dim, veclen);
+    codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename T>
+template <typename T, typename IdxT>
 void unpack_list_data(
   raft::resources const& res,
-  T* list_data,
-  uint32_t n_rows,
-  uint32_t dim,
+  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+    list_data,
   uint32_t veclen,
-  T* codes)
+  device_matrix_view<T, uint32_t, row_major> codes)
 {
-  // uint32_t n_rows                      = codes.extent(0);
-  // uint32_t dim                         = codes.extent(1);
+  uint32_t n_rows                      = codes.extent(0);
+  uint32_t dim                         = codes.extent(1);
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto stream = resource::get_cuda_stream(res);
   unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
-    codes, list_data, n_rows, dim, veclen);
+    list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index e757be9b8a..3f9d9325c4 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <raft/core/resource/cuda_stream.hpp>
 #include <raft/neighbors/detail/ivf_flat_build.cuh>
 #include <raft/neighbors/ivf_flat_types.hpp>
 
@@ -31,38 +30,26 @@ namespace raft::neighbors::ivf_flat::helpers {
 
 namespace codepacker {
 
-template <typename T>
+template <typename T, typename IdxT>
 void pack_full_list(
-  raft::resources const& handle,
-  T* codes,
-  uint32_t n_rows,
-  uint32_t dim,
+  raft::resources const& res,
+  device_matrix_view<const T, uint32_t, row_major> codes,
   uint32_t veclen,
-  T* list_data)
+  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+    list_data)
 {
-  raft::neighbors::ivf_flat::detail::pack_list_data(handle, codes, n_rows, dim, veclen, list_data);
+  raft::neighbors::ivf_flat::detail::pack_list_data<T, IdxT>(res, codes, veclen, list_data);
 }
 
-// template <typename T, typename IdxT>
-// void unpack_full_list(
-//   raft::resources const& res,
-//   device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
-//     list_data,
-//   uint32_t veclen,
-//   device_matrix_view<T, uint32_t, row_major> codes)
-// {
-//   raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes);
-// }
 template <typename T, typename IdxT>
 void unpack_full_list(
   raft::resources const& res,
-  T* list_data,
-  uint32_t n_rows,
-  uint32_t dim,
+  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
+    list_data,
   uint32_t veclen,
-  T* codes)
+  device_matrix_view<T, uint32_t, row_major> codes)
 {
-  raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, n_rows, dim, veclen, codes);
+  raft::neighbors::ivf_flat::detail::unpack_list_data<T, IdxT>(res, list_data, veclen, codes);
 }
 }  // namespace codepacker
 /** @} */
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index eaf0858c66..dccbedee98 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -22,10 +22,6 @@
 #include "raft/core/logger-macros.hpp"
 #include "raft/core/mdspan.hpp"
 #include "raft/core/mdspan_types.hpp"
-#include "raft/linalg/map.cuh"
-#include "raft/neighbors/ivf_flat_types.hpp"
-#include "raft/util/cudart_utils.hpp"
-#include "raft/util/fast_int_div.cuh"
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 
@@ -37,7 +33,6 @@
 #include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_flat.cuh>
 #include <raft/neighbors/ivf_flat_helpers.cuh>
-#include <raft/neighbors/detail/ivf_flat_build.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/ann.cuh>
 #include <raft/spatial/knn/knn.cuh>
@@ -297,6 +292,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
     auto index = ivf_flat::build(handle_, index_params, database_view);
 
+    // index<DataT, IdxT> index_2 = ivf_flat::extend(handle_, half_of_data_view, no_opt, idx);
+
     raft::print_device_vector("list_sizes", index.list_sizes().data_handle(), ps.nlist, std::cout);
     RAFT_LOG_INFO("total db size %lld", ps.num_db_vecs);
 
@@ -309,30 +306,21 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_);
     resource::sync_stream(handle_);
 
-    auto list_device_spec = list_spec<uint32_t, T, IdxT>{static_cast<uint32_t>(ps.dim), false};
+    auto list_device_spec = list_spec<uint32_t, DataT, IdxT>{static_cast<uint32_t>(ps.dim), false};
 
     for (uint32_t label = 0; label < index.n_lists(); label++) {
       uint32_t list_size = list_sizes.data_handle()[label];
-      T* list_data_ptr = data_ptrs.data_handle()[label];
+      DataT* list_data_ptr = data_ptrs.data_handle()[label];
       IdxT* list_inds_ptr = inds_ptrs.data_handle()[label];
       
       auto exts = list_device_spec.make_list_extents(Pow2<kIndexGroupSize>::roundUp(list_size));
-      auto interleaved_data = make_device_mdarray<T, uint32_t, row_major>(handle_, exts);
+      auto interleaved_data = make_device_mdarray<DataT, uint32_t, row_major>(handle_, exts);
 
       RAFT_LOG_INFO("interleaved_codes_extent(0) %u", interleaved_data.extent(0));
       RAFT_LOG_INFO("interleaved_codes_extent(1) %u", interleaved_data.extent(1));
 
       // fetch the flat codes
       auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, static_cast<uint32_t>(ps.dim));
-      //  auto flat_codes_view = raft::make_device_matrix_view<const T, uint32_t>(
-      // (const T*)flat_codes.data_handle(), list_size, ps.dim);
-
-      // flat_codes_from_list_indices<DataT, IdxT>(handle_,
-      //                         list_inds_ptr,
-      //                         list_data_ptr,
-      //                         list_size,
-      //                         static_cast<uint32_t>(ps.dim),
-      //                         flat_codes.data_handle());
 
       RAFT_LOG_INFO("static_cast %u", static_cast<uint32_t>(ps.num_db_vecs));
       RAFT_LOG_INFO("static_cast %u", static_cast<uint32_t>(ps.dim));
@@ -342,14 +330,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                      flat_codes.view());
 
       // auto interleaved_codes_view = device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>(interleaved_data.data_handle(), exts);
-      // auto interleaved_codes_view = raft::make_device_matrix_view(interleaved_data.data_handle(), list_size, (uint32_t)ps.dim);
-      detail::pack_list_data<T>(
+      helpers::codepacker::pack_full_list<DataT, IdxT>(
         handle_,
-        flat_codes.data_handle(),
-        list_size,
-        (uint32_t)ps.dim,
+        make_const_mdspan(flat_codes.view()),
         index.veclen(),
-        interleaved_data.data_handle());
+        interleaved_data.view());
+
+      resource::sync_stream(handle_);
       
       raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout);
       raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout);
@@ -366,15 +353,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                     raft::Compare<float>(),
                                     stream_));
 
-      auto unpacked_flat_codes = make_device_matrix<T, uint32_t>(handle_, static_cast<uint32_t>(list_size), ps.dim);
+      auto unpacked_flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, static_cast<uint32_t>(ps.dim));
 
-      detail::unpack_list_data<T>(
+      helpers::codepacker::unpack_full_list<DataT, IdxT>(
         handle_,
-        interleaved_data.data_handle(),
-        list_size,
-        (uint32_t)ps.dim,
+        interleaved_data.view(),
         index.veclen(),
-        unpacked_flat_codes.data_handle());
+        unpacked_flat_codes.view());
 
       ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(),
                                     unpacked_flat_codes.data_handle(),

From 9d742efe8296eaac1173c80506af853ccc4da1b5 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 21 Jul 2023 15:25:24 -0700
Subject: [PATCH 12/22] rename codepacking api

---
 cpp/include/raft/neighbors/detail/ivf_flat_build.cuh        | 6 +++---
 .../{ivf_flat_codepacker.cuh => ivf_flat_codepacker.hpp}    | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)
 rename cpp/include/raft/neighbors/{ivf_flat_codepacker.cuh => ivf_flat_codepacker.hpp} (97%)

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 708f9ae549..f2959c5829 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -26,7 +26,7 @@
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
-#include <raft/neighbors/ivf_flat_codepacker.cuh>
+#include <raft/neighbors/ivf_flat_codepacker.hpp>
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_list_types.hpp>
@@ -425,7 +425,7 @@ __global__ void pack_interleaved_list_kernel(
 {
   uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < n_rows) {
-    codepacker::pack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid);
+    codepacker::pack_1(codes + tid * dim, list_data, dim, veclen, tid);
   }
 }
 
@@ -435,7 +435,7 @@ __global__ void unpack_interleaved_list_kernel(
 {
   uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < n_rows) {
-    codepacker::unpack_1_interleaved(list_data, codes + tid * dim, dim, veclen, tid);
+    codepacker::unpack_1(list_data, codes + tid * dim, dim, veclen, tid);
   }
 }
 
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
similarity index 97%
rename from cpp/include/raft/neighbors/ivf_flat_codepacker.cuh
rename to cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
index 9b12761daf..430ca7d995 100644
--- a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
@@ -38,7 +38,7 @@ namespace raft::neighbors::ivf_flat::codepacker {
  * @param[in] offset how many records to skip before writing the data into the list
  */
 template <typename T>
-__host__ __device__ void pack_1_interleaved(
+_RAFT_HOST_DEVICE void pack_1(
   const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset)
 {
   // The data is written in interleaved groups of `index::kGroupSize` vectors
@@ -70,7 +70,7 @@ __host__ __device__ void pack_1_interleaved(
  * @param[in] offset fetch the flat code by the given offset
  */
 template <typename T>
-__host__ __device__ void unpack_1_interleaved(
+_RAFT_HOST_DEVICE void unpack_1(
   const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
 {
   // The data is written in interleaved groups of `index::kGroupSize` vectors

From d1ef8a16b5602dcb4f1e7df7a76c5e058cf57a48 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 27 Jul 2023 14:20:27 -0700
Subject: [PATCH 13/22] Updated gtest

---
 .../raft/neighbors/detail/ivf_flat_build.cuh  |  10 +
 cpp/test/neighbors/ann_ivf_flat.cuh           | 255 +++++++++++-------
 2 files changed, 163 insertions(+), 102 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index f2959c5829..6da3de9e6c 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "raft/core/logger-macros.hpp"
 #include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
@@ -152,9 +153,15 @@ __global__ void build_index_kernel(const LabelT* labels,
   // NB: such `veclen` is selected, that `dim % veclen == 0`
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
+      if (list_id == 0) {
+        printf("l %u, j %u, dst_index %u, src_value %f\n", l, j, l * kIndexGroupSize + ingroup_id + j, (float)source_vecs[l + j]);
+      }
       list_data[l * kIndexGroupSize + ingroup_id + j] = source_vecs[l + j];
     }
   }
+  if (list_id == 0) {
+    printf("list_id %u, inlist_id %u, group_offset %u, ingroup_id %u\n", list_id, inlist_id, group_offset, ingroup_id);
+  }
 }
 
 /** See raft::neighbors::ivf_flat::extend docs */
@@ -248,6 +255,7 @@ void extend(raft::resources const& handle,
   // Kernel to insert the new vectors
   const dim3 block_dim(256);
   const dim3 grid_dim(raft::ceildiv<IdxT>(n_rows, block_dim.x));
+
   build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(new_labels.data_handle(),
                                                          new_vectors,
                                                          new_indices,
@@ -449,6 +457,7 @@ raft::resources const& res,
 {
   uint32_t n_rows                      = codes.extent(0);
   uint32_t dim                         = codes.extent(1);
+  if (n_rows == 0 || dim == 0) return;
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
@@ -468,6 +477,7 @@ void unpack_list_data(
 {
   uint32_t n_rows                      = codes.extent(0);
   uint32_t dim                         = codes.extent(1);
+  if (n_rows == 0 || dim == 0) return;
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index dccbedee98..2702deef07 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -22,6 +22,10 @@
 #include "raft/core/logger-macros.hpp"
 #include "raft/core/mdspan.hpp"
 #include "raft/core/mdspan_types.hpp"
+#include "raft/linalg/map.cuh"
+#include "raft/neighbors/detail/ivf_pq_build.cuh"
+#include "raft/neighbors/ivf_flat_types.hpp"
+#include "raft/neighbors/ivf_list.hpp"
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 
@@ -84,6 +88,21 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   {
   }
 
+  void construct_pad_mask(raft::device_matrix_view<bool> mask) {
+  uint32_t list_size = mask.extent(0);
+  uint32_t dim = mask.extent(1);
+  using interleaved_group = Pow2<kIndexGroupSize>;
+  raft::linalg::map_offset(handle_, make_device_vector_view(mask.data_handle(), list_size * dim), [list_size, dim = util::FastIntDiv(dim)]__device__(uint32_t i) {
+        uint32_t row = i / dim;
+        uint32_t max_group_offset = interleaved_group::roundDown(list_size);
+        if (row < max_group_offset) {
+          return true;
+        }
+        uint32_t ingroup_id = interleaved_group::mod(row);
+        return ingroup_id < (list_size - max_group_offset);
+      });
+}
+
  protected:
   void testIVFFlat()
   {
@@ -283,91 +302,123 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     index_params.metric           = ps.metric;
     index_params.adaptive_centers = false;
 
-    index_params.add_data_on_build        = true;
-    index_params.kmeans_trainset_fraction = 0.5;
+    index_params.add_data_on_build        = false;
+    index_params.kmeans_trainset_fraction = 1.0;
     index_params.metric_arg               = 0;
 
     auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
       (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
 
-    auto index = ivf_flat::build(handle_, index_params, database_view);
+    auto idx = ivf_flat::build(handle_, index_params, database_view);
 
-    // index<DataT, IdxT> index_2 = ivf_flat::extend(handle_, half_of_data_view, no_opt, idx);
+    const std::optional<raft::device_vector_view<const IdxT, IdxT>> no_opt = std::nullopt;
+    index<DataT, IdxT> extend_index = ivf_flat::extend(handle_, database_view, no_opt, idx);
 
-    raft::print_device_vector("list_sizes", index.list_sizes().data_handle(), ps.nlist, std::cout);
-    RAFT_LOG_INFO("total db size %lld", ps.num_db_vecs);
-
-    auto list_sizes = raft::make_host_vector<uint32_t>(index.n_lists());
+    auto list_sizes = raft::make_host_vector<uint32_t>(idx.n_lists());
     update_host(
-      list_sizes.data_handle(), index.list_sizes().data_handle(), index.n_lists(), stream_);
-    auto data_ptrs = raft::make_host_vector<DataT*>(index.n_lists());
-    update_host(data_ptrs.data_handle(), index.data_ptrs().data_handle(), index.n_lists(), stream_);
-    auto inds_ptrs = raft::make_host_vector<IdxT*>(index.n_lists());
-    update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_);
+      list_sizes.data_handle(), idx.list_sizes().data_handle(), idx.n_lists(), stream_);
+    // auto data_ptrs = raft::make_host_vector<DataT*>(idx.n_lists());
+    // update_host(data_ptrs.data_handle(), idx.data_ptrs().data_handle(), idx.n_lists(), stream_);
+    // auto inds_ptrs = raft::make_host_vector<IdxT*>(idx.n_lists());
+    // update_host(inds_ptrs.data_handle(), idx.inds_ptrs().data_handle(), idx.n_lists(), stream_);
     resource::sync_stream(handle_);
+    auto& lists = idx.lists();
 
+    // conservative memory allocation for codepacking
     auto list_device_spec = list_spec<uint32_t, DataT, IdxT>{static_cast<uint32_t>(ps.dim), false};
 
-    for (uint32_t label = 0; label < index.n_lists(); label++) {
+    for (uint32_t label = 0; label < idx.n_lists(); label++) {
       uint32_t list_size = list_sizes.data_handle()[label];
-      DataT* list_data_ptr = data_ptrs.data_handle()[label];
-      IdxT* list_inds_ptr = inds_ptrs.data_handle()[label];
-      
-      auto exts = list_device_spec.make_list_extents(Pow2<kIndexGroupSize>::roundUp(list_size));
-      auto interleaved_data = make_device_mdarray<DataT, uint32_t, row_major>(handle_, exts);
 
-      RAFT_LOG_INFO("interleaved_codes_extent(0) %u", interleaved_data.extent(0));
-      RAFT_LOG_INFO("interleaved_codes_extent(1) %u", interleaved_data.extent(1));
+      ivf::resize_list(handle_, lists[label], list_device_spec, list_size, 0);
+    }
+
+    idx.recompute_internal_state(handle_);
+
+    using interleaved_group = Pow2<kIndexGroupSize>;
+
+    for (uint32_t label = 0; label < idx.n_lists(); label++) {
+      uint32_t list_size = list_sizes.data_handle()[label];
+      uint32_t padded_list_size = interleaved_group::roundUp(list_size);
+      uint32_t n_elems = padded_list_size * static_cast<uint32_t>(ps.dim);
+      auto list_data = lists[label]->data;
+      auto list_inds = lists[label]->indices;
 
       // fetch the flat codes
       auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, static_cast<uint32_t>(ps.dim));
 
-      RAFT_LOG_INFO("static_cast %u", static_cast<uint32_t>(ps.num_db_vecs));
-      RAFT_LOG_INFO("static_cast %u", static_cast<uint32_t>(ps.dim));
       matrix::gather(handle_,
                      make_device_matrix_view<const DataT, uint32_t>((const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), static_cast<uint32_t>(ps.dim)),
-                     make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds_ptr, list_size),
+                     make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds.data_handle(), list_size),
                      flat_codes.view());
 
-      // auto interleaved_codes_view = device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>(interleaved_data.data_handle(), exts);
       helpers::codepacker::pack_full_list<DataT, IdxT>(
         handle_,
         make_const_mdspan(flat_codes.view()),
-        index.veclen(),
-        interleaved_data.view());
+        idx.veclen(),
+        list_data.view());
 
-      resource::sync_stream(handle_);
+      {
+        auto mask = make_device_matrix<bool>(handle_,list_size, static_cast<uint32_t>(ps.dim));
+
+        construct_pad_mask(mask.view());
+
+        auto packed_list_data = make_device_vector<DataT, uint32_t>(handle_, n_elems);
+
+        thrust::transform(
+          list_data.data_handle(), list_data.data_handle() + n_elems,
+          mask.data_handle(),
+          packed_list_data.data_handle(),
+          thrust::multiplies<DataT>()
+        );
+      auto extend_data = extend_index.lists()[label]->data;
+      auto extend_data_filtered = make_device_vector<DataT, uint32_t>(handle_, n_elems);
+      thrust::transform(
+        extend_data.data_handle(), extend_data.data_handle() + n_elems,
+        mask.data_handle(),
+        extend_data_filtered.data_handle(),
+        thrust::multiplies<float>()
+    );
+
+      ASSERT_TRUE(raft::devArrMatch(packed_list_data.data_handle(),
+                                    extend_data_filtered.data_handle(),
+                                    n_elems,
+                                    raft::Compare<DataT>(),
+                                    stream_));
+      }
       
-      raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout);
-      raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout);
-      raft::print_device_vector("interleaved_data", interleaved_data.data_handle(), Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim, std::cout);
-      raft::print_device_vector("list_data", list_data_ptr, Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim, std::cout);
-      auto inds = make_host_vector<IdxT>(handle_, list_size);
-      raft::update_host(inds.data_handle(), list_inds_ptr, list_size, stream_);
-      resource::sync_stream(handle_);
-      raft::print_device_vector("first_flat_code", database.data() + inds.data_handle()[0] * ps.dim, ps.dim, std::cout);
+      // raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout);
+      // raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout);
+      // raft::print_device_vector("interleaved_data", interleaved_data.data_handle(), Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim, std::cout);
+      // auto database_data_host = raft::make_host_vector<DataT>(handle_, (uint32_t)ps.num_db_vecs).data_handle();
+      // auto list_data_host = raft::make_host_vector<DataT>(handle_, Pow2<kIndexGroupSize>::roundUp(list_size)).data_handle();
+      // raft::update_host(database_data_host, database.data(), ps.num_db_vecs, stream_);
+      // raft::update_host(list_data_host, list_data_ptr, Pow2<kIndexGroupSize>::roundUp(list_size), stream_);
+      // raft::resource::sync_stream(handle_);
 
-      ASSERT_TRUE(raft::devArrMatch(interleaved_data.data_handle(),
-                                    list_data_ptr,
-                                    Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim,
-                                    raft::Compare<float>(),
-                                    stream_));
+      
+      // raft::print_device_vector("list_data", list_data_ptr, Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim, std::cout);
+      // auto inds = make_host_vector<IdxT>(handle_, list_size);
+      // raft::update_host(inds.data_handle(), list_inds_ptr, list_size, stream_);
+      // resource::sync_stream(handle_);
+      // raft::print_device_vector("first_flat_code", database.data() + inds.data_handle()[0] * ps.dim, ps.dim, std::cout);
 
       auto unpacked_flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, static_cast<uint32_t>(ps.dim));
 
       helpers::codepacker::unpack_full_list<DataT, IdxT>(
         handle_,
-        interleaved_data.view(),
-        index.veclen(),
+        list_data.view(),
+        idx.veclen(),
         unpacked_flat_codes.view());
 
       ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(),
                                     unpacked_flat_codes.data_handle(),
                                     list_size * ps.dim,
-                                    raft::Compare<float>(),
+                                    raft::Compare<DataT>(),
                                     stream_));
     }
-  }
+    }
+
 
   void SetUp() override
   {
@@ -402,63 +453,63 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
 const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
   // test various dims (aligned and not aligned to vector sizes)
-  {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
-  {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
-  {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
-  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false},
-  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true},
+  {100, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}};
+  // {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  // {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+  // {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  // {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  // {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  // {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false},
+  // {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true},
 
   // test dims that do not fit into kernel shared memory limits
-  {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
-  {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
-  {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
-  {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
-
-  // various random combinations
-  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded, false},
-  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded, false},
-  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded, false},
-  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
-  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
-  {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false},
-
-  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
-  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
-  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct, false},
-  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct, true},
-  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, true},
-  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, false},
-  {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
-
-  {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct, false},
-
-  // test splitting the big query batches  (> max gridDim.y) into smaller batches
-  {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, false},
-  {1000000, 1024, 32, 10, 256, 256, raft::distance::DistanceType::InnerProduct, false},
-  {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, true},
-
-  // test radix_sort for getting the cluster selection
-  {1000,
-   10000,
-   16,
-   10,
-   raft::matrix::detail::select::warpsort::kMaxCapacity * 2,
-   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
-   raft::distance::DistanceType::L2Expanded,
-   false},
-  {1000,
-   10000,
-   16,
-   10,
-   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
-   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
-   raft::distance::DistanceType::InnerProduct,
-   false}};
+  // {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  // {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  // {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  // {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  // {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  // {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+  // {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+
+  // // various random combinations
+  // {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  // {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded, false},
+  // {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded, false},
+  // {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded, false},
+  // {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
+  // {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
+  // {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false},
+
+  // {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  // {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
+  // {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct, false},
+  // {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct, true},
+  // {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, true},
+  // {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, false},
+  // {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
+
+  // {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct, false},
+
+  // // test splitting the big query batches  (> max gridDim.y) into smaller batches
+  // {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, false},
+  // {1000000, 1024, 32, 10, 256, 256, raft::distance::DistanceType::InnerProduct, false},
+  // {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, true},
+
+  // // test radix_sort for getting the cluster selection
+  // {1000,
+  //  10000,
+  //  16,
+  //  10,
+  //  raft::matrix::detail::select::warpsort::kMaxCapacity * 2,
+  //  raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+  //  raft::distance::DistanceType::L2Expanded,
+  //  false},
+  // {1000,
+  //  10000,
+  //  16,
+  //  10,
+  //  raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+  //  raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+  //  raft::distance::DistanceType::InnerProduct,
+  //  false}};
 }  // namespace raft::neighbors::ivf_flat

From 4ee99e3ef16ccac3d08212fe165620c2422f4de5 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 27 Jul 2023 15:21:24 -0700
Subject: [PATCH 14/22] updates

---
 cpp/include/raft/neighbors/detail/ivf_flat_build.cuh | 9 ---------
 cpp/test/neighbors/ann_ivf_flat.cuh                  | 7 +------
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 6da3de9e6c..afb7c461e5 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "raft/core/logger-macros.hpp"
 #include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
@@ -33,7 +32,6 @@
 #include <raft/neighbors/ivf_list_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/stats/histogram.cuh>
-#include <raft/util/fast_int_div.cuh>
 #include <raft/util/pow2_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -153,15 +151,9 @@ __global__ void build_index_kernel(const LabelT* labels,
   // NB: such `veclen` is selected, that `dim % veclen == 0`
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
-      if (list_id == 0) {
-        printf("l %u, j %u, dst_index %u, src_value %f\n", l, j, l * kIndexGroupSize + ingroup_id + j, (float)source_vecs[l + j]);
-      }
       list_data[l * kIndexGroupSize + ingroup_id + j] = source_vecs[l + j];
     }
   }
-  if (list_id == 0) {
-    printf("list_id %u, inlist_id %u, group_offset %u, ingroup_id %u\n", list_id, inlist_id, group_offset, ingroup_id);
-  }
 }
 
 /** See raft::neighbors::ivf_flat::extend docs */
@@ -255,7 +247,6 @@ void extend(raft::resources const& handle,
   // Kernel to insert the new vectors
   const dim3 block_dim(256);
   const dim3 grid_dim(raft::ceildiv<IdxT>(n_rows, block_dim.x));
-
   build_index_kernel<<<grid_dim, block_dim, 0, stream>>>(new_labels.data_handle(),
                                                          new_vectors,
                                                          new_indices,
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 2702deef07..289cd5960a 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -23,7 +23,6 @@
 #include "raft/core/mdspan.hpp"
 #include "raft/core/mdspan_types.hpp"
 #include "raft/linalg/map.cuh"
-#include "raft/neighbors/detail/ivf_pq_build.cuh"
 #include "raft/neighbors/ivf_flat_types.hpp"
 #include "raft/neighbors/ivf_list.hpp"
 #include <raft/core/resource/cuda_stream.hpp>
@@ -154,8 +153,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                ps.num_db_vecs,
                                ps.dim);
         
-        RAFT_LOG_INFO("Index build successfully");
-
         resource::sync_stream(handle_);
         approx_knn_search(handle_,
                           distances_ivfflat_dev.data(),
@@ -165,8 +162,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                           search_queries.data(),
                           ps.num_queries);
         
-        RAFT_LOG_INFO("search successful");
-
         update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
         update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
         resource::sync_stream(handle_);
@@ -453,7 +448,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
 const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
   // test various dims (aligned and not aligned to vector sizes)
-  {100, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}};
+  {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}};
   // {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
   // {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
   // {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},

From 22f4f801deea6d4f47c6ef9f4f734e3de3afcea0 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 28 Jul 2023 09:20:42 -0700
Subject: [PATCH 15/22] update testing

---
 cpp/include/raft/neighbors/ivf_list.hpp       |   6 +
 cpp/include/raft/neighbors/ivf_list_types.hpp |   9 +
 cpp/test/neighbors/ann_ivf_flat.cuh           | 290 ++++++++++--------
 3 files changed, 180 insertions(+), 125 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_list.hpp b/cpp/include/raft/neighbors/ivf_list.hpp
index ad06a3ee71..de7d84da14 100644
--- a/cpp/include/raft/neighbors/ivf_list.hpp
+++ b/cpp/include/raft/neighbors/ivf_list.hpp
@@ -67,6 +67,12 @@ list<SpecT, SizeT, SpecExtraArgs...>::list(raft::resources const& res,
                  indices.data_handle(),
                  indices.size(),
                  ivf::kInvalidRecord<index_type>);
+  
+  // Fill the data buffer with a pre-defined marker for easier debugging
+  thrust::fill_n(resource::get_thrust_policy(res),
+                 data.data_handle(),
+                 data.size(),
+                 ivf::kPadElem<value_type>);
 }
 
 /**
diff --git a/cpp/include/raft/neighbors/ivf_list_types.hpp b/cpp/include/raft/neighbors/ivf_list_types.hpp
index 6317825201..9cbb36acda 100644
--- a/cpp/include/raft/neighbors/ivf_list_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_list_types.hpp
@@ -34,6 +34,15 @@ template <typename IdxT>
 constexpr static IdxT kInvalidRecord =
   (std::is_signed_v<IdxT> ? IdxT{0} : std::numeric_limits<IdxT>::max()) - 1;
 
+/**
+ * Default value filled in the `data` array.
+ * One may encounter it trying to access a record within a list that is outside of the
+ * `size` bound or whenever the list is allocated but not filled-in yet.
+ */
+template <typename T>
+constexpr static T kPadElem =
+  (std::is_signed_v<T> ? T{0} : std::numeric_limits<T>::max()) - T{1};
+
 /** The data for a single IVF list. */
 template <template <typename, typename...> typename SpecT,
           typename SizeT,
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 289cd5960a..3c93c07cfc 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -25,6 +25,9 @@
 #include "raft/linalg/map.cuh"
 #include "raft/neighbors/ivf_flat_types.hpp"
 #include "raft/neighbors/ivf_list.hpp"
+#include "raft/util/cudart_utils.hpp"
+#include "raft/util/fast_int_div.cuh"
+#include "thrust/functional.h"
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 
@@ -87,22 +90,22 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   {
   }
 
-  void construct_pad_mask(raft::device_matrix_view<bool> mask) {
-  uint32_t list_size = mask.extent(0);
-  uint32_t dim = mask.extent(1);
-  using interleaved_group = Pow2<kIndexGroupSize>;
-  raft::linalg::map_offset(handle_, make_device_vector_view(mask.data_handle(), list_size * dim), [list_size, dim = util::FastIntDiv(dim)]__device__(uint32_t i) {
-        uint32_t row = i / dim;
-        uint32_t max_group_offset = interleaved_group::roundDown(list_size);
-        if (row < max_group_offset) {
-          return true;
-        }
-        uint32_t ingroup_id = interleaved_group::mod(row);
-        return ingroup_id < (list_size - max_group_offset);
-      });
-}
-
- protected:
+  // void construct_pad_mask(raft::resources const& handle, uint32_t list_size, uint32_t dim, raft::device_vector_view<bool> mask) {
+  //   using interleaved_group = Pow2<kIndexGroupSize>;
+  //   uint32_t padded_list_size = interleaved_group::roundUp(list_size);
+  //   linalg::map_offset(handle, mask, [=] __device__ (auto i) {
+  //       uint32_t row = i / dim;
+  //       uint32_t max_group_offset = interleaved_group::roundDown(list_size);
+  //       if (row < max_group_offset) {
+  //         return true;
+  //       }
+  //       uint32_t ingroup_id = interleaved_group::mod(row);
+  //       return ingroup_id < (padded_list_size - max_group_offset);
+  //       return true;
+  //     });
+// }
+
+//  protected:
   void testIVFFlat()
   {
     size_t queries_size = ps.num_queries * ps.k;
@@ -293,9 +296,11 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   void testPacker()
   {
     ivf_flat::index_params index_params;
+    ivf_flat::search_params search_params;
     index_params.n_lists          = ps.nlist;
     index_params.metric           = ps.metric;
     index_params.adaptive_centers = false;
+    search_params.n_probes = ps.nprobe;
 
     index_params.add_data_on_build        = false;
     index_params.kmeans_trainset_fraction = 1.0;
@@ -311,16 +316,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
     auto list_sizes = raft::make_host_vector<uint32_t>(idx.n_lists());
     update_host(
-      list_sizes.data_handle(), idx.list_sizes().data_handle(), idx.n_lists(), stream_);
-    // auto data_ptrs = raft::make_host_vector<DataT*>(idx.n_lists());
-    // update_host(data_ptrs.data_handle(), idx.data_ptrs().data_handle(), idx.n_lists(), stream_);
-    // auto inds_ptrs = raft::make_host_vector<IdxT*>(idx.n_lists());
-    // update_host(inds_ptrs.data_handle(), idx.inds_ptrs().data_handle(), idx.n_lists(), stream_);
+      list_sizes.data_handle(), extend_index.list_sizes().data_handle(), extend_index.n_lists(), stream_);
     resource::sync_stream(handle_);
+
     auto& lists = idx.lists();
 
     // conservative memory allocation for codepacking
-    auto list_device_spec = list_spec<uint32_t, DataT, IdxT>{static_cast<uint32_t>(ps.dim), false};
+    auto list_device_spec = list_spec<uint32_t, DataT, IdxT>{idx.dim(), false};
 
     for (uint32_t label = 0; label < idx.n_lists(); label++) {
       uint32_t list_size = list_sizes.data_handle()[label];
@@ -334,19 +336,21 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
     for (uint32_t label = 0; label < idx.n_lists(); label++) {
       uint32_t list_size = list_sizes.data_handle()[label];
-      uint32_t padded_list_size = interleaved_group::roundUp(list_size);
-      uint32_t n_elems = padded_list_size * static_cast<uint32_t>(ps.dim);
+
+      if (list_size > 0) {
+        uint32_t padded_list_size = interleaved_group::roundUp(list_size);
+      uint32_t n_elems = padded_list_size * idx.dim();
       auto list_data = lists[label]->data;
-      auto list_inds = lists[label]->indices;
+      auto list_inds = extend_index.lists()[label]->indices;
 
       // fetch the flat codes
-      auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, static_cast<uint32_t>(ps.dim));
+      auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
 
       matrix::gather(handle_,
-                     make_device_matrix_view<const DataT, uint32_t>((const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), static_cast<uint32_t>(ps.dim)),
+                     make_device_matrix_view<const DataT, uint32_t>((const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), idx.dim()),
                      make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds.data_handle(), list_size),
                      flat_codes.view());
-
+      
       helpers::codepacker::pack_full_list<DataT, IdxT>(
         handle_,
         make_const_mdspan(flat_codes.view()),
@@ -354,51 +358,45 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
         list_data.view());
 
       {
-        auto mask = make_device_matrix<bool>(handle_,list_size, static_cast<uint32_t>(ps.dim));
-
-        construct_pad_mask(mask.view());
-
-        auto packed_list_data = make_device_vector<DataT, uint32_t>(handle_, n_elems);
-
-        thrust::transform(
-          list_data.data_handle(), list_data.data_handle() + n_elems,
-          mask.data_handle(),
-          packed_list_data.data_handle(),
-          thrust::multiplies<DataT>()
-        );
-      auto extend_data = extend_index.lists()[label]->data;
-      auto extend_data_filtered = make_device_vector<DataT, uint32_t>(handle_, n_elems);
-      thrust::transform(
-        extend_data.data_handle(), extend_data.data_handle() + n_elems,
-        mask.data_handle(),
-        extend_data_filtered.data_handle(),
-        thrust::multiplies<float>()
-    );
-
-      ASSERT_TRUE(raft::devArrMatch(packed_list_data.data_handle(),
-                                    extend_data_filtered.data_handle(),
+    //     auto mask = make_device_vector<bool>(handle_, n_elems);
+
+    // linalg::map_offset(handle_, mask.view(), [dim = idx.dim(), list_size, padded_list_size, veclen = idx.veclen()] __device__ (auto i) {
+    //     uint32_t max_group_offset = interleaved_group::roundDown(list_size);
+    //     if (i < max_group_offset * dim) {
+    //       return true;
+    //     }
+    //     uint32_t surplus = (i - max_group_offset * dim);
+    //     uint32_t ingroup_id = interleaved_group::mod( surplus / veclen);
+    //     return ingroup_id < (list_size - max_group_offset);
+    //   });
+
+    //   // ensure that the correct number of indices are masked out
+    //   ASSERT_TRUE(thrust::reduce(resource::get_thrust_policy(handle_), mask.data_handle(), mask.data_handle() + n_elems, 0) == list_size * ps.dim);
+
+    //     auto packed_list_data = make_device_vector<DataT, uint32_t>(handle_, n_elems);
+
+    //     linalg::map_offset(handle_, packed_list_data.view(), [mask = mask.data_handle(), list_data= list_data.data_handle()] __device__ (uint32_t i) {
+    //       if (mask[i]) return list_data[i];
+    //       return DataT{0};
+    //     });
+
+    //   auto extend_data = extend_index.lists()[label]->data;
+    //   auto extend_data_filtered = make_device_vector<DataT, uint32_t>(handle_, n_elems);
+    // linalg::map_offset(handle_, extend_data_filtered.view(), [mask = mask.data_handle(), extend_data = extend_data.data_handle()] __device__ (uint32_t i) {
+    //       if (mask[i]) return extend_data[i];
+    //       return DataT{0};
+    //     });
+
+      ASSERT_TRUE(raft::devArrMatch(list_data.data_handle(),
+                                    extend_index.lists()[label]->data.data_handle(),
                                     n_elems,
                                     raft::Compare<DataT>(),
                                     stream_));
       }
-      
-      // raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout);
-      // raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout);
-      // raft::print_device_vector("interleaved_data", interleaved_data.data_handle(), Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim, std::cout);
-      // auto database_data_host = raft::make_host_vector<DataT>(handle_, (uint32_t)ps.num_db_vecs).data_handle();
-      // auto list_data_host = raft::make_host_vector<DataT>(handle_, Pow2<kIndexGroupSize>::roundUp(list_size)).data_handle();
-      // raft::update_host(database_data_host, database.data(), ps.num_db_vecs, stream_);
-      // raft::update_host(list_data_host, list_data_ptr, Pow2<kIndexGroupSize>::roundUp(list_size), stream_);
-      // raft::resource::sync_stream(handle_);
+      raft::print_device_vector("list_data", list_data.data_handle(), n_elems, std::cout);
+      raft::print_device_vector("exte_data", extend_index.lists()[label]->data.data_handle(), n_elems, std::cout);
 
-      
-      // raft::print_device_vector("list_data", list_data_ptr, Pow2<kIndexGroupSize>::roundUp(list_size) * ps.dim, std::cout);
-      // auto inds = make_host_vector<IdxT>(handle_, list_size);
-      // raft::update_host(inds.data_handle(), list_inds_ptr, list_size, stream_);
-      // resource::sync_stream(handle_);
-      // raft::print_device_vector("first_flat_code", database.data() + inds.data_handle()[0] * ps.dim, ps.dim, std::cout);
-
-      auto unpacked_flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, static_cast<uint32_t>(ps.dim));
+      auto unpacked_flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
 
       helpers::codepacker::unpack_full_list<DataT, IdxT>(
         handle_,
@@ -411,7 +409,49 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                     list_size * ps.dim,
                                     raft::Compare<DataT>(),
                                     stream_));
+      }
     }
+    // auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
+    //       search_queries.data(), ps.num_queries, ps.dim);
+    //     auto indices_out_1 = raft::make_device_matrix<IdxT, IdxT>(handle_, ps.num_queries, ps.k);
+    //     auto dists_out = raft::make_device_matrix<T, IdxT>(handle_, ps.num_queries, ps.k);
+    //     auto indices_out_2 = raft::make_device_matrix<IdxT, IdxT>(handle_, ps.num_queries, ps.k);
+    
+    // raft::print_device_vector("idx_center_norms", idx.centers().data_handle(), idx.n_lists() * ps.dim, std::cout);
+    // raft::print_device_vector("extend_center_norms", extend_index.centers().data_handle(), idx.n_lists() * ps.dim, std::cout);
+
+        // Precompute the centers vector norms for L2Expanded distance
+    // idx.allocate_center_norms(handle_);
+    // if (idx.center_norms().has_value()) {
+    //   raft::linalg::rowNorm(idx.center_norms()->data_handle(),
+    //                         idx.centers().data_handle(),
+    //                         idx.dim(),
+    //                         idx.n_lists(),
+    //                         raft::linalg::L2Norm,
+    //                         true,
+    //                         stream_);
+    //   RAFT_LOG_TRACE_VEC(idx->center_norms()->data_handle(), std::min<uint32_t>(idx.dim(), 20));
+    // }
+
+    // raft::print_device_vector("idx_center_norms", idx.center_norms()->data_handle(), idx.n_lists(), std::cout);
+    // raft::print_device_vector("ext_center_norms", extend_index.center_norms()->data_handle(), idx.n_lists(), std::cout);
+    //     ivf_flat::search(handle_,
+    //                      search_params,
+    //                      idx,
+    //                      search_queries_view,
+    //                      indices_out_1.view(),
+    //                      dists_out.view());
+    //     ivf_flat::search(handle_,
+    //                      search_params,
+    //                      extend_index,
+    //                      search_queries_view,
+    //                      indices_out_2.view(),
+    //                      dists_out.view());
+    //     ASSERT_TRUE(raft::devArrMatch(indices_out_1.data_handle(),
+    //                                 indices_out_2.data_handle(),
+    //                                 ps.num_queries * ps.k,
+    //                                 raft::Compare<IdxT>(),
+    //                                 stream_));
     }
 
 
@@ -448,63 +488,63 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
 const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
   // test various dims (aligned and not aligned to vector sizes)
-  {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}};
-  // {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  // {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
-  // {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  // {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
-  // {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
-  // {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false},
-  // {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true},
+  {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false},
+  {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true},
 
   // test dims that do not fit into kernel shared memory limits
-  // {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  // {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  // {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
-  // {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
-  // {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
-  // {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
-  // {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
-
-  // // various random combinations
-  // {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
-  // {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded, false},
-  // {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded, false},
-  // {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded, false},
-  // {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
-  // {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
-  // {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false},
-
-  // {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
-  // {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
-  // {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct, false},
-  // {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct, true},
-  // {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, true},
-  // {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, false},
-  // {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
-
-  // {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct, false},
-
-  // // test splitting the big query batches  (> max gridDim.y) into smaller batches
-  // {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, false},
-  // {1000000, 1024, 32, 10, 256, 256, raft::distance::DistanceType::InnerProduct, false},
-  // {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, true},
-
-  // // test radix_sort for getting the cluster selection
-  // {1000,
-  //  10000,
-  //  16,
-  //  10,
-  //  raft::matrix::detail::select::warpsort::kMaxCapacity * 2,
-  //  raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
-  //  raft::distance::DistanceType::L2Expanded,
-  //  false},
-  // {1000,
-  //  10000,
-  //  16,
-  //  10,
-  //  raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
-  //  raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
-  //  raft::distance::DistanceType::InnerProduct,
-  //  false}};
+  {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true},
+
+  // various random combinations
+  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded, false},
+  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded, false},
+  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true},
+  {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false},
+
+  {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct, true},
+  {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, true},
+  {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, false},
+  {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true},
+
+  {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct, false},
+
+  // test splitting the big query batches  (> max gridDim.y) into smaller batches
+  {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, false},
+  {1000000, 1024, 32, 10, 256, 256, raft::distance::DistanceType::InnerProduct, false},
+  {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, true},
+
+  // test radix_sort for getting the cluster selection
+  {1000,
+   10000,
+   16,
+   10,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 2,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+   raft::distance::DistanceType::L2Expanded,
+   false},
+  {1000,
+   10000,
+   16,
+   10,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+   raft::distance::DistanceType::InnerProduct,
+   false}};
 }  // namespace raft::neighbors::ivf_flat

From c95d1e04754d9825cb9fa6ecd07a23f85a839929 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 28 Jul 2023 13:06:47 -0700
Subject: [PATCH 16/22] updates

---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 cpp/include/raft/neighbors/ivf_list.hpp       |  8 ++--
 cpp/include/raft/neighbors/ivf_list_types.hpp | 15 ++++---
 cpp/test/neighbors/ann_ivf_flat.cuh           | 42 +++++++++----------
 4 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 67dd01ada9..546728d2c6 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -57,4 +57,4 @@ dependencies:
 - ucx-proc=*=gpu
 - ucx-py==0.33.*
 - ucx>=1.13.0
-name: all_cuda-118_arch-x86_64
\ No newline at end of file
+name: all_cuda-118_arch-x86_64
diff --git a/cpp/include/raft/neighbors/ivf_list.hpp b/cpp/include/raft/neighbors/ivf_list.hpp
index de7d84da14..6bdd0578ed 100644
--- a/cpp/include/raft/neighbors/ivf_list.hpp
+++ b/cpp/include/raft/neighbors/ivf_list.hpp
@@ -69,10 +69,10 @@ list<SpecT, SizeT, SpecExtraArgs...>::list(raft::resources const& res,
                  ivf::kInvalidRecord<index_type>);
   
   // Fill the data buffer with a pre-defined marker for easier debugging
-  thrust::fill_n(resource::get_thrust_policy(res),
-                 data.data_handle(),
-                 data.size(),
-                 ivf::kPadElem<value_type>);
+  // thrust::fill_n(resource::get_thrust_policy(res),
+  //                data.data_handle(),
+  //                data.size(),
+  //                ivf::kPadElem<value_type>);
 }
 
 /**
diff --git a/cpp/include/raft/neighbors/ivf_list_types.hpp b/cpp/include/raft/neighbors/ivf_list_types.hpp
index 9cbb36acda..f9858c3d61 100644
--- a/cpp/include/raft/neighbors/ivf_list_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_list_types.hpp
@@ -34,14 +34,13 @@ template <typename IdxT>
 constexpr static IdxT kInvalidRecord =
   (std::is_signed_v<IdxT> ? IdxT{0} : std::numeric_limits<IdxT>::max()) - 1;
 
-/**
- * Default value filled in the `data` array.
- * One may encounter it trying to access a record within a list that is outside of the
- * `size` bound or whenever the list is allocated but not filled-in yet.
- */
-template <typename T>
-constexpr static T kPadElem =
-  (std::is_signed_v<T> ? T{0} : std::numeric_limits<T>::max()) - T{1};
+// /**
+//  * Default value filled in the `data` array.
+//  * One may encounter it trying to access a record within a list that is outside of the
+//  * `size` bound or whenever the list is allocated but not filled-in yet.
+//  */
+// template <typename T>
+// constexpr static T kPadElem = T{0};
 
 /** The data for a single IVF list. */
 template <template <typename, typename...> typename SpecT,
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 3c93c07cfc..4cff91bad4 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -393,8 +393,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                     raft::Compare<DataT>(),
                                     stream_));
       }
-      raft::print_device_vector("list_data", list_data.data_handle(), n_elems, std::cout);
-      raft::print_device_vector("exte_data", extend_index.lists()[label]->data.data_handle(), n_elems, std::cout);
+      // raft::print_device_vector("list_data", list_data.data_handle(), n_elems, std::cout);
+      // raft::print_device_vector("exte_data", extend_index.lists()[label]->data.data_handle(), n_elems, std::cout);
 
       auto unpacked_flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
 
@@ -416,13 +416,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     //     auto indices_out_1 = raft::make_device_matrix<IdxT, IdxT>(handle_, ps.num_queries, ps.k);
     //     auto dists_out = raft::make_device_matrix<T, IdxT>(handle_, ps.num_queries, ps.k);
     //     auto indices_out_2 = raft::make_device_matrix<IdxT, IdxT>(handle_, ps.num_queries, ps.k);
-    
+    // 
     // raft::print_device_vector("idx_center_norms", idx.centers().data_handle(), idx.n_lists() * ps.dim, std::cout);
     // raft::print_device_vector("extend_center_norms", extend_index.centers().data_handle(), idx.n_lists() * ps.dim, std::cout);
 
         // Precompute the centers vector norms for L2Expanded distance
     // idx.allocate_center_norms(handle_);
-    // if (idx.center_norms().has_value()) {
+    // // if (idx.center_norms().has_value()) {
     //   raft::linalg::rowNorm(idx.center_norms()->data_handle(),
     //                         idx.centers().data_handle(),
     //                         idx.dim(),
@@ -435,23 +435,23 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
     // raft::print_device_vector("idx_center_norms", idx.center_norms()->data_handle(), idx.n_lists(), std::cout);
     // raft::print_device_vector("ext_center_norms", extend_index.center_norms()->data_handle(), idx.n_lists(), std::cout);
-    //     ivf_flat::search(handle_,
-    //                      search_params,
-    //                      idx,
-    //                      search_queries_view,
-    //                      indices_out_1.view(),
-    //                      dists_out.view());
-    //     ivf_flat::search(handle_,
-    //                      search_params,
-    //                      extend_index,
-    //                      search_queries_view,
-    //                      indices_out_2.view(),
-    //                      dists_out.view());
-    //     ASSERT_TRUE(raft::devArrMatch(indices_out_1.data_handle(),
-    //                                 indices_out_2.data_handle(),
-    //                                 ps.num_queries * ps.k,
-    //                                 raft::Compare<IdxT>(),
-    //                                 stream_));
+        // ivf_flat::search(handle_,
+        //                  search_params,
+        //                  idx,
+        //                  search_queries_view,
+        //                  indices_out_1.view(),
+        //                  dists_out.view());
+        // ivf_flat::search(handle_,
+        //                  search_params,
+        //                  extend_index,
+        //                  search_queries_view,
+        //                  indices_out_2.view(),
+        //                  dists_out.view());
+        // ASSERT_TRUE(raft::devArrMatch(indices_out_1.data_handle(),
+        //                             indices_out_2.data_handle(),
+        //                             ps.num_queries * ps.k,
+        //                             raft::Compare<IdxT>(),
+        //                             stream_));
     }
 
 

From da78c664190c4fec2d407525b93d4abd595a387b Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 31 Jul 2023 10:26:06 -0700
Subject: [PATCH 17/22] Update testing, pow2

---
 .../all_cuda-120_arch-x86_64.yaml             |   2 +-
 .../bench_ann_cuda-118_arch-x86_64.yaml       |   2 +-
 .../raft/neighbors/detail/ivf_flat_build.cuh  |  54 +++--
 .../raft/neighbors/ivf_flat_codepacker.hpp    |  38 ++-
 .../raft/neighbors/ivf_flat_helpers.cuh       |  70 +++++-
 cpp/include/raft/neighbors/ivf_list.hpp       |   6 -
 cpp/include/raft/neighbors/ivf_list_types.hpp |   8 -
 cpp/test/neighbors/ann_ivf_flat.cuh           | 229 ++++++++----------
 .../ann_ivf_flat/test_float_int64_t.cu        |   6 +-
 9 files changed, 232 insertions(+), 183 deletions(-)

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 589fb4920b..0ae6154078 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -53,4 +53,4 @@ dependencies:
 - ucx-proc=*=gpu
 - ucx-py==0.33.*
 - ucx>=1.13.0
-name: all_cuda-120_arch-x86_64
\ No newline at end of file
+name: all_cuda-120_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 22c6a78438..a982febeed 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -36,4 +36,4 @@ dependencies:
 - nlohmann_json>=3.11.2
 - scikit-build>=0.13.1
 - sysroot_linux-64==2.17
-name: bench_ann_cuda-118_arch-x86_64
\ No newline at end of file
+name: bench_ann_cuda-118_arch-x86_64
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index afb7c461e5..9cde1143e0 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -420,61 +420,73 @@ inline void fill_refinement_index(raft::resources const& handle,
 
 template <typename T>
 __global__ void pack_interleaved_list_kernel(
-  const T* codes, T* list_data, uint32_t n_rows, uint32_t dim, uint32_t veclen)
+  const T* codes,
+  T* list_data,
+  uint32_t n_rows,
+  uint32_t dim,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
-  uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < n_rows) {
-    codepacker::pack_1(codes + tid * dim, list_data, dim, veclen, tid);
-  }
+  uint32_t tid          = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                            ? std::get<uint32_t>(offset_or_indices) + tid
+                            : std::get<const uint32_t*>(offset_or_indices)[tid];
+  if (tid < n_rows) { codepacker::pack_1(codes + tid * dim, list_data, dim, veclen, dst_ix); }
 }
 
 template <typename T>
 __global__ void unpack_interleaved_list_kernel(
-  const T* list_data, T* codes, uint32_t n_rows, uint32_t dim, uint32_t veclen)
+  const T* list_data,
+  T* codes,
+  uint32_t n_rows,
+  uint32_t dim,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
-  uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < n_rows) {
-    codepacker::unpack_1(list_data, codes + tid * dim, dim, veclen, tid);
-  }
+  uint32_t tid          = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                            ? std::get<uint32_t>(offset_or_indices) + tid
+                            : std::get<const uint32_t*>(offset_or_indices)[tid];
+  if (tid < n_rows) { codepacker::unpack_1(list_data, codes + tid * dim, dim, veclen, src_ix); }
 }
 
 template <typename T, typename IdxT>
 void pack_list_data(
-raft::resources const& res,
+  raft::resources const& res,
   device_matrix_view<const T, uint32_t, row_major> codes,
   uint32_t veclen,
-  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
-    list_data)
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data)
 {
-  uint32_t n_rows                      = codes.extent(0);
-  uint32_t dim                         = codes.extent(1);
+  uint32_t n_rows = codes.extent(0);
+  uint32_t dim    = codes.extent(1);
   if (n_rows == 0 || dim == 0) return;
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto stream = resource::get_cuda_stream(res);
   pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
-    codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen);
+    codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen, offset_or_indices);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T, typename IdxT>
 void unpack_list_data(
   raft::resources const& res,
-  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
-    list_data,
+  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data,
   uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
   device_matrix_view<T, uint32_t, row_major> codes)
 {
-  uint32_t n_rows                      = codes.extent(0);
-  uint32_t dim                         = codes.extent(1);
+  uint32_t n_rows = codes.extent(0);
+  uint32_t dim    = codes.extent(1);
   if (n_rows == 0 || dim == 0) return;
   static constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto stream = resource::get_cuda_stream(res);
   unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
-    list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen);
+    list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen, offset_or_indices);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
index 430ca7d995..4594332fdf 100644
--- a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
@@ -20,9 +20,35 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/neighbors/ivf_flat_types.hpp>
+
+#ifdef _RAFT_HAS_CUDA
 #include <raft/util/pow2_utils.cuh>
+#else
+#include <raft/util/integer_utils.hpp>
+#endif
 
 namespace raft::neighbors::ivf_flat::codepacker {
+
+template <typename T>
+_RAFT_HOST_DEVICE inline auto roundDown(T x)
+{
+#if defined(_RAFT_HAS_CUDA)
+  return Pow2<kIndexGroupSize>::roundDown(x);
+#else
+  return raft::round_down_safe(x, kIndexGroupSize);
+#endif
+}
+
+template <typename T>
+_RAFT_HOST_DEVICE inline auto mod(T x)
+{
+#if defined(_RAFT_HAS_CUDA)
+  return Pow2<kIndexGroupSize>::mod(x);
+#else
+  return x % kIndexGroupSize;
+#endif
+}
+
 /**
  * Write one flat code into a block by the given offset. The offset indicates the id of the record
  * in the list. This function interleaves the code and is intended to later copy the interleaved
@@ -42,12 +68,12 @@ _RAFT_HOST_DEVICE void pack_1(
   const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset)
 {
   // The data is written in interleaved groups of `index::kGroupSize` vectors
-  using interleaved_group = Pow2<kIndexGroupSize>;
+  // using interleaved_group = Pow2<kIndexGroupSize>;
 
   // Interleave dimensions of the source vector while recording it.
   // NB: such `veclen` is selected, that `dim % veclen == 0`
-  auto group_offset = interleaved_group::roundDown(offset);
-  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+  auto group_offset = roundDown(offset);
+  auto ingroup_id   = mod(offset) * veclen;
 
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
@@ -74,11 +100,11 @@ _RAFT_HOST_DEVICE void unpack_1(
   const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
 {
   // The data is written in interleaved groups of `index::kGroupSize` vectors
-  using interleaved_group = Pow2<kIndexGroupSize>;
+  // using interleaved_group = Pow2<kIndexGroupSize>;
 
   // NB: such `veclen` is selected, that `dim % veclen == 0`
-  auto group_offset = interleaved_group::roundDown(offset);
-  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+  auto group_offset = roundDown(offset);
+  auto ingroup_id   = mod(offset) * veclen;
 
   for (uint32_t l = 0; l < dim; l += veclen) {
     for (uint32_t j = 0; j < veclen; j++) {
diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
index 3f9d9325c4..096e8051c3 100644
--- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh
@@ -30,26 +30,80 @@ namespace raft::neighbors::ivf_flat::helpers {
 
 namespace codepacker {
 
+/**
+ * Write flat codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<T>(res, n_vec, index.dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.veclen(), 42, list_data);
+ * @endcode
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[in] codes flat codes [n_vec, dim]
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[inout] list_data block to write into
+ */
 template <typename T, typename IdxT>
-void pack_full_list(
+void pack(
   raft::resources const& res,
   device_matrix_view<const T, uint32_t, row_major> codes,
   uint32_t veclen,
-  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
-    list_data)
+  uint32_t offset,
+  device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data)
 {
-  raft::neighbors::ivf_flat::detail::pack_list_data<T, IdxT>(res, codes, veclen, list_data);
+  raft::neighbors::ivf_flat::detail::pack_list_data<T, IdxT>(res, codes, veclen, offset, list_data);
 }
 
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<T>(res, n_take, index.dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_pq::helpers::codepacker::unpack(res, list_data, index.veclen(), offset, codes.view());
+ * @endcode
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[inout] codes
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be <= the list size.
+ */
 template <typename T, typename IdxT>
-void unpack_full_list(
+void unpack(
   raft::resources const& res,
-  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major>
-    list_data,
+  device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, row_major> list_data,
   uint32_t veclen,
+  uint32_t offset,
   device_matrix_view<T, uint32_t, row_major> codes)
 {
-  raft::neighbors::ivf_flat::detail::unpack_list_data<T, IdxT>(res, list_data, veclen, codes);
+  raft::neighbors::ivf_flat::detail::unpack_list_data<T, IdxT>(
+    res, list_data, veclen, offset, codes);
 }
 }  // namespace codepacker
 /** @} */
diff --git a/cpp/include/raft/neighbors/ivf_list.hpp b/cpp/include/raft/neighbors/ivf_list.hpp
index 6bdd0578ed..ad06a3ee71 100644
--- a/cpp/include/raft/neighbors/ivf_list.hpp
+++ b/cpp/include/raft/neighbors/ivf_list.hpp
@@ -67,12 +67,6 @@ list<SpecT, SizeT, SpecExtraArgs...>::list(raft::resources const& res,
                  indices.data_handle(),
                  indices.size(),
                  ivf::kInvalidRecord<index_type>);
-  
-  // Fill the data buffer with a pre-defined marker for easier debugging
-  // thrust::fill_n(resource::get_thrust_policy(res),
-  //                data.data_handle(),
-  //                data.size(),
-  //                ivf::kPadElem<value_type>);
 }
 
 /**
diff --git a/cpp/include/raft/neighbors/ivf_list_types.hpp b/cpp/include/raft/neighbors/ivf_list_types.hpp
index f9858c3d61..6317825201 100644
--- a/cpp/include/raft/neighbors/ivf_list_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_list_types.hpp
@@ -34,14 +34,6 @@ template <typename IdxT>
 constexpr static IdxT kInvalidRecord =
   (std::is_signed_v<IdxT> ? IdxT{0} : std::numeric_limits<IdxT>::max()) - 1;
 
-// /**
-//  * Default value filled in the `data` array.
-//  * One may encounter it trying to access a record within a list that is outside of the
-//  * `size` bound or whenever the list is allocated but not filled-in yet.
-//  */
-// template <typename T>
-// constexpr static T kPadElem = T{0};
-
 /** The data for a single IVF list. */
 template <template <typename, typename...> typename SpecT,
           typename SizeT,
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 4cff91bad4..7b9cf75c3f 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -17,19 +17,18 @@
 
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
-#include "raft/core/device_mdarray.hpp"
-#include "raft/core/host_mdarray.hpp"
-#include "raft/core/logger-macros.hpp"
-#include "raft/core/mdspan.hpp"
-#include "raft/core/mdspan_types.hpp"
-#include "raft/linalg/map.cuh"
-#include "raft/neighbors/ivf_flat_types.hpp"
-#include "raft/neighbors/ivf_list.hpp"
-#include "raft/util/cudart_utils.hpp"
-#include "raft/util/fast_int_div.cuh"
-#include "thrust/functional.h"
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/neighbors/ivf_list.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/fast_int_div.cuh>
+#include <thrust/functional.h>
 
 #include <raft_internal/neighbors/naive_knn.cuh>
 
@@ -90,7 +89,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   {
   }
 
-  // void construct_pad_mask(raft::resources const& handle, uint32_t list_size, uint32_t dim, raft::device_vector_view<bool> mask) {
+  // void construct_pad_mask(raft::resources const& handle, uint32_t list_size, uint32_t dim,
+  // raft::device_vector_view<bool> mask) {
   //   using interleaved_group = Pow2<kIndexGroupSize>;
   //   uint32_t padded_list_size = interleaved_group::roundUp(list_size);
   //   linalg::map_offset(handle, mask, [=] __device__ (auto i) {
@@ -103,9 +103,9 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   //       return ingroup_id < (padded_list_size - max_group_offset);
   //       return true;
   //     });
-// }
+  // }
 
-//  protected:
+  //  protected:
   void testIVFFlat()
   {
     size_t queries_size = ps.num_queries * ps.k;
@@ -155,7 +155,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                database.data(),
                                ps.num_db_vecs,
                                ps.dim);
-        
+
         resource::sync_stream(handle_);
         approx_knn_search(handle_,
                           distances_ivfflat_dev.data(),
@@ -164,7 +164,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                           ps.k,
                           search_queries.data(),
                           ps.num_queries);
-        
+
         update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
         update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
         resource::sync_stream(handle_);
@@ -300,7 +300,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     index_params.n_lists          = ps.nlist;
     index_params.metric           = ps.metric;
     index_params.adaptive_centers = false;
-    search_params.n_probes = ps.nprobe;
+    search_params.n_probes        = ps.nprobe;
 
     index_params.add_data_on_build        = false;
     index_params.kmeans_trainset_fraction = 1.0;
@@ -315,8 +315,10 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     index<DataT, IdxT> extend_index = ivf_flat::extend(handle_, database_view, no_opt, idx);
 
     auto list_sizes = raft::make_host_vector<uint32_t>(idx.n_lists());
-    update_host(
-      list_sizes.data_handle(), extend_index.list_sizes().data_handle(), extend_index.n_lists(), stream_);
+    update_host(list_sizes.data_handle(),
+                extend_index.list_sizes().data_handle(),
+                extend_index.n_lists(),
+                stream_);
     resource::sync_stream(handle_);
 
     auto& lists = idx.lists();
@@ -339,121 +341,86 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
 
       if (list_size > 0) {
         uint32_t padded_list_size = interleaved_group::roundUp(list_size);
-      uint32_t n_elems = padded_list_size * idx.dim();
-      auto list_data = lists[label]->data;
-      auto list_inds = extend_index.lists()[label]->indices;
-
-      // fetch the flat codes
-      auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
-
-      matrix::gather(handle_,
-                     make_device_matrix_view<const DataT, uint32_t>((const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), idx.dim()),
-                     make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds.data_handle(), list_size),
-                     flat_codes.view());
-      
-      helpers::codepacker::pack_full_list<DataT, IdxT>(
-        handle_,
-        make_const_mdspan(flat_codes.view()),
-        idx.veclen(),
-        list_data.view());
+        uint32_t n_elems          = padded_list_size * idx.dim();
+        auto list_data            = lists[label]->data;
+        auto list_inds            = extend_index.lists()[label]->indices;
+
+        // fetch the flat codes
+        auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
+
+        matrix::gather(
+          handle_,
+          make_device_matrix_view<const DataT, uint32_t>(
+            (const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), idx.dim()),
+          make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds.data_handle(),
+                                                        list_size),
+          flat_codes.view());
+
+        helpers::codepacker::pack<DataT, IdxT>(
+          handle_, make_const_mdspan(flat_codes.view()), idx.veclen(), 0, list_data.view());
+
+        {
+          auto mask = make_device_vector<bool>(handle_, n_elems);
+
+          linalg::map_offset(
+            handle_,
+            mask.view(),
+            [dim = idx.dim(), list_size, padded_list_size, veclen = idx.veclen()] __device__(
+              auto i) {
+              uint32_t max_group_offset = interleaved_group::roundDown(list_size);
+              if (i < max_group_offset * dim) { return true; }
+              uint32_t surplus    = (i - max_group_offset * dim);
+              uint32_t ingroup_id = interleaved_group::mod(surplus / veclen);
+              return ingroup_id < (list_size - max_group_offset);
+            });
+
+          // ensure that the correct number of indices are masked out
+          ASSERT_TRUE(thrust::reduce(resource::get_thrust_policy(handle_),
+                                     mask.data_handle(),
+                                     mask.data_handle() + n_elems,
+                                     0) == list_size * ps.dim);
+
+          auto packed_list_data = make_device_vector<DataT, uint32_t>(handle_, n_elems);
+
+          linalg::map_offset(handle_,
+                             packed_list_data.view(),
+                             [mask      = mask.data_handle(),
+                              list_data = list_data.data_handle()] __device__(uint32_t i) {
+                               if (mask[i]) return list_data[i];
+                               return DataT{0};
+                             });
+
+          auto extend_data          = extend_index.lists()[label]->data;
+          auto extend_data_filtered = make_device_vector<DataT, uint32_t>(handle_, n_elems);
+          linalg::map_offset(handle_,
+                             extend_data_filtered.view(),
+                             [mask        = mask.data_handle(),
+                              extend_data = extend_data.data_handle()] __device__(uint32_t i) {
+                               if (mask[i]) return extend_data[i];
+                               return DataT{0};
+                             });
+
+          ASSERT_TRUE(raft::devArrMatch(packed_list_data.data_handle(),
+                                        extend_data_filtered.data_handle(),
+                                        n_elems,
+                                        raft::Compare<DataT>(),
+                                        stream_));
+        }
 
-      {
-    //     auto mask = make_device_vector<bool>(handle_, n_elems);
-
-    // linalg::map_offset(handle_, mask.view(), [dim = idx.dim(), list_size, padded_list_size, veclen = idx.veclen()] __device__ (auto i) {
-    //     uint32_t max_group_offset = interleaved_group::roundDown(list_size);
-    //     if (i < max_group_offset * dim) {
-    //       return true;
-    //     }
-    //     uint32_t surplus = (i - max_group_offset * dim);
-    //     uint32_t ingroup_id = interleaved_group::mod( surplus / veclen);
-    //     return ingroup_id < (list_size - max_group_offset);
-    //   });
-
-    //   // ensure that the correct number of indices are masked out
-    //   ASSERT_TRUE(thrust::reduce(resource::get_thrust_policy(handle_), mask.data_handle(), mask.data_handle() + n_elems, 0) == list_size * ps.dim);
-
-    //     auto packed_list_data = make_device_vector<DataT, uint32_t>(handle_, n_elems);
-
-    //     linalg::map_offset(handle_, packed_list_data.view(), [mask = mask.data_handle(), list_data= list_data.data_handle()] __device__ (uint32_t i) {
-    //       if (mask[i]) return list_data[i];
-    //       return DataT{0};
-    //     });
-
-    //   auto extend_data = extend_index.lists()[label]->data;
-    //   auto extend_data_filtered = make_device_vector<DataT, uint32_t>(handle_, n_elems);
-    // linalg::map_offset(handle_, extend_data_filtered.view(), [mask = mask.data_handle(), extend_data = extend_data.data_handle()] __device__ (uint32_t i) {
-    //       if (mask[i]) return extend_data[i];
-    //       return DataT{0};
-    //     });
-
-      ASSERT_TRUE(raft::devArrMatch(list_data.data_handle(),
-                                    extend_index.lists()[label]->data.data_handle(),
-                                    n_elems,
-                                    raft::Compare<DataT>(),
-                                    stream_));
-      }
-      // raft::print_device_vector("list_data", list_data.data_handle(), n_elems, std::cout);
-      // raft::print_device_vector("exte_data", extend_index.lists()[label]->data.data_handle(), n_elems, std::cout);
-
-      auto unpacked_flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
-
-      helpers::codepacker::unpack_full_list<DataT, IdxT>(
-        handle_,
-        list_data.view(),
-        idx.veclen(),
-        unpacked_flat_codes.view());
-
-      ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(),
-                                    unpacked_flat_codes.data_handle(),
-                                    list_size * ps.dim,
-                                    raft::Compare<DataT>(),
-                                    stream_));
+        auto unpacked_flat_codes =
+          make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
+
+        helpers::codepacker::unpack<DataT, IdxT>(
+          handle_, list_data.view(), idx.veclen(), 0, unpacked_flat_codes.view());
+
+        ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(),
+                                      unpacked_flat_codes.data_handle(),
+                                      list_size * ps.dim,
+                                      raft::Compare<DataT>(),
+                                      stream_));
       }
     }
-    // auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
-    //       search_queries.data(), ps.num_queries, ps.dim);
-    //     auto indices_out_1 = raft::make_device_matrix<IdxT, IdxT>(handle_, ps.num_queries, ps.k);
-    //     auto dists_out = raft::make_device_matrix<T, IdxT>(handle_, ps.num_queries, ps.k);
-    //     auto indices_out_2 = raft::make_device_matrix<IdxT, IdxT>(handle_, ps.num_queries, ps.k);
-    // 
-    // raft::print_device_vector("idx_center_norms", idx.centers().data_handle(), idx.n_lists() * ps.dim, std::cout);
-    // raft::print_device_vector("extend_center_norms", extend_index.centers().data_handle(), idx.n_lists() * ps.dim, std::cout);
-
-        // Precompute the centers vector norms for L2Expanded distance
-    // idx.allocate_center_norms(handle_);
-    // // if (idx.center_norms().has_value()) {
-    //   raft::linalg::rowNorm(idx.center_norms()->data_handle(),
-    //                         idx.centers().data_handle(),
-    //                         idx.dim(),
-    //                         idx.n_lists(),
-    //                         raft::linalg::L2Norm,
-    //                         true,
-    //                         stream_);
-    //   RAFT_LOG_TRACE_VEC(idx->center_norms()->data_handle(), std::min<uint32_t>(idx.dim(), 20));
-    // }
-
-    // raft::print_device_vector("idx_center_norms", idx.center_norms()->data_handle(), idx.n_lists(), std::cout);
-    // raft::print_device_vector("ext_center_norms", extend_index.center_norms()->data_handle(), idx.n_lists(), std::cout);
-        // ivf_flat::search(handle_,
-        //                  search_params,
-        //                  idx,
-        //                  search_queries_view,
-        //                  indices_out_1.view(),
-        //                  dists_out.view());
-        // ivf_flat::search(handle_,
-        //                  search_params,
-        //                  extend_index,
-        //                  search_queries_view,
-        //                  indices_out_2.view(),
-        //                  dists_out.view());
-        // ASSERT_TRUE(raft::devArrMatch(indices_out_1.data_handle(),
-        //                             indices_out_2.data_handle(),
-        //                             ps.num_queries * ps.k,
-        //                             raft::Compare<IdxT>(),
-        //                             stream_));
-    }
-
+  }
 
   void SetUp() override
   {
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
index 553c10e0f4..3bfea283e5 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
@@ -21,7 +21,11 @@
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, float, std::int64_t> AnnIVFFlatTestF;
-TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testPacker();}
+TEST_P(AnnIVFFlatTestF, AnnIVFFlat)
+{
+  this->testIVFFlat();
+  this->testPacker();
+}
 
 INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));
 

From 15db0c6a6256e290a1fc83342bb8793fdb191047 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 31 Jul 2023 11:07:43 -0700
Subject: [PATCH 18/22] remove unneccessary changes

---
 .../raft/spatial/knn/detail/ann_quantized.cuh   |  1 -
 cpp/test/neighbors/ann_ivf_flat.cuh             | 17 -----------------
 2 files changed, 18 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 25339a6391..964292f6cb 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -18,7 +18,6 @@
 
 #include "../ann_common.h"
 #include "../ivf_flat.cuh"
-#include <cstring>
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include "processing.cuh"
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 7b9cf75c3f..e95608d2a4 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -89,23 +89,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
   {
   }
 
-  // void construct_pad_mask(raft::resources const& handle, uint32_t list_size, uint32_t dim,
-  // raft::device_vector_view<bool> mask) {
-  //   using interleaved_group = Pow2<kIndexGroupSize>;
-  //   uint32_t padded_list_size = interleaved_group::roundUp(list_size);
-  //   linalg::map_offset(handle, mask, [=] __device__ (auto i) {
-  //       uint32_t row = i / dim;
-  //       uint32_t max_group_offset = interleaved_group::roundDown(list_size);
-  //       if (row < max_group_offset) {
-  //         return true;
-  //       }
-  //       uint32_t ingroup_id = interleaved_group::mod(row);
-  //       return ingroup_id < (padded_list_size - max_group_offset);
-  //       return true;
-  //     });
-  // }
-
-  //  protected:
   void testIVFFlat()
   {
     size_t queries_size = ps.num_queries * ps.k;

From 154dc6d2615935497303361a2d80f720c956d571 Mon Sep 17 00:00:00 2001
From: Tarang Jain <40517122+tarang-jain@users.noreply.github.com>
Date: Mon, 31 Jul 2023 11:10:21 -0700
Subject: [PATCH 19/22] Delete log.txt

---
 log.txt | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 log.txt

diff --git a/log.txt b/log.txt
deleted file mode 100644
index 733021b7ab..0000000000
--- a/log.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-========= COMPUTE-SANITIZER
-========= Target application doesn't exist or is not a valid executable

From 47d6421849438dbd5e3d261d3279cd5efdb47f90 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 31 Jul 2023 11:11:27 -0700
Subject: [PATCH 20/22] updates

---
 cpp/test/neighbors/ann_ivf_flat.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index e95608d2a4..b07e70d0d6 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -497,4 +497,5 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
    raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
    raft::distance::DistanceType::InnerProduct,
    false}};
+
 }  // namespace raft::neighbors::ivf_flat

From e2e13089271bb3e91be9cc51896da28ab3c43111 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 31 Jul 2023 11:54:46 -0700
Subject: [PATCH 21/22] ore cleanup

---
 cpp/include/raft/neighbors/ivf_flat_types.hpp | 1 -
 cpp/test/neighbors/ann_ivf_flat.cuh           | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 6032da8556..2e2e49cdbc 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -113,7 +113,6 @@ struct list_spec {
   /** Determine the extents of an array enough to hold a given amount of data. */
   constexpr auto make_list_extents(SizeT n_rows) const -> list_extents
   {
-    // return make_extents<SizeT>(round_up_safe(n_rows, this->align_min), dim);
     return make_extents<SizeT>(n_rows, dim);
   }
 };
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index b07e70d0d6..2252290eb2 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -348,12 +348,12 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
           linalg::map_offset(
             handle_,
             mask.view(),
-            [dim = idx.dim(), list_size, padded_list_size, veclen = idx.veclen()] __device__(
+            [dim = idx.dim(), list_size, padded_list_size, chunk_size = util::FastIntDiv(idx.veclen())] __device__(
               auto i) {
               uint32_t max_group_offset = interleaved_group::roundDown(list_size);
               if (i < max_group_offset * dim) { return true; }
               uint32_t surplus    = (i - max_group_offset * dim);
-              uint32_t ingroup_id = interleaved_group::mod(surplus / veclen);
+              uint32_t ingroup_id = interleaved_group::mod(surplus / chunk_size);
               return ingroup_id < (list_size - max_group_offset);
             });
 

From 41a49b2e22198297798de834fca596e19f16ddae Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 31 Jul 2023 12:13:53 -0700
Subject: [PATCH 22/22] style

---
 cpp/test/neighbors/ann_ivf_flat.cuh | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 2252290eb2..d72d73680a 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -345,17 +345,18 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
         {
           auto mask = make_device_vector<bool>(handle_, n_elems);
 
-          linalg::map_offset(
-            handle_,
-            mask.view(),
-            [dim = idx.dim(), list_size, padded_list_size, chunk_size = util::FastIntDiv(idx.veclen())] __device__(
-              auto i) {
-              uint32_t max_group_offset = interleaved_group::roundDown(list_size);
-              if (i < max_group_offset * dim) { return true; }
-              uint32_t surplus    = (i - max_group_offset * dim);
-              uint32_t ingroup_id = interleaved_group::mod(surplus / chunk_size);
-              return ingroup_id < (list_size - max_group_offset);
-            });
+          linalg::map_offset(handle_,
+                             mask.view(),
+                             [dim = idx.dim(),
+                              list_size,
+                              padded_list_size,
+                              chunk_size = util::FastIntDiv(idx.veclen())] __device__(auto i) {
+                               uint32_t max_group_offset = interleaved_group::roundDown(list_size);
+                               if (i < max_group_offset * dim) { return true; }
+                               uint32_t surplus    = (i - max_group_offset * dim);
+                               uint32_t ingroup_id = interleaved_group::mod(surplus / chunk_size);
+                               return ingroup_id < (list_size - max_group_offset);
+                             });
 
           // ensure that the correct number of indices are masked out
           ASSERT_TRUE(thrust::reduce(resource::get_thrust_policy(handle_),