diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 630b8788f8..d6e4ecb676 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -62,7 +62,7 @@ repos:
                 entry: ./cpp/scripts/run-cmake-format.sh cmake-format
                 language: python
                 types: [cmake]
-                exclude: .*/thirdparty/.*
+                exclude: .*/thirdparty/.*|.*FindAVX.cmake.*
                 # Note that pre-commit autoupdate does not update the versions
                 # of dependencies, so we'll have to update this manually.
                 additional_dependencies:
diff --git a/build.sh b/build.sh
index 9468d2cab0..3758dc26c4 100755
--- a/build.sh
+++ b/build.sh
@@ -2,7 +2,7 @@
 
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-# raft build script
+# raft build scripts
 
 # This script is used to build the component(s) in this repo from
 # source, and can be called with various options to customize the
@@ -15,11 +15,11 @@ NUMARGS=$#
 ARGS=$*
 
 # NOTE: ensure all dir changes are relative to the location of this
-# script, and that this script resides in the repo dir!
+# scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pylibraft raft-dask docs tests bench template clean --uninstall  -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn -h"
-HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench=<targets>]
+VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall  -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn -h"
+HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-prims=<targets>] [--limit-bench-ann=<targets>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libraft          - build the raft C++ code only. Also builds the C-wrapper library
@@ -28,7 +28,8 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    raft-dask        - build the raft-dask Python package. this also requires pylibraft.
    docs             - build the documentation
    tests            - build the tests
-   bench            - build the benchmarks
+   bench-prims      - build micro-benchmarks for primitives
+   bench-ann        - build end-to-end ann benchmarks
    template         - build the example RAFT application template
 
  and <flag> is:
@@ -39,7 +40,8 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    --compile-lib               - compile shared libraries for all components
                                  can be useful for a pure header-only install
    --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
-   --limit-bench               - semicolon-separated list of benchmark executables to compute (e.g. NEIGHBORS_BENCH;CLUSTER_BENCH)
+   --limit-bench-prims         - semicolon-separated list of prims benchmark executables to compute (e.g. NEIGHBORS_PRIMS_BENCH;CLUSTER_PRIMS_BENCH)
+   --limit-bench-ann           - semicolon-separated list of ann benchmark executables to compute (e.g. HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH)
    --allgpuarch                - build for all supported GPU architectures
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
    --show_depr_warn            - show cmake deprecation warnings
@@ -63,7 +65,8 @@ VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=OFF
 BUILD_TYPE=Release
-BUILD_BENCH=OFF
+BUILD_PRIMS_BENCH=OFF
+BUILD_ANN_BENCH=OFF
 COMPILE_LIBRARY=OFF
 INSTALL_TARGET=install
 
@@ -151,15 +154,30 @@ function limitTests {
 
 function limitBench {
     # Check for option to limit the set of test binaries to build
-    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench" || true; } ) ]]; then
+    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-prims" || true; } ) ]]; then
         # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
         # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
         # on the invalid option error
-        LIMIT_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench=//' -e 's/ .*//')
-        if [[ -n ${LIMIT_BENCH_TARGETS} ]]; then
+        LIMIT_PRIMS_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-prims=//' -e 's/ .*//')
+        if [[ -n ${LIMIT_PRIMS_BENCH_TARGETS} ]]; then
+            # Remove the full LIMIT_PRIMS_BENCH_TARGETS argument from list of args so that it passes validArgs function
+            ARGS=${ARGS//--limit-bench-prims=$LIMIT_PRIMS_BENCH_TARGETS/}
+            PRIMS_BENCH_TARGETS=${LIMIT_PRIMS_BENCH_TARGETS}
+        fi
+    fi
+}
+
+function limitAnnBench {
+    # Check for option to limit the set of test binaries to build
+    if [[ -n $(echo $ARGS | { grep -E "\-\-limit\-bench-ann" || true; } ) ]]; then
+        # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
+        # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
+        # on the invalid option error
+        LIMIT_ANN_BENCH_TARGETS=$(echo $ARGS | sed -e 's/.*--limit-bench-ann=//' -e 's/ .*//')
+        if [[ -n ${LIMIT_ANN_BENCH_TARGETS} ]]; then
             # Remove the full LIMIT_TEST_TARGETS argument from list of args so that it passes validArgs function
-            ARGS=${ARGS//--limit-bench=$LIMIT_BENCH_TARGETS/}
-            BENCH_TARGETS=${LIMIT_BENCH_TARGETS}
+            ARGS=${ARGS//--limit-bench-ann=$LIMIT_ANN_BENCH_TARGETS/}
+            ANN_BENCH_TARGETS=${LIMIT_ANN_BENCH_TARGETS}
         fi
     fi
 }
@@ -175,6 +193,7 @@ if (( ${NUMARGS} != 0 )); then
     cacheTool
     limitTests
     limitBench
+    limitAnnBench
     for a in ${ARGS}; do
         if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then
             echo "Invalid option: ${a}"
@@ -281,18 +300,23 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then
     fi
 fi
 
-if hasArg bench || (( ${NUMARGS} == 0 )); then
-    BUILD_BENCH=ON
-    CMAKE_TARGET="${CMAKE_TARGET};${BENCH_TARGETS}"
+if hasArg bench-prims || (( ${NUMARGS} == 0 )); then
+    BUILD_PRIMS_BENCH=ON
+    CMAKE_TARGET="${CMAKE_TARGET};${PRIMS_BENCH_TARGETS}"
 
     # Force compile library when needed benchmark targets are specified
-    if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \
-          $CMAKE_TARGET == *"MATRIX_BENCH"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_BENCH"* ]]; then
+    if [[ $CMAKE_TARGET == *"CLUSTER_PRIMS_BENCH"* || \
+          $CMAKE_TARGET == *"MATRIX_PRIMS_BENCH"* || \
+          $CMAKE_TARGET == *"NEIGHBORS_PRIMS_BENCH"* ]]; then
       echo "-- Enabling compiled lib for benchmarks"
       COMPILE_LIBRARY=ON
     fi
+fi
 
+if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
+    BUILD_ANN_BENCH=ON
+    CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}"
+    COMPILE_LIBRARY=ON
 fi
 
 if hasArg --no-nvtx; then
@@ -305,8 +329,6 @@ if hasArg clean; then
     CLEAN=1
 fi
 
-
-
 if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
@@ -340,7 +362,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -359,7 +381,8 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DRAFT_NVTX=${NVTX} \
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
-          -DBUILD_BENCH=${BUILD_BENCH} \
+          -DBUILD_PRIMS_BENCH=${BUILD_PRIMS_BENCH} \
+          -DBUILD_ANN_BENCH=${BUILD_ANN_BENCH} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           ${CACHE_ARGS} \
           ${EXTRA_CMAKE_ARGS}
@@ -380,7 +403,6 @@ if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
     if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
         EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
     fi
-
     cd ${REPODIR}/python/pylibraft
     python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index a44314a6ce..123aeba87b 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -192,7 +192,8 @@ def checkCopyright_main():
                            action="append",
                            required=False,
                            default=["python/cuml/_thirdparty/",
-                                    "cpp/include/raft/thirdparty/"],
+                                    "cpp/include/raft/thirdparty/",
+                                    "cpp/cmake/modules/FindAVX.cmake"],
                            help=("Exclude the paths specified (regexp). "
                                  "Can be specified multiple times."))
 
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
new file mode 100644
index 0000000000..5965aaef8f
--- /dev/null
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -0,0 +1,37 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- c-compiler
+- clang-tools=11.1.0
+- clang=11.1.0
+- cmake>=3.23.1,!=3.25.0
+- cuda-profiler-api=11.8.86
+- cudatoolkit=11.8
+- cxx-compiler
+- cython>=0.29,<0.30
+- faiss-proc=*=cuda
+- gcc_linux-64=11.*
+- glog>=0.6.0
+- h5py>=3.8.0
+- hnswlib=0.7.0
+- libcublas-dev=11.11.3.6
+- libcublas=11.11.3.6
+- libcurand-dev=10.3.0.86
+- libcurand=10.3.0.86
+- libcusolver-dev=11.4.1.48
+- libcusolver=11.4.1.48
+- libcusparse-dev=11.7.5.86
+- libcusparse=11.7.5.86
+- libfaiss>=1.7.1
+- nccl>=2.9.9
+- ninja
+- nlohmann_json>=3.11.2
+- scikit-build>=0.13.1
+- sysroot_linux-64==2.17
+name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/recipes/libraft/build_libraft_nn_bench.sh b/conda/recipes/libraft/build_libraft_nn_bench.sh
new file mode 100644
index 0000000000..dc6250f0f4
--- /dev/null
+++ b/conda/recipes/libraft/build_libraft_nn_bench.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+./build.sh tests bench-ann --allgpuarch --no-nvtx
+cmake --install cpp/build --component ann_bench
diff --git a/conda/recipes/libraft/build_libraft_tests.sh b/conda/recipes/libraft/build_libraft_tests.sh
index aa2c1b3e89..cc28f93fb8 100644
--- a/conda/recipes/libraft/build_libraft_tests.sh
+++ b/conda/recipes/libraft/build_libraft_tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
-./build.sh tests bench --allgpuarch --no-nvtx
+./build.sh tests bench-prims --allgpuarch --no-nvtx
 cmake --install cpp/build --component testing
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index e1079f4db8..2a66f213a7 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -19,6 +19,18 @@ nccl_version:
 gtest_version:
   - "=1.10.0"
 
+glog_version:
+  - ">=0.6.0"
+
+faiss_version:
+  - ">=1.7.1"
+
+h5py_version:
+  - ">=3.8.0"
+
+nlohmann_json_version:
+  - ">=3.11.2"
+
 # The CTK libraries below are missing from the conda-forge::cudatoolkit
 # package. The "*_host_*" version specifiers correspond to `11.8` packages and the
 # "*_run_*" version specifiers correspond to `11.x` packages.
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index f911166a9a..7859807777 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -186,3 +186,47 @@ outputs:
       home: https://rapids.ai/
       license: Apache-2.0
       summary: libraft template
+  - name: libraft-ann-bench
+    version: {{ version }}
+    script: build_libraft_nn_bench.sh
+    build:
+      script_env: *script_env
+      number: {{ GIT_DESCRIBE_NUMBER }}
+      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+      ignore_run_exports_from:
+        - {{ compiler('cuda') }}
+    requirements:
+      build:
+        - {{ compiler('c') }}
+        - {{ compiler('cuda') }} {{ cuda_version }}
+        - {{ compiler('cxx') }}
+        - cmake {{ cmake_version }}
+        - ninja
+        - sysroot_{{ target_platform }} {{ sysroot_version }}
+      host:
+        - {{ pin_subpackage('libraft', exact=True) }}
+        - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
+        - libcublas {{ libcublas_host_version }}
+        - libcublas-dev {{ libcublas_host_version }}
+        - libcurand {{ libcurand_host_version }}
+        - libcurand-dev {{ libcurand_host_version }}
+        - libcusolver {{ libcusolver_host_version }}
+        - libcusolver-dev {{ libcusolver_host_version }}
+        - libcusparse {{ libcusparse_host_version }}
+        - libcusparse-dev {{ libcusparse_host_version }}
+        - glog {{ glog_version }}
+        - nlohmann_json {{ nlohmann_json_version }}
+        - libfaiss>=1.7.1
+        - faiss-proc=*=cuda
+      run:
+        - {{ pin_subpackage('libraft', exact=True) }}
+        - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - glog {{ glog_version }}
+        - faiss-proc=*=cuda
+        - libfaiss {{ faiss_version }}
+        - h5py {{ h5py_version }}
+    about:
+      home: https://rapids.ai/
+      license: Apache-2.0
+      summary: libraft ann bench
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c1704552ec..7bb458c44a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -46,7 +46,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 option(BUILD_SHARED_LIBS "Build raft shared libraries" ON)
 option(BUILD_TESTS "Build raft unit-tests" ON)
-option(BUILD_BENCH "Build raft C++ benchmark tests" OFF)
+option(BUILD_PRIMS_BENCH "Build raft C++ benchmark tests" OFF)
+option(BUILD_ANN_BENCH "Build raft ann benchmarks" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
@@ -58,14 +59,20 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(RAFT_NVTX "Enable nvtx markers" OFF)
 
 set(RAFT_COMPILE_LIBRARY_DEFAULT OFF)
-if(BUILD_TESTS OR BUILD_BENCH)
+if(BUILD_TESTS
+   OR BUILD_PRIMS_BENCH
+   OR BUILD_ANN_BENCH
+)
   set(RAFT_COMPILE_LIBRARY_DEFAULT ON)
 endif()
 option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
 
-if(BUILD_TESTS OR BUILD_BENCH)
+if(BUILD_TESTS
+   OR BUILD_PRIMS_BENCH
+   OR BUILD_ANN_BENCH
+)
   # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
   # to have different values for the `Threads::Threads` target. Setting this flag ensures
   # `Threads::Threads` is the same value in first run and subsequent runs.
@@ -78,7 +85,7 @@ include(CMakeDependentOption)
 
 message(VERBOSE "RAFT: Building optional components: ${raft_FIND_COMPONENTS}")
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
-message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_BENCH}")
+message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_PRIMS_BENCH}")
 message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "RAFT: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -159,7 +166,7 @@ if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
 endif()
 
-if(BUILD_BENCH)
+if(BUILD_PRIMS_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
   rapids_cpm_gbench()
 endif()
@@ -647,7 +654,7 @@ raft_export(
 # ##################################################################################################
 # * shared test/bench headers ------------------------------------------------
 
-if(BUILD_TESTS OR BUILD_BENCH)
+if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
   include(internal/CMakeLists.txt)
 endif()
 
@@ -661,6 +668,13 @@ endif()
 # ##################################################################################################
 # * build benchmark executable -----------------------------------------------
 
-if(BUILD_BENCH)
-  include(bench/CMakeLists.txt)
+if(BUILD_PRIMS_BENCH)
+  include(bench/prims/CMakeLists.txt)
+endif()
+
+# ##################################################################################################
+# * build ann benchmark executable -----------------------------------------------
+
+if(BUILD_ANN_BENCH)
+  include(bench/ann/CMakeLists.txt)
 endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
new file mode 100644
index 0000000000..6267be518e
--- /dev/null
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -0,0 +1,160 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
+
+option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
+option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON)
+
+find_package(Threads REQUIRED)
+
+set(RAFT_ANN_BENCH_USE_FAISS OFF)
+if(RAFT_ANN_BENCH_USE_FAISS_BFKNN
+   OR RAFT_ANN_BENCH_USE_FAISS_IVFPQ
+   OR RAFT_ANN_BENCH_USE_FAISS_IFFLAT
+)
+  set(RAFT_ANN_BENCH_USE_FAISS ON)
+endif()
+
+set(RAFT_ANN_BENCH_USE_RAFT OFF)
+if(RAFT_ANN_BENCH_USE_RAFT_BFKNN
+   OR RAFT_ANN_BENCH_USE_RAFT_IVFPQ
+   OR RAFT_ANN_BENCH_USE_RAFT_IVFFLAT
+)
+  set(RAFT_ANN_BENCH_USE_RAFT ON)
+endif()
+
+if(RAFT_ANN_BENCH_USE_HNSWLIB)
+  include(cmake/thirdparty/get_hnswlib.cmake)
+endif()
+
+option(RAFT_ANN_BENCH_USE_MULTIGPU "Use multi-gpus (where possible) in benchmarks" OFF)
+
+include(cmake/thirdparty/get_nlohmann_json.cmake)
+
+if(RAFT_ANN_BENCH_USE_GGNN)
+  include(cmake/thirdparty/get_ggnn.cmake)
+endif()
+
+if(RAFT_ANN_BENCH_USE_FAISS)
+  include(cmake/thirdparty/get_faiss.cmake)
+endif()
+
+function(ConfigureAnnBench)
+
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH LINKS CXXFLAGS INCLUDES)
+
+  cmake_parse_arguments(
+    ConfigureAnnBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}
+  )
+
+  set(BENCH_NAME ${ConfigureAnnBench_NAME}_ANN_BENCH)
+
+  add_executable(
+    ${BENCH_NAME} ${ConfigureAnnBench_PATH} bench/ann/src/common/conf.cpp
+                  bench/ann/src/common/util.cpp
+  )
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE raft::raft
+            nlohmann_json::nlohmann_json
+            $<$<BOOL:${RAFT_ANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
+            ${ConfigureAnnBench_LINKS}
+            Threads::Threads
+            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  set(${ConfigureAnnBench_CXXFLAGS} ${RAFT_CXX_FLAGS} ${ConfigureAnnBench_CXXFLAGS})
+
+  target_compile_options(
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ConfigureAnnBench_CXXFLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
+
+  if(RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME})
+    target_compile_definitions(
+      ${BENCH_NAME}
+      PUBLIC
+        RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME}=RAFT_ANN_BENCH_USE_${ConfigureAnnBench_NAME}
+    )
+  endif()
+
+  target_include_directories(
+    ${BENCH_NAME}
+    PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+    PRIVATE ${ConfigureAnnBench_INCLUDES}
+  )
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT ann_bench
+    DESTINATION bin/ann
+    EXCLUDE_FROM_ALL
+  )
+endfunction()
+
+if(RAFT_ANN_BENCH_USE_HNSWLIB)
+  ConfigureAnnBench(
+    NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp INCLUDES
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS "${HNSW_CXX_FLAGS}"
+  )
+endif()
+
+if(RAFT_ANN_BENCH_USE_RAFT)
+  ConfigureAnnBench(
+    NAME
+    RAFT_IVF_PQ
+    PATH
+    bench/ann/src/raft/raft_benchmark.cu
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_PQ}>:bench/ann/src/raft/raft_ivf_pq.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT}>:bench/ann/src/raft/raft_ivf_flat.cu>
+    LINKS
+    raft::compiled
+  )
+endif()
+
+if(RAFT_ANN_BENCH_USE_FAISS)
+  ConfigureAnnBench(
+    NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss
+  )
+endif()
+
+if(RAFT_ANN_BENCH_USE_GGNN)
+  include(cmake/thirdparty/get_glog.cmake)
+  ConfigureAnnBench(
+    NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu INCLUDES
+    ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src/include LINKS glog::glog
+  )
+endif()
diff --git a/cpp/bench/ann/README.md b/cpp/bench/ann/README.md
new file mode 100644
index 0000000000..1a8af2e448
--- /dev/null
+++ b/cpp/bench/ann/README.md
@@ -0,0 +1,3 @@
+# RAFT CUDA ANN Benchmarks
+
+Please see the [ANN Benchmarks](https://docs.rapids.ai/api/raft/stable/cuda_ann_benchmarks.html) section of the RAFT documentation for instructions on building and using the ANN benchmarks.
\ No newline at end of file
diff --git a/cpp/bench/ann/conf/bigann-100M.json b/cpp/bench/ann/conf/bigann-100M.json
new file mode 100644
index 0000000000..5f16f3378d
--- /dev/null
+++ b/cpp/bench/ann/conf/bigann-100M.json
@@ -0,0 +1,174 @@
+{
+  "dataset" : {
+    "name" : "bigann-100M",
+    "base_file" : "data/bigann-1B/base.1B.u8bin",
+    "subset_size" : 100000000,
+    "query_file" : "data/bigann-1B/query.public.10K.u8bin",
+    "distance" : "euclidean"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster5K-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "niter": 25,
+        "nlist": 5000,
+        "pq_dim": 64,
+        "ratio": 10
+      },
+      "file": "index/bigann-100M/raft_ivf_pq/dimpq64-cluster5K",
+      "search_params": [
+        {
+          "numProbes": 20,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 30,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 40,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "numProbes": 1000,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/bigann-100M/raft_ivf_pq/dimpq64-cluster5K-float-float"
+    },
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/bigann-100M/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/bigann-100M/hnswlib/M36"
+    },
+
+
+    {
+      "name" : "ivf_flat.nlist100K",
+      "algo" : "ivf_flat",
+      "build_param": {
+        "nlist" : 100000,
+        "niter" : 25,
+        "ratio" : 5
+      },
+      "file" : "index/bigann-100M/ivf_flat/nlist100K",
+      "search_params" : [
+        {"max_batch":10000, "max_k":10, "nprobe":20},
+        {"max_batch":10000, "max_k":10, "nprobe":30},
+        {"max_batch":10000, "max_k":10, "nprobe":40},
+        {"max_batch":10000, "max_k":10, "nprobe":50},
+        {"max_batch":10000, "max_k":10, "nprobe":100},
+        {"max_batch":10000, "max_k":10, "nprobe":200},
+        {"max_batch":10000, "max_k":10, "nprobe":500},
+        {"max_batch":10000, "max_k":10, "nprobe":1000}
+      ],
+      "search_result_file" : "result/bigann-100M/ivf_flat/nlist100K"
+    },
+
+
+
+  ]
+}
diff --git a/cpp/bench/ann/conf/deep-100M.json b/cpp/bench/ann/conf/deep-100M.json
new file mode 100644
index 0000000000..b3a945d50e
--- /dev/null
+++ b/cpp/bench/ann/conf/deep-100M.json
@@ -0,0 +1,223 @@
+{
+  "dataset" : {
+    "name" : "deep-100M",
+    "base_file" : "data/deep-1B/base.1B.fbin",
+    "subset_size" : 100000000,
+    "query_file" : "data/deep-1B/query.public.10K.fbin",
+    "distance" : "euclidean"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/deep-100M/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/deep-100M/hnswlib/M36"
+    },
+    {
+      "name" : "faiss_ivf_flat.nlist50K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":50000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist50K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist50K"
+    },
+    {
+      "name" : "faiss_ivf_flat.nlist100K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":100000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist100K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist100K"
+    },
+    {
+      "name" : "faiss_ivf_flat.nlist200K",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":200000},
+      "file" : "index/deep-100M/faiss_ivf_flat/nlist200K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_flat/nlist200K"
+    },
+
+
+    {
+      "name" : "faiss_ivf_pq.M48-nlist16K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist16K",
+      "search_params" : [
+        {"nprobe":10},
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist16K"
+    },
+    {
+      "name" : "faiss_ivf_pq.M48-nlist50K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":50000, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist50K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist50K"
+    },
+    {
+      "name" : "faiss_ivf_pq.M48-nlist100K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":100000, "M":48},
+      "file" : "index/deep-100M/faiss_ivf_pq/M48-nlist100K",
+      "search_params" : [
+        {"nprobe":20},
+        {"nprobe":30},
+        {"nprobe":40},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/faiss_ivf_pq/M48-nlist100K"
+    },
+
+
+    {
+      "name" : "ivf_flat.nlist100K",
+      "algo" : "ivf_flat",
+      "build_param": {
+        "nlist" : 100000,
+        "niter" : 25,
+        "ratio" : 5
+      },
+      "file" : "index/deep-100M/ivf_flat/nlist100K",
+      "search_params" : [
+        {"max_batch":10000, "max_k":10, "nprobe":20},
+        {"max_batch":10000, "max_k":10, "nprobe":30},
+        {"max_batch":10000, "max_k":10, "nprobe":40},
+        {"max_batch":10000, "max_k":10, "nprobe":50},
+        {"max_batch":10000, "max_k":10, "nprobe":100},
+        {"max_batch":10000, "max_k":10, "nprobe":200},
+        {"max_batch":10000, "max_k":10, "nprobe":500},
+        {"max_batch":10000, "max_k":10, "nprobe":1000}
+      ],
+      "search_result_file" : "result/deep-100M/ivf_flat/nlist100K"
+    },
+
+
+  ]
+}
diff --git a/cpp/bench/ann/conf/deep-1B.json b/cpp/bench/ann/conf/deep-1B.json
new file mode 100644
index 0000000000..50d1b87602
--- /dev/null
+++ b/cpp/bench/ann/conf/deep-1B.json
@@ -0,0 +1,38 @@
+{
+  "dataset" : {
+    "name" : "deep-1B",
+    "base_file" : "data/deep-1B/base.1B.fbin",
+    "query_file" : "data/deep-1B/query.public.10K.fbin",
+    // although distance should be "euclidean", faiss becomes much slower for that
+    "distance" : "inner_product"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 10000,
+    "k" : 10,
+    "run_count" : 2
+  },
+
+  "index" : [
+    {
+      "name" : "faiss_ivf_pq.M48-nlist50K",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":50000, "M":48},
+      "file" : "index/deep-1B/faiss_ivf_pq/M48-nlist50K",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/deep-1B/faiss_ivf_pq/M48-nlist50K"
+    },
+
+
+  ]
+}
diff --git a/cpp/bench/ann/conf/glove-100-inner.json b/cpp/bench/ann/conf/glove-100-inner.json
new file mode 100644
index 0000000000..d210aca654
--- /dev/null
+++ b/cpp/bench/ann/conf/glove-100-inner.json
@@ -0,0 +1,797 @@
+{
+  "dataset" : {
+    "name" : "glove-100-inner",
+    "base_file" : "data/glove-100-inner/base.fbin",
+    "query_file" : "data/glove-100-inner/query.fbin",
+    "distance" : "inner_product"
+  },
+
+  "search_basic_param" : {
+    "batch_size" : 1,
+    "k" : 10,
+    "run_count" : 3
+  },
+
+  "index" : [
+    {
+      "name" : "hnswlib.M4",
+      "algo" : "hnswlib",
+      "build_param": {"M":4, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M4",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M4"
+    },
+
+    {
+      "name" : "hnswlib.M8",
+      "algo" : "hnswlib",
+      "build_param": {"M":8, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M8",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M8"
+    },
+
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M12"
+    },
+
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M16"
+    },
+
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M24"
+    },
+
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M36"
+    },
+
+    {
+      "name" : "hnswlib.M48",
+      "algo" : "hnswlib",
+      "build_param": {"M":48, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M48",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M48"
+    },
+
+    {
+      "name" : "hnswlib.M64",
+      "algo" : "hnswlib",
+      "build_param": {"M":64, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M64",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M64"
+    },
+
+    {
+      "name" : "hnswlib.M96",
+      "algo" : "hnswlib",
+      "build_param": {"M":96, "efConstruction":500, "numThreads":4},
+      "file" : "index/glove-100-inner/hnswlib/M96",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/glove-100-inner/hnswlib/M96"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist1024",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":1024},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist2048",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":2048},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist4096",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":4096},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist8192",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":8192},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_flat.nlist16384",
+      "algo" : "faiss_gpu_ivf_flat",
+      "build_param": {"nlist":16384},
+      "file" : "index/glove-100-inner/faiss_ivf_flat/nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_flat/nlist16384"
+    },
+
+
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M2-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":2},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M2-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M2-nlist16384"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M4-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":4},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M4-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M4-nlist16384"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist1024",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":1024, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist1024",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist1024"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist2048",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":2048, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist2048",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist2048"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist4096",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":4096, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist4096",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist4096"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist8192",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":8192, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist8192",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist8192"
+    },
+
+    {
+      "name" : "faiss_ivf_pq.M20-nlist16384",
+      "algo" : "faiss_gpu_ivf_pq",
+      "build_param": {"nlist":16384, "M":20},
+      "file" : "index/glove-100-inner/faiss_ivf_pq/M20-nlist16384",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_pq/M20-nlist16384"
+    },
+
+
+    {
+      "name" : "faiss_ivf_sq.nlist1024-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":1024, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist1024-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist1024-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist2048-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":2048, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist2048-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist2048-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist4096-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":4096, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist4096-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist4096-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist8192-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":8192, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist8192-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist8192-fp16"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist16384-fp16",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":16384, "quantizer_type":"fp16"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist16384-fp16",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist16384-fp16"
+    },
+
+
+    {
+      "name" : "faiss_ivf_sq.nlist1024-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":1024, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist1024-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist1024-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist2048-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":2048, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist2048-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist2048-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist4096-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":4096, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist4096-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist4096-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist8192-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":8192, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist8192-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist8192-int8"
+    },
+
+    {
+      "name" : "faiss_ivf_sq.nlist16384-int8",
+      "algo" : "faiss_gpu_ivf_sq",
+      "build_param": {"nlist":16384, "quantizer_type":"int8"},
+      "file" : "index/glove-100-inner/faiss_ivf_sq/nlist16384-int8",
+      "search_params" : [
+        {"nprobe":1},
+        {"nprobe":5},
+        {"nprobe":10},
+        {"nprobe":50},
+        {"nprobe":100},
+        {"nprobe":200},
+        {"nprobe":500},
+        {"nprobe":1000},
+        {"nprobe":2000}
+      ],
+      "search_result_file" : "result/glove-100-inner/faiss_ivf_sq/nlist16384-int8"
+    },
+
+    {
+      "name" : "faiss_flat",
+      "algo" : "faiss_gpu_flat",
+      "build_param": {},
+      "file" : "index/glove-100-inner/faiss_flat/flat",
+      "search_params" : [{}],
+      "search_result_file" : "result/glove-100-inner/faiss_flat/flat"
+    },
+
+    {
+      "name" : "ggnn.kbuild96-segment64-refine2-k10",
+      "algo" : "ggnn",
+      "build_param": {
+        "k_build": 96,
+        "segment_size": 64,
+        "refine_iterations": 2,
+        "dataset_size": 1183514,
+        "k": 10
+      },
+      "file" : "index/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10",
+      "search_params" : [
+        {"tau":0.001, "block_dim":64, "sorted_size":32},
+        {"tau":0.005, "block_dim":64, "sorted_size":32},
+        {"tau":0.01,  "block_dim":64, "sorted_size":32},
+        {"tau":0.02,  "block_dim":64, "sorted_size":32},
+        {"tau":0.03,  "block_dim":64, "sorted_size":32},
+        {"tau":0.04,  "block_dim":64, "sorted_size":32},
+        {"tau":0.05,  "block_dim":64, "sorted_size":32},
+        {"tau":0.06,  "block_dim":64, "sorted_size":32},
+        {"tau":0.09,  "block_dim":64, "sorted_size":32},
+        {"tau":0.12,  "block_dim":64, "sorted_size":32},
+        {"tau":0.18,  "block_dim":64, "sorted_size":32},
+        {"tau":0.21,  "block_dim":64, "sorted_size":32},
+        {"tau":0.24,  "block_dim":64, "sorted_size":32},
+        {"tau":0.27,  "block_dim":64, "sorted_size":32},
+        {"tau":0.3,   "block_dim":64, "sorted_size":32},
+        {"tau":0.4,   "block_dim":64, "sorted_size":32},
+        {"tau":0.01, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.02, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.03, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.04, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.05, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.06, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.09, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.12, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.18, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.21, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.24, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.27, "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.3,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.4,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32},
+        {"tau":0.5,  "block_dim":128, "max_iterations":2000, "cache_size":1024, "sorted_size":32}
+
+      ],
+      "search_result_file" : "result/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10"
+    },
+
+
+  ]
+
+}
diff --git a/cpp/bench/ann/conf/sift-128-euclidean.json b/cpp/bench/ann/conf/sift-128-euclidean.json
new file mode 100644
index 0000000000..476c363ecd
--- /dev/null
+++ b/cpp/bench/ann/conf/sift-128-euclidean.json
@@ -0,0 +1,1321 @@
+{
+  "dataset": {
+    "name": "sift-128-euclidean",
+    "base_file": "/home/cjnolet/workspace/ann_data/sift-128-euclidean/base.fbin",
+    "query_file": "/home/cjnolet/workspace/ann_data/sift-128-euclidean/query.fbin",
+    "distance": "euclidean"
+  },
+  "search_basic_param": {
+    "batch_size": 5000,
+    "k": 10,
+    "run_count": 3
+  },
+  "index": [
+    {
+      "name" : "hnswlib.M12",
+      "algo" : "hnswlib",
+      "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+      "file" : "index/sift-128-euclidean/hnswlib/M12",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M12"
+    },
+    {
+      "name" : "hnswlib.M16",
+      "algo" : "hnswlib",
+      "build_param": {"M":16, "efConstruction":500, "numThreads":32},
+      "file" : "index/sift-128-euclidean/hnswlib/M16",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M16"
+    },
+    {
+      "name" : "hnswlib.M24",
+      "algo" : "hnswlib",
+      "build_param": {"M":24, "efConstruction":500, "numThreads":32},
+      "file" : "index/sift-128-euclidean/hnswlib/M24",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M24"
+    },
+    {
+      "name" : "hnswlib.M36",
+      "algo" : "hnswlib",
+      "build_param": {"M":36, "efConstruction":500, "numThreads":32},
+      "file" : "index/sift-128-euclidean/hnswlib/M36",
+      "search_params" : [
+        {"ef":10, "numThreads":1},
+        {"ef":20, "numThreads":1},
+        {"ef":40, "numThreads":1},
+        {"ef":60, "numThreads":1},
+        {"ef":80, "numThreads":1},
+        {"ef":120, "numThreads":1},
+        {"ef":200, "numThreads":1},
+        {"ef":400, "numThreads":1},
+        {"ef":600, "numThreads":1},
+        {"ef":800, "numThreads":1}
+      ],
+      "search_result_file" : "result/sift-128-euclidean/hnswlib/M36"
+    },
+
+
+
+      
+	  {
+      "name": "raft_bfknn",
+      "algo": "raft_bfknn",
+      "build_param": {},
+      "file": "index/sift-128-euclidean/raft_bfknn/bfknn",
+      "search_params": [
+        {
+          "probe": 1
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_bfknn/bfknn"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist1024",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 1024
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist1024"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist2048",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 2048
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist2048",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist2048"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist4096",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 4096
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist4096",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist4096"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist8192",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 8192
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist8192",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist8192"
+    },
+    {
+      "name": "faiss_ivf_flat.nlist16384",
+      "algo": "faiss_gpu_ivf_flat",
+      "build_param": {
+        "nlist": 16384
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_flat/nlist16384"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": true
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_pq.M64-nlist1024.noprecomp",
+      "algo": "faiss_gpu_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "M": 64,
+        "useFloat16": true,
+        "usePrecomputed": false
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024.noprecomp",
+      "search_params": [
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_pq/M64-nlist1024"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist1024-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist1024-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist2048-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist2048-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist4096-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist4096-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist8192-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist8192-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-fp16",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "fp16"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist16384-fp16",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist16384-fp16"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist1024-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 1024,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist1024-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist1024-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist2048-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 2048,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist2048-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist2048-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist4096-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 4096,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist4096-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist4096-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist8192-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 8192,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist8192-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist8192-int8"
+    },
+    {
+      "name": "faiss_ivf_sq.nlist16384-int8",
+      "algo": "faiss_gpu_ivf_sq",
+      "build_param": {
+        "nlist": 16384,
+        "quantizer_type": "int8"
+      },
+      "file": "index/sift-128-euclidean/faiss_ivf_sq/nlist16384-int8",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_ivf_sq/nlist16384-int8"
+    },
+    {
+      "name": "faiss_flat",
+      "algo": "faiss_gpu_flat",
+      "build_param": {},
+      "file": "index/sift-128-euclidean/faiss_flat/flat",
+      "search_params": [
+        {}
+      ],
+      "search_result_file": "result/sift-128-euclidean/faiss_flat/flat"
+    },
+
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 5,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq64-cluster1024-float-half",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 64,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-half",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "half"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq64-cluster1024-float-half"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 32,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq32-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq32-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 16,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq16-cluster1024-float-fp8",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "fp8"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq16-cluster1024-float-fp8"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq128-cluster1024-half-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 128,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-half-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "half",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq128-cluster1024-half-float"
+    },
+    {
+      "name": "raft_ivf_pq.dimpq512-cluster1024-float-float",
+      "algo": "raft_ivf_pq",
+      "build_param": {
+        "nlist": 1024,
+        "pq_dim": 512,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_pq/dimpq512-cluster1024-float-float",
+      "search_params": [
+        {
+          "k": 10,
+          "numProbes": 10,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 50,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 100,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 200,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 500,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        },
+        {
+          "k": 10,
+          "numProbes": 1024,
+          "internalDistanceDtype": "float",
+          "smemLutDtype": "float"
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_pq/dimpq512-cluster1024-float-float"
+    },
+    {
+      "name": "raft_ivf_flat.nlist1024",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 1024,
+        "ratio": 1,
+        "niter": 25
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_flat/nlist1024",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_flat/nlist1024"
+    },
+    {
+      "name": "raft_ivf_flat.nlist16384",
+      "algo": "raft_ivf_flat",
+      "build_param": {
+        "nlist": 16384,
+        "ratio": 2,
+        "niter": 20
+      },
+      "file": "index/sift-128-euclidean/raft_ivf_flat/nlist16384",
+      "search_params": [
+        {
+          "nprobe": 1
+        },
+        {
+          "nprobe": 5
+        },
+        {
+          "nprobe": 10
+        },
+        {
+          "nprobe": 50
+        },
+        {
+          "nprobe": 100
+        },
+        {
+          "nprobe": 200
+        },
+        {
+          "nprobe": 500
+        },
+        {
+          "nprobe": 1000
+        },
+        {
+          "nprobe": 2000
+        }
+      ],
+      "search_result_file": "result/sift-128-euclidean/raft_ivf_flat/nlist16384"
+    }
+  ]
+}
diff --git a/cpp/bench/ann/scripts/eval.pl b/cpp/bench/ann/scripts/eval.pl
new file mode 100755
index 0000000000..81c5563d79
--- /dev/null
+++ b/cpp/bench/ann/scripts/eval.pl
@@ -0,0 +1,430 @@
+#!/usr/bin/perl
+
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+use warnings;
+use strict;
+use autodie qw(open close);
+use File::Find;
+use Getopt::Std;
+
+my $QPS = 'QPS';
+my $AVG_LATENCY = 'avg_latency(ms)';
+my $P99_LATENCY = 'p99_latency(ms)';
+my $P999_LATENCY = 'p999_latency(ms)';
+my @CONDITIONS = ([$QPS, 2000], ['recall', 0.9], ['recall', 0.95]);
+
+
+my $USAGE = << 'END';
+usage: [-f] [-l avg|p99|p999] [-o output.csv] groundtruth.neighbors.ibin result_paths...
+  result_paths... are paths to the search result files.
+    Can specify multiple paths.
+    For each of them, if it's a directory, all the .txt files found under
+    it recursively will be regarded as inputs.
+
+  -f: force to recompute recall and update it in result file if needed
+  -l: output search latency rather than QPS. Available options:
+        "avg" for average latency;
+        "p99" for 99th percentile latency;
+        "p999" for 99.9th percentile latency.
+  -o: also write result to a csv file
+END
+
+
+my %opt;
+getopts('fl:o:', \%opt)
+  or die $USAGE;
+my $force_calc_recall = exists $opt{f} ? 1 : 0;
+my $csv_file;
+$csv_file = $opt{o} if exists $opt{o};
+my $metric = $QPS;
+if (exists $opt{l}) {
+    my $option = $opt{l};
+    if ($option eq 'avg') {
+        $metric = $AVG_LATENCY;
+    }
+    elsif ($option eq 'p99') {
+        $metric = $P99_LATENCY;
+    }
+    elsif ($option eq 'p999') {
+        $metric = $P999_LATENCY;
+    }
+    else {
+        die
+          "[error] illegal value for '-l': '$option'. Must be 'avg', 'p99' or 'p999'\n";
+    }
+}
+
+@ARGV >= 2
+  or die $USAGE;
+
+
+my $truth_file = shift @ARGV;
+my ($k, $dataset, $distance, $results) = get_all_results($metric, @ARGV);
+if (!defined $k) {
+    print STDERR "no result file found\n";
+    exit -1;
+}
+print STDERR "dataset = $dataset, distance = $distance, k = $k\n\n";
+calc_missing_recall($results, $truth_file, $force_calc_recall);
+
+my @results = sort {
+         $a->{name} cmp $b->{name}
+      or $a->{recall} <=> $b->{recall}
+      or $b->{qps} <=> $a->{qps}
+} @$results;
+printf("%-60s  %6s %16s  %s\n", '', 'Recall', $metric, 'search_param');
+for my $result (@results) {
+    my $fmt = ($metric eq $QPS) ? '%16.1f' : '%16.3f';
+    my $qps = $result->{qps};
+    $qps *= 1000 if $metric ne $QPS;    # the unit of latency is ms
+    printf("%-60s  %6.4f ${fmt}  %s\n",
+        $result->{name}, $result->{recall}, $qps, $result->{search_param});
+}
+if (defined $csv_file) {
+    open my $fh, '>', $csv_file;
+    print {$fh} ",Recall,${metric},search_param\n";
+    for my $result (@results) {
+        my $qps = $result->{qps};
+        $qps *= 1000 if $metric ne $QPS;
+        printf {$fh} (
+            "%s,%.4f,%.3f,%s\n", $result->{name}, $result->{recall},
+            $qps, $result->{search_param}
+        );
+    }
+}
+print "\n";
+calc_and_print_estimation($results, $metric, \@CONDITIONS);
+
+
+
+
+sub read_result {
+    my ($fname) = @_;
+    open my $fh, '<', $fname;
+    my %attr;
+    while (<$fh>) {
+        chomp;
+        next if /^\s*$/;
+        my $pos = index($_, ':');
+        $pos != -1
+          or die "[error] no ':' is found: '$_'\n";
+        my $key = substr($_, 0, $pos);
+        my $val = substr($_, $pos + 1);
+        $key =~ s/^\s+|\s+$//g;
+        $val =~ s/^\s+|\s+$//g;
+
+        # old version benchmark compatible
+        if ($key eq 'search_time') {
+            $key = 'average_search_time';
+            $val *= $attr{batch_size};
+        }
+        $attr{$key} = $val;
+    }
+    return \%attr;
+}
+
+sub overwrite_recall_to_result {
+    my ($fname, $recall) = @_;
+    open my $fh_in, '<', $fname;
+    $recall = sprintf("%f", $recall);
+    my $out;
+    while (<$fh_in>) {
+        s/^recall: .*/recall: $recall/;
+        $out .= $_;
+    }
+    close $fh_in;
+
+    open my $fh_out, '>', $fname;
+    print {$fh_out} $out;
+}
+
+sub append_recall_to_result {
+    my ($fname, $recall) = @_;
+    open my $fh, '>>', $fname;
+    printf {$fh} ("recall: %f\n", $recall);
+}
+
+sub get_all_results {
+    my ($metric) = shift @_;
+
+    my %fname;
+    my $wanted = sub {
+        if (-f && /\.txt$/) {
+            $fname{$File::Find::name} = 1;
+        }
+    };
+    find($wanted, @_);
+
+    my $k;
+    my $dataset;
+    my $distance;
+    my @results;
+    for my $f (sort keys %fname) {
+        print STDERR "reading $f ...\n";
+        my $attr = read_result($f);
+        if (!defined $k) {
+            $k = $attr->{k};
+            $dataset = $attr->{dataset};
+            $distance = $attr->{distance};
+        }
+        else {
+            $attr->{k} eq $k
+              or die "[error] k should be $k, but is $attr->{k} in $f\n";
+            $attr->{dataset} eq $dataset
+              or die
+              "[error] dataset should be $dataset, but is $attr->{dataset} in $f\n";
+            $attr->{distance} eq $distance
+              or die
+              "[error] distance should be $distance, but is $attr->{distance} in $f\n";
+        }
+
+        my $batch_size = $attr->{batch_size};
+        $batch_size =~ s/000000$/M/;
+        $batch_size =~ s/000$/K/;
+        my $search_param = $attr->{search_param};
+        $search_param =~ s/^{//;
+        $search_param =~ s/}$//;
+        $search_param =~ s/,/ /g;
+        $search_param =~ s/"//g;
+
+        my $qps;
+        if ($metric eq $QPS) {
+            $qps = $attr->{batch_size} / $attr->{average_search_time};
+        }
+        elsif ($metric eq $AVG_LATENCY) {
+            $qps = $attr->{average_search_time};
+        }
+        elsif ($metric eq $P99_LATENCY) {
+            exists $attr->{p99_search_time}
+              or die "[error] p99_search_time is not found\n";
+            $qps = $attr->{p99_search_time};
+        }
+        elsif ($metric eq $P999_LATENCY) {
+            exists $attr->{p999_search_time}
+              or die "[error] p999_search_time is not found\n";
+            $qps = $attr->{p999_search_time};
+        }
+        else {
+            die "[error] unknown latency type: '$metric'\n";
+        }
+        my $result = {
+            file => $f,
+            name => "$attr->{name}-batch${batch_size}",
+            search_param => $search_param,
+            qps => $qps,
+        };
+
+        if (exists $attr->{recall}) {
+            $result->{recall} = $attr->{recall};
+        }
+        push @results, $result;
+    }
+    return $k, $dataset, $distance, \@results;
+}
+
+sub read_ibin {
+    my ($fname) = @_;
+
+    open my $fh, '<:raw', $fname;
+    my $raw;
+
+    read($fh, $raw, 8);
+    my ($nrows, $dim) = unpack('LL', $raw);
+
+    my $expected_size = 8 + $nrows * $dim * 4;
+    my $size = (stat($fh))[7];
+    $size == $expected_size
+      or die(
+        "[error] expected size is $expected_size, but actual size is $size\n");
+
+    read($fh, $raw, $nrows * $dim * 4) == $nrows * $dim * 4
+      or die "[error] read $fname failed\n";
+    my @data = unpack('l' x ($nrows * $dim), $raw);
+    return \@data, $nrows, $dim;
+}
+
+sub pick_k_neighbors {
+    my ($neighbors, $nrows, $ncols, $k) = @_;
+
+    my @res;
+    for my $i (0 .. $nrows - 1) {
+        my %neighbor_set;
+        for my $j (0 .. $k - 1) {
+            $neighbor_set{$neighbors->[$i * $ncols + $j]} = 1;
+        }
+        push @res, \%neighbor_set;
+    }
+    return \@res;
+}
+
+
+sub calc_recall {
+    my ($truth_k_neighbors, $result_neighbors, $nrows, $k) = @_;
+
+    my $recall = 0;
+    for my $i (0 .. $nrows - 1) {
+        my $tp = 0;
+        for my $j (0 .. $k - 1) {
+            my $neighbor = $result_neighbors->[$i * $k + $j];
+            ++$tp if exists $truth_k_neighbors->[$i]{$neighbor};
+        }
+        $recall += $tp;
+    }
+    return $recall / $k / $nrows;
+}
+
+sub calc_missing_recall {
+    my ($results, $truth_file, $force_calc_recall) = @_;
+
+    my $need_calc_recall = grep { !exists $_->{recall} } @$results;
+    return unless $need_calc_recall || $force_calc_recall;
+
+    my ($truth_neighbors, $nrows, $truth_k) = read_ibin($truth_file);
+    $truth_k >= $k
+      or die "[error] ground truth k ($truth_k) < k($k)\n";
+    my $truth_k_neighbors =
+      pick_k_neighbors($truth_neighbors, $nrows, $truth_k, $k);
+
+    for my $result (@$results) {
+        next if exists $result->{recall} && !$force_calc_recall;
+
+        my $result_bin_file = $result->{file};
+        $result_bin_file =~ s/txt$/ibin/;
+        print STDERR "calculating recall for $result_bin_file ...\n";
+        my ($result_neighbors, $result_nrows, $result_k) =
+          read_ibin($result_bin_file);
+        $result_k == $k
+          or die
+          "[error] k should be $k, but is $result_k in $result_bin_file\n";
+        $result_nrows == $nrows
+          or die
+          "[error] #row should be $nrows, but is $result_nrows in $result_bin_file\n";
+
+        my $recall =
+          calc_recall($truth_k_neighbors, $result_neighbors, $nrows, $k);
+        if (exists $result->{recall}) {
+            my $new_value = sprintf("%f", $recall);
+            if ($result->{recall} ne $new_value) {
+                print "update recall: $result->{recall} -> $new_value\n";
+                overwrite_recall_to_result($result->{file}, $recall);
+            }
+        }
+        else {
+            append_recall_to_result($result->{file}, $recall);
+        }
+        $result->{recall} = $recall;
+    }
+}
+
+
+sub estimate {
+    my ($results, $condition, $value) = @_;
+    my %point_of;
+    for my $result (@$results) {
+        my $point;
+        if ($condition eq 'recall') {
+            $point = [$result->{recall}, $result->{qps}];
+        }
+        else {
+            $point = [$result->{qps}, $result->{recall}];
+        }
+        push @{$point_of{$result->{name}}}, $point;
+    }
+
+    my @names = sort keys %point_of;
+    my @result;
+    for my $name (@names) {
+        my @points = sort { $a->[0] <=> $b->[0] } @{$point_of{$name}};
+        if ($value < $points[0][0] || $value > $points[$#points][0]) {
+            push @result, -1;
+            next;
+        }
+        elsif ($value == $points[0][0]) {
+            push @result, $points[0][1];
+            next;
+        }
+
+        for my $i (1 .. $#points) {
+            if ($points[$i][0] >= $value) {
+                push @result,
+                  linear_interpolation($value, @{$points[$i - 1]},
+                    @{$points[$i]});
+                last;
+            }
+        }
+    }
+    return \@names, \@result;
+}
+
+sub linear_interpolation {
+    my ($x, $x1, $y1, $x2, $y2) = @_;
+    return $y1 + ($x - $x1) * ($y2 - $y1) / ($x2 - $x1);
+}
+
+sub merge {
+    my ($all, $new, $scale) = @_;
+    @$all == @$new
+      or die "[error] length is not equal\n";
+    for my $i (0 .. @$all - 1) {
+        push @{$all->[$i]}, $new->[$i] * $scale;
+    }
+}
+
+sub calc_and_print_estimation {
+    my ($results, $metric, $conditions) = @_;
+
+    my @conditions = grep {
+        my $target = $_->[0];
+        if ($target eq 'recall' || $target eq $metric) {
+            1;
+        }
+        else {
+                 $target eq $QPS
+              || $target eq $AVG_LATENCY
+              || $target eq $P99_LATENCY
+              || $target eq $P999_LATENCY
+              or die "[error] unknown condition: '$target'\n";
+            0;
+        }
+    } @$conditions;
+
+    my @headers = map {
+        my $header;
+        if ($_->[0] eq 'recall') {
+            $header = $metric . '@recall' . $_->[1];
+        }
+        elsif ($_->[0] eq $metric) {
+            $header = 'recall@' . $metric . $_->[1];
+        }
+        $header;
+    } @conditions;
+
+    my $scale = ($metric eq $QPS) ? 1 : 1000;
+    my $estimations;
+    for my $condition (@conditions) {
+        my ($names, $estimate) = estimate($results, @$condition);
+        if (!defined $estimations) {
+            @$estimations = map { [$_] } @$names;
+        }
+        merge($estimations, $estimate, $scale);
+    }
+
+    my $fmt = "%-60s" . ("  %16s" x @headers) . "\n";
+    printf($fmt, '', @headers);
+    $fmt =~ s/16s/16.4f/g;
+    for (@$estimations) {
+        printf($fmt, @$_);
+    }
+}
diff --git a/cpp/bench/ann/scripts/fbin_to_f16bin.py b/cpp/bench/ann/scripts/fbin_to_f16bin.py
new file mode 100755
index 0000000000..4ea8988d87
--- /dev/null
+++ b/cpp/bench/ann/scripts/fbin_to_f16bin.py
@@ -0,0 +1,46 @@
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import numpy as np
+
+
+def read_fbin(fname):
+    shape = np.fromfile(fname, dtype=np.uint32, count=2)
+    if float(shape[0]) * shape[1] * 4 > 2000000000:
+        data = np.memmap(fname, dtype=np.float32, offset=8, mode="r").reshape(
+            shape
+        )
+    else:
+        data = np.fromfile(fname, dtype=np.float32, offset=8).reshape(shape)
+    return data
+
+
+def write_bin(fname, data):
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+if len(sys.argv) != 3:
+    print(
+        "usage: %s input.fbin output.f16bin" % (sys.argv[0]),
+        file=sys.stderr,
+    )
+    sys.exit(-1)
+
+data = read_fbin(sys.argv[1]).astype(np.float16)
+write_bin(sys.argv[2], data)
diff --git a/cpp/bench/ann/scripts/hdf5_to_fbin.py b/cpp/bench/ann/scripts/hdf5_to_fbin.py
new file mode 100755
index 0000000000..cfeb184ea8
--- /dev/null
+++ b/cpp/bench/ann/scripts/hdf5_to_fbin.py
@@ -0,0 +1,85 @@
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+import sys
+import numpy as np
+import h5py
+
+
+def normalize(x):
+    norm = np.linalg.norm(x, axis=1)
+    return (x.T / norm).T
+
+
+def write_bin(fname, data):
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2 and len(sys.argv) != 3:
+        print(
+            "usage: %s [-n] <input>.hdf5\n" % (sys.argv[0]),
+            "  -n: normalize base/query set\n",
+            "outputs: <input>.base.fbin\n",
+            "         <input>.query.fbin\n",
+            "         <input>.groundtruth.neighbors.ibin\n",
+            "         <input>.groundtruth.distances.fbin",
+            file=sys.stderr,
+        )
+        sys.exit(-1)
+
+    need_normalize = False
+    if len(sys.argv) == 3:
+        assert sys.argv[1] == "-n"
+        need_normalize = True
+    fname_prefix = sys.argv[-1]
+    assert fname_prefix.endswith(".hdf5")
+    fname_prefix = fname_prefix[:-5]
+
+    hdf5 = h5py.File(sys.argv[-1], "r")
+    assert (
+        hdf5.attrs["distance"] == "angular"
+        or hdf5.attrs["distance"] == "euclidean"
+    )
+    assert hdf5["train"].dtype == np.float32
+    assert hdf5["test"].dtype == np.float32
+    assert hdf5["neighbors"].dtype == np.int32
+    assert hdf5["distances"].dtype == np.float32
+
+    base = hdf5["train"][:]
+    query = hdf5["test"][:]
+    if need_normalize:
+        base = normalize(base)
+        query = normalize(query)
+    elif hdf5.attrs["distance"] == "angular":
+        print(
+            "warning: input has angular distance, specify -n to normalize base/query set!\n"
+        )
+
+    output_fname = fname_prefix + ".base.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, base)
+
+    output_fname = fname_prefix + ".query.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, query)
+
+    output_fname = fname_prefix + ".groundtruth.neighbors.ibin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, hdf5["neighbors"][:])
+
+    output_fname = fname_prefix + ".groundtruth.distances.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, hdf5["distances"][:])
diff --git a/cpp/bench/ann/scripts/split_groundtruth.pl b/cpp/bench/ann/scripts/split_groundtruth.pl
new file mode 100755
index 0000000000..b0a59f806c
--- /dev/null
+++ b/cpp/bench/ann/scripts/split_groundtruth.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+use warnings;
+use strict;
+use autodie qw(open close);
+
+
+@ARGV == 2
+  or die "usage: $0 input output_prefix\n";
+
+open my $fh, '<:raw', $ARGV[0];
+
+my $raw;
+read($fh, $raw, 8);
+my ($nrows, $dim) = unpack('LL', $raw);
+
+my $expected_size = 8 + $nrows * $dim * (4 + 4);
+my $size = (stat($fh))[7];
+$size == $expected_size
+  or die("error: expected size is $expected_size, but actual size is $size\n");
+
+
+open my $fh_out1, '>:raw', "$ARGV[1].neighbors.ibin";
+open my $fh_out2, '>:raw', "$ARGV[1].distances.fbin";
+
+print {$fh_out1} $raw;
+print {$fh_out2} $raw;
+
+read($fh, $raw, $nrows * $dim * 4);
+print {$fh_out1} $raw;
+read($fh, $raw, $nrows * $dim * 4);
+print {$fh_out2} $raw;
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
new file mode 100644
index 0000000000..8f73896e07
--- /dev/null
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -0,0 +1,88 @@
+
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include <cuda_runtime_api.h>
+
+namespace raft::bench::ann {
+
+enum class Metric {
+  kInnerProduct,
+  kEuclidean,
+};
+
+enum class MemoryType {
+  Host,
+  HostMmap,
+  Device,
+};
+
+struct AlgoProperty {
+  MemoryType dataset_memory_type;
+  // neighbors/distances should have same memory type as queries
+  MemoryType query_memory_type;
+  bool need_dataset_when_search;
+};
+
+template <typename T>
+class ANN {
+ public:
+  struct AnnSearchParam {
+    virtual ~AnnSearchParam() = default;
+  };
+
+  ANN(Metric metric, int dim) : metric_(metric), dim_(dim) {}
+  virtual ~ANN() = default;
+
+  virtual void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) = 0;
+
+  virtual void set_search_param(const AnnSearchParam& param) = 0;
+  // TODO: this assumes that an algorithm can always return k results.
+  // This is not always possible.
+  virtual void search(const T* queries,
+                      int batch_size,
+                      int k,
+                      size_t* neighbors,
+                      float* distances,
+                      cudaStream_t stream = 0) const = 0;
+
+  virtual void save(const std::string& file) const = 0;
+  virtual void load(const std::string& file)       = 0;
+
+  virtual AlgoProperty get_property() const = 0;
+
+  // Some algorithms don't save the building dataset in their indices.
+  // So they should be given the access to that dataset during searching.
+  // The advantage of this way is that index has smaller size
+  // and many indices can share one dataset.
+  //
+  // AlgoProperty::need_dataset_when_search of such algorithm should be true,
+  // and set_search_dataset() should save the passed-in pointer somewhere.
+  // The client code should call set_search_dataset() before searching,
+  // and should not release dataset before searching is finished.
+  virtual void set_search_dataset(const T* /*dataset*/, size_t /*nrow*/){};
+
+ protected:
+  Metric metric_;
+  int dim_;
+};
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
new file mode 100644
index 0000000000..b4d8fbeee3
--- /dev/null
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef NVTX
+#include <nvToolsExt.h>
+#endif
+#include <unistd.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "benchmark_util.hpp"
+#include "conf.h"
+#include "dataset.h"
+#include "util.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+using std::to_string;
+using std::unordered_set;
+using std::vector;
+
+namespace raft::bench::ann {
+
+inline bool check_file_exist(const std::vector<string>& files)
+{
+  bool ret = true;
+  std::unordered_set<std::string> processed;
+  for (const auto& file : files) {
+    if (processed.find(file) == processed.end() && !file_exists(file)) {
+      log_error("file '%s' doesn't exist or is not a regular file", file.c_str());
+      ret = false;
+    }
+    processed.insert(file);
+  }
+  return ret;
+}
+
+inline bool check_file_not_exist(const std::vector<std::string>& files, bool force_overwrite)
+{
+  bool ret = true;
+  for (const auto& file : files) {
+    if (file_exists(file)) {
+      if (force_overwrite) {
+        log_warn("'%s' already exists, will overwrite it", file.c_str());
+      } else {
+        log_error("'%s' already exists, use '-f' to force overwriting", file.c_str());
+        ret = false;
+      }
+    }
+  }
+  return ret;
+}
+
+inline bool check_no_duplicate_file(const std::vector<std::string>& files)
+{
+  bool ret = true;
+  std::unordered_set<string> processed;
+  for (const auto& file : files) {
+    if (processed.find(file) != processed.end()) {
+      log_error("'%s' occurs more than once as output file, would be overwritten", file.c_str());
+      ret = false;
+    }
+    processed.insert(file);
+  }
+  return ret;
+}
+
+inline bool mkdir(const std::vector<std::string>& dirs)
+{
+  std::unordered_set<string> processed;
+  for (const auto& dir : dirs) {
+    if (processed.find(dir) == processed.end() && !dir_exists(dir)) {
+      if (create_dir(dir)) {
+        log_info("mkdir '%s'", dir.c_str());
+      } else {
+        log_error("fail to create output directory '%s'", dir.c_str());
+        // won't create any other dir when problem occurs
+        return false;
+      }
+    }
+    processed.insert(dir);
+  }
+  return true;
+}
+
+inline bool check(const std::vector<Configuration::Index>& indices,
+                  bool build_mode,
+                  bool force_overwrite)
+{
+  std::vector<std::string> files_should_exist;
+  std::vector<std::string> dirs_should_exist;
+  std::vector<std::string> output_files;
+  for (const auto& index : indices) {
+    if (build_mode) {
+      output_files.push_back(index.file);
+      output_files.push_back(index.file + ".txt");
+
+      auto pos = index.file.rfind('/');
+      if (pos != std::string::npos) { dirs_should_exist.push_back(index.file.substr(0, pos)); }
+    } else {
+      files_should_exist.push_back(index.file);
+      files_should_exist.push_back(index.file + ".txt");
+
+      output_files.push_back(index.search_result_file + ".0.ibin");
+      output_files.push_back(index.search_result_file + ".0.txt");
+
+      auto pos = index.search_result_file.rfind('/');
+      if (pos != std::string::npos) {
+        dirs_should_exist.push_back(index.search_result_file.substr(0, pos));
+      }
+    }
+  }
+
+  bool ret = true;
+  if (!check_file_exist(files_should_exist)) { ret = false; }
+  if (!check_file_not_exist(output_files, force_overwrite)) { ret = false; }
+  if (!check_no_duplicate_file(output_files)) { ret = false; }
+  if (ret && !mkdir(dirs_should_exist)) { ret = false; }
+  return ret;
+}
+
+inline void write_build_info(const std::string& file_prefix,
+                             const std::string& dataset,
+                             const std::string& distance,
+                             const std::string& name,
+                             const std::string& algo,
+                             const std::string& build_param,
+                             float build_time)
+{
+  std::ofstream ofs(file_prefix + ".txt");
+  if (!ofs) { throw std::runtime_error("can't open build info file: " + file_prefix + ".txt"); }
+  ofs << "dataset: " << dataset << "\n"
+      << "distance: " << distance << "\n"
+      << "\n"
+      << "name: " << name << "\n"
+      << "algo: " << algo << "\n"
+      << "build_param: " << build_param << "\n"
+      << "build_time: " << build_time << endl;
+  ofs.close();
+  if (!ofs) { throw std::runtime_error("can't write to build info file: " + file_prefix + ".txt"); }
+}
+
+template <typename T>
+void build(const Dataset<T>* dataset, const std::vector<Configuration::Index>& indices)
+{
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+  log_info(
+    "base set from dataset '%s', #vector = %zu", dataset->name().c_str(), dataset->base_set_size());
+
+  for (const auto& index : indices) {
+    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
+    auto algo          = create_algo<T>(index.algo,
+                               dataset->distance(),
+                               dataset->dim(),
+                               index.refine_ratio,
+                               index.build_param,
+                               index.dev_list);
+    auto algo_property = algo->get_property();
+
+    const T* base_set_ptr = nullptr;
+    if (algo_property.dataset_memory_type == MemoryType::Host) {
+      log_info("%s", "loading base set to memory");
+      base_set_ptr = dataset->base_set();
+    } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
+      log_info("%s", "mapping base set to memory");
+      base_set_ptr = dataset->mapped_base_set();
+    } else if (algo_property.dataset_memory_type == MemoryType::Device) {
+      log_info("%s", "loading base set to GPU");
+      base_set_ptr = dataset->base_set_on_gpu();
+    }
+
+    log_info("building index '%s'", index.name.c_str());
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+#ifdef NVTX
+    nvtxRangePush("build");
+#endif
+    Timer timer;
+    algo->build(base_set_ptr, dataset->base_set_size(), stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    float elapsed_ms = timer.elapsed_ms();
+#ifdef NVTX
+    nvtxRangePop();
+#endif
+    log_info("built index in %.2f seconds", elapsed_ms / 1000.0f);
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    algo->save(index.file);
+    write_build_info(index.file,
+                     dataset->name(),
+                     dataset->distance(),
+                     index.name,
+                     index.algo,
+                     index.build_param.dump(),
+                     elapsed_ms / 1000.0f);
+    log_info("saved index to %s", index.file.c_str());
+  }
+
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+inline void write_search_result(const std::string& file_prefix,
+                                const std::string& dataset,
+                                const std::string& distance,
+                                const std::string& name,
+                                const std::string& algo,
+                                const std::string& build_param,
+                                const std::string& search_param,
+                                int batch_size,
+                                int run_count,
+                                int k,
+                                float search_time_average,
+                                float search_time_p99,
+                                float search_time_p999,
+                                const int* neighbors,
+                                size_t query_set_size)
+{
+  std::ofstream ofs(file_prefix + ".txt");
+  if (!ofs) { throw std::runtime_error("can't open search result file: " + file_prefix + ".txt"); }
+  ofs << "dataset: " << dataset << "\n"
+      << "distance: " << distance << "\n"
+      << "\n"
+      << "name: " << name << "\n"
+      << "algo: " << algo << "\n"
+      << "build_param: " << build_param << "\n"
+      << "search_param: " << search_param << "\n"
+      << "\n"
+      << "batch_size: " << batch_size << "\n"
+      << "run_count: " << run_count << "\n"
+      << "k: " << k << "\n"
+      << "average_search_time: " << search_time_average << endl;
+  if (search_time_p99 != std::numeric_limits<float>::max()) {
+    ofs << "p99_search_time: " << search_time_p99 << endl;
+  }
+  if (search_time_p999 != std::numeric_limits<float>::max()) {
+    ofs << "p999_search_time: " << search_time_p999 << endl;
+  }
+  ofs.close();
+  if (!ofs) {
+    throw std::runtime_error("can't write to search result file: " + file_prefix + ".txt");
+  }
+
+  BinFile<int> neighbors_file(file_prefix + ".ibin", "w");
+  neighbors_file.write(neighbors, query_set_size, k);
+}
+
+template <typename T>
+inline void search(const Dataset<T>* dataset, const std::vector<Configuration::Index>& indices)
+{
+  if (indices.empty()) { return; }
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+  log_info("loading query set from dataset '%s', #vector = %zu",
+           dataset->name().c_str(),
+           dataset->query_set_size());
+  const T* query_set = dataset->query_set();
+  // query set is usually much smaller than base set, so load it eagerly
+  const T* d_query_set  = dataset->query_set_on_gpu();
+  size_t query_set_size = dataset->query_set_size();
+
+  // currently all indices has same batch_size, k and run_count
+  const int batch_size = indices[0].batch_size;
+  const int k          = indices[0].k;
+  const int run_count  = indices[0].run_count;
+  log_info(
+    "basic search parameters: batch_size = %d, k = %d, run_count = %d", batch_size, k, run_count);
+  if (query_set_size % batch_size != 0) {
+    log_warn("query set size (%zu) % batch size (%d) != 0, the size of last batch is %zu",
+             query_set_size,
+             batch_size,
+             query_set_size % batch_size);
+  }
+  const size_t num_batches = (query_set_size - 1) / batch_size + 1;
+  std::size_t* neighbors   = new std::size_t[query_set_size * k];
+  int* neighbors_buf       = new int[query_set_size * k];
+  float* distances         = new float[query_set_size * k];
+  std::vector<float> search_times;
+  search_times.reserve(num_batches);
+  std::size_t* d_neighbors;
+  float* d_distances;
+  RAFT_CUDA_TRY(cudaMalloc((void**)&d_neighbors, query_set_size * k * sizeof(*d_neighbors)));
+  RAFT_CUDA_TRY(cudaMalloc((void**)&d_distances, query_set_size * k * sizeof(*d_distances)));
+
+  for (const auto& index : indices) {
+    log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str());
+    auto algo          = create_algo<T>(index.algo,
+                               dataset->distance(),
+                               dataset->dim(),
+                               index.refine_ratio,
+                               index.build_param,
+                               index.dev_list);
+    auto algo_property = algo->get_property();
+
+    log_info("loading index '%s' from file '%s'", index.name.c_str(), index.file.c_str());
+    algo->load(index.file);
+
+    const T* this_query_set     = query_set;
+    std::size_t* this_neighbors = neighbors;
+    float* this_distances       = distances;
+    if (algo_property.query_memory_type == MemoryType::Device) {
+      this_query_set = d_query_set;
+      this_neighbors = d_neighbors;
+      this_distances = d_distances;
+    }
+
+    if (algo_property.need_dataset_when_search) {
+      log_info("loading base set from dataset '%s', #vector = %zu",
+               dataset->name().c_str(),
+               dataset->base_set_size());
+      const T* base_set_ptr = nullptr;
+      if (algo_property.dataset_memory_type == MemoryType::Host) {
+        log_info("%s", "loading base set to memory");
+        base_set_ptr = dataset->base_set();
+      } else if (algo_property.dataset_memory_type == MemoryType::HostMmap) {
+        log_info("%s", "mapping base set to memory");
+        base_set_ptr = dataset->mapped_base_set();
+      } else if (algo_property.dataset_memory_type == MemoryType::Device) {
+        log_info("%s", "loading base set to GPU");
+        base_set_ptr = dataset->base_set_on_gpu();
+      }
+      algo->set_search_dataset(base_set_ptr, dataset->base_set_size());
+    }
+
+    for (int i = 0, end_i = index.search_params.size(); i != end_i; ++i) {
+      auto p_param = create_search_param<T>(index.algo, index.search_params[i]);
+      algo->set_search_param(*p_param);
+      log_info("search with param: %s", index.search_params[i].dump().c_str());
+
+      if (algo_property.query_memory_type == MemoryType::Device) {
+        RAFT_CUDA_TRY(cudaMemset(d_neighbors, 0, query_set_size * k * sizeof(*d_neighbors)));
+        RAFT_CUDA_TRY(cudaMemset(d_distances, 0, query_set_size * k * sizeof(*d_distances)));
+      } else {
+        memset(neighbors, 0, query_set_size * k * sizeof(*neighbors));
+        memset(distances, 0, query_set_size * k * sizeof(*distances));
+      }
+
+      float best_search_time_average = std::numeric_limits<float>::max();
+      float best_search_time_p99     = std::numeric_limits<float>::max();
+      float best_search_time_p999    = std::numeric_limits<float>::max();
+      for (int run = 0; run < run_count; ++run) {
+        log_info("run %d / %d", run + 1, run_count);
+        for (std::size_t batch_id = 0; batch_id < num_batches; ++batch_id) {
+          std::size_t row       = batch_id * batch_size;
+          int actual_batch_size = (batch_id == num_batches - 1) ? query_set_size - row : batch_size;
+          RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+#ifdef NVTX
+          string nvtx_label = "batch" + to_string(batch_id);
+          if (run_count != 1) { nvtx_label = "run" + to_string(run) + "-" + nvtx_label; }
+          if (batch_id == 10) {
+            run = run_count - 1;
+            break;
+          }
+#endif
+          Timer timer;
+#ifdef NVTX
+          nvtxRangePush(nvtx_label.c_str());
+#endif
+          algo->search(this_query_set + row * dataset->dim(),
+                       actual_batch_size,
+                       k,
+                       this_neighbors + row * k,
+                       this_distances + row * k,
+                       stream);
+          RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+          float elapsed_ms = timer.elapsed_ms();
+#ifdef NVTX
+          nvtxRangePop();
+#endif
+          // If the size of the last batch is less than batch_size, don't count it for
+          // search time. But neighbors of the last batch will still be filled, so it's
+          // counted for recall calculation.
+          if (actual_batch_size == batch_size) {
+            search_times.push_back(elapsed_ms / 1000.0f);  // in seconds
+          }
+        }
+
+        float search_time_average =
+          std::accumulate(search_times.cbegin(), search_times.cend(), 0.0f) / search_times.size();
+        best_search_time_average = std::min(best_search_time_average, search_time_average);
+
+        if (search_times.size() >= 100) {
+          std::sort(search_times.begin(), search_times.end());
+
+          auto calc_percentile_pos = [](float percentile, size_t N) {
+            return static_cast<size_t>(std::ceil(percentile / 100.0 * N)) - 1;
+          };
+
+          float search_time_p99 = search_times[calc_percentile_pos(99, search_times.size())];
+          best_search_time_p99  = std::min(best_search_time_p99, search_time_p99);
+
+          if (search_times.size() >= 1000) {
+            float search_time_p999 = search_times[calc_percentile_pos(99.9, search_times.size())];
+            best_search_time_p999  = std::min(best_search_time_p999, search_time_p999);
+          }
+        }
+        search_times.clear();
+      }
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      if (algo_property.query_memory_type == MemoryType::Device) {
+        RAFT_CUDA_TRY(cudaMemcpy(neighbors,
+                                 d_neighbors,
+                                 query_set_size * k * sizeof(*d_neighbors),
+                                 cudaMemcpyDeviceToHost));
+        RAFT_CUDA_TRY(cudaMemcpy(distances,
+                                 d_distances,
+                                 query_set_size * k * sizeof(*d_distances),
+                                 cudaMemcpyDeviceToHost));
+      }
+
+      for (size_t j = 0; j < query_set_size * k; ++j) {
+        neighbors_buf[j] = neighbors[j];
+      }
+      write_search_result(index.search_result_file + "." + to_string(i),
+                          dataset->name(),
+                          dataset->distance(),
+                          index.name,
+                          index.algo,
+                          index.build_param.dump(),
+                          index.search_params[i].dump(),
+                          batch_size,
+                          index.run_count,
+                          k,
+                          best_search_time_average,
+                          best_search_time_p99,
+                          best_search_time_p999,
+                          neighbors_buf,
+                          query_set_size);
+    }
+
+    log_info("finish searching for index '%s'", index.name.c_str());
+  }
+
+  delete[] neighbors;
+  delete[] neighbors_buf;
+  delete[] distances;
+  RAFT_CUDA_TRY(cudaFree(d_neighbors));
+  RAFT_CUDA_TRY(cudaFree(d_distances));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+inline const std::string usage(const string& argv0)
+{
+  return "usage: " + argv0 + " -b|s [-c] [-f] [-i index_names] conf.json\n" +
+         "   -b: build mode, will build index\n" +
+         "   -s: search mode, will search using built index\n" +
+         "       one and only one of -b and -s should be specified\n" +
+         "   -c: just check command line options and conf.json are sensible\n" +
+         "       won't build or search\n" + "   -f: force overwriting existing output files\n" +
+         "   -i: by default will build/search all the indices found in conf.json\n" +
+         "       '-i' can be used to select a subset of indices\n" +
+         "       'index_names' is a list of comma-separated index names\n" +
+         "       '*' is allowed as the last character of a name to select all matched indices\n" +
+         "       for example, -i \"hnsw1,hnsw2,faiss\" or -i \"hnsw*,faiss\"";
+}
+
+template <typename T>
+inline int dispatch_benchmark(Configuration& conf,
+                              std::string& index_patterns,
+                              bool force_overwrite,
+                              bool only_check,
+                              bool build_mode,
+                              bool search_mode)
+{
+  try {
+    auto dataset_conf = conf.get_dataset_conf();
+
+    BinDataset<T> dataset(dataset_conf.name,
+                          dataset_conf.base_file,
+                          dataset_conf.subset_first_row,
+                          dataset_conf.subset_size,
+                          dataset_conf.query_file,
+                          dataset_conf.distance);
+
+    vector<Configuration::Index> indices = conf.get_indices(index_patterns);
+    if (!check(indices, build_mode, force_overwrite)) { return -1; }
+
+    std::string message = "will ";
+    message += build_mode ? "build:" : "search:";
+    for (const auto& index : indices) {
+      message += "\n  " + index.name;
+    }
+    log_info("%s", message.c_str());
+
+    if (only_check) {
+      log_info("%s", "all check passed, quit due to option -c");
+      return 0;
+    }
+
+    if (build_mode) {
+      build(&dataset, indices);
+    } else if (search_mode) {
+      search(&dataset, indices);
+    }
+  } catch (const std::exception& e) {
+    log_error("exception occurred: %s", e.what());
+    return -1;
+  }
+
+  return 0;
+}
+
+inline int run_main(int argc, char** argv)
+{
+  bool force_overwrite = false;
+  bool build_mode      = false;
+  bool search_mode     = false;
+  bool only_check      = false;
+  std::string index_patterns("*");
+
+  int opt;
+  while ((opt = getopt(argc, argv, "bscfi:h")) != -1) {
+    switch (opt) {
+      case 'b': build_mode = true; break;
+      case 's': search_mode = true; break;
+      case 'c': only_check = true; break;
+      case 'f': force_overwrite = true; break;
+      case 'i': index_patterns = optarg; break;
+      case 'h': cout << usage(argv[0]) << endl; return -1;
+      default: cerr << "\n" << usage(argv[0]) << endl; return -1;
+    }
+  }
+  if (build_mode == search_mode) {
+    std::cerr << "one and only one of -b and -s should be specified\n\n" << usage(argv[0]) << endl;
+    return -1;
+  }
+  if (argc - optind != 1) {
+    std::cerr << usage(argv[0]) << endl;
+    return -1;
+  }
+  string conf_file = argv[optind];
+
+  std::ifstream conf_stream(conf_file.c_str());
+  if (!conf_stream) {
+    log_error("can't open configuration file: %s", argv[optind]);
+    return -1;
+  }
+
+  try {
+    Configuration conf(conf_stream);
+    std::string dtype = conf.get_dataset_conf().dtype;
+
+    if (dtype == "float") {
+      return dispatch_benchmark<float>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else if (dtype == "uint8") {
+      return dispatch_benchmark<std::uint8_t>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else if (dtype == "int8") {
+      return dispatch_benchmark<std::int8_t>(
+        conf, index_patterns, force_overwrite, only_check, build_mode, search_mode);
+    } else {
+      log_error("datatype %s not supported", dtype);
+    }
+
+  } catch (const std::exception& e) {
+    log_error("exception occurred: %s", e.what());
+    return -1;
+  }
+
+  return -1;
+}
+};  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/benchmark_util.hpp b/cpp/bench/ann/src/common/benchmark_util.hpp
new file mode 100644
index 0000000000..7005883ffc
--- /dev/null
+++ b/cpp/bench/ann/src/common/benchmark_util.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "ann_types.hpp"
+#include <string>
+
+namespace raft::bench::ann {
+
+inline Metric parse_metric(const std::string& metric_str)
+{
+  if (metric_str == "inner_product") {
+    return raft::bench::ann::Metric::kInnerProduct;
+  } else if (metric_str == "euclidean") {
+    return raft::bench::ann::Metric::kEuclidean;
+  } else {
+    throw std::runtime_error("invalid metric: '" + metric_str + "'");
+  }
+}
+};  // namespace raft::bench::ann
\ No newline at end of file
diff --git a/cpp/bench/ann/src/common/conf.cpp b/cpp/bench/ann/src/common/conf.cpp
new file mode 100644
index 0000000000..f690f68783
--- /dev/null
+++ b/cpp/bench/ann/src/common/conf.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "conf.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "util.h"
+
+namespace raft::bench::ann {
+using std::runtime_error;
+using std::string;
+using std::unordered_set;
+using std::vector;
+
+Configuration::Configuration(std::istream& conf_stream)
+{
+  // to enable comments in json
+  auto conf = nlohmann::json::parse(conf_stream, nullptr, true, true);
+
+  parse_dataset_(conf.at("dataset"));
+  parse_index_(conf.at("index"), conf.at("search_basic_param"));
+}
+
+vector<Configuration::Index> Configuration::get_indices(const string& patterns) const
+{
+  vector<string> names;
+  for (const auto& index : indices_) {
+    names.push_back(index.name);
+  }
+
+  auto matched = match_(names, patterns);
+  if (matched.empty()) { throw runtime_error("no available index matches '" + patterns + "'"); }
+
+  vector<Index> res;
+  for (const auto& index : indices_) {
+    if (matched.find(index.name) != matched.end()) { res.push_back(index); }
+  }
+  return res;
+}
+
+void Configuration::parse_dataset_(const nlohmann::json& conf)
+{
+  dataset_conf_.name       = conf.at("name");
+  dataset_conf_.base_file  = conf.at("base_file");
+  dataset_conf_.query_file = conf.at("query_file");
+  dataset_conf_.distance   = conf.at("distance");
+
+  if (conf.contains("subset_first_row")) {
+    dataset_conf_.subset_first_row = conf.at("subset_first_row");
+  }
+  if (conf.contains("subset_size")) { dataset_conf_.subset_size = conf.at("subset_size"); }
+
+  if (conf.contains("dtype")) {
+    dataset_conf_.dtype = conf.at("dtype");
+  } else {
+    auto filename = dataset_conf_.base_file;
+    if (!filename.compare(filename.size() - 4, 4, "fbin")) {
+      dataset_conf_.dtype = "float";
+    } else if (!filename.compare(filename.size() - 5, 5, "u8bin")) {
+      dataset_conf_.dtype = "uint8";
+    } else if (!filename.compare(filename.size() - 5, 5, "i8bin")) {
+      dataset_conf_.dtype = "int8";
+    } else {
+      log_error("Could not determine data type of the dataset");
+    }
+  }
+}
+
+void Configuration::parse_index_(const nlohmann::json& index_conf,
+                                 const nlohmann::json& search_basic_conf)
+{
+  const int batch_size = search_basic_conf.at("batch_size");
+  const int k          = search_basic_conf.at("k");
+  const int run_count  = search_basic_conf.at("run_count");
+
+  for (const auto& conf : index_conf) {
+    Index index;
+    index.name        = conf.at("name");
+    index.algo        = conf.at("algo");
+    index.build_param = conf.at("build_param");
+    index.file        = conf.at("file");
+    index.batch_size  = batch_size;
+    index.k           = k;
+    index.run_count   = run_count;
+
+    if (conf.contains("multigpu")) {
+      for (auto it : conf.at("multigpu")) {
+        index.dev_list.push_back(it);
+      }
+      if (index.dev_list.empty()) { throw std::runtime_error("dev_list shouln't be empty!"); }
+      index.dev_list.shrink_to_fit();
+      index.build_param["multigpu"] = conf["multigpu"];
+    }
+
+    if (conf.contains("refine_ratio")) {
+      float refine_ratio = conf.at("refine_ratio");
+      if (refine_ratio <= 1.0f) {
+        throw runtime_error("'" + index.name + "': refine_ratio should > 1.0");
+      }
+      index.refine_ratio = refine_ratio;
+    }
+
+    for (const auto& param : conf.at("search_params")) {
+      index.search_params.push_back(param);
+    }
+    index.search_result_file = conf.at("search_result_file");
+
+    indices_.push_back(index);
+  }
+}
+
+unordered_set<string> Configuration::match_(const vector<string>& candidates,
+                                            const string& patterns) const
+{
+  unordered_set<string> matched;
+  for (const auto& pat : split(patterns, ',')) {
+    if (pat.empty()) { continue; }
+
+    if (pat.back() == '*') {
+      auto len = pat.size() - 1;
+      for (const auto& item : candidates) {
+        if (item.compare(0, len, pat, 0, len) == 0) { matched.insert(item); }
+      }
+    } else {
+      for (const auto& item : candidates) {
+        if (item == pat) { matched.insert(item); }
+      }
+    }
+  }
+
+  return matched;
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/conf.h b/cpp/bench/ann/src/common/conf.h
new file mode 100644
index 0000000000..845defe94a
--- /dev/null
+++ b/cpp/bench/ann/src/common/conf.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <iostream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+class Configuration {
+ public:
+  struct Index {
+    std::string name;
+    std::string algo;
+    nlohmann::json build_param;
+    std::string file;
+    std::vector<int> dev_list;
+
+    int batch_size;
+    int k;
+    int run_count;
+    std::vector<nlohmann::json> search_params;
+    std::string search_result_file;
+    float refine_ratio{0.0f};
+  };
+
+  struct DatasetConf {
+    std::string name;
+    std::string base_file;
+    // use only a subset of base_file,
+    // the range of rows is [subset_first_row, subset_first_row + subset_size)
+    // however, subset_size = 0 means using all rows after subset_first_row
+    // that is, the subset is [subset_first_row, #rows in base_file)
+    size_t subset_first_row{0};
+    size_t subset_size{0};
+    std::string query_file;
+    std::string distance;
+
+    // data type of input dataset, possible values ["float", "int8", "uint8"]
+    std::string dtype;
+  };
+
+  Configuration(std::istream& conf_stream);
+
+  DatasetConf get_dataset_conf() const { return dataset_conf_; }
+  std::vector<Index> get_indices(const std::string& patterns) const;
+
+ private:
+  void parse_dataset_(const nlohmann::json& conf);
+  void parse_index_(const nlohmann::json& index_conf, const nlohmann::json& search_basic_conf);
+  std::unordered_set<std::string> match_(const std::vector<std::string>& candidates,
+                                         const std::string& patterns) const;
+
+  DatasetConf dataset_conf_;
+  std::vector<Index> indices_;
+};
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/dataset.h b/cpp/bench/ann/src/common/dataset.h
new file mode 100644
index 0000000000..1244935c99
--- /dev/null
+++ b/cpp/bench/ann/src/common/dataset.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuda_fp16.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::bench::ann {
+
+// http://big-ann-benchmarks.com/index.html:
+// binary format that starts with 8 bytes of data consisting of num_points(uint32_t)
+// num_dimensions(uint32) followed by num_pts x num_dimensions x sizeof(type) bytes of
+// data stored one vector after another.
+// Data files will have suffixes .fbin, .u8bin, and .i8bin to represent float32, uint8
+// and int8 type data.
+// As extensions for this benchmark, half and int data files will have suffixes .f16bin
+// and .ibin, respectively.
+template <typename T>
+class BinFile {
+ public:
+  BinFile(const std::string& file,
+          const std::string& mode,
+          uint32_t subset_first_row = 0,
+          uint32_t subset_size      = 0);
+  ~BinFile() { fclose(fp_); }
+  BinFile(const BinFile&) = delete;
+  BinFile& operator=(const BinFile&) = delete;
+
+  void get_shape(size_t* nrows, int* ndims)
+  {
+    assert(read_mode_);
+    *nrows = nrows_;
+    *ndims = ndims_;
+  }
+
+  void read(T* data) const
+  {
+    assert(read_mode_);
+    size_t total = static_cast<size_t>(nrows_) * ndims_;
+    if (fread(data, sizeof(T), total, fp_) != total) {
+      throw std::runtime_error("fread() BinFile " + file_ + " failed");
+    }
+  }
+
+  void write(const T* data, uint32_t nrows, uint32_t ndims)
+  {
+    assert(!read_mode_);
+    if (fwrite(&nrows, sizeof(uint32_t), 1, fp_) != 1) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+    if (fwrite(&ndims, sizeof(uint32_t), 1, fp_) != 1) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+
+    size_t total = static_cast<size_t>(nrows) * ndims;
+    if (fwrite(data, sizeof(T), total, fp_) != total) {
+      throw std::runtime_error("fwrite() BinFile " + file_ + " failed");
+    }
+  }
+
+  void* map() const
+  {
+    assert(read_mode_);
+    int fid       = fileno(fp_);
+    auto mmap_ptr = mmap(NULL, file_size_, PROT_READ, MAP_PRIVATE, fid, 0);
+    if (mmap_ptr == MAP_FAILED) {
+      throw std::runtime_error("mmap error: Value of errno " + std::to_string(errno) + ", " +
+                               std::string(strerror(errno)));
+    }
+    return mmap_ptr;
+  }
+
+  void unmap(void* data) const
+  {
+    if (munmap(data, file_size_) == -1) {
+      throw std::runtime_error("munmap error: " + std::string(strerror(errno)));
+    }
+  }
+
+ private:
+  void check_suffix_();
+
+  std::string file_;
+  FILE* fp_;
+  bool read_mode_;
+  uint32_t nrows_;
+  uint32_t ndims_;
+  size_t file_size_;
+};
+
+template <typename T>
+BinFile<T>::BinFile(const std::string& file,
+                    const std::string& mode,
+                    uint32_t subset_first_row,
+                    uint32_t subset_size)
+  : file_(file)
+{
+  check_suffix_();
+
+  if (mode == "r") {
+    read_mode_ = true;
+  } else if (mode == "w") {
+    read_mode_ = false;
+    if (subset_first_row != 0) {
+      throw std::runtime_error("subset_first_row should be zero for write mode");
+    }
+    if (subset_size != 0) { throw std::runtime_error("subset_size should be zero for write mode"); }
+  } else {
+    throw std::runtime_error("BinFile's mode must be either 'r' or 'w': " + file_);
+  }
+
+  fp_ = fopen(file_.c_str(), mode.c_str());
+  if (!fp_) { throw std::runtime_error("open BinFile failed: " + file_); }
+
+  if (read_mode_) {
+    struct stat statbuf;
+    if (stat(file_.c_str(), &statbuf) != 0) { throw std::runtime_error("stat() failed: " + file_); }
+    file_size_ = statbuf.st_size;
+
+    uint32_t header[2];
+    if (fread(header, sizeof(uint32_t), 2, fp_) != 2) {
+      throw std::runtime_error("read header of BinFile failed: " + file_);
+    }
+    nrows_ = header[0];
+    ndims_ = header[1];
+
+    size_t expected_file_size =
+      2 * sizeof(uint32_t) + static_cast<size_t>(nrows_) * ndims_ * sizeof(T);
+    if (file_size_ != expected_file_size) {
+      throw std::runtime_error("expected file size of " + file_ + " is " +
+                               std::to_string(expected_file_size) + ", however, actual size is " +
+                               std::to_string(file_size_));
+    }
+
+    if (subset_first_row >= nrows_) {
+      throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row) +
+                               ") >= nrows (" + std::to_string(nrows_) + ")");
+    }
+    if (subset_first_row + subset_size > nrows_) {
+      throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row) +
+                               ") + subset_size (" + std::to_string(subset_size) + ") > nrows (" +
+                               std::to_string(nrows_) + ")");
+    }
+
+    if (subset_first_row) {
+      static_assert(sizeof(long) == 8, "fseek() don't support 64-bit offset");
+      if (fseek(fp_, sizeof(T) * subset_first_row * ndims_, SEEK_CUR) == -1) {
+        throw std::runtime_error(file_ + ": fseek failed");
+      }
+      nrows_ -= subset_first_row;
+    }
+    if (subset_size) { nrows_ = subset_size; }
+  }
+}
+
+template <typename T>
+void BinFile<T>::check_suffix_()
+{
+  auto pos = file_.rfind('.');
+  if (pos == std::string::npos) {
+    throw std::runtime_error("name of BinFile doesn't have a suffix: " + file_);
+  }
+  std::string suffix = file_.substr(pos + 1);
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (suffix != "fbin") {
+      throw std::runtime_error("BinFile<float> should has .fbin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, half>) {
+    if (suffix != "f16bin") {
+      throw std::runtime_error("BinFile<half> should has .f16bin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, int>) {
+    if (suffix != "ibin") {
+      throw std::runtime_error("BinFile<int> should has .ibin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    if (suffix != "u8bin") {
+      throw std::runtime_error("BinFile<uint8_t> should has .u8bin suffix: " + file_);
+    }
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    if (suffix != "i8bin") {
+      throw std::runtime_error("BinFile<int8_t> should has .i8bin suffix: " + file_);
+    }
+  } else {
+    throw std::runtime_error(
+      "T of BinFile<T> should be one of float, half, int, uint8_t, or int8_t");
+  }
+}
+
+template <typename T>
+class Dataset {
+ public:
+  Dataset(const std::string& name) : name_(name) {}
+  Dataset(const std::string& name, const std::string& distance) : name_(name), distance_(distance)
+  {
+  }
+  Dataset(const Dataset&) = delete;
+  Dataset& operator=(const Dataset&) = delete;
+  virtual ~Dataset();
+
+  std::string name() const { return name_; }
+  std::string distance() const { return distance_; }
+  int dim() const { return dim_; }
+  size_t base_set_size() const { return base_set_size_; }
+  size_t query_set_size() const { return query_set_size_; }
+
+  // load data lazily, so don't pay the overhead of reading unneeded set
+  // e.g. don't load base set when searching
+  const T* base_set() const
+  {
+    if (!base_set_) { load_base_set_(); }
+    return base_set_;
+  }
+
+  const T* query_set() const
+  {
+    if (!query_set_) { load_query_set_(); }
+    return query_set_;
+  }
+
+  const T* base_set_on_gpu() const;
+  const T* query_set_on_gpu() const;
+  const T* mapped_base_set() const;
+
+ protected:
+  virtual void load_base_set_() const  = 0;
+  virtual void load_query_set_() const = 0;
+  virtual void map_base_set_() const   = 0;
+
+  std::string name_;
+  std::string distance_;
+  int dim_;
+  size_t base_set_size_;
+  size_t query_set_size_;
+
+  mutable T* base_set_        = nullptr;
+  mutable T* query_set_       = nullptr;
+  mutable T* d_base_set_      = nullptr;
+  mutable T* d_query_set_     = nullptr;
+  mutable T* mapped_base_set_ = nullptr;
+};
+
+template <typename T>
+Dataset<T>::~Dataset()
+{
+  delete[] base_set_;
+  delete[] query_set_;
+  if (d_base_set_) { RAFT_CUDA_TRY_NO_THROW(cudaFree(d_base_set_)); }
+  if (d_query_set_) { RAFT_CUDA_TRY_NO_THROW(cudaFree(d_query_set_)); }
+}
+
+template <typename T>
+const T* Dataset<T>::base_set_on_gpu() const
+{
+  if (!d_base_set_) {
+    base_set();
+    RAFT_CUDA_TRY(cudaMalloc((void**)&d_base_set_, base_set_size_ * dim_ * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMemcpy(
+      d_base_set_, base_set_, base_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice));
+  }
+  return d_base_set_;
+}
+
+template <typename T>
+const T* Dataset<T>::query_set_on_gpu() const
+{
+  if (!d_query_set_) {
+    query_set();
+    RAFT_CUDA_TRY(cudaMalloc((void**)&d_query_set_, query_set_size_ * dim_ * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMemcpy(
+      d_query_set_, query_set_, query_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice));
+  }
+  return d_query_set_;
+}
+
+template <typename T>
+const T* Dataset<T>::mapped_base_set() const
+{
+  if (!mapped_base_set_) { map_base_set_(); }
+  return mapped_base_set_;
+}
+
+template <typename T>
+class BinDataset : public Dataset<T> {
+ public:
+  BinDataset(const std::string& name,
+             const std::string& base_file,
+             size_t subset_first_row,
+             size_t subset_size,
+             const std::string& query_file,
+             const std::string& distance);
+  ~BinDataset()
+  {
+    if (this->mapped_base_set_) {
+      base_file_.unmap(reinterpret_cast<char*>(this->mapped_base_set_) - subset_offset_);
+    }
+  }
+
+ private:
+  void load_base_set_() const override;
+  void load_query_set_() const override;
+  void map_base_set_() const override;
+
+  using Dataset<T>::dim_;
+  using Dataset<T>::base_set_size_;
+  using Dataset<T>::query_set_size_;
+
+  BinFile<T> base_file_;
+  BinFile<T> query_file_;
+
+  size_t subset_offset_;
+};
+
+template <typename T>
+BinDataset<T>::BinDataset(const std::string& name,
+                          const std::string& base_file,
+                          size_t subset_first_row,
+                          size_t subset_size,
+                          const std::string& query_file,
+                          const std::string& distance)
+  : Dataset<T>(name, distance),
+    base_file_(base_file, "r", subset_first_row, subset_size),
+    query_file_(query_file, "r"),
+    subset_offset_(2 * sizeof(uint32_t) + subset_first_row * dim_ * sizeof(T))
+{
+  base_file_.get_shape(&base_set_size_, &dim_);
+  int query_dim;
+  query_file_.get_shape(&query_set_size_, &query_dim);
+  if (query_dim != dim_) {
+    throw std::runtime_error("base set dim (" + std::to_string(dim_) + ") != query set dim (" +
+                             std::to_string(query_dim));
+  }
+}
+
+template <typename T>
+void BinDataset<T>::load_base_set_() const
+{
+  this->base_set_ = new T[base_set_size_ * dim_];
+  base_file_.read(this->base_set_);
+}
+
+template <typename T>
+void BinDataset<T>::load_query_set_() const
+{
+  this->query_set_ = new T[query_set_size_ * dim_];
+  query_file_.read(this->query_set_);
+}
+
+template <typename T>
+void BinDataset<T>::map_base_set_() const
+{
+  char* original_map_ptr = static_cast<char*>(base_file_.map());
+  this->mapped_base_set_ = reinterpret_cast<T*>(original_map_ptr + subset_offset_);
+}
+
+}  // namespace  raft::bench::ann
diff --git a/cpp/bench/ann/src/common/util.cpp b/cpp/bench/ann/src/common/util.cpp
new file mode 100644
index 0000000000..17636f76d7
--- /dev/null
+++ b/cpp/bench/ann/src/common/util.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "util.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <cstring>
+#include <sstream>
+
+namespace raft::bench::ann {
+
+std::vector<std::string> split(const std::string& s, char delimiter)
+{
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream iss(s);
+  while (getline(iss, token, delimiter)) {
+    if (!token.empty()) { tokens.push_back(token); }
+  }
+  return tokens;
+}
+
+bool file_exists(const std::string& filename)
+{
+  struct stat statbuf;
+  if (stat(filename.c_str(), &statbuf) != 0) { return false; }
+  return S_ISREG(statbuf.st_mode);
+}
+
+bool dir_exists(const std::string& dir)
+{
+  struct stat statbuf;
+  if (stat(dir.c_str(), &statbuf) != 0) { return false; }
+  return S_ISDIR(statbuf.st_mode);
+}
+
+bool create_dir(const std::string& dir)
+{
+  const auto path = split(dir, '/');
+
+  std::string cwd;
+  if (!dir.empty() && dir[0] == '/') { cwd += '/'; }
+
+  for (const auto& p : path) {
+    cwd += p + "/";
+    if (!dir_exists(cwd)) {
+      int ret = mkdir(cwd.c_str(), S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+      if (ret != 0) { return false; }
+    }
+  }
+  return true;
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/util.h b/cpp/bench/ann/src/common/util.h
new file mode 100644
index 0000000000..290bf4cea9
--- /dev/null
+++ b/cpp/bench/ann/src/common/util.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <chrono>
+#include <cstdio>
+#include <ctime>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace raft::bench::ann {
+
+class Timer {
+ public:
+  Timer() { reset(); }
+  void reset() { start_time_ = std::chrono::steady_clock::now(); }
+  float elapsed_ms()
+  {
+    auto end_time = std::chrono::steady_clock::now();
+    auto dur =
+      std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(end_time - start_time_);
+    return dur.count();
+  }
+
+ private:
+  std::chrono::steady_clock::time_point start_time_;
+};
+
+std::vector<std::string> split(const std::string& s, char delimiter);
+
+bool file_exists(const std::string& filename);
+bool dir_exists(const std::string& dir);
+bool create_dir(const std::string& dir);
+
+template <typename... Ts>
+void log_(const char* level, Ts... vs)
+{
+  char buf[20];
+  std::time_t now = std::time(nullptr);
+  std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
+  printf("%s [%s] ", buf, level);
+  printf(vs...);
+  printf("\n");
+  fflush(stdout);
+}
+
+template <typename... Ts>
+void log_info(Ts... vs)
+{
+  log_("info", vs...);
+}
+
+template <typename... Ts>
+void log_warn(Ts... vs)
+{
+  log_("warn", vs...);
+}
+
+template <typename... Ts>
+void log_error(Ts... vs)
+{
+  log_("error", vs...);
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
new file mode 100644
index 0000000000..294da9a14f
--- /dev/null
+++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/ann_types.hpp"
+#undef WARP_SIZE
+#include "faiss_wrapper.h"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFFlat<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFPQ<T>::BuildParam& param)
+{
+  param.nlist = conf.at("nlist");
+  param.M     = conf.at("M");
+  if (conf.contains("usePrecomputed")) {
+    param.usePrecomputed = conf.at("usePrecomputed");
+  } else {
+    param.usePrecomputed = false;
+  }
+  if (conf.contains("useFloat16")) {
+    param.useFloat16 = conf.at("useFloat16");
+  } else {
+    param.useFloat16 = false;
+  }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::FaissGpuIVFSQ<T>::BuildParam& param)
+{
+  param.nlist          = conf.at("nlist");
+  param.quantizer_type = conf.at("quantizer_type");
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::FaissGpu<T>::SearchParam& param)
+{
+  param.nprobe = conf.at("nprobe");
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo == "faiss_gpu_ivf_flat") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFFlat>(metric, dim, conf, dev_list);
+    } else if (algo == "faiss_gpu_ivf_pq") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFPQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_ivf_sq") {
+      ann = make_algo<T, raft::bench::ann::FaissGpuIVFSQ>(metric, dim, conf);
+    } else if (algo == "faiss_gpu_flat") {
+      ann = std::make_unique<raft::bench::ann::FaissGpuFlat<T>>(metric, dim);
+    }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "faiss_gpu_ivf_flat" || algo == "faiss_gpu_ivf_pq" || algo == "faiss_gpu_ivf_sq") {
+    auto param = std::make_unique<typename raft::bench::ann::FaissGpu<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  } else if (algo == "faiss_gpu_flat") {
+    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
+
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_wrapper.h
new file mode 100644
index 0000000000..8cfc26ea5b
--- /dev/null
+++ b/cpp/bench/ann/src/faiss/faiss_wrapper.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef FAISS_WRAPPER_H_
+#define FAISS_WRAPPER_H_
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/index_io.h>
+#include <omp.h>
+
+#include <cassert>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "../common/ann_types.hpp"
+#include "../common/benchmark_util.hpp"
+#include <raft/util/cudart_utils.hpp>
+
+namespace {
+
+faiss::MetricType parse_metric_type(raft::bench::ann::Metric metric)
+{
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
+    return faiss::METRIC_INNER_PRODUCT;
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
+    return faiss::METRIC_L2;
+  } else {
+    throw std::runtime_error("faiss supports only metric type of inner product and L2");
+  }
+}
+
+// note BLAS library can still use multi-threading, and
+// setting environment variable like OPENBLAS_NUM_THREADS can control it
+class OmpSingleThreadScope {
+ public:
+  OmpSingleThreadScope()
+  {
+    max_threads_ = omp_get_max_threads();
+    omp_set_num_threads(1);
+  }
+  ~OmpSingleThreadScope()
+  {
+    // the best we can do
+    omp_set_num_threads(max_threads_);
+  }
+
+ private:
+  int max_threads_;
+};
+
+}  // namespace
+
+namespace raft::bench::ann {
+
+template <typename T>
+class FaissGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int nprobe;
+  };
+
+  FaissGpu(Metric metric, int dim, int nlist);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const final;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    // to enable building big dataset which is larger than GPU memory
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+
+ protected:
+  template <typename GpuIndex, typename CpuIndex>
+  void save_(const std::string& file) const;
+
+  template <typename GpuIndex, typename CpuIndex>
+  void load_(const std::string& file);
+
+  mutable faiss::gpu::StandardGpuResources gpu_resource_;
+  std::unique_ptr<faiss::gpu::GpuIndex> index_;
+  faiss::MetricType metric_type_;
+  int nlist_;
+  int device_;
+};
+
+template <typename T>
+FaissGpu<T>::FaissGpu(Metric metric, int dim, int nlist)
+  : ANN<T>(metric, dim), metric_type_(parse_metric_type(metric)), nlist_(nlist)
+{
+  static_assert(std::is_same_v<T, float>, "faiss support only float type");
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T>
+void FaissGpu<T>::build(const T* dataset, size_t nrow, cudaStream_t stream)
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  gpu_resource_.setDefaultStream(device_, stream);
+  index_->train(nrow, dataset);  // faiss::gpu::GpuIndexFlat::train() will do nothing
+  assert(index_->is_trained);
+  index_->add(nrow, dataset);
+}
+
+template <typename T>
+void FaissGpu<T>::set_search_param(const AnnSearchParam& param)
+{
+  int nprobe = dynamic_cast<const SearchParam&>(param).nprobe;
+  assert(nprobe <= nlist_);
+  dynamic_cast<faiss::gpu::GpuIndexIVF*>(index_.get())->setNumProbes(nprobe);
+}
+
+template <typename T>
+void FaissGpu<T>::search(const T* queries,
+                         int batch_size,
+                         int k,
+                         size_t* neighbors,
+                         float* distances,
+                         cudaStream_t stream) const
+{
+  static_assert(sizeof(size_t) == sizeof(faiss::Index::idx_t),
+                "sizes of size_t and faiss::Index::idx_t are different");
+  gpu_resource_.setDefaultStream(device_, stream);
+  index_->search(
+    batch_size, queries, k, distances, reinterpret_cast<faiss::Index::idx_t*>(neighbors));
+}
+
+template <typename T>
+template <typename GpuIndex, typename CpuIndex>
+void FaissGpu<T>::save_(const std::string& file) const
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  auto cpu_index = std::make_unique<CpuIndex>();
+  dynamic_cast<GpuIndex*>(index_.get())->copyTo(cpu_index.get());
+  faiss::write_index(cpu_index.get(), file.c_str());
+}
+
+template <typename T>
+template <typename GpuIndex, typename CpuIndex>
+void FaissGpu<T>::load_(const std::string& file)
+{
+  OmpSingleThreadScope omp_single_thread;
+
+  std::unique_ptr<CpuIndex> cpu_index(dynamic_cast<CpuIndex*>(faiss::read_index(file.c_str())));
+  assert(cpu_index);
+  dynamic_cast<GpuIndex*>(index_.get())->copyFrom(cpu_index.get());
+}
+
+template <typename T>
+class FaissGpuIVFFlat : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+  };
+
+  FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexIVFFlat>(
+      &(this->gpu_resource_), dim, param.nlist, this->metric_type_, config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFFlat, faiss::IndexIVFFlat>(file);
+  }
+};
+
+template <typename T>
+class FaissGpuIVFPQ : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+    int M;
+    bool useFloat16;
+    bool usePrecomputed;
+  };
+
+  FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.useFloat16LookupTables = param.useFloat16;
+    config.usePrecomputedTables   = param.usePrecomputed;
+    config.device                 = this->device_;
+    this->index_ =
+      std::make_unique<faiss::gpu::GpuIndexIVFPQ>(&(this->gpu_resource_),
+                                                  dim,
+                                                  param.nlist,
+                                                  param.M,
+                                                  8,  // FAISS only supports bitsPerCode=8
+                                                  this->metric_type_,
+                                                  config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFPQ, faiss::IndexIVFPQ>(file);
+  }
+};
+
+template <typename T>
+class FaissGpuIVFSQ : public FaissGpu<T> {
+ public:
+  struct BuildParam {
+    int nlist;
+    std::string quantizer_type;
+  };
+
+  FaissGpuIVFSQ(Metric metric, int dim, const BuildParam& param)
+    : FaissGpu<T>(metric, dim, param.nlist)
+  {
+    faiss::ScalarQuantizer::QuantizerType qtype;
+    if (param.quantizer_type == "fp16") {
+      qtype = faiss::ScalarQuantizer::QT_fp16;
+    } else if (param.quantizer_type == "int8") {
+      qtype = faiss::ScalarQuantizer::QT_8bit;
+    } else {
+      throw std::runtime_error("FaissGpuIVFSQ supports only fp16 and int8 but got " +
+                               param.quantizer_type);
+    }
+
+    faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexIVFScalarQuantizer>(
+      &(this->gpu_resource_), dim, param.nlist, qtype, this->metric_type_, true, config);
+  }
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
+      file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexIVFScalarQuantizer, faiss::IndexIVFScalarQuantizer>(
+      file);
+  }
+};
+
+template <typename T>
+class FaissGpuFlat : public FaissGpu<T> {
+ public:
+  FaissGpuFlat(Metric metric, int dim) : FaissGpu<T>(metric, dim, 0)
+  {
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = this->device_;
+    this->index_  = std::make_unique<faiss::gpu::GpuIndexFlat>(
+      &(this->gpu_resource_), dim, this->metric_type_, config);
+  }
+
+  // class FaissGpu is more like a IVF class, so need special treating here
+  void set_search_param(const typename ANN<T>::AnnSearchParam&) override{};
+
+  void save(const std::string& file) const override
+  {
+    this->template save_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
+  }
+  void load(const std::string& file) override
+  {
+    this->template load_<faiss::gpu::GpuIndexFlat, faiss::IndexFlat>(file);
+  }
+};
+
+}  // namespace raft::bench::ann
+
+#endif
diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
new file mode 100644
index 0000000000..8072cd857c
--- /dev/null
+++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/ann_types.hpp"
+#include "ggnn_wrapper.cuh"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::Ggnn<T>::BuildParam& param)
+{
+  param.dataset_size = conf.at("dataset_size");
+  param.k            = conf.at("k");
+
+  if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
+  if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }
+  if (conf.contains("num_layers")) { param.num_layers = conf.at("num_layers"); }
+  if (conf.contains("tau")) { param.tau = conf.at("tau"); }
+  if (conf.contains("refine_iterations")) {
+    param.refine_iterations = conf.at("refine_iterations");
+  }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::Ggnn<T>::SearchParam& param)
+{
+  param.tau = conf.at("tau");
+
+  if (conf.contains("block_dim")) { param.block_dim = conf.at("block_dim"); }
+  if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("cache_size")) { param.cache_size = conf.at("cache_size"); }
+  if (conf.contains("sorted_size")) { param.sorted_size = conf.at("sorted_size"); }
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {}
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+  if (algo == "ggnn") { ann = make_algo<T, raft::bench::ann::Ggnn>(metric, dim, conf); }
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "ggnn") {
+    auto param = std::make_unique<typename raft::bench::ann::Ggnn<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+}  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
+
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
new file mode 100644
index 0000000000..fd8fe0f2ec
--- /dev/null
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <stdexcept>
+
+#include "../common/ann_types.hpp"
+#include "../common/benchmark_util.hpp"
+#include <ggnn/cuda_knn_ggnn_gpu_instance.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+class GgnnImpl;
+
+template <typename T>
+class Ggnn : public ANN<T> {
+ public:
+  struct BuildParam {
+    int k_build{24};       // KBuild
+    int segment_size{32};  // S
+    int num_layers{4};     // L
+    float tau{0.5};
+    int refine_iterations{2};
+
+    size_t dataset_size;
+    int k;  // GGNN requires to know k during building
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    float tau;
+    int block_dim{32};
+    int max_iterations{400};
+    int cache_size{512};
+    int sorted_size{256};
+  };
+
+  Ggnn(Metric metric, int dim, const BuildParam& param);
+  ~Ggnn() { delete impl_; }
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override
+  {
+    impl_->build(dataset, nrow, stream);
+  }
+
+  void set_search_param(const AnnSearchParam& param) override { impl_->set_search_param(param); }
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override
+  {
+    impl_->search(queries, batch_size, k, neighbors, distances, stream);
+  }
+
+  void save(const std::string& file) const override { impl_->save(file); }
+  void load(const std::string& file) override { impl_->load(file); }
+
+  AlgoProperty get_property() const override { return impl_->get_property(); }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override
+  {
+    impl_->set_search_dataset(dataset, nrow);
+  };
+
+ private:
+  ANN<T>* impl_;
+};
+
+template <typename T>
+Ggnn<T>::Ggnn(Metric metric, int dim, const BuildParam& param) : ANN<T>(metric, dim)
+{
+  // ggnn/src/sift1m.cu
+  if (metric == Metric::kEuclidean && dim == 128 && param.k_build == 24 && param.k == 10 &&
+      param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Euclidean, 128, 24, 10, 32>(metric, dim, param);
+  }
+  // ggnn/src/deep1b_multi_gpu.cu, and adapt it deep1B
+  else if (metric == Metric::kEuclidean && dim == 96 && param.k_build == 24 && param.k == 10 &&
+           param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Euclidean, 96, 24, 10, 32>(metric, dim, param);
+  } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 24 && param.k == 10 &&
+             param.segment_size == 32) {
+    impl_ = new GgnnImpl<T, Cosine, 96, 24, 10, 32>(metric, dim, param);
+  } else if (metric == Metric::kInnerProduct && dim == 96 && param.k_build == 96 && param.k == 10 &&
+             param.segment_size == 64) {
+    impl_ = new GgnnImpl<T, Cosine, 96, 96, 10, 64>(metric, dim, param);
+  }
+  // ggnn/src/glove200.cu, adapt it to glove100
+  else if (metric == Metric::kInnerProduct && dim == 100 && param.k_build == 96 && param.k == 10 &&
+           param.segment_size == 64) {
+    impl_ = new GgnnImpl<T, Cosine, 100, 96, 10, 64>(metric, dim, param);
+  } else {
+    throw std::runtime_error(
+      "ggnn: not supported combination of metric, dim and build param; "
+      "see Ggnn's constructor in ggnn_wrapper.cuh for available combinations");
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+class GgnnImpl : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  GgnnImpl(Metric metric, int dim, const typename Ggnn<T>::BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  void save(const std::string& file) const override;
+  void load(const std::string& file) override;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;
+    return property;
+  }
+
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+ private:
+  using ANN<T>::metric_;
+  using ANN<T>::dim_;
+
+  using GGNNGPUInstance = GGNNGPUInstance<measure,
+                                          int64_t /* KeyT */,
+                                          float /* ValueT */,
+                                          size_t /* GAddrT */,
+                                          T /* BaseT */,
+                                          size_t /* BAddrT */,
+                                          D,
+                                          KBuild,
+                                          KBuild / 2 /* KF */,
+                                          KQuery,
+                                          S>;
+  std::unique_ptr<GGNNGPUInstance> ggnn_;
+  typename Ggnn<T>::BuildParam build_param_;
+  typename Ggnn<T>::SearchParam search_param_;
+};
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+GgnnImpl<T, measure, D, KBuild, KQuery, S>::GgnnImpl(Metric metric,
+                                                     int dim,
+                                                     const typename Ggnn<T>::BuildParam& param)
+  : ANN<T>(metric, dim), build_param_(param)
+{
+  if (metric_ == Metric::kInnerProduct) {
+    if (measure != Cosine) { throw std::runtime_error("mis-matched metric"); }
+  } else if (metric_ == Metric::kEuclidean) {
+    if (measure != Euclidean) { throw std::runtime_error("mis-matched metric"); }
+  } else {
+    throw std::runtime_error(
+      "ggnn supports only metric type of InnerProduct, Cosine and Euclidean");
+  }
+
+  if (dim != D) { throw std::runtime_error("mis-matched dim"); }
+
+  int device;
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
+
+  ggnn_ = std::make_unique<GGNNGPUInstance>(
+    device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
+                                                       size_t nrow,
+                                                       cudaStream_t stream)
+{
+  if (nrow != build_param_.dataset_size) {
+    throw std::runtime_error(
+      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
+      " , but nrow = " + std::to_string(nrow));
+  }
+
+  ggnn_->set_base_data(dataset);
+  ggnn_->set_stream(stream);
+  ggnn_->build(0);
+  for (int i = 0; i < build_param_.refine_iterations; ++i) {
+    ggnn_->refine();
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  if (nrow != build_param_.dataset_size) {
+    throw std::runtime_error(
+      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
+      " , but nrow = " + std::to_string(nrow));
+  }
+  ggnn_->set_base_data(dataset);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_param(const AnnSearchParam& param)
+{
+  search_param_ = dynamic_cast<const typename Ggnn<T>::SearchParam&>(param);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::search(const T* queries,
+                                                        int batch_size,
+                                                        int k,
+                                                        size_t* neighbors,
+                                                        float* distances,
+                                                        cudaStream_t stream) const
+{
+  static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different");
+  if (k != KQuery) {
+    throw std::runtime_error(
+      "k = " + std::to_string(k) +
+      ", but this GGNN instance only supports k = " + std::to_string(KQuery));
+  }
+
+  ggnn_->set_stream(stream);
+  RAFT_CUDA_TRY(cudaMemcpyToSymbol(c_tau_query, &search_param_.tau, sizeof(float)));
+
+  const int block_dim      = search_param_.block_dim;
+  const int max_iterations = search_param_.max_iterations;
+  const int cache_size     = search_param_.cache_size;
+  const int sorted_size    = search_param_.sorted_size;
+  // default value
+  if (block_dim == 32 && max_iterations == 400 && cache_size == 512 && sorted_size == 256) {
+    ggnn_->template queryLayer<32, 400, 512, 256, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/sift1m.cu
+  else if (block_dim == 32 && max_iterations == 200 && cache_size == 256 && sorted_size == 64) {
+    ggnn_->template queryLayer<32, 200, 256, 64, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/sift1m.cu
+  else if (block_dim == 32 && max_iterations == 400 && cache_size == 448 && sorted_size == 64) {
+    ggnn_->template queryLayer<32, 400, 448, 64, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // ggnn/src/glove200.cu
+  else if (block_dim == 128 && max_iterations == 2000 && cache_size == 2048 && sorted_size == 32) {
+    ggnn_->template queryLayer<128, 2000, 2048, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  }
+  // for glove100
+  else if (block_dim == 64 && max_iterations == 400 && cache_size == 512 && sorted_size == 32) {
+    ggnn_->template queryLayer<64, 400, 512, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  } else if (block_dim == 128 && max_iterations == 2000 && cache_size == 1024 &&
+             sorted_size == 32) {
+    ggnn_->template queryLayer<128, 2000, 1024, 32, false>(
+      queries, batch_size, reinterpret_cast<int64_t*>(neighbors), distances);
+  } else {
+    throw std::runtime_error("ggnn: not supported search param");
+  }
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::save(const std::string& file) const
+{
+  auto& ggnn_host   = ggnn_->ggnn_cpu_buffers.at(0);
+  auto& ggnn_device = ggnn_->ggnn_shards.at(0);
+  ggnn_->set_stream(0);
+
+  ggnn_host.downloadAsync(ggnn_device);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(ggnn_device.stream));
+  ggnn_host.store(file);
+}
+
+template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
+void GgnnImpl<T, measure, D, KBuild, KQuery, S>::load(const std::string& file)
+{
+  auto& ggnn_host   = ggnn_->ggnn_cpu_buffers.at(0);
+  auto& ggnn_device = ggnn_->ggnn_shards.at(0);
+  ggnn_->set_stream(0);
+
+  ggnn_host.load(file);
+  ggnn_host.uploadAsync(ggnn_device);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(ggnn_device.stream));
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
new file mode 100644
index 0000000000..cd823e8a69
--- /dev/null
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "../common/benchmark_util.hpp"
+
+#include "../common/ann_types.hpp"
+#undef WARP_SIZE
+#include "hnswlib_wrapper.h"
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::HnswLib<T>::BuildParam& param)
+{
+  param.ef_construction = conf.at("efConstruction");
+  param.M               = conf.at("M");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::HnswLib<T>::SearchParam& param)
+{
+  param.ef = conf.at("ef");
+  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    if (algo == "hnswlib") { ann = make_algo<T, raft::bench::ann::HnswLib>(metric, dim, conf); }
+  }
+
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+  if (algo == "hnswlib") {
+    auto param = std::make_unique<typename raft::bench::ann::HnswLib<T>::SearchParam>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+};  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
+
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
new file mode 100644
index 0000000000..c5c3a4a2a6
--- /dev/null
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <condition_variable>
+#include <cstdio>
+#include <ctime>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "../common/ann_types.hpp"
+#include <hnswlib.h>
+
+namespace raft::bench::ann {
+
+template <typename T>
+struct hnsw_dist_t {
+  using type = void;
+};
+
+template <>
+struct hnsw_dist_t<float> {
+  using type = float;
+};
+
+template <>
+struct hnsw_dist_t<uint8_t> {
+  using type = int;
+};
+
+class FixedThreadPool {
+ public:
+  FixedThreadPool(int num_threads)
+  {
+    if (num_threads < 1) {
+      throw std::runtime_error("num_threads must >= 1");
+    } else if (num_threads == 1) {
+      return;
+    }
+
+    tasks_ = new Task_[num_threads];
+
+    threads_.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      threads_.emplace_back([&, i] {
+        auto& task = tasks_[i];
+        while (true) {
+          std::unique_lock<std::mutex> lock(task.mtx);
+          task.cv.wait(lock,
+                       [&] { return task.has_task || finished_.load(std::memory_order_relaxed); });
+          if (finished_.load(std::memory_order_relaxed)) { break; }
+
+          task.task();
+          task.has_task = false;
+        }
+      });
+    }
+  }
+
+  ~FixedThreadPool()
+  {
+    if (threads_.empty()) { return; }
+
+    finished_.store(true, std::memory_order_relaxed);
+    for (unsigned i = 0; i < threads_.size(); ++i) {
+      auto& task = tasks_[i];
+      std::lock_guard<std::mutex>(task.mtx);
+
+      task.cv.notify_one();
+      threads_[i].join();
+    }
+
+    delete[] tasks_;
+  }
+
+  template <typename Func, typename IdxT>
+  void submit(Func f, IdxT len)
+  {
+    if (threads_.empty()) {
+      for (IdxT i = 0; i < len; ++i) {
+        f(i);
+      }
+      return;
+    }
+
+    const int num_threads = threads_.size();
+    // one extra part for competition among threads
+    const IdxT items_per_thread = len / (num_threads + 1);
+    std::atomic<IdxT> cnt(items_per_thread * num_threads);
+
+    auto wrapped_f = [&](IdxT start, IdxT end) {
+      for (IdxT i = start; i < end; ++i) {
+        f(i);
+      }
+
+      while (true) {
+        IdxT i = cnt.fetch_add(1, std::memory_order_relaxed);
+        if (i >= len) { break; }
+        f(i);
+      }
+    };
+
+    std::vector<std::future<void>> futures;
+    futures.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      IdxT start = i * items_per_thread;
+      auto& task = tasks_[i];
+      {
+        std::lock_guard lock(task.mtx);
+        (void)lock;  // stop nvcc warning
+        task.task = std::packaged_task<void()>([=] { wrapped_f(start, start + items_per_thread); });
+        futures.push_back(task.task.get_future());
+        task.has_task = true;
+      }
+      task.cv.notify_one();
+    }
+
+    for (auto& fut : futures) {
+      fut.wait();
+    }
+    return;
+  }
+
+ private:
+  struct alignas(64) Task_ {
+    std::mutex mtx;
+    std::condition_variable cv;
+    bool has_task = false;
+    std::packaged_task<void()> task;
+  };
+
+  Task_* tasks_;
+  std::vector<std::thread> threads_;
+  std::atomic<bool> finished_{false};
+};
+
+template <typename T>
+class HnswLib : public ANN<T> {
+ public:
+  // https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+  struct BuildParam {
+    int M;
+    int ef_construction;
+    int num_threads{1};
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int ef;
+    int num_threads{1};
+  };
+
+  HnswLib(Metric metric, int dim, const BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) override;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void search(const T* query,
+              int batch_size,
+              int k,
+              size_t* indices,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  void save(const std::string& path_to_index) const override;
+  void load(const std::string& path_to_index) override;
+
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Host;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+
+ private:
+  void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const;
+
+  std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
+  std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
+
+  using ANN<T>::metric_;
+  using ANN<T>::dim_;
+  int ef_construction_;
+  int m_;
+  int num_threads_;
+  std::unique_ptr<FixedThreadPool> thread_pool_;
+};
+
+template <typename T>
+HnswLib<T>::HnswLib(Metric metric, int dim, const BuildParam& param) : ANN<T>(metric, dim)
+{
+  assert(dim_ > 0);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t>);
+  if constexpr (std::is_same_v<T, uint8_t>) {
+    if (metric_ != Metric::kEuclidean) {
+      throw std::runtime_error("hnswlib<uint8_t> only supports Euclidean distance");
+    }
+  }
+
+  ef_construction_ = param.ef_construction;
+  m_               = param.M;
+  num_threads_     = param.num_threads;
+}
+
+template <typename T>
+void HnswLib<T>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  if constexpr (std::is_same_v<T, float>) {
+    if (metric_ == Metric::kInnerProduct) {
+      space_ = std::make_unique<hnswlib::InnerProductSpace>(dim_);
+    } else {
+      space_ = std::make_unique<hnswlib::L2Space>(dim_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    space_ = std::make_unique<hnswlib::L2SpaceI>(dim_);
+  }
+
+  appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    space_.get(), nrow, m_, ef_construction_);
+
+  thread_pool_                  = std::make_unique<FixedThreadPool>(num_threads_);
+  const size_t items_per_thread = nrow / (num_threads_ + 1);
+
+  thread_pool_->submit(
+    [&](size_t i) {
+      if (i < items_per_thread && i % 10000 == 0) {
+        char buf[20];
+        std::time_t now = std::time(nullptr);
+        std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
+
+        printf("%s building %zu / %zu\n", buf, i, items_per_thread);
+        fflush(stdout);
+      }
+
+      appr_alg_->addPoint(dataset + i * dim_, i);
+    },
+    nrow);
+}
+
+template <typename T>
+void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
+{
+  auto param     = dynamic_cast<const SearchParam&>(param_);
+  appr_alg_->ef_ = param.ef;
+
+  if (!thread_pool_ || num_threads_ != param.num_threads) {
+    num_threads_ = param.num_threads;
+    thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_);
+  }
+}
+
+template <typename T>
+void HnswLib<T>::search(
+  const T* query, int batch_size, int k, size_t* indices, float* distances, cudaStream_t) const
+{
+  thread_pool_->submit(
+    [&](int i) {
+      get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k);
+    },
+    batch_size);
+}
+
+template <typename T>
+void HnswLib<T>::save(const std::string& path_to_index) const
+{
+  appr_alg_->saveIndex(std::string(path_to_index));
+}
+
+template <typename T>
+void HnswLib<T>::load(const std::string& path_to_index)
+{
+  if constexpr (std::is_same_v<T, float>) {
+    if (metric_ == Metric::kInnerProduct) {
+      space_ = std::make_unique<hnswlib::InnerProductSpace>(dim_);
+    } else {
+      space_ = std::make_unique<hnswlib::L2Space>(dim_);
+    }
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    space_ = std::make_unique<hnswlib::L2SpaceI>(dim_);
+  }
+
+  appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    space_.get(), path_to_index);
+}
+
+template <typename T>
+void HnswLib<T>::get_search_knn_results_(const T* query,
+                                         int k,
+                                         size_t* indices,
+                                         float* distances) const
+{
+  auto result = appr_alg_->searchKnn(query, k);
+  assert(result.size() >= static_cast<size_t>(k));
+
+  for (int i = k - 1; i >= 0; --i) {
+    indices[i]   = result.top().second;
+    distances[i] = result.top().first;
+    result.pop();
+  }
+}
+
+};  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
new file mode 100644
index 0000000000..cb30c2693f
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace raft::bench::ann {
+
+inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric)
+{
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
+    return raft::distance::DistanceType::InnerProduct;
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
+    // Even for L2 expanded RAFT IVF Flat uses unexpanded formula
+    return raft::distance::DistanceType::L2Expanded;
+  } else {
+    throw std::runtime_error("raft supports only metric type of inner product and L2");
+  }
+}
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
new file mode 100644
index 0000000000..d8e98ce2a9
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
+#include "../common/ann_types.hpp"
+#include "../common/benchmark_util.hpp"
+#undef WARP_SIZE
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+#include "raft_wrapper.h"
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+#include "raft_ivf_flat_wrapper.h"
+extern template class raft::bench::ann::RaftIvfFlatGpu<float, int64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<uint8_t, int64_t>;
+extern template class raft::bench::ann::RaftIvfFlatGpu<int8_t, int64_t>;
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+#include "raft_ivf_pq_wrapper.h"
+extern template class raft::bench::ann::RaftIvfPQ<float, int64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<uint8_t, int64_t>;
+extern template class raft::bench::ann::RaftIvfPQ<int8_t, int64_t>;
+#endif
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+namespace raft::bench::ann {
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) {
+    param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio");
+    std::cout << "kmeans_trainset_fraction " << param.kmeans_trainset_fraction;
+  }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfFlatGpu<T, IdxT>::SearchParam& param)
+{
+  param.ivf_flat_params.n_probes = conf.at("nprobe");
+}
+#endif
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename raft::bench::ann::RaftIvfPQ<T, IdxT>::BuildParam& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); }
+  if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); }
+  if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename raft::bench::ann::RaftIvfPQ<T, IdxT>::SearchParam& param)
+{
+  param.pq_param.n_probes = conf.at("numProbes");
+  if (conf.contains("internalDistanceDtype")) {
+    std::string type = conf.at("internalDistanceDtype");
+    if (type == "float") {
+      param.pq_param.internal_distance_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.internal_distance_dtype = CUDA_R_16F;
+    } else {
+      throw std::runtime_error("internalDistanceDtype: '" + type +
+                               "', should be either 'float' or 'half'");
+    }
+  } else {
+    // set half as default type
+    param.pq_param.internal_distance_dtype = CUDA_R_16F;
+  }
+
+  if (conf.contains("smemLutDtype")) {
+    std::string type = conf.at("smemLutDtype");
+    if (type == "float") {
+      param.pq_param.lut_dtype = CUDA_R_32F;
+    } else if (type == "half") {
+      param.pq_param.lut_dtype = CUDA_R_16F;
+    } else if (type == "fp8") {
+      param.pq_param.lut_dtype = CUDA_R_8U;
+    } else {
+      throw std::runtime_error("smemLutDtype: '" + type +
+                               "', should be either 'float', 'half' or 'fp8'");
+    }
+  } else {
+    // set half as default
+    param.pq_param.lut_dtype = CUDA_R_16F;
+  }
+}
+#endif
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<raft::bench::ann::ANN<T>> make_algo(raft::bench::ann::Metric metric,
+                                                    int dim,
+                                                    const nlohmann::json& conf,
+                                                    const std::vector<int>& dev_list)
+{
+  typename Algo<T>::BuildParam param;
+  parse_build_param<T>(conf, param);
+
+  (void)dev_list;
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+std::unique_ptr<raft::bench::ann::ANN<T>> create_algo(const std::string& algo,
+                                                      const std::string& distance,
+                                                      int dim,
+                                                      float refine_ratio,
+                                                      const nlohmann::json& conf,
+                                                      const std::vector<int>& dev_list)
+{
+  // stop compiler warning; not all algorithms support multi-GPU so it may not be used
+  (void)dev_list;
+
+  raft::bench::ann::Metric metric = parse_metric(distance);
+  std::unique_ptr<raft::bench::ann::ANN<T>> ann;
+
+  if constexpr (std::is_same_v<T, float>) {
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+    if (algo == "raft_bfknn") { ann = std::make_unique<raft::bench::ann::RaftGpu<T>>(metric, dim); }
+#endif
+  }
+
+  if constexpr (std::is_same_v<T, uint8_t>) {}
+
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    typename raft::bench::ann::RaftIvfFlatGpu<T, int64_t>::BuildParam param;
+    parse_build_param<T, int64_t>(conf, param);
+    ann = std::make_unique<raft::bench::ann::RaftIvfFlatGpu<T, int64_t>>(metric, dim, param);
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    typename raft::bench::ann::RaftIvfPQ<T, int64_t>::BuildParam param;
+    parse_build_param<T, int64_t>(conf, param);
+    ann =
+      std::make_unique<raft::bench::ann::RaftIvfPQ<T, int64_t>>(metric, dim, param, refine_ratio);
+  }
+#endif
+  if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); }
+
+  if (refine_ratio > 1.0) {}
+  return ann;
+}
+
+template <typename T>
+std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search_param(
+  const std::string& algo, const nlohmann::json& conf)
+{
+#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN
+  if (algo == "raft_bfknn") {
+    auto param = std::make_unique<typename raft::bench::ann::ANN<T>::AnnSearchParam>();
+    return param;
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT
+  if (algo == "raft_ivf_flat") {
+    auto param =
+      std::make_unique<typename raft::bench::ann::RaftIvfFlatGpu<T, int64_t>::SearchParam>();
+    parse_search_param<T, int64_t>(conf, *param);
+    return param;
+  }
+#endif
+#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ
+  if (algo == "raft_ivf_pq") {
+    auto param = std::make_unique<typename raft::bench::ann::RaftIvfPQ<T, int64_t>::SearchParam>();
+    parse_search_param<T, int64_t>(conf, *param);
+    return param;
+  }
+#endif
+  // else
+  throw std::runtime_error("invalid algo: '" + algo + "'");
+}
+
+};  // namespace raft::bench::ann
+
+#include "../common/benchmark.hpp"
+
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat.cu b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
new file mode 100644
index 0000000000..ff108080b5
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_ivf_flat_wrapper.h"
+
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
+namespace raft::bench::ann {
+template class RaftIvfFlatGpu<float, int64_t>;
+template class RaftIvfFlatGpu<uint8_t, int64_t>;
+template class RaftIvfFlatGpu<int8_t, int64_t>;
+}  // namespace raft::bench::ann
\ No newline at end of file
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
new file mode 100644
index 0000000000..8b2a7d329b
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "../common/ann_types.hpp"
+#include "raft_ann_bench_utils.h"
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T, typename IdxT>
+class RaftIvfFlatGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  struct SearchParam : public AnnSearchParam {
+    raft::neighbors::ivf_flat::search_params ivf_flat_params;
+  };
+
+  using BuildParam = raft::neighbors::ivf_flat::index_params;
+
+  RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = false;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+
+ private:
+  raft::device_resources handle_;
+  BuildParam index_params_;
+  raft::neighbors::ivf_flat::search_params search_params_;
+  std::optional<raft::neighbors::ivf_flat::index<T, IdxT>> index_;
+  int device_;
+  int dimension_;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
+};
+
+template <typename T, typename IdxT>
+RaftIvfFlatGpu<T, IdxT>::RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param)
+  : ANN<T>(metric, dim),
+    index_params_(param),
+    dimension_(dim),
+    mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+{
+  index_params_.metric = parse_metric_type(metric);
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  index_.emplace(
+    raft::neighbors::ivf_flat::build(handle_, index_params_, dataset, IdxT(nrow), dimension_));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::set_search_param(const AnnSearchParam& param)
+{
+  auto search_param = dynamic_cast<const SearchParam&>(param);
+  search_params_    = search_param.ivf_flat_params;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::save(const std::string& file) const
+{
+  raft::neighbors::ivf_flat::serialize(handle_, file, *index_);
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::load(const std::string& file)
+{
+  index_ = raft::neighbors::ivf_flat::deserialize<T, IdxT>(handle_, file);
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfFlatGpu<T, IdxT>::search(
+  const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const
+{
+  rmm::mr::device_memory_resource* mr_ptr = &const_cast<RaftIvfFlatGpu*>(this)->mr_;
+  static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
+  raft::neighbors::ivf_flat::search(
+    handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances, mr_ptr);
+  handle_.sync_stream();
+  return;
+}
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq.cu b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
new file mode 100644
index 0000000000..338bc9a32f
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_ivf_pq_wrapper.h"
+
+#ifdef RAFT_COMPILED
+#include <raft/neighbors/specializations.cuh>
+#endif
+
+namespace raft::bench::ann {
+template class RaftIvfPQ<float, int64_t>;
+template class RaftIvfPQ<uint8_t, int64_t>;
+template class RaftIvfPQ<int8_t, int64_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
new file mode 100644
index 0000000000..70dff81847
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+#include <raft_runtime/neighbors/refine.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <type_traits>
+
+#include "../common/ann_types.hpp"
+#include "raft_ann_bench_utils.h"
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::bench::ann {
+
+template <typename T, typename IdxT>
+class RaftIvfPQ : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  struct SearchParam : public AnnSearchParam {
+    raft::neighbors::ivf_pq::search_params pq_param;
+  };
+
+  using BuildParam = raft::neighbors::ivf_pq::index_params;
+
+  RaftIvfPQ(Metric metric, int dim, const BuildParam& param, float refine_ratio);
+
+  void build(const T* dataset, size_t nrow, cudaStream_t stream) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const override;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Host;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;  // actually it is only used during refinement
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+
+ private:
+  raft::device_resources handle_;
+  BuildParam index_params_;
+  raft::neighbors::ivf_pq::search_params search_params_;
+  std::optional<raft::neighbors::ivf_pq::index<IdxT>> index_;
+  int device_;
+  int dimension_;
+  float refine_ratio_ = 1.0;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr_;
+  raft::device_matrix_view<const T, IdxT> dataset_;
+};
+template <typename T, typename IdxT>
+RaftIvfPQ<T, IdxT>::RaftIvfPQ(Metric metric, int dim, const BuildParam& param, float refine_ratio)
+  : ANN<T>(metric, dim),
+    index_params_(param),
+    dimension_(dim),
+    refine_ratio_(refine_ratio),
+    mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull)
+{
+  index_params_.metric = parse_metric_type(metric);
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::save(const std::string& file) const
+{
+  raft::runtime::neighbors::ivf_pq::serialize(handle_, file, *index_);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::load(const std::string& file)
+{
+  auto index_tmp = raft::neighbors::ivf_pq::index<IdxT>(handle_, index_params_, dimension_);
+  raft::runtime::neighbors::ivf_pq::deserialize(handle_, file, &index_tmp);
+  index_.emplace(std::move(index_tmp));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
+{
+  auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), index_->dim());
+
+  index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v));
+  return;
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::set_search_param(const AnnSearchParam& param)
+{
+  auto search_param = dynamic_cast<const SearchParam&>(param);
+  search_params_    = search_param.pq_param;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  dataset_ = raft::make_device_matrix_view<const T, IdxT>(dataset, nrow, index_->dim());
+}
+
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::search(const T* queries,
+                                int batch_size,
+                                int k,
+                                size_t* neighbors,
+                                float* distances,
+                                cudaStream_t stream) const
+{
+  if (refine_ratio_ > 1.0f) {
+    uint32_t k0 = static_cast<uint32_t>(refine_ratio_ * k);
+    auto queries_v =
+      raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
+    auto distances_tmp = raft::make_device_matrix<float, IdxT>(handle_, batch_size, k0);
+    auto candidates    = raft::make_device_matrix<IdxT, IdxT>(handle_, batch_size, k0);
+
+    raft::runtime::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_v, candidates.view(), distances_tmp.view());
+
+    if (get_property().dataset_memory_type == MemoryType::Device) {
+      auto queries_v =
+        raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
+      auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>((IdxT*)neighbors, batch_size, k);
+      auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
+
+      raft::runtime::neighbors::refine(handle_,
+                                       dataset_,
+                                       queries_v,
+                                       candidates.view(),
+                                       neighbors_v,
+                                       distances_v,
+                                       index_->metric());
+    } else {
+      auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->dim());
+      auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
+      auto neighbors_host  = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
+      auto distances_host  = raft::make_host_matrix<float, IdxT>(batch_size, k);
+
+      raft::copy(queries_host.data_handle(), queries, queries_host.size(), handle_.get_stream());
+      raft::copy(candidates_host.data_handle(),
+                 candidates.data_handle(),
+                 candidates_host.size(),
+                 handle_.get_stream());
+
+      auto dataset_v = raft::make_host_matrix_view<const T, IdxT>(
+        dataset_.data_handle(), batch_size, index_->dim());
+
+      raft::runtime::neighbors::refine(handle_,
+                                       dataset_v,
+                                       queries_host.view(),
+                                       candidates_host.view(),
+                                       neighbors_host.view(),
+                                       distances_host.view(),
+                                       index_->metric());
+
+      raft::copy(neighbors,
+                 (size_t*)neighbors_host.data_handle(),
+                 neighbors_host.size(),
+                 handle_.get_stream());
+      raft::copy(
+        distances, distances_host.data_handle(), distances_host.size(), handle_.get_stream());
+    }
+  } else {
+    auto queries_v =
+      raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
+    auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>((IdxT*)neighbors, batch_size, k);
+    auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
+
+    raft::runtime::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_v, neighbors_v, distances_v);
+  }
+  handle_.sync_stream();
+  return;
+}
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
new file mode 100644
index 0000000000..01f206ab70
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "../common/ann_types.hpp"
+
+namespace raft_temp {
+
+inline raft::distance::DistanceType parse_metric_type(raft::bench::ann::Metric metric)
+{
+  if (metric == raft::bench::ann::Metric::kInnerProduct) {
+    return raft::distance::DistanceType::InnerProduct;
+  } else if (metric == raft::bench::ann::Metric::kEuclidean) {
+    return raft::distance::DistanceType::L2Expanded;
+  } else {
+    throw std::runtime_error("raft supports only metric type of inner product and L2");
+  }
+}
+
+}  // namespace raft_temp
+
+namespace raft::bench::ann {
+
+// brute force fused L2 KNN - RAFT
+template <typename T>
+class RaftGpu : public ANN<T> {
+ public:
+  using typename ANN<T>::AnnSearchParam;
+
+  RaftGpu(Metric metric, int dim);
+
+  void build(const T*, size_t, cudaStream_t) final;
+
+  void set_search_param(const AnnSearchParam& param) override;
+
+  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
+  // will be filled with (size_t)-1
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              size_t* neighbors,
+              float* distances,
+              cudaStream_t stream = 0) const final;
+
+  // to enable dataset access from GPU memory
+  AlgoProperty get_property() const override
+  {
+    AlgoProperty property;
+    property.dataset_memory_type      = MemoryType::Device;
+    property.query_memory_type        = MemoryType::Device;
+    property.need_dataset_when_search = true;
+    return property;
+  }
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+  void save(const std::string& file) const override;
+  void load(const std::string&) override { return; };
+
+ protected:
+  raft::distance::DistanceType metric_type_;
+  int device_;
+  const T* dataset_;
+  size_t nrow_;
+};
+
+template <typename T>
+RaftGpu<T>::RaftGpu(Metric metric, int dim)
+  : ANN<T>(metric, dim), metric_type_(raft_temp::parse_metric_type(metric))
+{
+  static_assert(std::is_same_v<T, float>, "raft support only float type");
+  assert(metric_type_ == raft::distance::DistanceType::L2Expanded);
+  RAFT_CUDA_TRY(cudaGetDevice(&device_));
+}
+
+template <typename T>
+void RaftGpu<T>::build(const T*, size_t, cudaStream_t)
+{
+  // as this is brute force algo so no index building required
+  return;
+}
+
+template <typename T>
+void RaftGpu<T>::set_search_param(const AnnSearchParam&)
+{
+  // Nothing to set here as it is brute force implementation
+}
+
+template <typename T>
+void RaftGpu<T>::set_search_dataset(const T* dataset, size_t nrow)
+{
+  dataset_ = dataset;
+  nrow_    = nrow;
+}
+
+template <typename T>
+void RaftGpu<T>::save(const std::string& file) const
+{
+  // create a empty index file as no index to store.
+  std::fstream fp;
+  fp.open(file.c_str(), std::ios::out);
+  if (!fp) {
+    printf("Error in creating file!!!\n");
+    ;
+    return;
+  }
+  fp.close();
+}
+
+template <typename T>
+void RaftGpu<T>::search(const T* queries,
+                        int batch_size,
+                        int k,
+                        size_t* neighbors,
+                        float* distances,
+                        cudaStream_t stream) const
+{
+  raft::spatial::knn::detail::fusedL2Knn(this->dim_,
+                                         reinterpret_cast<int64_t*>(neighbors),
+                                         distances,
+                                         dataset_,
+                                         queries,
+                                         nrow_,
+                                         static_cast<size_t>(batch_size),
+                                         k,
+                                         true,
+                                         true,
+                                         stream,
+                                         metric_type_);
+}
+
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
new file mode 100644
index 0000000000..f03a552c1d
--- /dev/null
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -0,0 +1,141 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
+
+function(ConfigureBench)
+
+  set(options OPTIONAL LIB)
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH TARGETS CONFIGURATIONS)
+
+  cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(BENCH_NAME ${ConfigureBench_NAME})
+
+  add_executable(${BENCH_NAME} ${ConfigureBench_PATH})
+
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE raft::raft
+            raft_internal
+            $<$<BOOL:${ConfigureBench_LIB}>:raft::compiled>
+            benchmark::benchmark
+            Threads::Threads
+            $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+
+  target_compile_options(
+    ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+  )
+
+  target_include_directories(${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench>")
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT testing
+    DESTINATION bin/gbench/prims/libraft
+    EXCLUDE_FROM_ALL
+  )
+
+endfunction()
+
+if(BUILD_BENCH)
+  ConfigureBench(
+    NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
+    bench/prims/main.cpp OPTIONAL LIB
+  )
+
+  ConfigureBench(
+    NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
+    bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
+  )
+
+  ConfigureBench(
+    NAME
+    DISTANCE_BENCH
+    PATH
+    bench/prims/distance/distance_cosine.cu
+    bench/prims/distance/distance_exp_l2.cu
+    bench/prims/distance/distance_l1.cu
+    bench/prims/distance/distance_unexp_l2.cu
+    bench/prims/distance/fused_l2_nn.cu
+    bench/prims/distance/masked_nn.cu
+    bench/prims/distance/kernels.cu
+    bench/prims/main.cpp
+    OPTIONAL
+    LIB
+  )
+
+  ConfigureBench(
+    NAME
+    LINALG_BENCH
+    PATH
+    bench/prims/linalg/add.cu
+    bench/prims/linalg/map_then_reduce.cu
+    bench/prims/linalg/matrix_vector_op.cu
+    bench/prims/linalg/norm.cu
+    bench/prims/linalg/normalize.cu
+    bench/prims/linalg/reduce_cols_by_key.cu
+    bench/prims/linalg/reduce_rows_by_key.cu
+    bench/prims/linalg/reduce.cu
+    bench/prims/main.cpp
+  )
+
+  ConfigureBench(
+    NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
+    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB
+  )
+
+  ConfigureBench(
+    NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
+    bench/prims/random/rng.cu bench/prims/main.cpp
+  )
+
+  ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
+
+  ConfigureBench(
+    NAME
+    NEIGHBORS_BENCH
+    PATH
+    bench/prims/neighbors/knn/brute_force_float_int64_t.cu
+    bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
+    bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+    bench/prims/neighbors/refine_float_int64_t.cu
+    bench/prims/neighbors/refine_uint8_t_int64_t.cu
+    bench/prims/main.cpp
+    OPTIONAL
+    LIB
+  )
+endif()
diff --git a/cpp/bench/cluster/kmeans.cu b/cpp/bench/prims/cluster/kmeans.cu
similarity index 100%
rename from cpp/bench/cluster/kmeans.cu
rename to cpp/bench/prims/cluster/kmeans.cu
diff --git a/cpp/bench/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu
similarity index 100%
rename from cpp/bench/cluster/kmeans_balanced.cu
rename to cpp/bench/prims/cluster/kmeans_balanced.cu
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp
similarity index 100%
rename from cpp/bench/common/benchmark.hpp
rename to cpp/bench/prims/common/benchmark.hpp
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/prims/distance/distance_common.cuh
similarity index 100%
rename from cpp/bench/distance/distance_common.cuh
rename to cpp/bench/prims/distance/distance_common.cuh
diff --git a/cpp/bench/distance/distance_cosine.cu b/cpp/bench/prims/distance/distance_cosine.cu
similarity index 94%
rename from cpp/bench/distance/distance_cosine.cu
rename to cpp/bench/prims/distance/distance_cosine.cu
index 20f29ce4ef..c8ac8067c8 100644
--- a/cpp/bench/distance/distance_cosine.cu
+++ b/cpp/bench/prims/distance/distance_cosine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_exp_l2.cu b/cpp/bench/prims/distance/distance_exp_l2.cu
similarity index 94%
rename from cpp/bench/distance/distance_exp_l2.cu
rename to cpp/bench/prims/distance/distance_exp_l2.cu
index 5a3af17193..52b7fff05c 100644
--- a/cpp/bench/distance/distance_exp_l2.cu
+++ b/cpp/bench/prims/distance/distance_exp_l2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_l1.cu b/cpp/bench/prims/distance/distance_l1.cu
similarity index 93%
rename from cpp/bench/distance/distance_l1.cu
rename to cpp/bench/prims/distance/distance_l1.cu
index 2ad7d5e957..e80db48ef0 100644
--- a/cpp/bench/distance/distance_l1.cu
+++ b/cpp/bench/prims/distance/distance_l1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/distance_unexp_l2.cu b/cpp/bench/prims/distance/distance_unexp_l2.cu
similarity index 94%
rename from cpp/bench/distance/distance_unexp_l2.cu
rename to cpp/bench/prims/distance/distance_unexp_l2.cu
index 406aca2378..7ac1a8a4b5 100644
--- a/cpp/bench/distance/distance_unexp_l2.cu
+++ b/cpp/bench/prims/distance/distance_unexp_l2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu
similarity index 100%
rename from cpp/bench/distance/fused_l2_nn.cu
rename to cpp/bench/prims/distance/fused_l2_nn.cu
diff --git a/cpp/bench/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu
similarity index 100%
rename from cpp/bench/distance/kernels.cu
rename to cpp/bench/prims/distance/kernels.cu
diff --git a/cpp/bench/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
similarity index 100%
rename from cpp/bench/distance/masked_nn.cu
rename to cpp/bench/prims/distance/masked_nn.cu
diff --git a/cpp/bench/distance/tune_pairwise/bench.cu b/cpp/bench/prims/distance/tune_pairwise/bench.cu
similarity index 100%
rename from cpp/bench/distance/tune_pairwise/bench.cu
rename to cpp/bench/prims/distance/tune_pairwise/bench.cu
diff --git a/cpp/bench/distance/tune_pairwise/kernel.cu b/cpp/bench/prims/distance/tune_pairwise/kernel.cu
similarity index 100%
rename from cpp/bench/distance/tune_pairwise/kernel.cu
rename to cpp/bench/prims/distance/tune_pairwise/kernel.cu
diff --git a/cpp/bench/distance/tune_pairwise/kernel.cuh b/cpp/bench/prims/distance/tune_pairwise/kernel.cuh
similarity index 100%
rename from cpp/bench/distance/tune_pairwise/kernel.cuh
rename to cpp/bench/prims/distance/tune_pairwise/kernel.cuh
diff --git a/cpp/bench/linalg/add.cu b/cpp/bench/prims/linalg/add.cu
similarity index 96%
rename from cpp/bench/linalg/add.cu
rename to cpp/bench/prims/linalg/add.cu
index 7d00b8cbae..456214ad7b 100644
--- a/cpp/bench/linalg/add.cu
+++ b/cpp/bench/prims/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/map_then_reduce.cu b/cpp/bench/prims/linalg/map_then_reduce.cu
similarity index 97%
rename from cpp/bench/linalg/map_then_reduce.cu
rename to cpp/bench/prims/linalg/map_then_reduce.cu
index 33a3e66264..84aebd85bf 100644
--- a/cpp/bench/linalg/map_then_reduce.cu
+++ b/cpp/bench/prims/linalg/map_then_reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/prims/linalg/matrix_vector_op.cu
similarity index 99%
rename from cpp/bench/linalg/matrix_vector_op.cu
rename to cpp/bench/prims/linalg/matrix_vector_op.cu
index aa388955da..d1fbaee79b 100644
--- a/cpp/bench/linalg/matrix_vector_op.cu
+++ b/cpp/bench/prims/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/norm.cu b/cpp/bench/prims/linalg/norm.cu
similarity index 98%
rename from cpp/bench/linalg/norm.cu
rename to cpp/bench/prims/linalg/norm.cu
index efecee88c9..f83953f8e4 100644
--- a/cpp/bench/linalg/norm.cu
+++ b/cpp/bench/prims/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/normalize.cu b/cpp/bench/prims/linalg/normalize.cu
similarity index 98%
rename from cpp/bench/linalg/normalize.cu
rename to cpp/bench/prims/linalg/normalize.cu
index d01473ffeb..ad9052a008 100644
--- a/cpp/bench/linalg/normalize.cu
+++ b/cpp/bench/prims/linalg/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce.cu b/cpp/bench/prims/linalg/reduce.cu
similarity index 97%
rename from cpp/bench/linalg/reduce.cu
rename to cpp/bench/prims/linalg/reduce.cu
index 015e0b8abe..cf41c5916a 100644
--- a/cpp/bench/linalg/reduce.cu
+++ b/cpp/bench/prims/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce_cols_by_key.cu b/cpp/bench/prims/linalg/reduce_cols_by_key.cu
similarity index 98%
rename from cpp/bench/linalg/reduce_cols_by_key.cu
rename to cpp/bench/prims/linalg/reduce_cols_by_key.cu
index 43aeb69ab0..ac0c612ee4 100644
--- a/cpp/bench/linalg/reduce_cols_by_key.cu
+++ b/cpp/bench/prims/linalg/reduce_cols_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/linalg/reduce_rows_by_key.cu b/cpp/bench/prims/linalg/reduce_rows_by_key.cu
similarity index 98%
rename from cpp/bench/linalg/reduce_rows_by_key.cu
rename to cpp/bench/prims/linalg/reduce_rows_by_key.cu
index 075bc7c8c4..aa9c9a1f62 100644
--- a/cpp/bench/linalg/reduce_rows_by_key.cu
+++ b/cpp/bench/prims/linalg/reduce_rows_by_key.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/main.cpp b/cpp/bench/prims/main.cpp
similarity index 92%
rename from cpp/bench/main.cpp
rename to cpp/bench/prims/main.cpp
index 3162422e8e..40f539facf 100644
--- a/cpp/bench/main.cpp
+++ b/cpp/bench/prims/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/matrix/argmin.cu b/cpp/bench/prims/matrix/argmin.cu
similarity index 100%
rename from cpp/bench/matrix/argmin.cu
rename to cpp/bench/prims/matrix/argmin.cu
diff --git a/cpp/bench/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
similarity index 100%
rename from cpp/bench/matrix/gather.cu
rename to cpp/bench/prims/matrix/gather.cu
diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu
similarity index 100%
rename from cpp/bench/matrix/select_k.cu
rename to cpp/bench/prims/matrix/select_k.cu
diff --git a/cpp/bench/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh
similarity index 100%
rename from cpp/bench/neighbors/knn.cuh
rename to cpp/bench/prims/neighbors/knn.cuh
diff --git a/cpp/bench/neighbors/knn/brute_force_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/brute_force_float_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
index d981104e20..7df0599670 100644
--- a/cpp/bench/neighbors/knn/brute_force_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/brute_force_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu b/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu
rename to cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
index 60f7edae96..9704d39e76 100644
--- a/cpp/bench/neighbors/knn/brute_force_float_uint32_t.cu
+++ b/cpp/bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
index 594d4d16d2..fbbb4f9acc 100644
--- a/cpp/bench/neighbors/knn/ivf_flat_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
similarity index 93%
rename from cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
index bd268f036c..7067dbe1b6 100644
--- a/cpp/bench/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+++ b/cpp/bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/neighbors/knn/ivf_flat_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_float_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
diff --git a/cpp/bench/neighbors/knn/ivf_pq_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
diff --git a/cpp/bench/neighbors/refine.cuh b/cpp/bench/prims/neighbors/refine.cuh
similarity index 100%
rename from cpp/bench/neighbors/refine.cuh
rename to cpp/bench/prims/neighbors/refine.cuh
diff --git a/cpp/bench/neighbors/refine_float_int64_t.cu b/cpp/bench/prims/neighbors/refine_float_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/refine_float_int64_t.cu
rename to cpp/bench/prims/neighbors/refine_float_int64_t.cu
diff --git a/cpp/bench/neighbors/refine_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
similarity index 100%
rename from cpp/bench/neighbors/refine_uint8_t_int64_t.cu
rename to cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/prims/random/make_blobs.cu
similarity index 98%
rename from cpp/bench/random/make_blobs.cu
rename to cpp/bench/prims/random/make_blobs.cu
index 950d80c499..f43d914cf2 100644
--- a/cpp/bench/random/make_blobs.cu
+++ b/cpp/bench/prims/random/make_blobs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/random/permute.cu b/cpp/bench/prims/random/permute.cu
similarity index 100%
rename from cpp/bench/random/permute.cu
rename to cpp/bench/prims/random/permute.cu
diff --git a/cpp/bench/random/rng.cu b/cpp/bench/prims/random/rng.cu
similarity index 98%
rename from cpp/bench/random/rng.cu
rename to cpp/bench/prims/random/rng.cu
index 147adf26ae..d15c9441d7 100644
--- a/cpp/bench/random/rng.cu
+++ b/cpp/bench/prims/random/rng.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/bench/sparse/convert_csr.cu b/cpp/bench/prims/sparse/convert_csr.cu
similarity index 100%
rename from cpp/bench/sparse/convert_csr.cu
rename to cpp/bench/prims/sparse/convert_csr.cu
diff --git a/cpp/cmake/modules/FindAVX.cmake b/cpp/cmake/modules/FindAVX.cmake
new file mode 100644
index 0000000000..7f3b2dfc76
--- /dev/null
+++ b/cpp/cmake/modules/FindAVX.cmake
@@ -0,0 +1,110 @@
+# =============================================================================
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+#
+# Note: This file was copied from PyTorch and modified for use in the RAFT library.
+# Refer to thirdparty/LICENSES/LICENSE.pytorch for license and additional
+# copyright information.
+# =============================================================================
+
+INCLUDE(CheckCXXSourceRuns)
+
+SET(AVX_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256 a;
+    a = _mm256_set1_ps(0);
+    return 0;
+  }
+"
+)
+
+SET(AVX512_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0);
+    __m512i b = a;
+    __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
+    return 0;
+  }
+"
+)
+
+SET(AVX2_CODE
+    "
+  #include <immintrin.h>
+
+  int main()
+  {
+    __m256i a = {0};
+    a = _mm256_abs_epi16(a);
+    __m256i x;
+    _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
+    return 0;
+  }
+"
+)
+
+MACRO(CHECK_SSE lang type flags)
+  SET(__FLAG_I 1)
+  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+  FOREACH(__FLAG ${flags})
+    IF(NOT ${lang}_${type}_FOUND)
+      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
+      CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      IF(${lang}_HAS_${type}_${__FLAG_I})
+        SET(${lang}_${type}_FOUND
+            TRUE
+            CACHE BOOL "${lang} ${type} support"
+        )
+        SET(${lang}_${type}_FLAGS
+            "${__FLAG}"
+            CACHE STRING "${lang} ${type} flags"
+        )
+      ENDIF()
+      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
+    ENDIF()
+  ENDFOREACH()
+  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+  IF(NOT ${lang}_${type}_FOUND)
+    SET(${lang}_${type}_FOUND
+        FALSE
+        CACHE BOOL "${lang} ${type} support"
+    )
+    SET(${lang}_${type}_FLAGS
+        ""
+        CACHE STRING "${lang} ${type} flags"
+    )
+  ENDIF()
+
+  MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
+
+ENDMACRO()
+
+# CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX") CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2") CHECK_SSE(C
+# "AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
+#
+CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
+CHECK_SSE(CXX "AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
diff --git a/cpp/cmake/patches/ggnn.patch b/cpp/cmake/patches/ggnn.patch
new file mode 100644
index 0000000000..95e1aaff4b
--- /dev/null
+++ b/cpp/cmake/patches/ggnn.patch
@@ -0,0 +1,206 @@
+diff --git a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+index 8cbaf0d..6eb72ac 100644
+--- a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
++++ b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh
+@@ -41,7 +41,6 @@ limitations under the License.
+ #include "ggnn/sym/cuda_knn_sym_query_layer.cuh"
+ #include "ggnn/utils/cuda_knn_utils.cuh"
+ #include "ggnn/utils/cuda_knn_constants.cuh"
+-#include "ggnn/utils/cuda_knn_dataset.cuh"
+ 
+ template <typename ValueT>
+ __global__ void divide(ValueT* res, ValueT* input, ValueT N) {
+@@ -98,9 +97,7 @@ struct GGNNGPUInstance {
+   typedef GGNNGraphDevice<KeyT, BaseT, ValueT> GGNNGraphDevice;
+   typedef GGNNGraphHost<KeyT, BaseT, ValueT> GGNNGraphHost;
+ 
+-  const Dataset<KeyT, BaseT, BAddrT>* dataset;
+   GGNNGraphBuffer<KeyT, ValueT>* ggnn_buffer {nullptr};
+-  GGNNQuery<KeyT, ValueT, BaseT> ggnn_query;
+ 
+   // Graph Shards resident on the GPU
+   std::vector<GGNNGraphDevice> ggnn_shards;
+@@ -117,13 +114,12 @@ struct GGNNGPUInstance {
+   // number of shards that need to be processed by this instance
+   const int num_parts;
+ 
+-  GGNNGPUInstance(const int gpu_id, const Dataset<KeyT, BaseT, BAddrT>* dataset,
++  GGNNGPUInstance(const int gpu_id,
+             const int N_shard, const int L,
+             const bool enable_construction, const float tau_build,
+             const int num_parts=1, const int num_cpu_buffers=1) :
+     N_shard{N_shard}, L{L}, tau_build{tau_build},
+-    dataset{dataset}, gpu_id{gpu_id},
+-    ggnn_query{dataset->N_query, D, KQuery, num_parts},
++    gpu_id{gpu_id},
+     num_parts{num_parts}
+   {
+     CHECK_LE(L, MAX_LAYER);
+@@ -135,7 +131,6 @@ struct GGNNGPUInstance {
+       CHECK_EQ(current_gpu_id, gpu_id) << "cudaSetDevice() needs to be called in advance!";
+     }
+ 
+-    ggnn_query.loadQueriesAsync(dataset->h_query, 0);
+ 
+     computeGraphParameters();
+ 
+@@ -186,7 +181,7 @@ struct GGNNGPUInstance {
+   }
+ 
+   GGNNGPUInstance(const GGNNGPUInstance& other)
+-   : dataset{nullptr}, ggnn_query{0, D, KQuery},
++   :
+      gpu_id{0}, N_shard{0}, num_parts{0} {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+@@ -305,6 +300,7 @@ struct GGNNGPUInstance {
+ 
+   // io
+ 
++  /*
+   void waitForDiskIO(const int shard_id) {
+     auto& cpu_buffer = ggnn_cpu_buffers[shard_id%ggnn_cpu_buffers.size()];
+     if (cpu_buffer.disk_io_thread.joinable())
+@@ -468,11 +464,12 @@ struct GGNNGPUInstance {
+     CHECK_CUDA(cudaDeviceSynchronize());
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
++  */
+ 
+   // graph operations
+ 
+   template <int BLOCK_DIM_X = 32, int MAX_ITERATIONS = 400, int CACHE_SIZE = 512, int SORTED_SIZE = 256, bool DIST_STATS = false>
+-  void queryLayer(const int shard_id = 0) const {
++  void queryLayer(const BaseT* d_query, int batch_size, KeyT* d_query_result_ids, ValueT* d_query_result_dists, const int shard_id = 0) const {
+     CHECK_CUDA(cudaSetDevice(gpu_id));
+     const auto& shard = ggnn_shards.at(shard_id%ggnn_shards.size());
+ 
+@@ -482,21 +479,21 @@ struct GGNNGPUInstance {
+ 
+     int* m_dist_statistics = nullptr;
+     if (DIST_STATS)
+-      cudaMallocManaged(&m_dist_statistics, dataset->N_query * sizeof(int));
++      cudaMallocManaged(&m_dist_statistics, batch_size * sizeof(int));
+ 
+     QueryKernel query_kernel;
+     query_kernel.d_base = shard.d_base;
+-    query_kernel.d_query = ggnn_query.d_query;
++    query_kernel.d_query = d_query;
+ 
+     query_kernel.d_graph = shard.d_graph;
+-    query_kernel.d_query_results = ggnn_query.d_query_result_ids;
+-    query_kernel.d_query_results_dists = ggnn_query.d_query_result_dists;
++    query_kernel.d_query_results = d_query_result_ids;
++    query_kernel.d_query_results_dists = d_query_result_dists;
+ 
+     query_kernel.d_translation = shard.d_translation;
+ 
+     query_kernel.d_nn1_stats = shard.d_nn1_stats;
+ 
+-    query_kernel.N = dataset->N_query;
++    query_kernel.N = batch_size;
+     query_kernel.N_offset = 0;
+ 
+     query_kernel.d_dist_stats = m_dist_statistics;
+@@ -771,6 +768,16 @@ struct GGNNGPUInstance {
+       sym(layer, shard_id);
+     }
+   }
++
++  void set_stream(cudaStream_t stream) {
++    assert(ggnn_shards.size() == 1);
++    ggnn_shards.at(0).stream = stream;
++  }
++
++  void set_base_data(const BaseT* dataset) {
++    assert(ggnn_shards.size() == 1);
++    ggnn_shards.at(0).d_base = dataset;
++  }
+ };
+ 
+ #endif  // INCLUDE_GGNN_CUDA_KNN_GGNN_GPU_INSTANCE_CUH_
+diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+index c94a8f1..781226d 100644
+--- a/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_device.cuh
+@@ -50,7 +50,7 @@ struct GGNNGraphDevice {
+   ValueT* d_nn1_stats;
+ 
+   /// base data pointer for the shard.
+-  BaseT* d_base;
++  const BaseT* d_base;
+ 
+   /// combined memory pool
+   char* d_memory;
+@@ -69,7 +69,9 @@ struct GGNNGraphDevice {
+     const size_t selection_translation_size = align8(ST_all * sizeof(KeyT));
+     const size_t nn1_stats_size = align8(2 * sizeof(ValueT));
+     total_graph_size = graph_size + 2 * selection_translation_size + nn1_stats_size;
+-    base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
++    // base_size = align8(static_cast<size_t>(N) * D * sizeof(BaseT));
++    (void) N;
++    (void) D;
+ 
+     const size_t total_size = base_size+total_graph_size;
+ 
+@@ -86,8 +88,7 @@ struct GGNNGraphDevice {
+     CHECK_CUDA(cudaMalloc(&d_memory, total_size));
+ 
+     size_t pos = 0;
+-    d_base = reinterpret_cast<BaseT*>(d_memory+pos);
+-    pos += base_size;
++    d_base = nullptr;
+     d_graph = reinterpret_cast<KeyT*>(d_memory+pos);
+     pos += graph_size;
+     d_translation = reinterpret_cast<KeyT*>(d_memory+pos);
+@@ -99,14 +100,14 @@ struct GGNNGraphDevice {
+ 
+     CHECK_EQ(pos, total_size);
+ 
+-    CHECK_CUDA(cudaStreamCreate(&stream));
++    // CHECK_CUDA(cudaStreamCreate(&stream));
+ 
+     CHECK_CUDA(cudaPeekAtLastError());
+     CHECK_CUDA(cudaDeviceSynchronize());
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
+ 
+-  GGNNGraphDevice(const GGNNGraphDevice& other) {
++  GGNNGraphDevice(const GGNNGraphDevice&) {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+     // always make sure that enough memory is reserved ahead of time.
+@@ -116,7 +117,7 @@ struct GGNNGraphDevice {
+   ~GGNNGraphDevice() {
+     cudaFree(d_memory);
+ 
+-    CHECK_CUDA(cudaStreamDestroy(stream));
++    // CHECK_CUDA(cudaStreamDestroy(stream));
+   }
+ };
+ 
+diff --git a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+index 2055f9e..ef5843a 100644
+--- a/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
++++ b/include/ggnn/graph/cuda_knn_ggnn_graph_host.cuh
+@@ -92,7 +92,7 @@ struct GGNNGraphHost {
+     CHECK_CUDA(cudaPeekAtLastError());
+   }
+ 
+-  GGNNGraphHost(const GGNNGraphHost& other) {
++  GGNNGraphHost(const GGNNGraphHost&) {
+     // this exists to allow using vector::emplace_back
+     // when it triggers a reallocation, this code will be called.
+     // always make sure that enough memory is reserved ahead of time.
+diff --git a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+index 49d76a1..eef69e6 100644
+--- a/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
++++ b/include/ggnn/select/cuda_knn_wrs_select_layer.cuh
+@@ -22,7 +22,6 @@ limitations under the License.
+ #include <cuda.h>
+ #include <cuda_runtime.h>
+ 
+-#include <gflags/gflags.h>
+ #include <cub/cub.cuh>
+ 
+ #include "ggnn/utils/cuda_knn_constants.cuh"
diff --git a/cpp/cmake/patches/nlohmann_json.patch b/cpp/cmake/patches/nlohmann_json.patch
new file mode 100644
index 0000000000..83dd56bc16
--- /dev/null
+++ b/cpp/cmake/patches/nlohmann_json.patch
@@ -0,0 +1,38 @@
+--- nlohmann/json.hpp	2021-05-06 11:40:39.770669693 +0800
++++ nlohmann/json_patched.hpp	2021-06-02 18:46:43.849334466 +0800
+@@ -16607,6 +16607,21 @@
+         }
+     }
+ 
++
++    template <typename NumberType,
++  	    enable_if_t<std::is_signed<NumberType>::value, int> = 0>
++    bool is_negative_number(NumberType x)
++    {
++        return x < 0;
++    }
++
++    template < typename NumberType,
++  	     enable_if_t < std::is_unsigned<NumberType>::value, int > = 0 >
++    bool is_negative_number(NumberType /*unused*/)
++    {
++        return false;
++    }
++
+     /*!
+     @brief dump an integer
+ 
+@@ -16649,12 +16664,11 @@
+         // use a pointer to fill the buffer
+         auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+ 
+-        const bool is_negative = std::is_same<NumberType, number_integer_t>::value && !(x >= 0); // see issue #755
+         number_unsigned_t abs_value;
+ 
+         unsigned int n_chars{};
+ 
+-        if (is_negative)
++        if (is_negative_number(x))
+         {
+             *buffer_ptr = '-';
+             abs_value = remove_sign(static_cast<number_integer_t>(x));
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
new file mode 100644
index 0000000000..b7c132f2f1
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -0,0 +1,87 @@
+#=============================================================================
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_faiss)
+    set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+        rapids_find_generate_module(faiss
+                HEADER_NAMES  faiss/IndexFlat.h
+                LIBRARY_NAMES faiss
+                )
+
+        set(BUILD_SHARED_LIBS ON)
+        if (PKG_BUILD_STATIC_LIBS)
+            set(BUILD_SHARED_LIBS OFF)
+            set(CPM_DOWNLOAD_faiss ON)
+        endif()
+
+        rapids_cpm_find(faiss ${PKG_VERSION}
+                GLOBAL_TARGETS     faiss::faiss
+                CPM_ARGS
+                GIT_REPOSITORY   ${PKG_REPOSITORY}
+                GIT_TAG          ${PKG_PINNED_TAG}
+                EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
+                OPTIONS
+                "FAISS_ENABLE_PYTHON OFF"
+                "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
+                "FAISS_ENABLE_GPU ON"
+                "BUILD_TESTING OFF"
+                "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
+                "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
+                )
+
+        if(TARGET faiss AND NOT TARGET faiss::faiss)
+            add_library(faiss::faiss ALIAS faiss)
+        endif()
+
+        if(faiss_ADDED)
+            rapids_export(BUILD faiss
+                    EXPORT_SET faiss-targets
+                    GLOBAL_TARGETS faiss
+                    NAMESPACE faiss::)
+        endif()
+
+    # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
+    rapids_export_package(BUILD OpenMP raft-ann-bench-exports) # faiss uses openMP but doesn't export a need for it
+    rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS faiss::faiss faiss)
+    rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS faiss::faiss faiss)
+
+    # Tell cmake where it can find the generated faiss-config.cmake we wrote.
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-ann-bench-exports)
+endfunction()
+
+if(NOT RAFT_FAISS_GIT_TAG)
+    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
+    # (https://github.com/facebookresearch/faiss/pull/2446)
+    set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk-v1.7.0)
+    # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30)
+endif()
+
+if(NOT RAFT_FAISS_GIT_REPOSITORY)
+    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
+    # (https://github.com/facebookresearch/faiss/pull/2446)
+    set(RAFT_FAISS_GIT_REPOSITORY https://github.com/trxcllnt/faiss.git)
+    # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git)
+endif()
+
+find_and_configure_faiss(VERSION    1.7.0
+        REPOSITORY  ${RAFT_FAISS_GIT_REPOSITORY}
+        PINNED_TAG  ${RAFT_FAISS_GIT_TAG}
+        BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
+        EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL})
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake
new file mode 100644
index 0000000000..708acb6b8d
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_ggnn.cmake
@@ -0,0 +1,44 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ggnn)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ )
+    if (NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/ggnn-src/)
+
+        execute_process (
+                COMMAND git clone "https://github.com/${PKG_FORK}/ggnn" --branch ${PKG_PINNED_TAG} ggnn-src
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ )
+
+        message("SOURCE ${CMAKE_CURRENT_SOURCE_DIR}")
+        execute_process (
+                COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/ggnn.patch
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/ggnn-src
+        )
+    endif()
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_ggnn(VERSION          0.5
+        FORK             cgtuebingen
+        PINNED_TAG       release_0.5
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/cmake/thirdparty/get_glog.cmake b/cpp/cmake/thirdparty/get_glog.cmake
new file mode 100644
index 0000000000..9334224de5
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_glog.cmake
@@ -0,0 +1,49 @@
+#=============================================================================
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_glog)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(glog ${PKG_VERSION}
+            GLOBAL_TARGETS      glog::glog
+            BUILD_EXPORT_SET    raft-exports
+            INSTALL_EXPORT_SET  raft-exports
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/glog.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR          cpp
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
+            )
+
+    if(glog_ADDED)
+        message(VERBOSE "RAFT: Using glog located in ${glog_SOURCE_DIR}")
+    else()
+        message(VERBOSE "RAFT: Using glog located in ${glog_DIR}")
+    endif()
+
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_glog_SOURCE=/path/to/local/glog
+find_and_configure_glog(VERSION 0.6.0
+        FORK             google
+        PINNED_TAG       v0.6.0
+        EXCLUDE_FROM_ALL ON
+        )
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
new file mode 100644
index 0000000000..94033e8333
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -0,0 +1,49 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_hnswlib)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set ( EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} )
+    if( NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/_deps/hnswlib-src )
+
+        execute_process (
+                COMMAND git clone --branch=v0.6.2 https://github.com/nmslib/hnswlib.git hnswlib-src
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps )
+
+    endif ()
+
+    include(cmake/modules/FindAVX.cmake)
+
+    set(HNSW_CXX_FLAGS "")
+    if(CXX_AVX_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX_FLAGS}")
+    elseif(CXX_AVX2_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX2_FLAGS}")
+    elseif(CXX_AVX512_FOUND)
+        set(HNSW_CXX_FLAGS "${HNSW_CXX_FLAGS} ${CXX_AVX512_FLAGS}")
+    endif()
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_hnswlib(VERSION  0.6.2
+        FORK             nmslib
+        PINNED_TAG       v0.6.2
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/cmake/thirdparty/get_nlohmann_json.cmake b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
new file mode 100644
index 0000000000..5de98a47ce
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_nlohmann_json.cmake
@@ -0,0 +1,39 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_nlohmann_json)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(nlohmann_json ${PKG_VERSION}
+            GLOBAL_TARGETS      nlohmann_json::nlohmann_json
+            BUILD_EXPORT_SET    raft-bench-ann-exports
+            INSTALL_EXPORT_SET  raft-bench-ann-exports
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/json.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL})
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_nlohmann_json(VERSION  3.11.2
+        FORK             nlohmann
+        PINNED_TAG       v3.11.2
+        EXCLUDE_FROM_ALL YES)
diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
index a9d8777304..bb1d122a24 100644
--- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh
@@ -383,8 +383,8 @@ static int chooseNewCentroid(raft::device_resources const& handle,
                          thrust::device_pointer_cast(dists),
                          thrust::device_pointer_cast(dists + n),
                          thrust::device_pointer_cast(distsCumSum));
-  CHECK_CUDA(stream);
-  CUDA_TRY(cudaMemcpyAsync(
+  RAFT_CHECK_CUDA(stream);
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
     &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
 
   // Randomly choose observation vector
@@ -523,7 +523,7 @@ static int initializeCentroids(raft::device_resources const& handle,
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
-    CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
     computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
       n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
     RAFT_CHECK_CUDA(stream);
@@ -534,7 +534,7 @@ static int initializeCentroids(raft::device_resources const& handle,
   }
 
   // Compute cluster sizes
-  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
   computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
   RAFT_CHECK_CUDA(stream);
 
@@ -598,7 +598,7 @@ static int assignCentroids(raft::device_resources const& handle,
   RAFT_CHECK_CUDA(stream);
 
   // Find centroid closest to each observation vector
-  CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
@@ -606,7 +606,7 @@ static int assignCentroids(raft::device_resources const& handle,
   gridDim.y  = 1;
   gridDim.z  = 1;
   minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Compute residual sum of squares
   *residual_host = thrust::reduce(
@@ -825,8 +825,8 @@ int kmeans(raft::device_resources const& handle,
 
   // Trivial cases
   if (k == 1) {
-    CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-    CUDA_TRY(
+    RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+    RAFT_CUDA_TRY(
       cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
     if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not compute k-means centroids");
@@ -837,7 +837,7 @@ int kmeans(raft::device_resources const& handle,
                  1,
                  std::min(ceildiv<unsigned>(n, BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
 
-    CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
     computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
     RAFT_CHECK_CUDA(stream);
     *residual_host = thrust::reduce(
diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index 8c0cfeba28..73d93ab535 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -958,7 +958,7 @@ int computeSmallestEigenvectors(
                                 (*effIter) * nEigVecs * sizeof(value_type_t),
                                 cudaMemcpyHostToDevice,
                                 stream));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
@@ -1305,7 +1305,7 @@ int computeLargestEigenvectors(
                                 cudaMemcpyHostToDevice,
                                 stream));
 
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
diff --git a/cpp/include/raft/solver/detail/lap_functions.cuh b/cpp/include/raft/solver/detail/lap_functions.cuh
index 440e6901c6..63f27e6346 100644
--- a/cpp/include/raft/solver/detail/lap_functions.cuh
+++ b/cpp/include/raft/solver/detail/lap_functions.cuh
@@ -113,7 +113,7 @@ inline void initialReduction(raft::device_resources const& handle,
   kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits<weight_t>::max());
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
   kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_costs,
     d_vertices_dev.row_duals,
@@ -121,7 +121,7 @@ inline void initialReduction(raft::device_resources const& handle,
     SP,
     N,
     std::numeric_limits<weight_t>::max());
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 template <typename vertex_t, typename weight_t>
@@ -159,7 +159,7 @@ inline void computeInitialAssignments(raft::device_resources const& handle,
     SP,
     N,
     epsilon);
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for finding row cover on individual devices.
@@ -191,7 +191,7 @@ inline int computeRowCovers(raft::device_resources const& handle,
   kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size);
 }
@@ -268,7 +268,7 @@ inline vertex_t zeroCoverIteration(raft::device_resources const& handle,
                                          0,
                                          handle.get_stream()>>>(
       predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N);
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
     thrust::exclusive_scan(
@@ -286,7 +286,7 @@ inline vertex_t zeroCoverIteration(raft::device_resources const& handle,
         SP,
         N);
 
-      CHECK_CUDA(handle.get_stream());
+      RAFT_CHECK_CUDA(handle.get_stream());
     }
   }
 
@@ -356,7 +356,7 @@ inline void reversePass(raft::device_resources const& handle,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
   std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
@@ -375,11 +375,11 @@ inline void reversePass(raft::device_resources const& handle,
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), size);
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size);
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
   }
 }
 
@@ -410,7 +410,7 @@ inline void augmentationPass(raft::device_resources const& handle,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
   // TODO: should be vertex_t
@@ -432,7 +432,7 @@ inline void augmentationPass(raft::device_resources const& handle,
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N});
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
 
     kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       d_vertices_dev.row_assignments,
@@ -443,7 +443,7 @@ inline void augmentationPass(raft::device_resources const& handle,
       vertex_t{N},
       row_ids_csr_size);
 
-    CHECK_CUDA(handle.get_stream());
+    RAFT_CHECK_CUDA(handle.get_stream());
   }
 }
 
@@ -471,7 +471,7 @@ inline void dualUpdate(raft::device_resources const& handle,
     N,
     std::numeric_limits<weight_t>::max());
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 
   detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP);
   kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
@@ -488,7 +488,7 @@ inline void dualUpdate(raft::device_resources const& handle,
     std::numeric_limits<weight_t>::max(),
     epsilon);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for calculating optimal objective function value using dual variables.
@@ -508,7 +508,7 @@ inline void calcObjValDual(raft::device_resources const& handle,
   kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 // Function for calculating optimal objective function value using dual variables.
@@ -529,7 +529,7 @@ inline void calcObjValPrimal(raft::device_resources const& handle,
   kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_costs, d_row_assignments, SP, N);
 
-  CHECK_CUDA(handle.get_stream());
+  RAFT_CHECK_CUDA(handle.get_stream());
 }
 
 }  // namespace raft::solver::detail
diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh
index 7904c04ede..6e66bafe1f 100644
--- a/cpp/include/raft/solver/linear_assignment.cuh
+++ b/cpp/include/raft/solver/linear_assignment.cuh
@@ -170,7 +170,7 @@ class LinearAssignmentProblem {
   {
     weight_t result;
     raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
+    RAFT_CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
@@ -183,7 +183,7 @@ class LinearAssignmentProblem {
   {
     weight_t result;
     raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
-    CHECK_CUDA(handle_.get_stream());
+    RAFT_CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 63bc98b404..67d6f6c412 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -962,7 +962,7 @@ int computeSmallestEigenvectors(
                                 (*effIter) * nEigVecs * sizeof(value_type_t),
                                 cudaMemcpyHostToDevice,
                                 stream));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
@@ -1312,7 +1312,7 @@ int computeLargestEigenvectors(
                                 cudaMemcpyHostToDevice,
                                 stream));
 
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index e32b718117..73518e20ef 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -352,7 +352,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     // scales y by beta:
     //
     if (beta == 0) {
-      CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
+      RAFT_CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
     } else if (beta != 1) {
       // TODO: Call from public API when ready
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 0a7ca23028..1134513587 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -14,14 +14,6 @@
  * limitations under the License.
  */
 
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use raft_runtime/cudart_utils.hpp instead.
- */
-
-#ifndef __RAFT_RT_CUDART_UTILS_H
-#define __RAFT_RT_CUDART_UTILS_H
-
 #pragma once
 
 #include <raft/core/error.hpp>
@@ -32,7 +24,7 @@
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cuda_fp16.h>
-#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
 
 #include <chrono>
 #include <cstdio>
@@ -43,11 +35,6 @@
 #include <mutex>
 #include <string>
 
-// FIXME: Remove after consumers rename
-#ifndef CUDA_TRY
-#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
-#endif
-
 /**
  * @brief Debug macro to check for CUDA errors
  *
@@ -67,16 +54,6 @@
 #define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
 #endif
 
-// FIXME: Remove after consumers rename
-#ifndef CHECK_CUDA
-#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
-#endif
-
-/** FIXME: remove after cuml rename */
-#ifndef CUDA_CHECK
-#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
-#endif
-
 // /**
 //  * @brief check for cuda runtime API errors but log error instead of raising
 //  *        exception.
@@ -93,17 +70,6 @@
     }                                                              \
   } while (0)
 
-// FIXME: Remove after cuml rename
-#ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
-#endif
-
-/**
- * Alias to raft scope for now.
- * TODO: Rename original implementations in 22.04 to fix
- * https://github.com/rapidsai/raft/issues/128
- */
-
 namespace raft {
 
 /** Helper method to get to know warp size in device code */
@@ -215,7 +181,7 @@ class grid_1d_block_t {
 template <typename Type>
 void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
 {
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**
@@ -241,7 +207,8 @@ void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_vi
 template <typename Type>
 void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
 {
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -270,7 +237,7 @@ void print_device_vector(const char* variable_name,
                          OutStream& out)
 {
   auto host_mem = std::make_unique<T[]>(componentsCount);
-  CUDA_CHECK(
+  RAFT_CUDA_TRY(
     cudaMemcpy(host_mem.get(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
   print_host_vector(variable_name, host_mem.get(), componentsCount, out);
 }
@@ -532,5 +499,3 @@ inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_
 }
 
 }  // namespace raft
-
-#endif
diff --git a/dependencies.yaml b/dependencies.yaml
index dd361a0cdf..64fd7cd454 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -15,6 +15,16 @@ files:
       - run_pylibraft
       - test_python_common
       - test_pylibraft
+  bench_ann:
+    output: conda
+    matrix:
+      cuda: ["11.8"]
+      arch: [x86_64]
+    includes:
+      - build
+      - develop
+      - cudatoolkit
+      - nn_bench
   test_cpp:
     output: none
     includes:
@@ -137,6 +147,17 @@ dependencies:
       - output_types: [conda]
         packages:
           - clang-tools=11.1.0
+  nn_bench:
+    common:
+      - output_types: [conda]
+        packages:
+          - hnswlib=0.7.0
+          - nlohmann_json>=3.11.2
+          - glog>=0.6.0
+          - h5py>=3.8.0
+          - libfaiss>=1.7.1
+          - faiss-proc=*=cuda
+
   cudatoolkit:
     specific:
       - output_types: conda
diff --git a/docs/source/cuda_ann_benchmarks.md b/docs/source/cuda_ann_benchmarks.md
new file mode 100644
index 0000000000..708f5f7dba
--- /dev/null
+++ b/docs/source/cuda_ann_benchmarks.md
@@ -0,0 +1,322 @@
+# CUDA ANN Benchmarks
+
+This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU.
+
+## Benchmark
+
+### Dependencies
+
+CUDA 11 and a GPU with Pascal architecture or later are required to run the benchmarks. 
+
+Please refer to the  [installation docs](https://docs.rapids.ai/api/raft/stable/build.html#cuda-gpu-requirements) for the base requirements to build RAFT. 
+
+In addition to the base requirements for building RAFT, additional dependencies needed to build the ANN benchmarks include:
+1. FAISS GPU >= 1.7.1
+2. Google Logging (GLog)
+3. H5Py
+4. HNSWLib
+5. nlohmann_json
+6. GGNN
+
+[rapids-cmake](https://github.com/rapidsai/rapids-cmake) is used to build the ANN benchmarks so the code for dependencies not already supplied in the CUDA toolkit will be downloaded and built automatically.
+
+The easiest (and most reproducible) way to install the dependencies needed to build the ANN benchmarks is to use the conda environment file located in the `conda/environments` directory of the RAFT repository. The following command will use `mamba` (which is preferred over `conda`) to build and activate a new environment for compiling the benchmarks:
+
+```bash
+mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+conda activate raft_ann_benchmarks
+```
+
+The above conda environment will also reduce the compile times as dependencies like FAISS will already be installed and not need to be compiled with `rapids-cmake`.
+
+### Compiling the Benchmarks
+
+After the needed dependencies are satisfied, the easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms:
+```bash
+./build.sh bench-ann
+```
+
+You can limit the algorithms that are built by providing a semicolon-delimited list of executable names (each algorithm is suffixed with `_ANN_BENCH`):
+```bash
+./build.sh bench-ann --limit-bench-ann=HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH
+```
+
+Available targets to use with `--limit-bench-ann` are:
+- FAISS_IVF_FLAT_ANN_BENCH
+- FAISS_IVF_PQ_ANN_BENCH
+- FAISS_BFKNN_ANN_BENCH
+- GGNN_ANN_BENCH
+- HNSWLIB_ANN_BENCH
+- RAFT_IVF_PQ_ANN_BENCH
+- RAFT_IVF_FLAT_ANN_BENCH
+- RAFT_BFKNN_ANN_BENCH
+
+By default, the `*_ANN_BENCH` executables program infer the dataset's datatype from the filename's extension. For example, an extension of `fbin` uses a `float` datatype, `f16bin` uses a `float16` datatype, extension of `i8bin` uses `int8_t` datatype, and `u8bin` uses `uint8_t` type. Currently, only `float`, `float16`, int8_t`, and `unit8_t` are supported.
+
+### Usage
+There are 4 general steps to running the benchmarks:
+1. Prepare Dataset
+2. Build Index
+3. Search Using Built Index
+4. Evaluate Result
+
+#### End-to-end Example
+An end-to-end example (run from the RAFT source code root directory):
+```bash
+# (1) prepare a dataset
+pushd
+
+cd cpp/bench/ann
+mkdir data && cd data
+wget http://ann-benchmarks.com/glove-100-angular.hdf5
+
+# option -n is used here to normalize vectors so cosine distance is converted
+# to inner product; don't use -n for l2 distance
+python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5
+
+mkdir glove-100-inner
+mv glove-100-angular.base.fbin glove-100-inner/base.fbin
+mv glove-100-angular.query.fbin glove-100-inner/query.fbin
+mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin
+mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin
+popd
+
+# (2) build index
+./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -b -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json
+
+# (3) search
+./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -s -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json
+
+# (4) evaluate result
+pushd
+cd cpp/bench/ann
+./scripts/eval.pl \
+  -o result.csv \
+  data/glove-100-inner/groundtruth.neighbors.ibin \
+  result/glove-100-inner/faiss_ivf_flat
+popd 
+
+# optional step: plot QPS-Recall figure using data in result.csv with your favorite tool
+```
+
+##### Step 1: Prepare Dataset
+A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation.
+
+The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively.
+These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order.
+
+Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
+
+Commonly used datasets can be downloaded from two websites:
+1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
+
+    However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
+    ```bash
+    pip3 install numpy h5py
+    ```
+    The usage of this script is:
+    ```bash
+    $ cpp/bench/ann/scripts/hdf5_to_fbin.py
+    usage: scripts/hdf5_to_fbin.py [-n] <input>.hdf5
+       -n: normalize base/query set
+     outputs: <input>.base.fbin
+              <input>.query.fbin
+              <input>.groundtruth.neighbors.ibin
+              <input>.groundtruth.distances.fbin
+    ```
+    So for an input `.hdf5` file, four output binary files will be produced. See previous section for an example of prepossessing GloVe dataset.
+
+    Most datasets provided by `ann-benchmarks` use `Angular` or `Euclidean` distance. `Angular` denotes cosine distance. However, computing cosine distance reduces to computing inner product by normalizing vectors beforehand. In practice, we can always do the normalization to decrease computation cost, so it's better to measure the performance of inner product rather than cosine distance. The `-n` option of `hdf5_to_fbin.py` can be used to normalize the dataset.
+
+2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
+    ```bash
+    $ cpp/bench/ann/scripts/split_groundtruth.pl
+    usage: script/split_groundtruth.pl input output_prefix
+    ```
+    Take Deep-1B dataset as an example:
+    ```bash
+    pushd
+    cd cpp/bench/ann
+    mkdir -p data/deep-1B && cd data/deep-1B
+    # download manually "Ground Truth" file of "Yandex DEEP"
+    # suppose the file name is deep_new_groundtruth.public.10K.bin
+    ../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
+    # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
+    popd
+    ```
+    Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
+
+
+##### Step 2: Build Index
+An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk.
+
+To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`cpp/bench/ann/conf/glove-100-inner.json`](../../cpp/cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections:
+* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed.
+    - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset.
+* `search_basic_param` section specifies basic parameters for searching:
+    - `k` is the "k" in "k-nn", that is, the number of neighbors (or results) we want from the searching.
+    -  `run_count` means how many times we run the searching. A single run of searching will search neighbors for all vectors in `test` set. The total time used for a run is recorded, and the final searching time is the smallest one among these runs.
+* `index` section specifies an array of configurations for index building and searching:
+    - `build_param` and `search_params` are parameters for building and searching, respectively. `search_params` is an array since we will search with different parameters to get different recall values.
+    - `file` is the file name of index. Building will save built index to this file, while searching will load this file.
+    - `search_result_file` is the file name prefix of searching results. Searching will save results to these files, and plotting script will read these files to plot results. Note this is a prefix rather than a whole file name. Suppose its value is `${prefix}`, then the real file names are like `${prefix}.0.{ibin|txt}`, `${prefix}.1.{ibin|txt}`, etc. Each of them corresponds to an item in `search_params` array. That is, for one searching parameter, there will be some corresponding search result files.
+    - if `multigpu` is specified, multiple GPUs will be used for index build and search.
+    - if `refine_ratio` is specified, refinement, as a post-processing step of search, will be done. It's for algorithms that compress vectors. For example, if `"refine_ratio" : 2` is set, 2`k` results are first computed, then exact distances of them are computed using original uncompressed vectors, and finally top `k` results among them are kept.
+
+
+The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables:
+```bash
+$ ./cpp/build/*_ANN_BENCH -h
+usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json
+   -b: build mode, will build index
+   -s: search mode, will search using built index
+       one and only one of -b and -s should be specified
+   -f: force overwriting existing output files
+   -i: by default will build/search all the indices found in conf.json
+       '-i' can be used to select a subset of indices
+       'index_names' is a list of comma-separated index names
+       '*' is allowed as the last character of a name to select all matched indices
+       for example, -i "hnsw1,hnsw2,faiss" or -i "hnsw*,faiss"
+```
+* `-b`: build index.
+* `-s`: do the searching with built index.
+* `-f`: before doing the real task, the program checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option.
+* `-i`: by default, the `-b` flag will build all indices found in the configuration file, and `-s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option.
+
+It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains:
+```json
+  "index" : [
+    {
+      "name" : "hnsw1",
+      ...
+    },
+    {
+      "name" : "hnsw1",
+      ...
+    },
+    {
+      "name" : "faiss",
+      ...
+    }
+  ]
+```
+Then,
+```bash
+# build all indices: hnsw1, hnsw2 and faiss
+./cpp/build/HNSWLIB_ANN_BENCH -b a.json
+
+# build only hnsw1
+./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1 a.json
+
+# build hnsw1 and hnsw2
+./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json
+
+# build hnsw1 and hnsw2
+./cpp/build/HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json
+
+# build faiss
+./cpp/build/FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json
+```
+In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell.
+
+
+##### Step 3: Searching
+Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the same as in step 2.
+
+
+##### Step 4: Evaluating Results
+Use `cpp/bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is:
+```bash
+$ cpp/bench/ann/scripts/eval.pl
+usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths...
+  result_paths... are paths to the search result files.
+    Can specify multiple paths.
+    For each of them, if it's a directory, all the .txt files found under
+    it recursively will be regarded as inputs.
+
+  -f: force to recompute recall and update it in result file if needed
+  -o: also write result to a csv file
+```
+Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files.
+An example:
+```bash
+cpp/bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \
+  result/glove-100-angular/10/hnsw/angular_M_24_*.txt \
+  result/glove-100-angular/10/faiss/
+```
+The search result files used by this command are files matching `result/glove-100-angular/10/hnsw/angular_M_24_*.txt`, and all `.txt` files under directory `result/glove-100-angular/10/faiss/` recursively.
+
+This script prints recall and QPS for every result file. Also, it outputs estimated "recall at QPS=2000" and "QPS at recall=0.9", which can be used to compare performance quantitatively.
+
+It saves recall value in result txt file, so avoids to recompute recall if the same command is run again. To force to recompute recall, option `-f` can be used. If option `-o <output.csv>` is specified, a csv output file will be produced. This file can be used to plot Throughput-Recall curves.
+
+## Adding a new ANN algorithm
+Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions.
+
+In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN<T>::AnnSearchParam`. Take `class HnswLib` as an example, its definition is:
+```c++
+template<typename T>
+class HnswLib : public ANN<T> {
+public:
+  struct BuildParam {
+    int M;
+    int ef_construction;
+    int num_threads;
+  };
+
+  using typename ANN<T>::AnnSearchParam;
+  struct SearchParam : public AnnSearchParam {
+    int ef;
+    int num_threads;
+  };
+
+  // ...
+};
+```
+
+The benchmark program uses JSON configuration file. To add the new algorithm to the benchmark, need be able to specify `build_param`, whose value is a JSON object, and `search_params`, whose value is an array of JSON objects, for this algorithm in configuration file. Still take the configuration for `HnswLib` as an example:
+```json
+{
+  "name" : "...",
+  "algo" : "hnswlib",
+  "build_param": {"M":12, "efConstruction":500, "numThreads":32},
+  "file" : "/path/to/file",
+  "search_params" : [
+    {"ef":10, "numThreads":1},
+    {"ef":20, "numThreads":1},
+    {"ef":40, "numThreads":1},
+  ],
+  "search_result_file" : "/path/to/file"
+},
+```
+
+How to interpret these JSON objects is totally left to the implementation and should be specified in `cpp/bench/ann/src/factory.cuh`:
+1. First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively:
+    ```c++
+    template<typename T>
+    void parse_build_param(const nlohmann::json& conf,
+                           typename cuann::HnswLib<T>::BuildParam& param) {
+      param.ef_construction = conf.at("efConstruction");
+      param.M = conf.at("M");
+      if (conf.contains("numThreads")) {
+        param.num_threads = conf.at("numThreads");
+      }
+    }
+
+    template<typename T>
+    void parse_search_param(const nlohmann::json& conf,
+                            typename cuann::HnswLib<T>::SearchParam& param) {
+      param.ef = conf.at("ef");
+      if (conf.contains("numThreads")) {
+        param.num_threads = conf.at("numThreads");
+      }
+    }
+    ```
+
+2. Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example,
+    ```c++
+      // JSON configuration file contains a line like:  "algo" : "hnswlib"
+      if (algo == "hnswlib") {
+         // ...
+      }
+    ```
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 814899c36b..23e346c872 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -44,6 +44,7 @@ While not exhaustive, the following general categories help summarize the accele
    developer_guide.md
    cpp_api.rst
    pylibraft_api.rst
+   cuda_ann_benchmarks.md
    raft_dask_api.rst
    using_comms.rst
    using_libraft.md
diff --git a/thirdparty/LICENSES/LICENSE.pytorch b/thirdparty/LICENSES/LICENSE.pytorch
new file mode 100644
index 0000000000..7ad3d737a5
--- /dev/null
+++ b/thirdparty/LICENSES/LICENSE.pytorch
@@ -0,0 +1,77 @@
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file