diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bec89ab888..80e0c8b216 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -60,13 +60,13 @@ jobs:
     uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: branch
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-120-pip
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -78,7 +78,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@cuda-120-pip
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-publish-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-120-pip
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@cuda-120-pip
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8175b4fbc7..fcb155d651 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -60,14 +60,14 @@ jobs:
     uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-120-pip
     with:
       build_type: pull-request
       package-name: pylibraft
@@ -76,34 +76,31 @@ jobs:
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120-pip
     with:
       build_type: pull-request
       package-name: pylibraft
-      test-before-amd64: "pip install cupy-cuda11x"
-      # On arm also need to install cupy from the specific webpage.
-      test-before-arm64: "pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64"
-      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
+      test-unittest: "python -m pytest ./python/pylibraft/pylibraft/test"
       test-smoketest: "python ./ci/wheel_smoke_test_pylibraft.py"
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-120-pip
     with:
       build_type: pull-request
       package-name: raft_dask
       package-dir: python/raft-dask
-      before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-wheelhouse"
+      before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibraft_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibraft && python -m pip install --no-deps ./local-pylibraft/pylibraft*.whl"
       skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120-pip
     with:
       build_type: pull-request
       package-name: raft_dask
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
+      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-unittest: "python -m pytest ./python/raft-dask/raft_dask/test"
       test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index dc8f7b6f2b..d389c4e2a9 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -32,19 +32,17 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120-pip
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: pylibraft
-      test-before-amd64: "pip install cupy-cuda11x"
-      test-before-arm64: "pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64"
-      test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
+      test-unittest: "python -m pytest ./python/pylibraft/pylibraft/test"
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120-pip
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -53,4 +51,4 @@ jobs:
       package-name: raft_dask
       test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
+      test-unittest: "python -m pytest ./python/raft-dask/raft_dask/test"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d6e4ecb676..2a70632497 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,7 +43,7 @@ repos:
                 additional_dependencies: [toml]
                 args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v11.1.0
+        rev: v16.0.1
         hooks:
               - id: clang-format
                 types_or: [c, c++, cuda]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c4701f587f..c5ca5995e4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,119 @@
+# raft 23.04.00 (6 Apr 2023)
+
+## 🚨 Breaking Changes
+
+- Pin `dask` and `distributed` for release ([#1399](https://github.com/rapidsai/raft/pull/1399)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove faiss_mr.hpp ([#1351](https://github.com/rapidsai/raft/pull/1351)) [@benfred](https://github.com/benfred)
+- Removing FAISS from build ([#1340](https://github.com/rapidsai/raft/pull/1340)) [@cjnolet](https://github.com/cjnolet)
+- Generic linalg::map ([#1337](https://github.com/rapidsai/raft/pull/1337)) [@achirkin](https://github.com/achirkin)
+- Consolidate pre-compiled specializations into single `libraft` binary ([#1333](https://github.com/rapidsai/raft/pull/1333)) [@cjnolet](https://github.com/cjnolet)
+- Generic linalg::map ([#1329](https://github.com/rapidsai/raft/pull/1329)) [@achirkin](https://github.com/achirkin)
+- Update and standardize IVF indexes API ([#1328](https://github.com/rapidsai/raft/pull/1328)) [@viclafargue](https://github.com/viclafargue)
+- IVF-Flat index splitting ([#1271](https://github.com/rapidsai/raft/pull/1271)) [@lowener](https://github.com/lowener)
+- IVF-PQ: store cluster data in individual lists and reduce templates ([#1249](https://github.com/rapidsai/raft/pull/1249)) [@achirkin](https://github.com/achirkin)
+- Fix for svd API ([#1190](https://github.com/rapidsai/raft/pull/1190)) [@lowener](https://github.com/lowener)
+- Remove deprecated headers ([#1145](https://github.com/rapidsai/raft/pull/1145)) [@lowener](https://github.com/lowener)
+
+## 🐛 Bug Fixes
+
+- Fix primitives benchmarks ([#1389](https://github.com/rapidsai/raft/pull/1389)) [@ahendriksen](https://github.com/ahendriksen)
+- Fixing index-url link on pip install docs ([#1378](https://github.com/rapidsai/raft/pull/1378)) [@cjnolet](https://github.com/cjnolet)
+- Adding some functions back in that seem to be a copy/paste error ([#1373](https://github.com/rapidsai/raft/pull/1373)) [@cjnolet](https://github.com/cjnolet)
+- Remove usage of Dask&#39;s `get_worker` ([#1365](https://github.com/rapidsai/raft/pull/1365)) [@pentschev](https://github.com/pentschev)
+- Remove MANIFEST.in use auto-generated one for sdists and package_data for wheels ([#1348](https://github.com/rapidsai/raft/pull/1348)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Generic linalg::map ([#1329)&quot; (#1336](https://github.com/rapidsai/raft/pull/1329)&quot; (#1336)) [@cjnolet](https://github.com/cjnolet)
+- Small follow-up to specializations cleanup ([#1332](https://github.com/rapidsai/raft/pull/1332)) [@cjnolet](https://github.com/cjnolet)
+- Fixing select_k specializations ([#1330](https://github.com/rapidsai/raft/pull/1330)) [@cjnolet](https://github.com/cjnolet)
+- Fixing remaining bug in ann_quantized ([#1327](https://github.com/rapidsai/raft/pull/1327)) [@cjnolet](https://github.com/cjnolet)
+- Fixign a couple small kmeans bugs ([#1274](https://github.com/rapidsai/raft/pull/1274)) [@cjnolet](https://github.com/cjnolet)
+- Remove no longer instantiated templates from list of extern template declarations ([#1272](https://github.com/rapidsai/raft/pull/1272)) [@vyasr](https://github.com/vyasr)
+- Bump pinned deps to 23.4 ([#1266](https://github.com/rapidsai/raft/pull/1266)) [@vyasr](https://github.com/vyasr)
+- Fix the destruction of interruptible token registry ([#1229](https://github.com/rapidsai/raft/pull/1229)) [@achirkin](https://github.com/achirkin)
+- Expose raft::handle_t in the public header ([#1192](https://github.com/rapidsai/raft/pull/1192)) [@vyasr](https://github.com/vyasr)
+- Fix for svd API ([#1190](https://github.com/rapidsai/raft/pull/1190)) [@lowener](https://github.com/lowener)
+
+## 📖 Documentation
+
+- Adding architecture diagram to README.md ([#1370](https://github.com/rapidsai/raft/pull/1370)) [@cjnolet](https://github.com/cjnolet)
+- Adding small readme image ([#1354](https://github.com/rapidsai/raft/pull/1354)) [@cjnolet](https://github.com/cjnolet)
+- Fix serialize documentation of ivf_flat ([#1347](https://github.com/rapidsai/raft/pull/1347)) [@lowener](https://github.com/lowener)
+- Small updates to docs ([#1339](https://github.com/rapidsai/raft/pull/1339)) [@cjnolet](https://github.com/cjnolet)
+
+## 🚀 New Features
+
+- Add Options to Generate Build Metrics Report ([#1369](https://github.com/rapidsai/raft/pull/1369)) [@divyegala](https://github.com/divyegala)
+- Generic linalg::map ([#1337](https://github.com/rapidsai/raft/pull/1337)) [@achirkin](https://github.com/achirkin)
+- Generic linalg::map ([#1329](https://github.com/rapidsai/raft/pull/1329)) [@achirkin](https://github.com/achirkin)
+- matrix::select_k specializations ([#1268](https://github.com/rapidsai/raft/pull/1268)) [@achirkin](https://github.com/achirkin)
+- Use rapids-cmake new COMPONENT exporting feature ([#1154](https://github.com/rapidsai/raft/pull/1154)) [@robertmaynard](https://github.com/robertmaynard)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for release ([#1399](https://github.com/rapidsai/raft/pull/1399)) [@galipremsagar](https://github.com/galipremsagar)
+- Pin cupy in wheel tests to supported versions ([#1383](https://github.com/rapidsai/raft/pull/1383)) [@vyasr](https://github.com/vyasr)
+- CAGRA ([#1375](https://github.com/rapidsai/raft/pull/1375)) [@tfeher](https://github.com/tfeher)
+- add a distance epilogue function to the bfknn call ([#1371](https://github.com/rapidsai/raft/pull/1371)) [@benfred](https://github.com/benfred)
+- Relax UCX pin to allow 1.14 ([#1366](https://github.com/rapidsai/raft/pull/1366)) [@pentschev](https://github.com/pentschev)
+- Generate pyproject dependencies with dfg ([#1364](https://github.com/rapidsai/raft/pull/1364)) [@vyasr](https://github.com/vyasr)
+- Add nccl to dependencies.yaml ([#1361](https://github.com/rapidsai/raft/pull/1361)) [@benfred](https://github.com/benfred)
+- Add extern template for ivfflat_interleaved_scan ([#1360](https://github.com/rapidsai/raft/pull/1360)) [@ahendriksen](https://github.com/ahendriksen)
+- Stop setting package version attribute in wheels ([#1359](https://github.com/rapidsai/raft/pull/1359)) [@vyasr](https://github.com/vyasr)
+- Fix ivf flat specialization header IdxT from uint64_t -&gt; int64_t ([#1358](https://github.com/rapidsai/raft/pull/1358)) [@ahendriksen](https://github.com/ahendriksen)
+- Remove faiss_mr.hpp ([#1351](https://github.com/rapidsai/raft/pull/1351)) [@benfred](https://github.com/benfred)
+- Rename optional helper function ([#1345](https://github.com/rapidsai/raft/pull/1345)) [@viclafargue](https://github.com/viclafargue)
+- Pass minimum target compile options through `raft::raft` ([#1341](https://github.com/rapidsai/raft/pull/1341)) [@cjnolet](https://github.com/cjnolet)
+- Removing FAISS from build ([#1340](https://github.com/rapidsai/raft/pull/1340)) [@cjnolet](https://github.com/cjnolet)
+- Add dispatch based on compute architecture ([#1335](https://github.com/rapidsai/raft/pull/1335)) [@ahendriksen](https://github.com/ahendriksen)
+- Consolidate pre-compiled specializations into single `libraft` binary ([#1333](https://github.com/rapidsai/raft/pull/1333)) [@cjnolet](https://github.com/cjnolet)
+- Update and standardize IVF indexes API ([#1328](https://github.com/rapidsai/raft/pull/1328)) [@viclafargue](https://github.com/viclafargue)
+- Using int64_t specializations for `ivf_pq` and `refine` ([#1325](https://github.com/rapidsai/raft/pull/1325)) [@cjnolet](https://github.com/cjnolet)
+- Migrate as much as possible to pyproject.toml ([#1324](https://github.com/rapidsai/raft/pull/1324)) [@vyasr](https://github.com/vyasr)
+- Pass `AWS_SESSION_TOKEN` and `SCCACHE_S3_USE_SSL` vars to conda build ([#1321](https://github.com/rapidsai/raft/pull/1321)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Numerical stability fixes for l2 pairwise distance ([#1319](https://github.com/rapidsai/raft/pull/1319)) [@benfred](https://github.com/benfred)
+- Consolidate linter configuration into pyproject.toml ([#1317](https://github.com/rapidsai/raft/pull/1317)) [@vyasr](https://github.com/vyasr)
+- IVF-Flat Python wrappers ([#1316](https://github.com/rapidsai/raft/pull/1316)) [@tfeher](https://github.com/tfeher)
+- Add stream overloads to `ivf_pq` serialize/deserialize methods ([#1315](https://github.com/rapidsai/raft/pull/1315)) [@divyegala](https://github.com/divyegala)
+- Temporary buffer to view host or device memory in device ([#1313](https://github.com/rapidsai/raft/pull/1313)) [@divyegala](https://github.com/divyegala)
+- RAFT skeleton project template ([#1312](https://github.com/rapidsai/raft/pull/1312)) [@cjnolet](https://github.com/cjnolet)
+- Fix docs build to be `pydata-sphinx-theme=0.13.0` compatible ([#1311](https://github.com/rapidsai/raft/pull/1311)) [@galipremsagar](https://github.com/galipremsagar)
+- Update to GCC 11 ([#1309](https://github.com/rapidsai/raft/pull/1309)) [@bdice](https://github.com/bdice)
+- Reduce compile times of distance specializations ([#1307](https://github.com/rapidsai/raft/pull/1307)) [@ahendriksen](https://github.com/ahendriksen)
+- Fix docs upload path ([#1305](https://github.com/rapidsai/raft/pull/1305)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Add end-to-end CUDA ann-benchmarks to raft ([#1304](https://github.com/rapidsai/raft/pull/1304)) [@cjnolet](https://github.com/cjnolet)
+- Make docs builds less verbose ([#1302](https://github.com/rapidsai/raft/pull/1302)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Stop using versioneer to manage versions ([#1301](https://github.com/rapidsai/raft/pull/1301)) [@vyasr](https://github.com/vyasr)
+- Adding util to get the device id for a pointer address ([#1297](https://github.com/rapidsai/raft/pull/1297)) [@cjnolet](https://github.com/cjnolet)
+- Enable dfg in pre-commit. ([#1293](https://github.com/rapidsai/raft/pull/1293)) [@vyasr](https://github.com/vyasr)
+- Python API for brute-force KNN ([#1292](https://github.com/rapidsai/raft/pull/1292)) [@cjnolet](https://github.com/cjnolet)
+- support k up to 2048 in faiss select ([#1287](https://github.com/rapidsai/raft/pull/1287)) [@benfred](https://github.com/benfred)
+- CI: Remove specification of manual stage for check_style.sh script. ([#1283](https://github.com/rapidsai/raft/pull/1283)) [@csadorf](https://github.com/csadorf)
+- New Sparse Matrix APIs ([#1279](https://github.com/rapidsai/raft/pull/1279)) [@cjnolet](https://github.com/cjnolet)
+- fix build on cuda 11.5 ([#1277](https://github.com/rapidsai/raft/pull/1277)) [@benfred](https://github.com/benfred)
+- IVF-Flat index splitting ([#1271](https://github.com/rapidsai/raft/pull/1271)) [@lowener](https://github.com/lowener)
+- Remove duplicate `librmm` runtime dependency ([#1264](https://github.com/rapidsai/raft/pull/1264)) [@ajschmidt8](https://github.com/ajschmidt8)
+- build.sh: Add option to log nvcc compile times ([#1262](https://github.com/rapidsai/raft/pull/1262)) [@ahendriksen](https://github.com/ahendriksen)
+- Reduce error handling verbosity in CI tests scripts ([#1259](https://github.com/rapidsai/raft/pull/1259)) [@AjayThorve](https://github.com/AjayThorve)
+- Update shared workflow branches ([#1256](https://github.com/rapidsai/raft/pull/1256)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Keeping only compute similarity specializations for uint64_t for now ([#1255](https://github.com/rapidsai/raft/pull/1255)) [@cjnolet](https://github.com/cjnolet)
+- Fix compile time explosion for minkowski distance ([#1254](https://github.com/rapidsai/raft/pull/1254)) [@ahendriksen](https://github.com/ahendriksen)
+- Unpin `dask` and `distributed` for development ([#1253](https://github.com/rapidsai/raft/pull/1253)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove gpuCI scripts. ([#1252](https://github.com/rapidsai/raft/pull/1252)) [@bdice](https://github.com/bdice)
+- IVF-PQ: store cluster data in individual lists and reduce templates ([#1249](https://github.com/rapidsai/raft/pull/1249)) [@achirkin](https://github.com/achirkin)
+- Fix inconsistency between the building doc and CMakeLists.txt ([#1248](https://github.com/rapidsai/raft/pull/1248)) [@yong-wang](https://github.com/yong-wang)
+- Consolidating ANN benchmarks and tests ([#1243](https://github.com/rapidsai/raft/pull/1243)) [@cjnolet](https://github.com/cjnolet)
+- mdspan view for IVF-PQ API ([#1236](https://github.com/rapidsai/raft/pull/1236)) [@viclafargue](https://github.com/viclafargue)
+- Remove uint32 distance idx specializations ([#1235](https://github.com/rapidsai/raft/pull/1235)) [@cjnolet](https://github.com/cjnolet)
+- Add innerproduct to the pairwise distance api ([#1226](https://github.com/rapidsai/raft/pull/1226)) [@benfred](https://github.com/benfred)
+- Move date to build string in `conda` recipe ([#1223](https://github.com/rapidsai/raft/pull/1223)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Replace faiss bfKnn ([#1202](https://github.com/rapidsai/raft/pull/1202)) [@benfred](https://github.com/benfred)
+- Expose KMeans `init_plus_plus` in pylibraft ([#1198](https://github.com/rapidsai/raft/pull/1198)) [@betatim](https://github.com/betatim)
+- Fix `ucx-py` version ([#1184](https://github.com/rapidsai/raft/pull/1184)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Improve the performance of radix top-k ([#1175](https://github.com/rapidsai/raft/pull/1175)) [@yong-wang](https://github.com/yong-wang)
+- Add docs build job ([#1168](https://github.com/rapidsai/raft/pull/1168)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Remove deprecated headers ([#1145](https://github.com/rapidsai/raft/pull/1145)) [@lowener](https://github.com/lowener)
+- Simplify distance/detail to make is easier to dispatch to different kernel implementations ([#1142](https://github.com/rapidsai/raft/pull/1142)) [@ahendriksen](https://github.com/ahendriksen)
+- Initial port of auto-find-k ([#1070](https://github.com/rapidsai/raft/pull/1070)) [@cjnolet](https://github.com/cjnolet)
+
 # raft 23.02.00 (9 Feb 2023)
 
 ## 🚨 Breaking Changes
diff --git a/README.md b/README.md
index b77e906262..10cd7b16fc 100755
--- a/README.md
+++ b/README.md
@@ -146,7 +146,7 @@ in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
 output = pairwise_distance(in1, in2, metric="euclidean")
 ```
 
-The `output` array in the above example is of type `raft.common.device_ndarray`, which supports [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html#cuda-array-interface-version-2) making it interoperable with other libraries like CuPy, Numba, and PyTorch that also support it. CuPy supports DLPack, which also enables zero-copy conversion from `raft.common.device_ndarray` to JAX and Tensorflow.
+The `output` array in the above example is of type `raft.common.device_ndarray`, which supports [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html#cuda-array-interface-version-2) making it interoperable with other libraries like CuPy, Numba, PyTorch and RAPIDS cuDF that also support it. CuPy supports DLPack, which also enables zero-copy conversion from `raft.common.device_ndarray` to JAX and Tensorflow.
 
 Below is an example of converting the output `pylibraft.device_ndarray` to a CuPy array:
 ```python
@@ -160,6 +160,11 @@ import torch
 torch_tensor = torch.as_tensor(output, device='cuda')
 ```
 
+Or converting to a RAPIDS cuDF dataframe:
+```python
+cudf_dataframe = cudf.DataFrame(output)
+```
+
 When the corresponding library has been installed and available in your environment, this conversion can also be done automatically by all RAFT compute APIs by setting a global configuration option:
 ```python
 import pylibraft.config
@@ -198,7 +203,7 @@ RAFT itself can be installed through conda, [CMake Package Manager (CPM)](https:
 
 The easiest way to install RAFT is through conda and several packages are provided.
 - `libraft-headers` RAFT headers
-- `libraft` (optional) shared library of pre-compiled template specializations and runtime APIs.
+- `libraft` (optional) shared library of pre-compiled template instantiations and runtime APIs.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
 - `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
 
@@ -231,11 +236,11 @@ You can find an [example RAFT](cpp/template/README.md) project template in the `
 
 Additional CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available. RAFT headers require, at a minimum, the CUDA toolkit libraries and RMM dependencies.
 
-| Component   | Target              | Description                                               | Base Dependencies                     |
-|-------------|---------------------|-----------------------------------------------------------|---------------------------------------|
-| n/a         | `raft::raft`        | Full RAFT header library                                  | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS |
-| compiled    | `raft::compiled`    | Pre-compiled template specializations and runtime library | raft::raft                            |
-| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs                       | raft::raft, UCX, NCCL                 |
+| Component   | Target              | Description                                              | Base Dependencies                      |
+|-------------|---------------------|----------------------------------------------------------|----------------------------------------|
+| n/a         | `raft::raft`        | Full RAFT header library                                 | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS |
+| compiled    | `raft::compiled`    | Pre-compiled template instantiations and runtime library | raft::raft                             |
+| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs                      | raft::raft, UCX, NCCL                  |
 
 ### Source
 
@@ -282,7 +287,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders:
     - `util`: Various reusable tools and utilities for accelerated algorithm development
   - `internal`: A private header-only component that hosts the code shared between benchmarks and tests.
   - `scripts`: Helpful scripts for development
-  - `src`: Compiled APIs and template specializations for the shared libraries
+  - `src`: Compiled APIs and template instantiations for the shared libraries
   - `template`: A skeleton template containing the bare-bones file structure and cmake configuration for writing applications with RAFT.
   - `test`: Googletests source code
 - `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs)
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 5db6fa11be..e52beb22ea 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -19,7 +19,7 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-VERSION_NUMBER=$(rapids-get-rapids-version-from-git)
+VERSION_NUMBER="23.06"
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
index efc8f0c77c..fd6c2f929e 100755
--- a/ci/release/apply_wheel_modifications.sh
+++ b/ci/release/apply_wheel_modifications.sh
@@ -18,3 +18,8 @@ sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/pylibraft/pyproject.toml
 sed -i "s/^name = \"raft-dask\"/name = \"raft-dask${CUDA_SUFFIX}\"/g" python/raft-dask/pyproject.toml
 sed -i "s/pylibraft/pylibraft${CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml
 sed -i "s/ucx-py/ucx-py${CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml
+
+if [[ $CUDA_SUFFIX == "-cu12" ]]; then
+    sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" python/pylibraft/pyproject.toml
+    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/pylibraft/pyproject.toml
+fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index d8c22b4931..f6c6b08644 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -80,6 +80,7 @@ sed_runner "s/ucx-py.*\",/ucx-py==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\",/g" python
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
+sed_runner "s/VERSION_NUMBER=\".*/VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
 
 sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxyfile
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 0e06076f1a..aae2aa3d15 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,13 +9,13 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=11.1.0
-- clang=11.1.0
+- clang-tools=16.0.1
+- clang=16.0.1
 - cmake>=3.23.1,!=3.25.0
 - cuda-profiler-api=11.8.86
-- cuda-python >=11.7.1,<12.0
+- cuda-python>=11.7.1,<12.0
 - cudatoolkit=11.8
-- cupy
+- cupy>=12.0.0
 - cxx-compiler
 - cython>=0.29,<0.30
 - dask-core==2023.3.2
@@ -24,7 +24,9 @@ dependencies:
 - distributed==2023.3.2.1
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
+- gmock>=1.13.0
 - graphviz
+- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev=11.11.3.6
@@ -45,7 +47,7 @@ dependencies:
 - pytest-cov
 - recommonmark
 - rmm==23.6.*
-- scikit-build>=0.13.1
+- scikit-build>=0.13.1,<0.17.2
 - scikit-learn
 - scipy
 - sphinx-copybutton
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 5965aaef8f..3ea560025e 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -8,8 +8,8 @@ channels:
 - nvidia
 dependencies:
 - c-compiler
-- clang-tools=11.1.0
-- clang=11.1.0
+- clang-tools=16.0.1
+- clang=16.0.1
 - cmake>=3.23.1,!=3.25.0
 - cuda-profiler-api=11.8.86
 - cudatoolkit=11.8
@@ -32,6 +32,6 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
-- scikit-build>=0.13.1
+- scikit-build>=0.13.1,<0.17.2
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 2a66f213a7..bec773d26d 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -17,7 +17,7 @@ nccl_version:
   - ">=2.9.9"
 
 gtest_version:
-  - "=1.10.0"
+  - ">=1.13.0"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 8ec9cc10c6..b89fcfb788 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -36,6 +36,7 @@ outputs:
         - SCCACHE_S3_KEY_PREFIX=libraft-aarch64 # [aarch64]
         - SCCACHE_S3_KEY_PREFIX=libraft-linux64 # [linux64]
         - SCCACHE_S3_USE_SSL
+        - SCCACHE_S3_NO_CREDENTIALS
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
@@ -52,6 +53,9 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - cudatoolkit {{ cuda_version }}
+      run:
+        - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+        - librmm ={{ minor_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
@@ -68,7 +72,6 @@ outputs:
       run:
         - {{ pin_subpackage('libraft-headers-only', exact=True) }}
         - cuda-profiler-api {{ cuda_profiler_api_run_version }}
-        - cudatoolkit {{ cuda_version }}
         - librmm ={{ minor_version }}
         - libcublas {{ libcublas_run_version }}
         - libcublas-dev {{ libcublas_run_version }}
@@ -101,6 +104,7 @@ outputs:
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
         - {{ pin_subpackage('libraft-headers', exact=True) }}
+        - cudatoolkit {{ cuda_version }}
         - cuda-profiler-api {{ cuda_profiler_api_host_version }}
         - libcublas {{ libcublas_host_version }}
         - libcublas-dev {{ libcublas_host_version }}
@@ -135,6 +139,7 @@ outputs:
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
         - {{ pin_subpackage('libraft', exact=True) }}
+        - cudatoolkit {{ cuda_version }}
         - cuda-profiler-api {{ cuda_profiler_api_host_version }}
         - gmock {{ gtest_version }}
         - gtest {{ gtest_version }}
@@ -200,6 +205,7 @@ outputs:
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
         - {{ pin_subpackage('libraft', exact=True) }}
+        - cudatoolkit {{ cuda_version }}
         - libcublas {{ libcublas_host_version }}
         - libcublas-dev {{ libcublas_host_version }}
         - glog {{ glog_version }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6461492169..cddfa4b38d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -70,13 +70,11 @@ option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
 
-
-# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
-# to have different values for the `Threads::Threads` target. Setting this flag ensures
+# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
+# have different values for the `Threads::Threads` target. Setting this flag ensures
 # `Threads::Threads` is the same value across all builds so that cache hits occur
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
-
 include(CMakeDependentOption)
 # cmake_dependent_option( RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for
 # nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARY OFF )
@@ -265,180 +263,135 @@ set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled)
 if(RAFT_COMPILE_LIBRARY)
   add_library(
     raft_lib
-    src/distance/pairwise_distance.cu
-    src/distance/fused_l2_min_arg.cu
-    src/cluster/update_centroids_float.cu
-    src/cluster/update_centroids_double.cu
-    src/cluster/cluster_cost_float.cu
-    src/cluster/cluster_cost_double.cu
-    src/neighbors/refine_d_int64_t_float.cu
-    src/neighbors/refine_d_int64_t_int8_t.cu
-    src/neighbors/refine_d_int64_t_uint8_t.cu
-    src/neighbors/refine_h_int64_t_float.cu
-    src/neighbors/refine_h_int64_t_int8_t.cu
-    src/neighbors/refine_h_int64_t_uint8_t.cu
-    src/neighbors/specializations/refine_d_int64_t_float.cu
-    src/neighbors/specializations/refine_d_int64_t_int8_t.cu
-    src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
-    src/neighbors/specializations/refine_h_int64_t_float.cu
-    src/neighbors/specializations/refine_h_int64_t_int8_t.cu
-    src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
-    src/cluster/kmeans_fit_float.cu
-    src/cluster/kmeans_fit_double.cu
-    src/cluster/kmeans_init_plus_plus_double.cu
-    src/cluster/kmeans_init_plus_plus_float.cu
-    src/distance/specializations/detail/canberra_double_double_double_int.cu
-    src/distance/specializations/detail/canberra_float_float_float_int.cu
-    src/distance/specializations/detail/correlation_double_double_double_int.cu
-    src/distance/specializations/detail/correlation_float_float_float_int.cu
-    src/distance/specializations/detail/cosine_double_double_double_int.cu
-    src/distance/specializations/detail/cosine_float_float_float_int.cu
-    src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
-    src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
-    src/distance/specializations/detail/inner_product_float_float_float_int.cu
-    src/distance/specializations/detail/inner_product_double_double_double_int.cu
-    src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
-    src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
-    src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
-    src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
-    src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
-    src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
-    # These are somehow missing a kernel definition which is causing a compile error.
-    # src/distance/specializations/detail/kernels/rbf_kernel_double.cu
-    # src/distance/specializations/detail/kernels/rbf_kernel_float.cu
-    src/neighbors/brute_force_knn_int64_t_float.cu
-    src/distance/specializations/detail/kernels/tanh_kernel_double.cu
-    src/distance/specializations/detail/kernels/tanh_kernel_float.cu
-    src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
-    src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
-    src/distance/specializations/detail/l1_float_float_float_int.cu
-    src/distance/specializations/detail/l1_double_double_double_int.cu
-    src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
-    src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/l_inf_double_double_double_int.cu
-    src/distance/specializations/detail/l_inf_float_float_float_int.cu
-    src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/russel_rao_double_double_double_int.cu
-    src/distance/specializations/detail/russel_rao_float_float_float_int.cu
-    src/distance/specializations/fused_l2_nn_double_int.cu
-    src/distance/specializations/fused_l2_nn_double_int64.cu
-    src/distance/specializations/fused_l2_nn_float_int.cu
-    src/distance/specializations/fused_l2_nn_float_int64.cu
-    src/matrix/specializations/detail/select_k_float_uint32_t.cu
-    src/matrix/specializations/detail/select_k_float_int64_t.cu
-    src/matrix/specializations/detail/select_k_half_uint32_t.cu
-    src/matrix/specializations/detail/select_k_half_int64_t.cu
-    src/neighbors/ivfpq_build.cu
-    src/neighbors/ivfpq_deserialize.cu
-    src/neighbors/ivfpq_serialize.cu
+    src/core/logger.cpp
+    src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_rbf.cu
+    src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
+    src/distance/distance.cu
+    src/distance/fused_l2_nn.cu
+    src/linalg/detail/coalesced_reduction.cu
+    src/matrix/detail/select_k_double_int64_t.cu
+    src/matrix/detail/select_k_double_uint32_t.cu
+    src/matrix/detail/select_k_float_int64_t.cu
+    src/matrix/detail/select_k_float_uint32_t.cu
+    src/matrix/detail/select_k_half_int64_t.cu
+    src/matrix/detail/select_k_half_uint32_t.cu
+    src/neighbors/ball_cover.cu
+    src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
+    src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
+    src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
+    src/neighbors/brute_force_knn_int_float_int.cu
+    src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
+    src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
+    src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
+    src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
+    src/neighbors/detail/ivf_flat_search.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
+    src/neighbors/detail/selection_faiss_int32_t_float.cu
+    src/neighbors/detail/selection_faiss_int_double.cu
+    src/neighbors/detail/selection_faiss_long_float.cu
+    src/neighbors/detail/selection_faiss_size_t_double.cu
+    src/neighbors/detail/selection_faiss_size_t_float.cu
+    src/neighbors/detail/selection_faiss_uint32_t_float.cu
+    src/neighbors/ivf_flat_build_float_int64_t.cu
+    src/neighbors/ivf_flat_build_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
+    src/neighbors/ivf_flat_extend_float_int64_t.cu
+    src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
+    src/neighbors/ivf_flat_search_float_int64_t.cu
+    src/neighbors/ivf_flat_search_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
+    src/neighbors/ivfpq_build_float_int64_t.cu
+    src/neighbors/ivfpq_build_int8_t_int64_t.cu
+    src/neighbors/ivfpq_build_uint8_t_int64_t.cu
+    src/neighbors/ivfpq_extend_float_int64_t.cu
+    src/neighbors/ivfpq_extend_int8_t_int64_t.cu
+    src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
-    src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
-    src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
-    src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
-    src/random/rmat_rectangular_generator_int_double.cu
-    src/random/rmat_rectangular_generator_int64_double.cu
-    src/random/rmat_rectangular_generator_int_float.cu
-    src/random/rmat_rectangular_generator_int64_float.cu
-    src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
-    src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
-    src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
-    src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
-    src/neighbors/specializations/ball_cover_all_knn_query.cu
-    src/neighbors/specializations/ball_cover_build_index.cu
-    src/neighbors/specializations/ball_cover_knn_query.cu
-    src/neighbors/specializations/fused_l2_knn_long_float_true.cu
-    src/neighbors/specializations/fused_l2_knn_long_float_false.cu
-    src/neighbors/specializations/fused_l2_knn_int_float_true.cu
-    src/neighbors/specializations/fused_l2_knn_int_float_false.cu
-    src/neighbors/ivf_flat_search.cu
-    src/neighbors/ivf_flat_build.cu
-    src/neighbors/specializations/ivfflat_build_float_int64_t.cu
-    src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
-    src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_search_float_int64_t.cu
-    src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
-    src/neighbors/ivfpq_build.cu
-    src/neighbors/ivfpq_deserialize.cu
-    src/neighbors/ivfpq_serialize.cu
-    src/neighbors/ivfpq_search_float_int64_t.cu
-    src/neighbors/ivfpq_search_int8_t_int64_t.cu
-    src/neighbors/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
-    src/random/rmat_rectangular_generator_int_double.cu
-    src/random/rmat_rectangular_generator_int64_double.cu
-    src/random/rmat_rectangular_generator_int_float.cu
-    src/random/rmat_rectangular_generator_int64_float.cu
+    src/neighbors/refine_float_float.cu
+    src/neighbors/refine_int8_t_float.cu
+    src/neighbors/refine_uint8_t_float.cu
+    src/raft_runtime/cluster/cluster_cost.cuh
+    src/raft_runtime/cluster/cluster_cost_double.cu
+    src/raft_runtime/cluster/cluster_cost_float.cu
+    src/raft_runtime/cluster/kmeans_fit_double.cu
+    src/raft_runtime/cluster/kmeans_fit_float.cu
+    src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
+    src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
+    src/raft_runtime/cluster/update_centroids.cuh
+    src/raft_runtime/cluster/update_centroids_double.cu
+    src/raft_runtime/cluster/update_centroids_float.cu
+    src/raft_runtime/distance/fused_l2_min_arg.cu
+    src/raft_runtime/distance/pairwise_distance.cu
+    src/raft_runtime/matrix/select_k_float_int64_t.cu
+    src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
+    src/raft_runtime/neighbors/ivf_flat_build.cu
+    src/raft_runtime/neighbors/ivf_flat_search.cu
+    src/raft_runtime/neighbors/ivfpq_build.cu
+    src/raft_runtime/neighbors/ivfpq_deserialize.cu
+    src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
+    src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
+    src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
+    src/raft_runtime/neighbors/ivfpq_serialize.cu
+    src/raft_runtime/neighbors/refine_d_int64_t_float.cu
+    src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
+    src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
+    src/raft_runtime/neighbors/refine_h_int64_t_float.cu
+    src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
+    src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
+    src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu
+    src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
+    src/raft_runtime/random/rmat_rectangular_generator_int_double.cu
+    src/raft_runtime/random/rmat_rectangular_generator_int_float.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
+    src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
+    src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
+    src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
+    src/util/memory_pool.cpp
   )
   set_target_properties(
     raft_lib
@@ -464,7 +417,13 @@ if(RAFT_COMPILE_LIBRARY)
     raft_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
-  target_compile_definitions(raft_lib INTERFACE "RAFT_COMPILED")
+
+  # RAFT_COMPILED is set during compilation of libraft.so as well as downstream libraries (due to
+  # "PUBLIC")
+  target_compile_definitions(raft_lib PUBLIC "RAFT_COMPILED")
+
+  # RAFT_EXPLICIT_INSTANTIATE_ONLY is set during compilation of libraft.so (due to "PRIVATE")
+  target_compile_definitions(raft_lib PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
 
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index b4d8fbeee3..c34b95010f 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #ifdef NVTX
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #endif
 #include <unistd.h>
 
diff --git a/cpp/bench/ann/src/common/dataset.h b/cpp/bench/ann/src/common/dataset.h
index 1244935c99..46dd66d649 100644
--- a/cpp/bench/ann/src/common/dataset.h
+++ b/cpp/bench/ann/src/common/dataset.h
@@ -47,7 +47,7 @@ class BinFile {
           uint32_t subset_first_row = 0,
           uint32_t subset_size      = 0);
   ~BinFile() { fclose(fp_); }
-  BinFile(const BinFile&) = delete;
+  BinFile(const BinFile&)            = delete;
   BinFile& operator=(const BinFile&) = delete;
 
   void get_shape(size_t* nrows, int* ndims)
@@ -219,7 +219,7 @@ class Dataset {
   Dataset(const std::string& name, const std::string& distance) : name_(name), distance_(distance)
   {
   }
-  Dataset(const Dataset&) = delete;
+  Dataset(const Dataset&)            = delete;
   Dataset& operator=(const Dataset&) = delete;
   virtual ~Dataset();
 
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index d8e98ce2a9..baff1b1c45 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -22,10 +22,6 @@
 #include <type_traits>
 #include <utility>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include "../common/ann_types.hpp"
 #include "../common/benchmark_util.hpp"
 #undef WARP_SIZE
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat.cu b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
index ff108080b5..bcd23723a4 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
@@ -15,12 +15,8 @@
  */
 #include "raft_ivf_flat_wrapper.h"
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::bench::ann {
 template class RaftIvfFlatGpu<float, int64_t>;
 template class RaftIvfFlatGpu<uint8_t, int64_t>;
 template class RaftIvfFlatGpu<int8_t, int64_t>;
-}  // namespace raft::bench::ann
\ No newline at end of file
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 8b2a7d329b..0a80eef1b5 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -29,6 +29,7 @@
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq.cu b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
index 338bc9a32f..2efe14631b 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
@@ -15,10 +15,6 @@
  */
 #include "raft_ivf_pq_wrapper.h"
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::bench::ann {
 template class RaftIvfPQ<float, int64_t>;
 template class RaftIvfPQ<uint8_t, int64_t>;
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 70dff81847..517272e6cf 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -42,6 +42,7 @@ template <typename T, typename IdxT>
 class RaftIvfPQ : public ANN<T> {
  public:
   using typename ANN<T>::AnnSearchParam;
+  using ANN<T>::dim_;
 
   struct SearchParam : public AnnSearchParam {
     raft::neighbors::ivf_pq::search_params pq_param;
@@ -118,7 +119,7 @@ void RaftIvfPQ<T, IdxT>::load(const std::string& file)
 template <typename T, typename IdxT>
 void RaftIvfPQ<T, IdxT>::build(const T* dataset, size_t nrow, cudaStream_t)
 {
-  auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), index_->dim());
+  auto dataset_v = raft::make_device_matrix_view<const T, IdxT>(dataset, IdxT(nrow), dim_);
 
   index_.emplace(raft::runtime::neighbors::ivf_pq::build(handle_, index_params_, dataset_v));
   return;
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index f6499623dd..505ca32886 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 function(ConfigureBench)
 
-  set(options OPTIONAL LIB)
+  set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY)
   set(oneValueArgs NAME)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
@@ -55,6 +55,10 @@ function(ConfigureBench)
                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
+  if(ConfigureTest_EXPLICIT_INSTANTIATE_ONLY)
+    target_compile_definitions(${BENCH_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
+  endif()
+
   target_include_directories(
     ${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench/prims>"
   )
@@ -71,7 +75,7 @@ endfunction()
 if(BUILD_PRIMS_BENCH)
   ConfigureBench(
     NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
-    bench/prims/main.cpp OPTIONAL LIB
+    bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
@@ -93,6 +97,7 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/main.cpp
     OPTIONAL
     LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
@@ -112,7 +117,7 @@ if(BUILD_PRIMS_BENCH)
 
   ConfigureBench(
     NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
-    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB
+    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
@@ -139,5 +144,6 @@ if(BUILD_PRIMS_BENCH)
     bench/prims/main.cpp
     OPTIONAL
     LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 endif()
diff --git a/cpp/bench/prims/cluster/kmeans.cu b/cpp/bench/prims/cluster/kmeans.cu
index af7afb8037..3147960f72 100644
--- a/cpp/bench/prims/cluster/kmeans.cu
+++ b/cpp/bench/prims/cluster/kmeans.cu
@@ -18,10 +18,6 @@
 #include <raft/cluster/kmeans.cuh>
 #include <raft/cluster/kmeans_types.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 namespace raft::bench::cluster {
 
 struct KMeansBenchParams {
diff --git a/cpp/bench/prims/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu
index 6bda43bdb2..42a8f7967c 100644
--- a/cpp/bench/prims/cluster/kmeans_balanced.cu
+++ b/cpp/bench/prims/cluster/kmeans_balanced.cu
@@ -18,10 +18,6 @@
 #include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/random/rng.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 namespace raft::bench::cluster {
 
 struct KMeansBalancedBenchParams {
diff --git a/cpp/bench/prims/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp
index 4b6e1ba286..1e783eb338 100644
--- a/cpp/bench/prims/common/benchmark.hpp
+++ b/cpp/bench/prims/common/benchmark.hpp
@@ -113,8 +113,19 @@ class fixture {
   raft::device_resources handle;
   rmm::cuda_stream_view stream;
 
-  fixture() : stream{handle.get_stream()}
+  fixture(bool use_pool_memory_resource = false) : stream{handle.get_stream()}
   {
+    // Cache memory pool between test runs, since it is expensive to create.
+    // This speeds up the time required to run the select_k bench by over 3x.
+    // This is part of the fixture class here so that the pool will get cleaned
+    // up, rather than outliving the benchmarks that require it.
+    static std::unique_ptr<using_pool_memory_res> memory_pool;
+    if (use_pool_memory_resource) {
+      if (!memory_pool) { memory_pool.reset(new using_pool_memory_res()); }
+    } else if (memory_pool) {
+      memory_pool.reset();
+    }
+
     int l2_cache_size = 0;
     int device_id     = 0;
     RAFT_CUDA_TRY(cudaGetDevice(&device_id));
diff --git a/cpp/bench/prims/distance/distance_common.cuh b/cpp/bench/prims/distance/distance_common.cuh
index 9b5d67a46f..dff3401b62 100644
--- a/cpp/bench/prims/distance/distance_common.cuh
+++ b/cpp/bench/prims/distance/distance_common.cuh
@@ -17,9 +17,6 @@
 #include <common/benchmark.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/util/cudart_utils.hpp>
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::distance {
diff --git a/cpp/bench/prims/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu
index 1c45572782..24c0cbf8f9 100644
--- a/cpp/bench/prims/distance/fused_l2_nn.cu
+++ b/cpp/bench/prims/distance/fused_l2_nn.cu
@@ -16,10 +16,8 @@
 
 #include <common/benchmark.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/norm.cuh>
 #include <raft/util/cudart_utils.hpp>
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::distance {
diff --git a/cpp/bench/prims/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu
index 4407bdcf83..53d97c1fc7 100644
--- a/cpp/bench/prims/distance/kernels.cu
+++ b/cpp/bench/prims/distance/kernels.cu
@@ -13,10 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 #include <common/benchmark.hpp>
 #include <memory>
 #include <raft/core/device_resources.hpp>
diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
index f9f234187d..c804ecb3a1 100644
--- a/cpp/bench/prims/distance/masked_nn.cu
+++ b/cpp/bench/prims/distance/masked_nn.cu
@@ -30,10 +30,6 @@
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#ifdef RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 namespace raft::bench::distance::masked_nn {
 
 // Introduce various sparsity patterns
diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu
index 870119db52..d0bc993cc1 100644
--- a/cpp/bench/prims/matrix/select_k.cu
+++ b/cpp/bench/prims/matrix/select_k.cu
@@ -23,10 +23,6 @@
 #include <raft/sparse/detail/utils.h>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/matrix/specializations.cuh>
-#endif
-
 #include <raft/matrix/detail/select_radix.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/matrix/select_k.cuh>
@@ -46,7 +42,8 @@ using namespace raft::bench;  // NOLINT
 template <typename KeyT, typename IdxT, select::Algo Algo>
 struct selection : public fixture {
   explicit selection(const select::params& p)
-    : params_(p),
+    : fixture(true),
+      params_(p),
       in_dists_(p.batch_size * p.len, stream),
       in_ids_(p.batch_size * p.len, stream),
       out_dists_(p.batch_size * p.k, stream),
@@ -76,7 +73,6 @@ struct selection : public fixture {
   void run_benchmark(::benchmark::State& state) override  // NOLINT
   {
     device_resources handle{stream};
-    using_pool_memory_res res;
     try {
       std::ostringstream label_stream;
       label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
@@ -157,34 +153,33 @@ const std::vector<select::params> kInputs{
   {10, 1000000, 256, true, false, true},
 };
 
-#define SELECTION_REGISTER(KeyT, IdxT, A)                          \
-  namespace BENCHMARK_PRIVATE_NAME(selection)                      \
-  {                                                                \
-    using SelectK = selection<KeyT, IdxT, select::Algo::A>;        \
-    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
+#define SELECTION_REGISTER(KeyT, IdxT, A)                        \
+  namespace BENCHMARK_PRIVATE_NAME(selection) {                  \
+  using SelectK = selection<KeyT, IdxT, select::Algo::A>;        \
+  RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
   }
 
-SELECTION_REGISTER(float, uint32_t, kPublicApi);             // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix8bits);            // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bits);           // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);  // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpAuto);              // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpImmediate);         // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpFiltered);          // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributed);       // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);    // NOLINT
+SELECTION_REGISTER(float, uint32_t, kPublicApi);              // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix8bits);             // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bits);            // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);   // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpAuto);               // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpImmediate);          // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpFiltered);           // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributed);        // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);     // NOLINT
 
 SELECTION_REGISTER(double, uint32_t, kRadix8bits);            // NOLINT
 SELECTION_REGISTER(double, uint32_t, kRadix11bits);           // NOLINT
 SELECTION_REGISTER(double, uint32_t, kRadix11bitsExtraPass);  // NOLINT
 SELECTION_REGISTER(double, uint32_t, kWarpAuto);              // NOLINT
 
-SELECTION_REGISTER(double, int64_t, kRadix8bits);            // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bits);           // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);  // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpImmediate);         // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpFiltered);          // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributed);       // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);    // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix8bits);             // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bits);            // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);   // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpImmediate);          // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpFiltered);           // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributed);        // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);     // NOLINT
 
 }  // namespace raft::matrix
diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh
index 8f0b1cb5d9..8239fa4f89 100644
--- a/cpp/bench/prims/neighbors/knn.cuh
+++ b/cpp/bench/prims/neighbors/knn.cuh
@@ -24,10 +24,6 @@
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -226,7 +222,8 @@ struct brute_force_knn {
 template <typename ValT, typename IdxT, typename ImplT>
 struct knn : public fixture {
   explicit knn(const params& p, const TransferStrategy& strategy, const Scope& scope)
-    : params_(p),
+    : fixture(true),
+      params_(p),
       strategy_(strategy),
       scope_(scope),
       dev_mem_res_(strategy == TransferStrategy::MANAGED),
@@ -278,8 +275,6 @@ struct knn : public fixture {
         "device (TransferStrategy::NO_COPY)");
     }
 
-    using_pool_memory_res default_resource;
-
     try {
       std::ostringstream label_stream;
       label_stream << params_ << "#" << strategy_ << "#" << scope_;
@@ -384,11 +379,10 @@ inline const std::vector<TransferStrategy> kNoCopyOnly{TransferStrategy::NO_COPY
 inline const std::vector<Scope> kScopeFull{Scope::BUILD_SEARCH};
 inline const std::vector<Scope> kAllScopes{Scope::BUILD_SEARCH, Scope::SEARCH, Scope::BUILD};
 
-#define KNN_REGISTER(ValT, IdxT, ImplT, inputs, strats, scope)                   \
-  namespace BENCHMARK_PRIVATE_NAME(knn)                                          \
-  {                                                                              \
-    using KNN = knn<ValT, IdxT, ImplT<ValT, IdxT>>;                              \
-    RAFT_BENCH_REGISTER(KNN, #ValT "/" #IdxT "/" #ImplT, inputs, strats, scope); \
+#define KNN_REGISTER(ValT, IdxT, ImplT, inputs, strats, scope)                 \
+  namespace BENCHMARK_PRIVATE_NAME(knn) {                                      \
+  using KNN = knn<ValT, IdxT, ImplT<ValT, IdxT>>;                              \
+  RAFT_BENCH_REGISTER(KNN, #ValT "/" #IdxT "/" #ImplT, inputs, strats, scope); \
   }
 
 }  // namespace raft::bench::spatial
diff --git a/cpp/bench/prims/neighbors/refine_float_int64_t.cu b/cpp/bench/prims/neighbors/refine_float_int64_t.cu
index 43be330e9b..bbedc1ae64 100644
--- a/cpp/bench/prims/neighbors/refine_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/refine_float_int64_t.cu
@@ -17,11 +17,6 @@
 #include "refine.cuh"
 #include <common/benchmark.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations/refine.cuh>
-#include <raft/spatial/knn/specializations.cuh>
-#endif
-
 using namespace raft::neighbors;
 
 namespace raft::bench::neighbors {
diff --git a/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
index 1d7cb8c8aa..4952361f03 100644
--- a/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
+++ b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
@@ -17,10 +17,6 @@
 #include "refine.cuh"
 #include <common/benchmark.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 using namespace raft::neighbors;
 
 namespace raft::bench::neighbors {
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index c733d46985..ea8a077b0c 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -17,8 +17,16 @@ if(DISABLE_DEPRECATION_WARNINGS)
   list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
 endif()
 
+# Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
+# clang)
 if(CMAKE_COMPILER_IS_GNUCXX)
   list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
+  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+
+  # set warnings as errors
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
+    list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
+  endif()
 endif()
 
 if(CUDA_LOG_COMPILE_TIME)
@@ -31,12 +39,6 @@ list(APPEND RAFT_CUDA_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM")
 # make sure we produce smallest binary size
 list(APPEND RAFT_CUDA_FLAGS -Xfatbin=-compress-all)
 
-# set warnings as errors
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
-  list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
-endif()
-list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
-
 # Option to enable line info in CUDA device compilation to allow introspection when profiling /
 # memchecking
 if(CUDA_ENABLE_LINEINFO)
diff --git a/cpp/cmake/thirdparty/get_glog.cmake b/cpp/cmake/thirdparty/get_glog.cmake
index 9334224de5..35a9170f99 100644
--- a/cpp/cmake/thirdparty/get_glog.cmake
+++ b/cpp/cmake/thirdparty/get_glog.cmake
@@ -26,7 +26,6 @@ function(find_and_configure_glog)
             CPM_ARGS
             GIT_REPOSITORY         https://github.com/${PKG_FORK}/glog.git
             GIT_TAG                ${PKG_PINNED_TAG}
-            SOURCE_SUBDIR          cpp
             EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
             )
 
@@ -46,4 +45,4 @@ find_and_configure_glog(VERSION 0.6.0
         FORK             google
         PINNED_TAG       v0.6.0
         EXCLUDE_FROM_ALL ON
-        )
\ No newline at end of file
+        )
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 17a1e0caca..1948169c91 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -918,6 +918,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
+# TODO: remove specializations from exclude patterns when headers have been removed.
 EXCLUDE_PATTERNS       = */detail/* \
                          */specializations/* \
                          */thirdparty/*
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
index 3d23c809c3..eb89ebe402 100644
--- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -436,7 +436,7 @@ __global__ void __launch_bounds__((WarpSize * BlockDimY))
   adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
                         IdxT n_clusters,
                         IdxT dim,
-                        const T* dataset,  // [n_rows, dim]
+                        const T* dataset,               // [n_rows, dim]
                         IdxT n_rows,
                         const LabelT* labels,           // [n_rows]
                         const CounterT* cluster_sizes,  // [n_clusters]
@@ -976,7 +976,7 @@ void build_hierarchical(const raft::device_resources& handle,
     raft::get_pool_memory_resource(device_memory, mem_per_row * size_t(max_minibatch_size));
   if (pool_guard) {
     RAFT_LOG_DEBUG("build_hierarchical: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
+                   mem_per_row * size_t(max_minibatch_size));
   }
 
   // Precompute the L2 norm of the dataset if relevant.
diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh
index 76fc22e99e..cca1cbb6e9 100644
--- a/cpp/include/raft/cluster/detail/kmeans_common.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_common.cuh
@@ -38,6 +38,7 @@
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/norm.cuh>
 #include <raft/linalg/reduce_rows_by_key.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/gather.cuh>
diff --git a/cpp/include/raft/cluster/single_linkage_types.hpp b/cpp/include/raft/cluster/single_linkage_types.hpp
index 9a4fcfef60..cd815622bf 100644
--- a/cpp/include/raft/cluster/single_linkage_types.hpp
+++ b/cpp/include/raft/cluster/single_linkage_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,9 +77,7 @@ class linkage_output {
   }
 };
 
-class linkage_output_int : public linkage_output<int> {
-};
-class linkage_output_int64 : public linkage_output<int64_t> {
-};
+class linkage_output_int : public linkage_output<int> {};
+class linkage_output_int64 : public linkage_output<int64_t> {};
 
 };  // namespace raft::cluster
diff --git a/cpp/include/raft/cluster/specializations.cuh b/cpp/include/raft/cluster/specializations.cuh
index 9b68d7adc9..9588a7f329 100644
--- a/cpp/include/raft/cluster/specializations.cuh
+++ b/cpp/include/raft/cluster/specializations.cuh
@@ -13,12 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __CLUSTER_SPECIALIZATIONS_H
-#define __CLUSTER_SPECIALIZATIONS_H
-
 #pragma once
 
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#endif
\ No newline at end of file
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index e80d7cccd9..dd8fc2d103 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please note that there is no equivalent in RAFT's public API"
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please note that there is no equivalent in RAFT's public API"
                 " so this file will eventually be removed altogether.")
 
 #include <raft/util/detail/cub_wrappers.cuh>
diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index f3cfbd81cc..6c62cd70cc 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft/util version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft/util version instead.")
 
 #include <raft/util/device_loads_stores.cuh>
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index 0e83f9a5cd..72de79a596 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft/matrix version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft/matrix version instead.")
 
 #include <raft/util/scatter.cuh>
diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp
index 633c8dd3e1..433b032b0f 100644
--- a/cpp/include/raft/common/seive.hpp
+++ b/cpp/include/raft/common/seive.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft/util version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft/util version instead.")
 
 #include <raft/util/seive.hpp>
diff --git a/cpp/include/raft/core/coo_matrix.hpp b/cpp/include/raft/core/coo_matrix.hpp
index efab8a1601..a5f7c05493 100644
--- a/cpp/include/raft/core/coo_matrix.hpp
+++ b/cpp/include/raft/core/coo_matrix.hpp
@@ -71,12 +71,6 @@ class coordinate_structure_view
   {
   }
 
-  /**
-   * Create a view from this view. Note that this is for interface compatibility
-   * @return
-   */
-  view_type view() { return view_type(rows_, cols_, this->get_n_rows(), this->get_n_cols()); }
-
   /**
    * Return span containing underlying rows array
    * @return span containing underlying rows array
@@ -209,6 +203,10 @@ class coo_matrix_view
                               coordinate_structure_view<RowType, ColType, NZType, is_device>,
                               is_device> {
  public:
+  using element_type = ElementType;
+  using row_type     = RowType;
+  using col_type     = ColType;
+  using nnz_type     = NZType;
   coo_matrix_view(raft::span<ElementType, is_device> element_span,
                   coordinate_structure_view<RowType, ColType, NZType, is_device> structure_view)
     : sparse_matrix_view<ElementType,
@@ -238,6 +236,9 @@ class coo_matrix
                          ContainerPolicy> {
  public:
   using element_type        = ElementType;
+  using row_type            = RowType;
+  using col_type            = ColType;
+  using nnz_type            = NZType;
   using structure_view_type = typename structure_type::view_type;
   using container_type      = typename ContainerPolicy<ElementType>::container_type;
   using sparse_matrix_type =
@@ -258,14 +259,9 @@ class coo_matrix
   // Constructor that owns the data but not the structure
   template <SparsityType sparsity_type_ = get_sparsity_type(),
             typename = typename std::enable_if_t<sparsity_type_ == SparsityType::PRESERVING>>
-  coo_matrix(raft::resources const& handle, std::shared_ptr<structure_type> structure) noexcept(
+  coo_matrix(raft::resources const& handle, structure_type structure) noexcept(
     std::is_nothrow_default_constructible_v<container_type>)
     : sparse_matrix_type(handle, structure){};
-  /**
-   * Return a view of the structure underlying this matrix
-   * @return
-   */
-  structure_view_type structure_view() { return this->structure_.get()->view(); }
 
   /**
    * Initialize the sparsity on this instance if it was not known upon construction
@@ -277,7 +273,20 @@ class coo_matrix
   void initialize_sparsity(NZType nnz)
   {
     sparse_matrix_type::initialize_sparsity(nnz);
-    this->structure_.get()->initialize_sparsity(nnz);
+    this->structure_.initialize_sparsity(nnz);
+  }
+
+  /**
+   * Return a view of the structure underlying this matrix
+   * @return
+   */
+  structure_view_type structure_view()
+  {
+    if constexpr (get_sparsity_type() == SparsityType::OWNING) {
+      return this->structure_.view();
+    } else {
+      return this->structure_;
+    }
   }
 };
 }  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/core/csr_matrix.hpp b/cpp/include/raft/core/csr_matrix.hpp
index fac656b3f9..95d09d3eea 100644
--- a/cpp/include/raft/core/csr_matrix.hpp
+++ b/cpp/include/raft/core/csr_matrix.hpp
@@ -87,12 +87,6 @@ class compressed_structure_view
    */
   span<indices_type, is_device> get_indices() override { return indices_; }
 
-  /**
-   * Create a view from this view. Note that this is for interface compatibility
-   * @return
-   */
-  view_type view() { return view_type(indptr_, indices_, this->get_n_cols()); }
-
  protected:
   raft::span<indptr_type, is_device> indptr_;
   raft::span<indices_type, is_device> indices_;
@@ -147,7 +141,7 @@ class compressed_structure
   constexpr auto operator=(compressed_structure const&) noexcept(
     std::is_nothrow_copy_assignable<indptr_container_type>::value)
     -> compressed_structure& = default;
-  constexpr auto operator    =(compressed_structure&&) noexcept(
+  constexpr auto operator=(compressed_structure&&) noexcept(
     std::is_nothrow_move_assignable<indptr_container_type>::value)
     -> compressed_structure& = default;
 
@@ -221,6 +215,10 @@ class csr_matrix_view
                               compressed_structure_view<IndptrType, IndicesType, NZType, is_device>,
                               is_device> {
  public:
+  using element_type = ElementType;
+  using indptr_type  = IndptrType;
+  using indices_type = IndicesType;
+  using nnz_type     = NZType;
   csr_matrix_view(
     raft::span<ElementType, is_device> element_span,
     compressed_structure_view<IndptrType, IndicesType, NZType, is_device> structure_view)
@@ -249,6 +247,9 @@ class csr_matrix
                          ContainerPolicy> {
  public:
   using element_type        = ElementType;
+  using indptr_type         = IndptrType;
+  using indices_type        = IndicesType;
+  using nnz_type            = NZType;
   using structure_view_type = typename structure_type::view_type;
   static constexpr auto get_sparsity_type() { return sparsity_type; }
   using sparse_matrix_type =
@@ -271,7 +272,7 @@ class csr_matrix
 
   template <SparsityType sparsity_type_ = get_sparsity_type(),
             typename = typename std::enable_if_t<sparsity_type_ == SparsityType::PRESERVING>>
-  csr_matrix(raft::resources const& handle, std::shared_ptr<structure_type> structure) noexcept(
+  csr_matrix(raft::resources const& handle, structure_type structure) noexcept(
     std::is_nothrow_default_constructible_v<container_type>)
     : sparse_matrix_type(handle, structure){};
 
@@ -284,13 +285,20 @@ class csr_matrix
   void initialize_sparsity(NZType nnz)
   {
     sparse_matrix_type::initialize_sparsity(nnz);
-    this->structure_.get()->initialize_sparsity(nnz);
+    this->structure_.initialize_sparsity(nnz);
   }
 
   /**
    * Return a view of the structure underlying this matrix
    * @return
    */
-  structure_view_type structure_view() { return this->structure_.get()->view(); }
+  structure_view_type structure_view()
+  {
+    if constexpr (get_sparsity_type() == SparsityType::OWNING) {
+      return this->structure_.view();
+    } else {
+      return this->structure_;
+    }
+  }
 };
 }  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
index 855c1228f7..5c56240ccf 100644
--- a/cpp/include/raft/core/cublas_macros.hpp
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include <raft/core/error.hpp>
 
 ///@todo: enable this once we have logger enabled
-//#include <cuml/common/logger.hpp>
+// #include <cuml/common/logger.hpp>
 
 #include <cstdint>
 
diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
index 8f7caf65f3..4477d32118 100644
--- a/cpp/include/raft/core/cusolver_macros.hpp
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <cusolverDn.h>
 #include <cusolverSp.h>
 ///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
+// #include <cuml/common/logger.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <type_traits>
 
diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp
index 8a9aab55f7..21a25ae28c 100644
--- a/cpp/include/raft/core/cusparse_macros.hpp
+++ b/cpp/include/raft/core/cusparse_macros.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <cusparse.h>
 #include <raft/core/error.hpp>
 ///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
+// #include <cuml/common/logger.hpp>
 
 #define _CUSPARSE_ERR_TO_STR(err) \
   case err: return #err;
diff --git a/cpp/include/raft/core/detail/logger.hpp b/cpp/include/raft/core/detail/logger.hpp
index 619fb89452..532aee4d90 100644
--- a/cpp/include/raft/core/detail/logger.hpp
+++ b/cpp/include/raft/core/detail/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#pragma message(__FILE__                                                 \
-                " is deprecated and will be removed in future releases." \
-                " Please use the <raft/core/logger.hpp> version instead.")
+#pragma message(__FILE__                                                   \
+                  " is deprecated and will be removed in future releases." \
+                  " Please use the <raft/core/logger.hpp> version instead.")
 
 #include <raft/core/logger.hpp>
diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp
index bfb47437ad..390acea697 100644
--- a/cpp/include/raft/core/detail/macros.hpp
+++ b/cpp/include/raft/core/detail/macros.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,40 @@
 #define RAFT_INLINE_FUNCTION _RAFT_HOST_DEVICE _RAFT_FORCEINLINE
 #endif
 
+// The RAFT_INLINE_CONDITIONAL is a conditional inline specifier that removes
+// the inline specification when RAFT_COMPILED is defined.
+//
+// When RAFT_COMPILED is not defined, functions may be defined in multiple
+// translation units and we do not want that to lead to linker errors.
+//
+// When RAFT_COMPILED is defined, this serves two purposes:
+//
+// 1. It triggers a multiple definition error message when memory_pool-inl.hpp
+// (for instance) is accidentally included in multiple translation units.
+//
+// 2. We function definitions to be non-inline, because non-inline functions
+// symbols are always exported in the object symbol table. For inline functions,
+// the compiler may elide the external symbol, which results in linker errors.
+#ifdef RAFT_COMPILED
+#define RAFT_INLINE_CONDITIONAL
+#else
+#define RAFT_INLINE_CONDITIONAL inline
+#endif  // RAFT_COMPILED
+
+// The RAFT_WEAK_FUNCTION specificies that:
+//
+// 1. A function may be defined in multiple translation units (like inline)
+//
+// 2. Must still emit an external symbol (unlike inline). This enables declaring
+// a function signature in an `-ext` header and defining it in a source file.
+//
+// From
+// https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes:
+//
+// "The weak attribute causes a declaration of an external symbol to be emitted
+// as a weak symbol rather than a global."
+#define RAFT_WEAK_FUNCTION __attribute__((weak))
+
 /**
  * Some macro magic to remove optional parentheses of a macro argument.
  * See https://stackoverflow.com/a/62984543
diff --git a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
index df89811636..d0aea4168e 100644
--- a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
+++ b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
@@ -74,7 +74,7 @@ namespace numpy_serializer {
 
 #if RAFT_SYSTEM_LITTLE_ENDIAN == 1
 #define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_LITTLE_ENDIAN_CHAR
-#else  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
+#else   // RAFT_SYSTEM_LITTLE_ENDIAN == 1
 #define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_BIG_ENDIAN_CHAR
 #endif  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
 
@@ -110,11 +110,9 @@ struct header_t {
 };
 
 template <class T>
-struct is_complex : std::false_type {
-};
+struct is_complex : std::false_type {};
 template <class T>
-struct is_complex<std::complex<T>> : std::true_type {
-};
+struct is_complex<std::complex<T>> : std::true_type {};
 
 template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
 inline dtype_t get_numpy_dtype()
diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
index 4a16ec81bd..e734c99029 100644
--- a/cpp/include/raft/core/detail/nvtx.hpp
+++ b/cpp/include/raft/core/detail/nvtx.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,18 +18,18 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace raft::common::nvtx::detail {
-
 #ifdef NVTX_ENABLED
 
 #include <cstdint>
 #include <cstdlib>
 #include <mutex>
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
 
+namespace raft::common::nvtx::detail {
+
 /**
  * @brief An internal struct to store associated state with the color
  * generator
@@ -191,7 +191,11 @@ inline void pop_range()
   nvtxDomainRangePop(domain_store<Domain>::value());
 }
 
-#else  // NVTX_ENABLED
+}  // namespace raft::common::nvtx::detail
+
+#else   // NVTX_ENABLED
+
+namespace raft::common::nvtx::detail {
 
 template <typename Domain, typename... Args>
 inline void push_range(const char* format, Args... args)
@@ -203,6 +207,6 @@ inline void pop_range()
 {
 }
 
-#endif  // NVTX_ENABLED
-
 }  // namespace raft::common::nvtx::detail
+
+#endif  // NVTX_ENABLED
diff --git a/cpp/include/raft/core/detail/span.hpp b/cpp/include/raft/core/detail/span.hpp
index 20500d618b..e6ccb8535c 100644
--- a/cpp/include/raft/core/detail/span.hpp
+++ b/cpp/include/raft/core/detail/span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,7 @@ template <std::size_t Extent, std::size_t Offset, std::size_t Count>
 struct extent_value_t
   : public std::integral_constant<
       std::size_t,
-      Count != dynamic_extent ? Count : (Extent != dynamic_extent ? Extent - Offset : Extent)> {
-};
+      Count != dynamic_extent ? Count : (Extent != dynamic_extent ? Extent - Offset : Extent)> {};
 
 /*!
  * If N is dynamic_extent, the extent of the returned span E is also
@@ -47,31 +46,25 @@ struct extent_value_t
 template <typename T, std::size_t Extent>
 struct extent_as_bytes_value_t
   : public std::integral_constant<std::size_t,
-                                  Extent == dynamic_extent ? Extent : sizeof(T) * Extent> {
-};
+                                  Extent == dynamic_extent ? Extent : sizeof(T) * Extent> {};
 
 template <std::size_t From, std::size_t To>
 struct is_allowed_extent_conversion_t
   : public std::integral_constant<bool,
-                                  From == To || From == dynamic_extent || To == dynamic_extent> {
-};
+                                  From == To || From == dynamic_extent || To == dynamic_extent> {};
 
 template <class From, class To>
 struct is_allowed_element_type_conversion_t
-  : public std::integral_constant<bool, std::is_convertible<From (*)[], To (*)[]>::value> {
-};
+  : public std::integral_constant<bool, std::is_convertible<From (*)[], To (*)[]>::value> {};
 
 template <class T>
-struct is_span_oracle_t : std::false_type {
-};
+struct is_span_oracle_t : std::false_type {};
 
 template <class T, bool is_device, std::size_t Extent>
-struct is_span_oracle_t<span<T, is_device, Extent>> : std::true_type {
-};
+struct is_span_oracle_t<span<T, is_device, Extent>> : std::true_type {};
 
 template <class T>
-struct is_span_t : public is_span_oracle_t<typename std::remove_cv<T>::type> {
-};
+struct is_span_t : public is_span_oracle_t<typename std::remove_cv<T>::type> {};
 
 template <class InputIt1, class InputIt2, class Compare>
 _RAFT_HOST_DEVICE constexpr auto lexicographical_compare(InputIt1 first1,
diff --git a/cpp/include/raft/core/device_coo_matrix.hpp b/cpp/include/raft/core/device_coo_matrix.hpp
index b1e9ca30fc..ce016dd5e0 100644
--- a/cpp/include/raft/core/device_coo_matrix.hpp
+++ b/cpp/include/raft/core/device_coo_matrix.hpp
@@ -79,8 +79,7 @@ template <typename RowType, typename ColType, typename NZType>
 using device_coordinate_structure_view = coordinate_structure_view<RowType, ColType, NZType, true>;
 
 template <typename T>
-struct is_device_coo_matrix : std::false_type {
-};
+struct is_device_coo_matrix : std::false_type {};
 
 template <typename ElementType,
           typename RowType,
@@ -91,8 +90,7 @@ template <typename ElementType,
           SparsityType sparsity_type>
 struct is_device_coo_matrix<
   device_coo_matrix<ElementType, RowType, ColType, NZType, ContainerPolicy, sparsity_type>>
-  : std::true_type {
-};
+  : std::true_type {};
 
 template <typename T>
 constexpr bool is_device_coo_matrix_v = is_device_coo_matrix<T>::value;
@@ -174,16 +172,15 @@ auto make_device_coo_matrix(raft::resources const& handle,
  * @tparam ColType
  * @tparam NZType
  * @param[in] handle raft handle for managing expensive device resources
- * @param[in] structure_ a sparsity-preserving coordinate structural view
+ * @param[in] structure a sparsity-preserving coordinate structural view
  * @return a sparsity-preserving sparse matrix in coordinate (coo) format
  */
 template <typename ElementType, typename RowType, typename ColType, typename NZType>
 auto make_device_coo_matrix(raft::resources const& handle,
-                            device_coordinate_structure_view<RowType, ColType, NZType> structure_)
+                            device_coordinate_structure_view<RowType, ColType, NZType> structure)
 {
-  return device_sparsity_preserving_coo_matrix<ElementType, RowType, ColType, NZType>(
-    handle,
-    std::make_shared<device_coordinate_structure_view<RowType, ColType, NZType>>(structure_));
+  return device_sparsity_preserving_coo_matrix<ElementType, RowType, ColType, NZType>(handle,
+                                                                                      structure);
 }
 
 /**
@@ -212,16 +209,15 @@ auto make_device_coo_matrix(raft::resources const& handle,
  * @tparam ColType
  * @tparam NZType
  * @param[in] ptr a pointer to array of nonzero matrix elements on device (size nnz)
- * @param[in] structure_ a sparsity-preserving coordinate structural view
+ * @param[in] structure a sparsity-preserving coordinate structural view
  * @return a sparsity-preserving sparse matrix in coordinate (coo) format
  */
 template <typename ElementType, typename RowType, typename ColType, typename NZType>
 auto make_device_coo_matrix_view(
-  ElementType* ptr, device_coordinate_structure_view<RowType, ColType, NZType> structure_)
+  ElementType* ptr, device_coordinate_structure_view<RowType, ColType, NZType> structure)
 {
   return device_coo_matrix_view<ElementType, RowType, ColType, NZType>(
-    raft::device_span<ElementType>(ptr, structure_.get_nnz()),
-    std::make_shared<device_coordinate_structure_view<RowType, ColType, NZType>>(structure_));
+    raft::device_span<ElementType>(ptr, structure.get_nnz()), structure);
 }
 
 /**
@@ -251,19 +247,17 @@ auto make_device_coo_matrix_view(
  * @tparam ColType
  * @tparam NZType
  * @param[in] elements a device span containing nonzero matrix elements (size nnz)
- * @param[in] structure_ a sparsity-preserving coordinate structural view
+ * @param[in] structure a sparsity-preserving coordinate structural view
  * @return
  */
 template <typename ElementType, typename RowType, typename ColType, typename NZType>
 auto make_device_coo_matrix_view(
   raft::device_span<ElementType> elements,
-  device_coordinate_structure_view<RowType, ColType, NZType> structure_)
+  device_coordinate_structure_view<RowType, ColType, NZType> structure)
 {
-  RAFT_EXPECTS(elements.size() == structure_.get_nnz(),
+  RAFT_EXPECTS(elements.size() == structure.get_nnz(),
                "Size of elements must be equal to the nnz from the structure");
-  return device_coo_matrix_view<ElementType, RowType, ColType, NZType>(
-    elements,
-    std::make_shared<device_coordinate_structure_view<RowType, ColType, NZType>>(structure_));
+  return device_coo_matrix_view<ElementType, RowType, ColType, NZType>(elements, structure);
 }
 
 /**
@@ -338,7 +332,7 @@ auto make_device_coordinate_structure(raft::resources const& handle,
  * @return a sparsity-preserving coordinate structural view
  */
 template <typename RowType, typename ColType, typename NZType>
-auto make_device_coo_structure_view(
+auto make_device_coordinate_structure_view(
   RowType* rows, ColType* cols, RowType n_rows, ColType n_cols, NZType nnz)
 {
   return device_coordinate_structure_view<RowType, ColType, NZType>(
@@ -376,10 +370,10 @@ auto make_device_coo_structure_view(
  * @return a sparsity-preserving coordinate structural view
  */
 template <typename RowType, typename ColType, typename NZType>
-auto make_device_coo_structure_view(raft::device_span<RowType> rows,
-                                    raft::device_span<ColType> cols,
-                                    RowType n_rows,
-                                    ColType n_cols)
+auto make_device_coordinate_structure_view(raft::device_span<RowType> rows,
+                                           raft::device_span<ColType> cols,
+                                           RowType n_rows,
+                                           ColType n_cols)
 {
   return device_coordinate_structure_view<RowType, ColType, NZType>(rows, cols, n_rows, n_cols);
 }
diff --git a/cpp/include/raft/core/device_csr_matrix.hpp b/cpp/include/raft/core/device_csr_matrix.hpp
index 59cabacf6d..869034e925 100644
--- a/cpp/include/raft/core/device_csr_matrix.hpp
+++ b/cpp/include/raft/core/device_csr_matrix.hpp
@@ -46,8 +46,7 @@ using device_sparsity_owning_csr_matrix =
   csr_matrix<ElementType, IndptrType, IndicesType, NZType, true, ContainerPolicy>;
 
 template <typename T>
-struct is_device_csr_matrix : std::false_type {
-};
+struct is_device_csr_matrix : std::false_type {};
 
 template <typename ElementType,
           typename IndptrType,
@@ -58,8 +57,7 @@ template <typename ElementType,
           SparsityType sparsity_type>
 struct is_device_csr_matrix<
   device_csr_matrix<ElementType, IndptrType, IndicesType, NZType, ContainerPolicy, sparsity_type>>
-  : std::true_type {
-};
+  : std::true_type {};
 
 template <typename T>
 constexpr bool is_device_csr_matrix_v = is_device_csr_matrix<T>::value;
@@ -189,7 +187,7 @@ auto make_device_csr_matrix(raft::device_resources const& handle,
  * @tparam IndicesType
  * @tparam NZType
  * @param[in] handle raft handle for managing expensive device resources
- * @param[in] structure_ a sparsity-preserving compressed structural view
+ * @param[in] structure a sparsity-preserving compressed structural view
  * @return a sparsity-preserving sparse matrix in compressed (csr) format
  */
 template <typename ElementType,
@@ -198,12 +196,10 @@ template <typename ElementType,
           typename NZType = uint64_t>
 auto make_device_csr_matrix(
   raft::device_resources const& handle,
-  device_compressed_structure_view<IndptrType, IndicesType, NZType> structure_)
+  device_compressed_structure_view<IndptrType, IndicesType, NZType> structure)
 {
   return device_sparsity_preserving_csr_matrix<ElementType, IndptrType, IndicesType, NZType>(
-    handle,
-    std::make_shared<device_compressed_structure_view<IndptrType, IndicesType, NZType>>(
-      structure_));
+    handle, structure);
 }
 
 /**
@@ -232,7 +228,7 @@ auto make_device_csr_matrix(
  * @tparam IndicesType
  * @tparam NZType
  * @param[in] ptr a pointer to array of nonzero matrix elements on device (size nnz)
- * @param[in] structure_ a sparsity-preserving compressed sparse structural view
+ * @param[in] structure a sparsity-preserving compressed sparse structural view
  * @return a sparsity-preserving csr matrix view
  */
 template <typename ElementType,
@@ -240,10 +236,10 @@ template <typename ElementType,
           typename IndicesType,
           typename NZType = uint64_t>
 auto make_device_csr_matrix_view(
-  ElementType* ptr, device_compressed_structure_view<IndptrType, IndicesType, NZType> structure_)
+  ElementType* ptr, device_compressed_structure_view<IndptrType, IndicesType, NZType> structure)
 {
   return device_csr_matrix_view<ElementType, IndptrType, IndicesType, NZType>(
-    raft::device_span<ElementType>(ptr, structure_.get_nnz()), std::make_shared(structure_));
+    raft::device_span<ElementType>(ptr, structure.get_nnz()), structure);
 }
 
 /**
@@ -273,7 +269,7 @@ auto make_device_csr_matrix_view(
  * @tparam IndicesType
  * @tparam NZType
  * @param[in] elements device span containing array of matrix elements (size nnz)
- * @param[in] structure_ a sparsity-preserving structural view
+ * @param[in] structure a sparsity-preserving structural view
  * @return a sparsity-preserving csr matrix view
  */
 template <typename ElementType,
@@ -282,12 +278,11 @@ template <typename ElementType,
           typename NZType = uint64_t>
 auto make_device_csr_matrix_view(
   raft::device_span<ElementType> elements,
-  device_compressed_structure_view<IndptrType, IndicesType, NZType> structure_)
+  device_compressed_structure_view<IndptrType, IndicesType, NZType> structure)
 {
-  RAFT_EXPECTS(elements.size() == structure_.get_nnz(),
+  RAFT_EXPECTS(elements.size() == structure.get_nnz(),
                "Size of elements must be equal to the nnz from the structure");
-  return device_csr_matrix_view<ElementType, IndptrType, IndicesType, NZType>(
-    elements, std::make_shared(structure_));
+  return device_csr_matrix_view<ElementType, IndptrType, IndicesType, NZType>(elements, structure);
 }
 
 /**
@@ -365,7 +360,7 @@ auto make_device_compressed_structure(raft::device_resources const& handle,
  * @return a sparsity-preserving compressed structural view
  */
 template <typename IndptrType, typename IndicesType, typename NZType = uint64_t>
-auto make_device_csr_structure_view(
+auto make_device_compressed_structure_view(
   IndptrType* indptr, IndicesType* indices, IndptrType n_rows, IndicesType n_cols, NZType nnz)
 {
   return device_compressed_structure_view<IndptrType, IndicesType, NZType>(
@@ -408,9 +403,9 @@ auto make_device_csr_structure_view(
  *
  */
 template <typename IndptrType, typename IndicesType, typename NZType = uint64_t>
-auto make_device_csr_structure_view(raft::device_span<IndptrType> indptr,
-                                    raft::device_span<IndicesType> indices,
-                                    IndicesType n_cols)
+auto make_device_compressed_structure_view(raft::device_span<IndptrType> indptr,
+                                           raft::device_span<IndicesType> indices,
+                                           IndicesType n_cols)
 {
   return device_compressed_structure_view<IndptrType, IndicesType, NZType>(indptr, indices, n_cols);
 }
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index f72ae36d64..c1898a3f09 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -45,11 +45,9 @@ template <typename ElementType,
 using managed_mdspan = mdspan<ElementType, Extents, LayoutPolicy, managed_accessor<AccessorPolicy>>;
 
 template <typename T, bool B>
-struct is_device_mdspan : std::false_type {
-};
+struct is_device_mdspan : std::false_type {};
 template <typename T>
-struct is_device_mdspan<T, true> : std::bool_constant<T::accessor_type::is_device_accessible> {
-};
+struct is_device_mdspan<T, true> : std::bool_constant<T::accessor_type::is_device_accessible> {};
 
 /**
  * @\brief Boolean to determine if template type T is either raft::device_mdspan or a derived type
@@ -64,11 +62,9 @@ template <typename T>
 using is_output_device_mdspan_t = is_device_mdspan<T, is_output_mdspan_v<T>>;
 
 template <typename T, bool B>
-struct is_managed_mdspan : std::false_type {
-};
+struct is_managed_mdspan : std::false_type {};
 template <typename T>
-struct is_managed_mdspan<T, true> : std::bool_constant<T::accessor_type::is_managed_accessible> {
-};
+struct is_managed_mdspan<T, true> : std::bool_constant<T::accessor_type::is_managed_accessible> {};
 
 /**
  * @\brief Boolean to determine if template type T is either raft::managed_mdspan or a derived type
@@ -259,6 +255,36 @@ auto make_device_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_col
   return device_matrix_view<ElementType, IndexType, LayoutPolicy>{ptr, extents};
 }
 
+/**
+ * @brief Create a 2-dim mdspan instance for device pointer with a strided layout
+ *        that is restricted to stride 1 in the trailing dimension. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr on device to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ * @param[in] stride leading dimension / stride of data
+ */
+template <typename ElementType, typename IndexType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_strided_matrix_view(ElementType* ptr,
+                                     IndexType n_rows,
+                                     IndexType n_cols,
+                                     IndexType stride)
+{
+  constexpr auto is_row_major = std::is_same_v<LayoutPolicy, layout_c_contiguous>;
+  IndexType stride0           = is_row_major ? (stride > 0 ? stride : n_cols) : 1;
+  IndexType stride1           = is_row_major ? 1 : (stride > 0 ? stride : n_rows);
+
+  assert(is_row_major ? stride0 >= n_cols : stride1 >= n_rows);
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+
+  auto layout = make_strided_layout(extents, std::array<IndexType, 2>{stride0, stride1});
+  return device_matrix_view<ElementType, IndexType, layout_stride>{ptr, layout};
+}
+
 /**
  * @brief Create a 1-dim mdspan instance for device pointer.
  * @tparam ElementType the data type of the vector elements
diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
index df6b39a368..1cab36561a 100644
--- a/cpp/include/raft/core/device_resources.hpp
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -69,7 +69,7 @@ class device_resources : public resources {
   }
 
   device_resources(const device_resources& handle) : resources{handle} {}
-  device_resources(device_resources&&) = delete;
+  device_resources(device_resources&&)            = delete;
   device_resources& operator=(device_resources&&) = delete;
 
   /**
@@ -246,7 +246,7 @@ class stream_syncer {
     handle_.sync_stream_pool();
   }
 
-  stream_syncer(const stream_syncer& other) = delete;
+  stream_syncer(const stream_syncer& other)            = delete;
   stream_syncer& operator=(const stream_syncer& other) = delete;
 
  private:
diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp
index 02efebec9e..2a6b5657e2 100644
--- a/cpp/include/raft/core/handle.hpp
+++ b/cpp/include/raft/core/handle.hpp
@@ -39,7 +39,7 @@ class handle_t : public raft::device_resources {
 
   handle_t(const handle_t& handle) : device_resources{handle} {}
 
-  handle_t(handle_t&&) = delete;
+  handle_t(handle_t&&)            = delete;
   handle_t& operator=(handle_t&&) = delete;
 
   /**
diff --git a/cpp/include/raft/core/host_coo_matrix.hpp b/cpp/include/raft/core/host_coo_matrix.hpp
index 45ec278a7d..32e7a9e3c4 100644
--- a/cpp/include/raft/core/host_coo_matrix.hpp
+++ b/cpp/include/raft/core/host_coo_matrix.hpp
@@ -78,8 +78,7 @@ template <typename RowType, typename ColType, typename NZType>
 using host_coordinate_structure_view = coordinate_structure_view<RowType, ColType, NZType, false>;
 
 template <typename T>
-struct is_host_coo_matrix : std::false_type {
-};
+struct is_host_coo_matrix : std::false_type {};
 
 template <typename ElementType,
           typename RowType,
@@ -90,8 +89,7 @@ template <typename ElementType,
           SparsityType sparsity_type>
 struct is_host_coo_matrix<
   host_coo_matrix<ElementType, RowType, ColType, NZType, ContainerPolicy, sparsity_type>>
-  : std::true_type {
-};
+  : std::true_type {};
 
 template <typename T>
 constexpr bool is_host_coo_matrix_v = is_host_coo_matrix<T>::value;
@@ -173,15 +171,15 @@ auto make_host_coo_matrix(raft::resources const& handle,
  * @tparam ColType
  * @tparam NZType
  * @param[in] handle raft handle for managing expensive resources
- * @param[in] structure_ a sparsity-preserving coordinate structural view
+ * @param[in] structure a sparsity-preserving coordinate structural view
  * @return a sparsity-preserving sparse matrix in coordinate (coo) format
  */
 template <typename ElementType, typename RowType, typename ColType, typename NZType>
 auto make_host_coo_matrix(raft::resources const& handle,
-                          host_coordinate_structure_view<RowType, ColType, NZType> structure_)
+                          host_coordinate_structure_view<RowType, ColType, NZType> structure)
 {
-  return host_sparsity_preserving_coo_matrix<ElementType, RowType, ColType, NZType>(
-    handle, std::make_shared<host_coordinate_structure_view<RowType, ColType, NZType>>(structure_));
+  return host_sparsity_preserving_coo_matrix<ElementType, RowType, ColType, NZType>(handle,
+                                                                                    structure);
 }
 
 /**
@@ -210,15 +208,15 @@ auto make_host_coo_matrix(raft::resources const& handle,
  * @tparam ColType
  * @tparam NZType
  * @param[in] ptr a pointer to array of nonzero matrix elements on host (size nnz)
- * @param[in] structure_ a sparsity-preserving coordinate structural view
+ * @param[in] structure a sparsity-preserving coordinate structural view
  * @return a sparsity-preserving sparse matrix in coordinate (coo) format
  */
 template <typename ElementType, typename RowType, typename ColType, typename NZType>
 auto make_host_coo_matrix_view(ElementType* ptr,
-                               host_coordinate_structure_view<RowType, ColType, NZType> structure_)
+                               host_coordinate_structure_view<RowType, ColType, NZType> structure)
 {
   return host_coo_matrix_view<ElementType, RowType, ColType, NZType>(
-    raft::host_span<ElementType>(ptr, structure_.get_nnz()), std::make_shared(structure_));
+    raft::host_span<ElementType>(ptr, structure.get_nnz()), structure);
 }
 
 /**
@@ -248,17 +246,16 @@ auto make_host_coo_matrix_view(ElementType* ptr,
  * @tparam ColType
  * @tparam NZType
  * @param[in] elements a host span containing nonzero matrix elements (size nnz)
- * @param[in] structure_ a sparsity-preserving coordinate structural view
+ * @param[in] structure a sparsity-preserving coordinate structural view
  * @return
  */
 template <typename ElementType, typename RowType, typename ColType, typename NZType>
 auto make_host_coo_matrix_view(raft::host_span<ElementType> elements,
-                               host_coordinate_structure_view<RowType, ColType, NZType> structure_)
+                               host_coordinate_structure_view<RowType, ColType, NZType> structure)
 {
-  RAFT_EXPECTS(elements.size() == structure_.get_nnz(),
+  RAFT_EXPECTS(elements.size() == structure.get_nnz(),
                "Size of elements must be equal to the nnz from the structure");
-  return host_coo_matrix_view<ElementType, RowType, ColType, NZType>(elements,
-                                                                     std::make_shared(structure_));
+  return host_coo_matrix_view<ElementType, RowType, ColType, NZType>(elements, structure);
 }
 
 /**
@@ -333,7 +330,7 @@ auto make_host_coordinate_structure(raft::resources const& handle,
  * @return a sparsity-preserving coordinate structural view
  */
 template <typename RowType, typename ColType, typename NZType>
-auto make_host_coo_structure_view(
+auto make_host_coordinate_structure_view(
   RowType* rows, ColType* cols, RowType n_rows, ColType n_cols, NZType nnz)
 {
   return host_coordinate_structure_view<RowType, ColType, NZType>(
@@ -371,10 +368,10 @@ auto make_host_coo_structure_view(
  * @return a sparsity-preserving coordinate structural view
  */
 template <typename RowType, typename ColType, typename NZType>
-auto make_host_coo_structure_view(raft::host_span<RowType> rows,
-                                  raft::host_span<ColType> cols,
-                                  RowType n_rows,
-                                  ColType n_cols)
+auto make_host_coordinate_structure_view(raft::host_span<RowType> rows,
+                                         raft::host_span<ColType> cols,
+                                         RowType n_rows,
+                                         ColType n_cols)
 {
   return host_coordinate_structure_view<RowType, ColType, NZType>(rows, cols, n_rows, n_cols);
 }
diff --git a/cpp/include/raft/core/host_csr_matrix.hpp b/cpp/include/raft/core/host_csr_matrix.hpp
index 437f60814e..86199335f2 100644
--- a/cpp/include/raft/core/host_csr_matrix.hpp
+++ b/cpp/include/raft/core/host_csr_matrix.hpp
@@ -45,8 +45,7 @@ using host_sparsity_owning_csr_matrix =
   csr_matrix<ElementType, IndptrType, IndicesType, NZType, false, ContainerPolicy>;
 
 template <typename T>
-struct is_host_csr_matrix : std::false_type {
-};
+struct is_host_csr_matrix : std::false_type {};
 
 template <typename ElementType,
           typename IndptrType,
@@ -57,8 +56,7 @@ template <typename ElementType,
           SparsityType sparsity_type>
 struct is_host_csr_matrix<
   host_csr_matrix<ElementType, IndptrType, IndicesType, NZType, ContainerPolicy, sparsity_type>>
-  : std::true_type {
-};
+  : std::true_type {};
 
 template <typename T>
 constexpr bool is_host_csr_matrix_v = is_host_csr_matrix<T>::value;
@@ -189,20 +187,18 @@ auto make_host_csr_matrix(raft::resources const& handle,
  * @tparam IndicesType
  * @tparam NZType
  * @param[in] handle raft handle for managing expensive resources
- * @param[in] structure_ a sparsity-preserving compressed structural view
+ * @param[in] structure a sparsity-preserving compressed structural view
  * @return a sparsity-preserving sparse matrix in compressed (csr) format
  */
 template <typename ElementType,
           typename IndptrType,
           typename IndicesType,
           typename NZType = uint64_t>
-auto make_host_csr_matrix(
-  raft::resources const& handle,
-  host_compressed_structure_view<IndptrType, IndicesType, NZType> structure_)
+auto make_host_csr_matrix(raft::resources const& handle,
+                          host_compressed_structure_view<IndptrType, IndicesType, NZType> structure)
 {
   return host_sparsity_preserving_csr_matrix<ElementType, IndptrType, IndicesType, NZType>(
-    handle,
-    std::make_shared<host_compressed_structure_view<IndptrType, IndicesType, NZType>>(structure_));
+    handle, structure);
 }
 
 /**
@@ -231,7 +227,7 @@ auto make_host_csr_matrix(
  * @tparam IndicesType
  * @tparam NZType
  * @param[in] ptr a pointer to array of nonzero matrix elements on host (size nnz)
- * @param[in] structure_ a sparsity-preserving compressed sparse structural view
+ * @param[in] structure a sparsity-preserving compressed sparse structural view
  * @return a sparsity-preserving csr matrix view
  */
 template <typename ElementType,
@@ -239,10 +235,10 @@ template <typename ElementType,
           typename IndicesType,
           typename NZType = uint64_t>
 auto make_host_csr_matrix_view(
-  ElementType* ptr, host_compressed_structure_view<IndptrType, IndicesType, NZType> structure_)
+  ElementType* ptr, host_compressed_structure_view<IndptrType, IndicesType, NZType> structure)
 {
   return host_csr_matrix_view<ElementType, IndptrType, IndicesType, NZType>(
-    raft::host_span<ElementType>(ptr, structure_.get_nnz()), std::make_shared(structure_));
+    raft::host_span<ElementType>(ptr, structure.get_nnz()), structure);
 }
 
 /**
@@ -272,7 +268,7 @@ auto make_host_csr_matrix_view(
  * @tparam IndicesType
  * @tparam NZType
  * @param[in] elements host span containing array of matrix elements (size nnz)
- * @param[in] structure_ a sparsity-preserving structural view
+ * @param[in] structure a sparsity-preserving structural view
  * @return a sparsity-preserving csr matrix view
  */
 template <typename ElementType,
@@ -281,12 +277,11 @@ template <typename ElementType,
           typename NZType = uint64_t>
 auto make_host_csr_matrix_view(
   raft::host_span<ElementType> elements,
-  host_compressed_structure_view<IndptrType, IndicesType, NZType> structure_)
+  host_compressed_structure_view<IndptrType, IndicesType, NZType> structure)
 {
-  RAFT_EXPECTS(elements.size() == structure_.get_nnz(),
+  RAFT_EXPECTS(elements.size() == structure.get_nnz(),
                "Size of elements must be equal to the nnz from the structure");
-  return host_csr_matrix_view<ElementType, IndptrType, IndicesType, NZType>(
-    elements, std::make_shared(structure_));
+  return host_csr_matrix_view<ElementType, IndptrType, IndicesType, NZType>(elements, structure);
 }
 
 /**
@@ -365,7 +360,7 @@ auto make_host_compressed_structure(raft::resources const& handle,
  * @return a sparsity-preserving compressed structural view
  */
 template <typename IndptrType, typename IndicesType, typename NZType = uint64_t>
-auto make_host_csr_structure_view(
+auto make_host_compressed_structure_view(
   IndptrType* indptr, IndicesType* indices, IndptrType n_rows, IndicesType n_cols, NZType nnz)
 {
   return host_compressed_structure_view<IndptrType, IndicesType, NZType>(
@@ -408,9 +403,9 @@ auto make_host_csr_structure_view(
  *
  */
 template <typename IndptrType, typename IndicesType, typename NZType = uint64_t>
-auto make_host_csr_structure_view(raft::host_span<IndptrType> indptr,
-                                  raft::host_span<IndicesType> indices,
-                                  IndicesType n_cols)
+auto make_host_compressed_structure_view(raft::host_span<IndptrType> indptr,
+                                         raft::host_span<IndicesType> indices,
+                                         IndicesType n_cols)
 {
   return host_compressed_structure_view<IndptrType, IndicesType, NZType>(indptr, indices, n_cols);
 }
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index a6cdec7a84..9a675680ac 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -37,11 +37,9 @@ template <typename ElementType,
 using host_mdspan = mdspan<ElementType, Extents, LayoutPolicy, host_accessor<AccessorPolicy>>;
 
 template <typename T, bool B>
-struct is_host_mdspan : std::false_type {
-};
+struct is_host_mdspan : std::false_type {};
 template <typename T>
-struct is_host_mdspan<T, true> : std::bool_constant<T::accessor_type::is_host_accessible> {
-};
+struct is_host_mdspan<T, true> : std::bool_constant<T::accessor_type::is_host_accessible> {};
 
 /**
  * @\brief Boolean to determine if template type T is either raft::host_mdspan or a derived type
diff --git a/cpp/include/raft/core/interruptible.hpp b/cpp/include/raft/core/interruptible.hpp
index 0cc4af2bbf..62e481a801 100644
--- a/cpp/include/raft/core/interruptible.hpp
+++ b/cpp/include/raft/core/interruptible.hpp
@@ -172,10 +172,10 @@ class interruptible {
   inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
 
   // don't allow the token to leave the shared_ptr
-  interruptible(interruptible const&) = delete;
-  interruptible(interruptible&&)      = delete;
+  interruptible(interruptible const&)                    = delete;
+  interruptible(interruptible&&)                         = delete;
   auto operator=(interruptible const&) -> interruptible& = delete;
-  auto operator=(interruptible&&) -> interruptible& = delete;
+  auto operator=(interruptible&&) -> interruptible&      = delete;
 
  private:
   /** Global registry of thread-local cancellation stores. */
diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp
index 192d160d45..2e0d1117a1 100644
--- a/cpp/include/raft/core/kvp.hpp
+++ b/cpp/include/raft/core/kvp.hpp
@@ -32,8 +32,8 @@ struct KeyValuePair {
   typedef _Key Key;      ///< Key data type
   typedef _Value Value;  ///< Value data type
 
-  Key key;      ///< Item key
-  Value value;  ///< Item value
+  Key key;               ///< Item key
+  Value value;           ///< Item value
 
   /// Constructor
   RAFT_INLINE_FUNCTION KeyValuePair() {}
diff --git a/cpp/include/raft/core/logger-ext.hpp b/cpp/include/raft/core/logger-ext.hpp
new file mode 100644
index 0000000000..8fd29cf1d6
--- /dev/null
+++ b/cpp/include/raft/core/logger-ext.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <memory>                       // std::unique_ptr
+#include <raft/core/detail/macros.hpp>  // RAFT_INLINE_CONDITIONAL
+#include <string>                       // std::string
+#include <unordered_map>                // std::unordered_map
+
+namespace raft {
+
+static const std::string RAFT_NAME = "raft";
+static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
+
+namespace detail {
+RAFT_INLINE_CONDITIONAL std::string format(const char* fmt, ...);
+}
+/**
+ * @brief The main Logging class for raft library.
+ *
+ * This class acts as a thin wrapper over the underlying `spdlog` interface. The
+ * design is done in this way in order to avoid us having to also ship `spdlog`
+ * header files in our installation.
+ *
+ * @todo This currently only supports logging to stdout. Need to add support in
+ *       future to add custom loggers as well [Issue #2046]
+ */
+class logger {
+ public:
+  // @todo setting the logger once per process with
+  logger(std::string const& name_ = "");
+  /**
+   * @brief Singleton method to get the underlying logger object
+   *
+   * @return the singleton logger object
+   */
+  static logger& get(std::string const& name = "");
+
+  /**
+   * @brief Set the logging level.
+   *
+   * Only messages with level equal or above this will be printed
+   *
+   * @param[in] level logging level
+   *
+   * @note The log level will actually be set only if the input is within the
+   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
+   *       be ignored. See documentation of decisiontree for how this gets used
+   */
+  void set_level(int level);
+
+  /**
+   * @brief Set the logging pattern
+   *
+   * @param[in] pattern the pattern to be set. Refer this link
+   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
+   *                    to know the right syntax of this pattern
+   */
+  void set_pattern(const std::string& pattern);
+
+  /**
+   * @brief Register a callback function to be run in place of usual log call
+   *
+   * @param[in] callback the function to be run on all logged messages
+   */
+  void set_callback(void (*callback)(int lvl, const char* msg));
+
+  /**
+   * @brief Register a flush function compatible with the registered callback
+   *
+   * @param[in] flush the function to use when flushing logs
+   */
+  void set_flush(void (*flush)());
+
+  /**
+   * @brief Tells whether messages will be logged for the given log level
+   *
+   * @param[in] level log level to be checked for
+   * @return true if messages will be logged for this level, else false
+   */
+  bool should_log_for(int level) const;
+  /**
+   * @brief Query for the current log level
+   *
+   * @return the current log level
+   */
+  int get_level() const;
+
+  /**
+   * @brief Get the current logging pattern
+   * @return the pattern
+   */
+  std::string get_pattern() const;
+
+  /**
+   * @brief Main logging method
+   *
+   * @param[in] level logging level of this message
+   * @param[in] fmt   C-like format string, followed by respective params
+   */
+  void log(int level, const char* fmt, ...);
+
+  /**
+   * @brief Flush logs by calling flush on underlying logger
+   */
+  void flush();
+
+  ~logger();
+
+ private:
+  logger();
+  // pimpl pattern:
+  // https://learn.microsoft.com/en-us/cpp/cpp/pimpl-for-compile-time-encapsulation-modern-cpp?view=msvc-170
+  class impl;
+  std::unique_ptr<impl> pimpl;
+  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
+};  // class logger
+
+};  // namespace raft
diff --git a/cpp/include/raft/core/logger-inl.hpp b/cpp/include/raft/core/logger-inl.hpp
new file mode 100644
index 0000000000..fcfa1f1333
--- /dev/null
+++ b/cpp/include/raft/core/logger-inl.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <stdarg.h>
+
+#include <algorithm>
+
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <stdarg.h>
+
+#include "logger-macros.hpp"
+// The logger-ext.hpp file contains the class declaration of the logger class.
+// In this case, it is okay to include the logger-ext.hpp file because it
+// contains no RAFT_EXPLICIT template instantiations.
+#include "logger-ext.hpp"
+
+#define SPDLOG_HEADER_ONLY
+#include <raft/core/detail/callback_sink.hpp>
+#include <raft/core/detail/macros.hpp>        // RAFT_INLINE_CONDITIONAL
+#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
+#include <spdlog/spdlog.h>                    // NOLINT
+
+namespace raft {
+
+namespace detail {
+
+inline std::string format(const char* fmt, va_list& vl)
+{
+  va_list vl_copy;
+  va_copy(vl_copy, vl);
+  int length = std::vsnprintf(nullptr, 0, fmt, vl_copy);
+  assert(length >= 0);
+  std::vector<char> buf(length + 1);
+  std::vsnprintf(buf.data(), length + 1, fmt, vl);
+  return std::string(buf.data());
+}
+
+RAFT_INLINE_CONDITIONAL std::string format(const char* fmt, ...)
+{
+  va_list vl;
+  va_start(vl, fmt);
+  std::string str = format(fmt, vl);
+  va_end(vl);
+  return str;
+}
+
+inline int convert_level_to_spdlog(int level)
+{
+  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
+  return RAFT_LEVEL_TRACE - level;
+}
+
+}  // namespace detail
+
+class logger::impl {  // defined privately here
+                      // ... all private data and functions: all of these
+                      //     can now change without recompiling callers ...
+ public:
+  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
+  std::shared_ptr<spdlog::logger> spdlogger;
+  std::string cur_pattern;
+  int cur_level;
+
+  impl(std::string const& name_ = "")
+    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
+      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
+      cur_pattern()
+  {
+  }
+};  // class logger::impl
+
+RAFT_INLINE_CONDITIONAL logger::logger(std::string const& name_) : pimpl(new impl(name_))
+{
+  set_pattern(default_log_pattern);
+  set_level(RAFT_ACTIVE_LEVEL);
+}
+
+RAFT_INLINE_CONDITIONAL logger& logger::get(std::string const& name)
+{
+  if (log_map.find(name) == log_map.end()) { log_map[name] = std::make_shared<raft::logger>(name); }
+  return *log_map[name];
+}
+
+RAFT_INLINE_CONDITIONAL void logger::set_level(int level)
+{
+  level = raft::detail::convert_level_to_spdlog(level);
+  pimpl->spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
+}
+
+RAFT_INLINE_CONDITIONAL void logger::set_pattern(const std::string& pattern)
+{
+  pimpl->cur_pattern = pattern;
+  pimpl->spdlogger->set_pattern(pattern);
+}
+
+RAFT_INLINE_CONDITIONAL void logger::set_callback(void (*callback)(int lvl, const char* msg))
+{
+  pimpl->sink->set_callback(callback);
+}
+
+RAFT_INLINE_CONDITIONAL void logger::set_flush(void (*flush)()) { pimpl->sink->set_flush(flush); }
+
+RAFT_INLINE_CONDITIONAL bool logger::should_log_for(int level) const
+{
+  level        = raft::detail::convert_level_to_spdlog(level);
+  auto level_e = static_cast<spdlog::level::level_enum>(level);
+  return pimpl->spdlogger->should_log(level_e);
+}
+
+RAFT_INLINE_CONDITIONAL int logger::get_level() const
+{
+  auto level_e = pimpl->spdlogger->level();
+  return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
+}
+
+RAFT_INLINE_CONDITIONAL std::string logger::get_pattern() const { return pimpl->cur_pattern; }
+
+RAFT_INLINE_CONDITIONAL void logger::log(int level, const char* fmt, ...)
+{
+  level        = raft::detail::convert_level_to_spdlog(level);
+  auto level_e = static_cast<spdlog::level::level_enum>(level);
+  // explicit check to make sure that we only expand messages when required
+  if (pimpl->spdlogger->should_log(level_e)) {
+    va_list vl;
+    va_start(vl, fmt);
+    auto msg = raft::detail::format(fmt, vl);
+    va_end(vl);
+    pimpl->spdlogger->log(level_e, msg);
+  }
+}
+
+RAFT_INLINE_CONDITIONAL void logger::flush() { pimpl->spdlogger->flush(); }
+
+RAFT_INLINE_CONDITIONAL logger::~logger() {}
+
+};  // namespace raft
diff --git a/cpp/include/raft/core/logger-macros.hpp b/cpp/include/raft/core/logger-macros.hpp
new file mode 100644
index 0000000000..5ddb072067
--- /dev/null
+++ b/cpp/include/raft/core/logger-macros.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/**
+ * @defgroup logging levels used in raft
+ *
+ * @note exactly match the corresponding ones (but reverse in terms of value)
+ *       in spdlog for wrapping purposes
+ *
+ * @{
+ */
+#define RAFT_LEVEL_TRACE    6
+#define RAFT_LEVEL_DEBUG    5
+#define RAFT_LEVEL_INFO     4
+#define RAFT_LEVEL_WARN     3
+#define RAFT_LEVEL_ERROR    2
+#define RAFT_LEVEL_CRITICAL 1
+#define RAFT_LEVEL_OFF      0
+/** @} */
+
+#if !defined(RAFT_ACTIVE_LEVEL)
+#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_INFO
+#endif
+
+/**
+ * @defgroup loggerMacros Helper macros for dealing with logging
+ * @{
+ */
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE_VEC(ptr, len)                                      \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    print_vector(#ptr, ptr, len, ss);                                     \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE_VEC(ptr, len) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
+#define RAFT_LOG_DEBUG(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_DEBUG(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
+#define RAFT_LOG_INFO(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_INFO(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
+#define RAFT_LOG_WARN(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_WARN(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
+#define RAFT_LOG_ERROR(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_ERROR(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
+#define RAFT_LOG_CRITICAL(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
+#endif
+/** @} */
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
index 3984ec042a..59968ff5e5 100644
--- a/cpp/include/raft/core/logger.hpp
+++ b/cpp/include/raft/core/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,310 +15,10 @@
  */
 #pragma once
 
-#ifndef __RAFT_RT_LOGGER
-#define __RAFT_RT_LOGGER
+#include "logger-macros.hpp"
 
-#include <stdarg.h>
+#include "logger-ext.hpp"
 
-#include <algorithm>
-
-#include <memory>
-#include <mutex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include <stdarg.h>
-
-#define SPDLOG_HEADER_ONLY
-#include <raft/core/detail/callback_sink.hpp>
-#include <raft/util/cudart_utils.hpp>
-#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
-#include <spdlog/spdlog.h>                    // NOLINT
-
-/**
- * @defgroup logging levels used in raft
- *
- * @note exactly match the corresponding ones (but reverse in terms of value)
- *       in spdlog for wrapping purposes
- *
- * @{
- */
-#define RAFT_LEVEL_TRACE    6
-#define RAFT_LEVEL_DEBUG    5
-#define RAFT_LEVEL_INFO     4
-#define RAFT_LEVEL_WARN     3
-#define RAFT_LEVEL_ERROR    2
-#define RAFT_LEVEL_CRITICAL 1
-#define RAFT_LEVEL_OFF      0
-/** @} */
-
-#if !defined(RAFT_ACTIVE_LEVEL)
-#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_INFO
+#if !defined(RAFT_COMPILED)
+#include "logger-inl.hpp"
 #endif
-
-namespace raft {
-
-static const std::string RAFT_NAME = "raft";
-static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
-
-namespace detail {
-
-/**
- * @defgroup CStringFormat Expand a C-style format string
- *
- * @brief Expands C-style formatted string into std::string
- *
- * @param[in] fmt format string
- * @param[in] vl  respective values for each of format modifiers in the string
- *
- * @return the expanded `std::string`
- *
- * @{
- */
-inline std::string format(const char* fmt, va_list& vl)
-{
-  va_list vl_copy;
-  va_copy(vl_copy, vl);
-  int length = std::vsnprintf(nullptr, 0, fmt, vl_copy);
-  assert(length >= 0);
-  std::vector<char> buf(length + 1);
-  std::vsnprintf(buf.data(), length + 1, fmt, vl);
-  return std::string(buf.data());
-}
-
-inline std::string format(const char* fmt, ...)
-{
-  va_list vl;
-  va_start(vl, fmt);
-  std::string str = format(fmt, vl);
-  va_end(vl);
-  return str;
-}
-/** @} */
-
-inline int convert_level_to_spdlog(int level)
-{
-  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
-  return RAFT_LEVEL_TRACE - level;
-}
-
-}  // namespace detail
-
-/**
- * @brief The main Logging class for raft library.
- *
- * This class acts as a thin wrapper over the underlying `spdlog` interface. The
- * design is done in this way in order to avoid us having to also ship `spdlog`
- * header files in our installation.
- *
- * @todo This currently only supports logging to stdout. Need to add support in
- *       future to add custom loggers as well [Issue #2046]
- */
-class logger {
- public:
-  // @todo setting the logger once per process with
-  logger(std::string const& name_ = "")
-    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
-      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
-      cur_pattern()
-  {
-    set_pattern(default_log_pattern);
-    set_level(RAFT_ACTIVE_LEVEL);
-  }
-  /**
-   * @brief Singleton method to get the underlying logger object
-   *
-   * @return the singleton logger object
-   */
-  static logger& get(std::string const& name = "")
-  {
-    if (log_map.find(name) == log_map.end()) {
-      log_map[name] = std::make_shared<raft::logger>(name);
-    }
-    return *log_map[name];
-  }
-
-  /**
-   * @brief Set the logging level.
-   *
-   * Only messages with level equal or above this will be printed
-   *
-   * @param[in] level logging level
-   *
-   * @note The log level will actually be set only if the input is within the
-   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
-   *       be ignored. See documentation of decisiontree for how this gets used
-   */
-  void set_level(int level)
-  {
-    level = raft::detail::convert_level_to_spdlog(level);
-    spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
-  }
-
-  /**
-   * @brief Set the logging pattern
-   *
-   * @param[in] pattern the pattern to be set. Refer this link
-   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
-   *                    to know the right syntax of this pattern
-   */
-  void set_pattern(const std::string& pattern)
-  {
-    cur_pattern = pattern;
-    spdlogger->set_pattern(pattern);
-  }
-
-  /**
-   * @brief Register a callback function to be run in place of usual log call
-   *
-   * @param[in] callback the function to be run on all logged messages
-   */
-  void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
-
-  /**
-   * @brief Register a flush function compatible with the registered callback
-   *
-   * @param[in] flush the function to use when flushing logs
-   */
-  void set_flush(void (*flush)()) { sink->set_flush(flush); }
-
-  /**
-   * @brief Tells whether messages will be logged for the given log level
-   *
-   * @param[in] level log level to be checked for
-   * @return true if messages will be logged for this level, else false
-   */
-  bool should_log_for(int level) const
-  {
-    level        = raft::detail::convert_level_to_spdlog(level);
-    auto level_e = static_cast<spdlog::level::level_enum>(level);
-    return spdlogger->should_log(level_e);
-  }
-
-  /**
-   * @brief Query for the current log level
-   *
-   * @return the current log level
-   */
-  int get_level() const
-  {
-    auto level_e = spdlogger->level();
-    return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
-  }
-
-  /**
-   * @brief Get the current logging pattern
-   * @return the pattern
-   */
-  std::string get_pattern() const { return cur_pattern; }
-
-  /**
-   * @brief Main logging method
-   *
-   * @param[in] level logging level of this message
-   * @param[in] fmt   C-like format string, followed by respective params
-   */
-  void log(int level, const char* fmt, ...)
-  {
-    level        = raft::detail::convert_level_to_spdlog(level);
-    auto level_e = static_cast<spdlog::level::level_enum>(level);
-    // explicit check to make sure that we only expand messages when required
-    if (spdlogger->should_log(level_e)) {
-      va_list vl;
-      va_start(vl, fmt);
-      auto msg = raft::detail::format(fmt, vl);
-      va_end(vl);
-      spdlogger->log(level_e, msg);
-    }
-  }
-
-  /**
-   * @brief Flush logs by calling flush on underlying logger
-   */
-  void flush() { spdlogger->flush(); }
-
-  ~logger() {}
-
- private:
-  logger();
-
-  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
-  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
-  std::shared_ptr<spdlog::logger> spdlogger;
-  std::string cur_pattern;
-  int cur_level;
-};  // class logger
-
-};  // namespace raft
-
-/**
- * @defgroup loggerMacros Helper macros for dealing with logging
- * @{
- */
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_TRACE(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE_VEC(ptr, len)                                      \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    print_vector(#ptr, ptr, len, ss);                                     \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_TRACE_VEC(ptr, len) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
-#define RAFT_LOG_DEBUG(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_DEBUG(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
-#define RAFT_LOG_INFO(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_INFO(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
-#define RAFT_LOG_WARN(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_WARN(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
-#define RAFT_LOG_ERROR(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_ERROR(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
-#define RAFT_LOG_CRITICAL(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
-#endif
-/** @} */
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 61c1b500e6..c7350a978c 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -25,11 +25,11 @@
 #include <stddef.h>
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/memory_type.hpp>
-#include <rmm/cuda_stream_view.hpp>
 
 namespace raft {
 /**
@@ -45,23 +45,21 @@ namespace raft {
 template <typename Base>
 class array_interface {
   /**
-   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   * @brief Get an mdspan
    */
   auto view() noexcept { return static_cast<Base*>(this)->view(); }
   /**
-   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   * @brief Get an mdspan<const T>
    */
   auto view() const noexcept { return static_cast<Base*>(this)->view(); }
 };
 
 namespace detail {
 template <typename T, typename = void>
-struct is_array_interface : std::false_type {
-};
+struct is_array_interface : std::false_type {};
 template <typename T>
 struct is_array_interface<T, std::void_t<decltype(std::declval<T>().view())>>
-  : std::bool_constant<is_mdspan_v<decltype(std::declval<T>().view())>> {
-};
+  : std::bool_constant<is_mdspan_v<decltype(std::declval<T>().view())>> {};
 
 template <typename T>
 using is_array_interface_t = is_array_interface<std::remove_const_t<T>>;
@@ -76,16 +74,13 @@ inline constexpr bool is_array_interface_v = is_array_interface<std::remove_cons
 }  // namespace detail
 
 template <typename...>
-struct is_array_interface : std::true_type {
-};
+struct is_array_interface : std::true_type {};
 template <typename T1>
-struct is_array_interface<T1> : detail::is_array_interface_t<T1> {
-};
+struct is_array_interface<T1> : detail::is_array_interface_t<T1> {};
 template <typename T1, typename... Tn>
 struct is_array_interface<T1, Tn...> : std::conditional_t<detail::is_array_interface_v<T1>,
                                                           is_array_interface<Tn...>,
-                                                          std::false_type> {
-};
+                                                          std::false_type> {};
 /**
  * @\brief Boolean to determine if variadic template types Tn are raft::array_interface
  *         or derived type or any type that has a member function `view()` that returns either
@@ -108,7 +103,8 @@ inline constexpr bool is_array_interface_v = is_array_interface<Tn...>::value;
  *   template.
  *
  * - Most of the constructors from the reference implementation is removed to make sure
- *   CUDA stream is honorred.
+ *   CUDA stream is honored. Note that this class is not coupled to CUDA and therefore
+ *   will only be used in the case where the device variant is used.
  *
  * - unique_size is not implemented, which is still working in progress in the proposal
  *
@@ -177,9 +173,9 @@ class mdarray
   constexpr mdarray(mdarray&&) noexcept(std::is_nothrow_move_constructible<container_type>::value) =
     default;
 
-  constexpr auto operator                                               =(mdarray const&) noexcept(
+  constexpr auto operator=(mdarray const&) noexcept(
     std::is_nothrow_copy_assignable<container_type>::value) -> mdarray& = default;
-  constexpr auto operator                                               =(mdarray&&) noexcept(
+  constexpr auto operator=(mdarray&&) noexcept(
     std::is_nothrow_move_assignable<container_type>::value) -> mdarray& = default;
 
   ~mdarray() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
@@ -201,8 +197,9 @@ class mdarray
 #endif  // RAFT_MDARRAY_CTOR_CONSTEXPR
 
   /**
-   * @brief The only constructor that can create storage, this is to make sure CUDA stream is being
-   * used.
+   * @brief The only constructor that can create storage, raft::resources is accepted
+   * so that the device implementation can make sure the relevant CUDA stream is
+   * being used for allocation.
    */
   RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(raft::resources const& handle,
                                       mapping_type const& m,
@@ -220,11 +217,11 @@ class mdarray
 #undef RAFT_MDARRAY_CTOR_CONSTEXPR
 
   /**
-   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   * @brief Get an mdspan
    */
   auto view() noexcept { return view_type(c_.data(), map_, cp_.make_accessor_policy()); }
   /**
-   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   * @brief Get an mdspan<const T>
    */
   auto view() const noexcept
   {
diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp
index 1c69cdd973..cd9ca26ed9 100644
--- a/cpp/include/raft/core/mdspan.hpp
+++ b/cpp/include/raft/core/mdspan.hpp
@@ -85,28 +85,22 @@ template <typename ElementType, typename Extents, typename LayoutPolicy, typenam
 void __takes_an_mdspan_ptr(mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>*);
 
 template <typename T, typename = void>
-struct is_mdspan : std::false_type {
-};
+struct is_mdspan : std::false_type {};
 template <typename T>
 struct is_mdspan<T, std::void_t<decltype(__takes_an_mdspan_ptr(std::declval<T*>()))>>
-  : std::true_type {
-};
+  : std::true_type {};
 
 template <typename T, typename = void>
-struct is_input_mdspan : std::false_type {
-};
+struct is_input_mdspan : std::false_type {};
 template <typename T>
 struct is_input_mdspan<T, std::void_t<decltype(__takes_an_mdspan_ptr(std::declval<T*>()))>>
-  : std::bool_constant<std::is_const_v<typename T::element_type>> {
-};
+  : std::bool_constant<std::is_const_v<typename T::element_type>> {};
 
 template <typename T, typename = void>
-struct is_output_mdspan : std::false_type {
-};
+struct is_output_mdspan : std::false_type {};
 template <typename T>
 struct is_output_mdspan<T, std::void_t<decltype(__takes_an_mdspan_ptr(std::declval<T*>()))>>
-  : std::bool_constant<not std::is_const_v<typename T::element_type>> {
-};
+  : std::bool_constant<not std::is_const_v<typename T::element_type>> {};
 
 template <typename T>
 using is_mdspan_t = is_mdspan<std::remove_const_t<T>>;
diff --git a/cpp/include/raft/core/nvtx.hpp b/cpp/include/raft/core/nvtx.hpp
index 09a41f10a6..57338c32c7 100644
--- a/cpp/include/raft/core/nvtx.hpp
+++ b/cpp/include/raft/core/nvtx.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,9 +144,9 @@ class range {
   ~range() { pop_range<Domain>(); }
 
   /* This object is not meant to be touched. */
-  range(const range&) = delete;
-  range(range&&)      = delete;
-  auto operator=(const range&) -> range& = delete;
+  range(const range&)                              = delete;
+  range(range&&)                                   = delete;
+  auto operator=(const range&) -> range&           = delete;
   auto operator=(range&&) -> range&                = delete;
   static auto operator new(std::size_t) -> void*   = delete;
   static auto operator new[](std::size_t) -> void* = delete;
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 35ae3d715f..ebc41e0f8e 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -18,6 +18,7 @@
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 namespace raft::resource {
 class device_memory_resource : public resource {
@@ -72,4 +73,4 @@ inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_
 {
   res.add_resource_factory(std::make_shared<workspace_resource_factory>(mr));
 };
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index cf302e25f9..2dc4eb1f9d 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -42,7 +42,7 @@ enum resource_type {
   THRUST_POLICY,           // thrust execution policy
   WORKSPACE_RESOURCE,      // rmm device memory resource
 
-  LAST_KEY  // reserved for the last key
+  LAST_KEY                 // reserved for the last key
 };
 
 /**
@@ -83,6 +83,8 @@ class resource_factory {
    * @return resource instance
    */
   virtual resource* make_resource() = 0;
+
+  virtual ~resource_factory() {}
 };
 
 /**
diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp
index 64e281e934..e0f51b61b4 100644
--- a/cpp/include/raft/core/resources.hpp
+++ b/cpp/include/raft/core/resources.hpp
@@ -18,6 +18,7 @@
 #include "resource/resource_types.hpp"
 #include <algorithm>
 #include <mutex>
+#include <raft/core/error.hpp>  // RAFT_EXPECTS
 #include <raft/core/logger.hpp>
 #include <string>
 #include <vector>
@@ -67,7 +68,7 @@ class resources {
    * Note that this does not create any new resources.
    */
   resources(const resources& res) : factories_(res.factories_), resources_(res.resources_) {}
-  resources(resources&&) = delete;
+  resources(resources&&)            = delete;
   resources& operator=(resources&&) = delete;
 
   /**
@@ -128,4 +129,4 @@ class resources {
   mutable std::vector<pair_res_factory> factories_;
   mutable std::vector<pair_resource> resources_;
 };
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/core/span.hpp b/cpp/include/raft/core/span.hpp
index 188d58c896..a896ba1977 100644
--- a/cpp/include/raft/core/span.hpp
+++ b/cpp/include/raft/core/span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,7 +104,7 @@ class span {
   constexpr span(span&& other) noexcept      = default;
 
   constexpr auto operator=(span const& other) noexcept -> span& = default;
-  constexpr auto operator=(span&& other) noexcept -> span& = default;
+  constexpr auto operator=(span&& other) noexcept -> span&      = default;
 
   constexpr auto begin() const noexcept -> iterator { return data(); }
 
diff --git a/cpp/include/raft/core/sparse_types.hpp b/cpp/include/raft/core/sparse_types.hpp
index 207cc944d2..a14944ed5b 100644
--- a/cpp/include/raft/core/sparse_types.hpp
+++ b/cpp/include/raft/core/sparse_types.hpp
@@ -109,7 +109,7 @@ class sparse_matrix_view {
    * Return a view of the structure underlying this matrix
    * @return
    */
-  structure_view_type get_structure() { return structure_view_; }
+  structure_view_type structure_view() { return structure_view_; }
 
   /**
    * Return a span of the nonzero elements of the matrix
@@ -158,18 +158,19 @@ class sparse_matrix {
   using container_policy_type = ContainerPolicy<element_type>;
   using container_type        = typename container_policy_type::container_type;
 
+  // constructor that owns the data and the structure
   sparse_matrix(raft::resources const& handle,
                 row_type n_rows,
                 col_type n_cols,
                 nnz_type nnz = 0) noexcept(std::is_nothrow_default_constructible_v<container_type>)
-    : structure_{std::make_shared<structure_type>(handle, n_rows, n_cols, nnz)},
-      cp_{},
-      c_elements_{cp_.create(handle, 0)} {};
+    : structure_{handle, n_rows, n_cols, nnz}, cp_{}, c_elements_{cp_.create(handle, 0)} {};
 
   // Constructor that owns the data but not the structure
-  sparse_matrix(raft::resources const& handle, std::shared_ptr<structure_type> structure) noexcept(
+  // This constructor is only callable with a `structure_type == *_structure_view`
+  // which makes it okay to copy
+  sparse_matrix(raft::resources const& handle, structure_type structure) noexcept(
     std::is_nothrow_default_constructible_v<container_type>)
-    : structure_{structure}, cp_{}, c_elements_{cp_.create(handle, structure.get()->get_nnz())} {};
+    : structure_{structure}, cp_{}, c_elements_{cp_.create(handle, structure_.get_nnz())} {};
 
   constexpr sparse_matrix(sparse_matrix const&) noexcept(
     std::is_nothrow_copy_constructible_v<container_type>) = default;
@@ -187,7 +188,7 @@ class sparse_matrix {
 
   raft::span<ElementType, is_device> get_elements()
   {
-    return raft::span<ElementType, is_device>(c_elements_.data(), structure_view().get_nnz());
+    return raft::span<ElementType, is_device>(c_elements_.data(), structure_.get_nnz());
   }
 
   /**
@@ -209,7 +210,7 @@ class sparse_matrix {
   }
 
  protected:
-  std::shared_ptr<structure_type> structure_;
+  structure_type structure_;
   container_policy_type cp_;
   container_type c_elements_;
 };
diff --git a/cpp/include/raft/core/temporary_device_buffer.hpp b/cpp/include/raft/core/temporary_device_buffer.hpp
index 194471c5de..4baa7e9597 100644
--- a/cpp/include/raft/core/temporary_device_buffer.hpp
+++ b/cpp/include/raft/core/temporary_device_buffer.hpp
@@ -55,10 +55,10 @@ class temporary_device_buffer {
   static constexpr bool is_const_pointer_ = std::is_const_v<ElementType>;
 
  public:
-  temporary_device_buffer(temporary_device_buffer const&) = delete;
+  temporary_device_buffer(temporary_device_buffer const&)            = delete;
   temporary_device_buffer& operator=(temporary_device_buffer const&) = delete;
 
-  constexpr temporary_device_buffer(temporary_device_buffer&&) = default;
+  constexpr temporary_device_buffer(temporary_device_buffer&&)            = default;
   constexpr temporary_device_buffer& operator=(temporary_device_buffer&&) = default;
 
   /**
diff --git a/cpp/include/raft/distance/detail/distance_ops/cutlass.cuh b/cpp/include/raft/distance/detail/distance_ops/cutlass.cuh
index 7a4fe0ce83..68e843c6f5 100644
--- a/cpp/include/raft/distance/detail/distance_ops/cutlass.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/cutlass.cuh
@@ -30,13 +30,11 @@ namespace raft::distance::detail::ops {
 // This pattern is described in:
 // https://en.cppreference.com/w/cpp/types/void_t
 template <typename, typename = void>
-struct has_cutlass_op : std::false_type {
-};
+struct has_cutlass_op : std::false_type {};
 
 // Specialization recognizes types that do support CUTLASS
 template <typename T>
 struct has_cutlass_op<T, std::void_t<decltype(std::declval<T>().get_cutlass_op())>>
-  : std::true_type {
-};
+  : std::true_type {};
 
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
index aaf3052892..2154aa560c 100644
--- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -16,13 +16,26 @@
 
 #pragma once
 
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+// #include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/linalg/spmm.cuh>
 
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/gemm.cuh>
 
 namespace raft::distance::kernels::detail {
 
+template <typename math_t>
+using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, layout_stride>;
+template <typename math_t>
+using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, layout_stride>;
+template <typename math_t>
+using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
+
 /**
  * Base class for general Gram matrices
  * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
@@ -37,14 +50,135 @@ namespace raft::distance::kernels::detail {
  */
 template <typename math_t>
 class GramMatrixBase {
+ protected:
   cublasHandle_t cublas_handle;
+  bool legacy_interface;
 
  public:
-  GramMatrixBase(cublasHandle_t cublas_handle) : cublas_handle(cublas_handle){};
+  GramMatrixBase() : legacy_interface(false){};
+  [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
+    : cublas_handle(cublas_handle), legacy_interface(true){};
 
   virtual ~GramMatrixBase(){};
 
   /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::device_resources const& handle,
+                  dense_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr)
+  {
+    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+  }
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::device_resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr)
+  {
+    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+  }
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::device_resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
+                  csr_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr)
+  {
+    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+  }
+
+  // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::device_resources const& handle,
+                        dense_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2)
+  {
+    linear(handle, x1, x2, out);
+  }
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::device_resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2)
+  {
+    linear(handle, x1, x2, out);
+  }
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::device_resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
+                        csr_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2)
+  {
+    linear(handle, x1, x2, out);
+  }
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
    * @param [in] x1 device array of vectors, size [n1*n_cols]
    * @param [in] n1 number vectors in x1
@@ -55,29 +189,26 @@ class GramMatrixBase {
    * @param [in] is_row_major whether the input and output matrices are in row
    *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
    */
-  virtual void operator()(const math_t* x1,
-                          int n1,
-                          int n_cols,
-                          const math_t* x2,
-                          int n2,
-                          math_t* out,
-                          bool is_row_major,
-                          cudaStream_t stream,
-                          int ld1    = 0,
-                          int ld2    = 0,
-                          int ld_out = 0)
+  [[deprecated]] virtual void evaluate(const math_t* x1,
+                                       int n1,
+                                       int n_cols,
+                                       const math_t* x2,
+                                       int n2,
+                                       math_t* out,
+                                       bool is_row_major,
+                                       cudaStream_t stream,
+                                       int ld1,
+                                       int ld2,
+                                       int ld_out)
   {
-    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
 
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
    *
    * @param [in] x1 device array of vectors, size [n1*n_cols]
    * @param [in] n1 number vectors in x1
@@ -88,30 +219,30 @@ class GramMatrixBase {
    * @param [in] is_row_major whether the input and output matrices are in row
    *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
    */
-  virtual void evaluate(const math_t* x1,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
-                        cudaStream_t stream,
-                        int ld1,
-                        int ld2,
-                        int ld_out)
+  [[deprecated]] void operator()(const math_t* x1,
+                                 int n1,
+                                 int n_cols,
+                                 const math_t* x2,
+                                 int n2,
+                                 math_t* out,
+                                 bool is_row_major,
+                                 cudaStream_t stream,
+                                 int ld1    = 0,
+                                 int ld2    = 0,
+                                 int ld_out = 0)
   {
-    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
+    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
   }
 
-  // private:
-  // The following methods should be private, they are kept public to avoid:
-  // "error: The enclosing parent function ("distance") for an extended
-  // __device__ lambda cannot have private or protected access within its class"
-
+ protected:
   /** Calculates the Gram matrix using simple dot product between vector sets.
    *
    * out = x1 * x2
@@ -131,17 +262,17 @@ class GramMatrixBase {
    * @param ld2 leading dimension of x2
    * @param ld_out leading dimension of out
    */
-  void linear(const math_t* x1,
-              int n1,
-              int n_cols,
-              const math_t* x2,
-              int n2,
-              math_t* out,
-              bool is_row_major,
-              cudaStream_t stream,
-              int ld1,
-              int ld2,
-              int ld_out)
+  [[deprecated]] void linear(const math_t* x1,
+                             int n1,
+                             int n_cols,
+                             const math_t* x2,
+                             int n2,
+                             math_t* out,
+                             bool is_row_major,
+                             cudaStream_t stream,
+                             int ld1,
+                             int ld2,
+                             int ld_out)
   {
     math_t alpha = 1.0;
     math_t beta  = 0.0;
@@ -182,37 +313,198 @@ class GramMatrixBase {
     }
   }
 
-  /** Calculates the Gram matrix using Euclidean distance.
+ protected:
+  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
+  {
+    return (matrix.stride(1) == 1);
+  }
+
+  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
+  {
+    return (matrix.stride(1) == 1);
+  }
+
+  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
+  {
+    return (matrix.stride(0) == 1);
+  }
+
+  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
+  {
+    return (matrix.stride(0) == 1);
+  }
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
    *
    * Can be used as a building block for more complex kernel functions.
    *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
    */
-  virtual void distance(const math_t* x1,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
-                        cudaStream_t stream,
-                        int ld1,
-                        int ld2,
-                        int ld_out)
-  {
-    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded, math_t, math_t, math_t>(
-      raft::device_resources(stream), x1, x2, out, n1, n2, n_cols, is_row_major);
+  void linear(raft::device_resources const& handle,
+              dense_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out)
+  {
+    // check is_row_major consistency
+    bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
+    bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
+    ASSERT(is_row_major || is_col_major,
+           "GramMatrix leading dimensions for x1, x2 and out do not match");
+
+    // check dimensions
+    int n1     = out.extent(0);
+    int n2     = out.extent(1);
+    int n_cols = x1.extent(1);
+    ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
+    ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
+    ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+    // extract major stride
+    int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
+    int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
+    int ld_out = is_row_major ? out.stride(0) : out.stride(1);
+
+    math_t alpha = 1.0;
+    math_t beta  = 0.0;
+    if (is_row_major) {
+      // #TODO: Use mdspan-based API when stride-capable
+      // https://github.com/rapidsai/raft/issues/875
+      raft::linalg::gemm(handle,
+                         true,
+                         false,
+                         n2,
+                         n1,
+                         n_cols,
+                         &alpha,
+                         x2.data_handle(),
+                         ld2,
+                         x1.data_handle(),
+                         ld1,
+                         &beta,
+                         out.data_handle(),
+                         ld_out,
+                         handle.get_stream());
+    } else {
+      // #TODO: Use mdspan-based API when stride-capable
+      // https://github.com/rapidsai/raft/issues/875
+      raft::linalg::gemm(handle,
+                         false,
+                         true,
+                         n1,
+                         n2,
+                         n_cols,
+                         &alpha,
+                         x1.data_handle(),
+                         ld1,
+                         x2.data_handle(),
+                         ld2,
+                         &beta,
+                         out.data_handle(),
+                         ld_out,
+                         handle.get_stream());
+    }
+  }
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::device_resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out)
+  {
+    // check is_row_major consistency
+    bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
+    bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
+    ASSERT(is_row_major || is_col_major,
+           "GramMatrix leading dimensions for x2 and out do not match");
+
+    // check dimensions
+    auto x1_structure = x1.structure_view();
+    ASSERT(x1_structure.get_n_rows() == out.extent(0),
+           "GramMatrix input matrix dimensions for x1 and out do not match");
+    ASSERT(x2.extent(0) == out.extent(1),
+           "GramMatrix input matrix dimensions for x2 and out do not match");
+    ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
+           "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+    math_t alpha = 1.0;
+    math_t beta  = 0.0;
+
+    raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
+  }
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::device_resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
+              csr_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out)
+  {
+    // check is_row_major consistency
+    bool is_row_major = get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    int minor_out     = is_row_major ? out.extent(1) : out.extent(0);
+    ASSERT(ld_out == minor_out, "Sparse linear Kernel distance does not support ld_out parameter");
+
+    auto x1_structure = x1.structure_view();
+    auto x2_structure = x2.structure_view();
+    raft::sparse::distance::distances_config_t<int, math_t> dist_config(handle);
+
+    // switch a,b based on is_row_major
+    if (!is_row_major) {
+      dist_config.a_nrows   = x2_structure.get_n_rows();
+      dist_config.a_ncols   = x2_structure.get_n_cols();
+      dist_config.a_nnz     = x2_structure.get_nnz();
+      dist_config.a_indptr  = const_cast<int*>(x2_structure.get_indptr().data());
+      dist_config.a_indices = const_cast<int*>(x2_structure.get_indices().data());
+      dist_config.a_data    = const_cast<math_t*>(x2.get_elements().data());
+      dist_config.b_nrows   = x1_structure.get_n_rows();
+      dist_config.b_ncols   = x1_structure.get_n_cols();
+      dist_config.b_nnz     = x1_structure.get_nnz();
+      dist_config.b_indptr  = const_cast<int*>(x1_structure.get_indptr().data());
+      dist_config.b_indices = const_cast<int*>(x1_structure.get_indices().data());
+      dist_config.b_data    = const_cast<math_t*>(x1.get_elements().data());
+    } else {
+      dist_config.a_nrows   = x1_structure.get_n_rows();
+      dist_config.a_ncols   = x1_structure.get_n_cols();
+      dist_config.a_nnz     = x1_structure.get_nnz();
+      dist_config.a_indptr  = const_cast<int*>(x1_structure.get_indptr().data());
+      dist_config.a_indices = const_cast<int*>(x1_structure.get_indices().data());
+      dist_config.a_data    = const_cast<math_t*>(x1.get_elements().data());
+      dist_config.b_nrows   = x2_structure.get_n_rows();
+      dist_config.b_ncols   = x2_structure.get_n_cols();
+      dist_config.b_nnz     = x2_structure.get_nnz();
+      dist_config.b_indptr  = const_cast<int*>(x2_structure.get_indptr().data());
+      dist_config.b_indices = const_cast<int*>(x2_structure.get_indices().data());
+      dist_config.b_data    = const_cast<math_t*>(x2.get_elements().data());
+    }
+
+    raft::sparse::distance::pairwiseDistance(
+      out.data_handle(), dist_config, raft::distance::DistanceType::InnerProduct, 0.0);
   }
 };
+
 };  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
index 1aa6809bcd..bb3ff1c2f5 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,19 +26,35 @@ namespace raft::distance::kernels::detail {
 template <typename math_t>
 class KernelFactory {
  public:
-  static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t cublas_handle)
+  static GramMatrixBase<math_t>* create(KernelParams params)
   {
     GramMatrixBase<math_t>* res;
     // KernelParams is not templated, we convert the parameters to math_t here:
     math_t coef0 = params.coef0;
     math_t gamma = params.gamma;
     switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(cublas_handle); break;
+      case LINEAR: res = new GramMatrixBase<math_t>(); break;
+      case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
+      case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
+      case RBF: res = new RBFKernel<math_t>(gamma); break;
+      default: throw raft::exception("Kernel not implemented");
+    }
+    return res;
+  }
+
+  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle)
+  {
+    GramMatrixBase<math_t>* res;
+    // KernelParams is not templated, we convert the parameters to math_t here:
+    math_t coef0 = params.coef0;
+    math_t gamma = params.gamma;
+    switch (params.kernel) {
+      case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
       case POLYNOMIAL:
-        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, cublas_handle);
+        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
         break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0, cublas_handle); break;
-      case RBF: res = new RBFKernel<math_t>(gamma); break;
+      case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
+      case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
       default: throw raft::exception("Kernel not implemented");
     }
     return res;
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index d1465efdb0..7ff886c677 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -17,10 +17,12 @@
 #pragma once
 
 #include "gram_matrix.cuh"
-#include <raft/util/cuda_utils.cuh>
 
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>
 #include <raft/distance/distance.cuh>
 #include <raft/linalg/gemm.cuh>
+#include <raft/sparse/linalg/norm.cuh>
+#include <raft/util/cuda_utils.cuh>
 
 namespace raft::distance::kernels::detail {
 
@@ -100,6 +102,38 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
     }
 }
 
+/** Epiloge function for rbf kernel using expansion.
+ *
+ * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
+ *
+ * Intended usage
+ *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
+ *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
+ *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
+ *
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param norm_x l2-norm of X's rows
+ * @param norm_y l2-norm of Y's rows
+ * @param gain
+ */
+template <typename math_t>
+__global__ void rbf_kernel_expanded(
+  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y) {
+    math_t norm_y_val = norm_y[tidy];
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] =
+        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
+    }
+  }
+}
+
 /**
  * Create a kernel matrix using polynomial kernel function.
  */
@@ -138,11 +172,42 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * @param exponent
    * @param gain
    * @param offset
-   * @param cublas_handle
    */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t cublas_handle)
-    : GramMatrixBase<math_t>(cublas_handle), exponent(exponent), gain(gain), offset(offset)
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
+    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset)
+  {
+  }
+
+  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
+  {
+  }
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::device_resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
   {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
   }
 
   /** Evaluate kernel matrix using polynomial kernel.
@@ -150,32 +215,84 @@ class PolynomialKernel : public GramMatrixBase<math_t> {
    * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate the Gram matrix using the legacy interface.
    *
    * @param [in] x1 device array of vectors, size [n1*n_cols]
    * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*cols]
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
    * @param [in] n2 number vectors in x2
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
    * @param [in] is_row_major whether the input and output matrices are in row
    *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
    */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out)
   {
+    ASSERT(GramMatrixBase<math_t>::legacy_interface,
+           "Legacy interface can only be used with legacy ctor.");
     GramMatrixBase<math_t>::linear(
       x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
     applyKernel(out, ld_out, n1, n2, is_row_major, stream);
@@ -216,10 +333,11 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    * @param offset
-   * @param cublas_handle
    */
-  TanhKernel(math_t gain, math_t offset, cublasHandle_t cublas_handle)
-    : GramMatrixBase<math_t>(cublas_handle), gain(gain), offset(offset)
+  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
+
+  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
   {
   }
 
@@ -229,12 +347,87 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and < , > denotes dot product.
    *
-   * @param [in] x1 device array of vectors,
-   *  size [n1*n_cols]
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::device_resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(
+      out.data_handle(), ld_out, out.extent(0), out.extent(1), is_row_major, handle.get_stream());
+  }
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
    * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
-   * @param [in] x2 device array of vectors,
-   *   size [n2*n_cols]
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
    * @param [in] n2 number vectors in x2
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
    * @param [in] is_row_major whether the input and output matrices are in row
@@ -244,18 +437,20 @@ class TanhKernel : public GramMatrixBase<math_t> {
    * @param ld2 leading dimension of x2 (usually it is n2)
    * @param ld_out leading dimension of out (usually it is n1)
    */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out)
   {
+    ASSERT(GramMatrixBase<math_t>::legacy_interface,
+           "Legacy interface can only be used with legacy ctor.");
     GramMatrixBase<math_t>::linear(
       x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
     applyKernel(out, ld_out, n1, n2, is_row_major, stream);
@@ -269,21 +464,23 @@ template <typename math_t>
 class RBFKernel : public GramMatrixBase<math_t> {
   math_t gain;
 
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+  void applyKernel(math_t* inout,
+                   int ld,
+                   int rows,
+                   int cols,
+                   math_t* norm_x1,
+                   math_t* norm_x2,
+                   bool is_row_major,
+                   cudaStream_t stream)
   {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      rbf_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain);
-    } else {
-      int n1 = is_row_major ? cols : rows;
-      int n2 = is_row_major ? rows : cols;
-      rbf_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
-                   dim3(32, 4, 1),
-                   0,
-                   stream>>>(inout, ld, n1, n2, gain);
-    }
+    int n1          = is_row_major ? cols : rows;
+    int n2          = is_row_major ? rows : cols;
+    math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1;
+    math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2;
+    rbf_kernel_expanded<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
+                          dim3(32, 4, 1),
+                          0,
+                          stream>>>(inout, ld, n1, n2, norm_n1, norm_n2, gain);
   }
 
  public:
@@ -295,65 +492,234 @@ class RBFKernel : public GramMatrixBase<math_t> {
    * @tparam math_t floating point type
    * @param gain
    */
-  RBFKernel(math_t gain) : GramMatrixBase<math_t>(NULL), gain(gain) {}
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain) {}
+
+  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain)
+  {
+  }
+
+  void matrixRowNormL2(raft::device_resources const& handle,
+                       dense_input_matrix_view_t<math_t> matrix,
+                       math_t* target)
+  {
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
+    int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
+    int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
+    ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
+    raft::linalg::rowNorm(target,
+                          matrix.data_handle(),
+                          matrix.extent(1),
+                          matrix.extent(0),
+                          raft::linalg::NormType::L2Norm,
+                          is_row_major,
+                          handle.get_stream());
+  }
+
+  void matrixRowNormL2(raft::device_resources const& handle,
+                       csr_input_matrix_view_t<math_t> matrix,
+                       math_t* target)
+  {
+    auto matrix_structure = matrix.structure_view();
+    raft::sparse::linalg::rowNormCsr(handle,
+                                     matrix_structure.get_indptr().data(),
+                                     matrix.get_elements().data(),
+                                     matrix_structure.get_nnz(),
+                                     matrix_structure.get_n_rows(),
+                                     target,
+                                     raft::linalg::NormType::L2Norm);
+  }
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::device_resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    cudaStream_t stream = handle.get_stream();
+
+    // lazy compute norms if not given
+    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+    if (norm_x1 == nullptr) {
+      tmp_norm_x1.reserve(x1.extent(0), stream);
+      norm_x1 = tmp_norm_x1.data();
+      matrixRowNormL2(handle, x1, norm_x1);
+    }
+    if (norm_x2 == nullptr) {
+      tmp_norm_x2.reserve(x2.extent(0), stream);
+      norm_x2 = tmp_norm_x2.data();
+      matrixRowNormL2(handle, x2, norm_x2);
+    }
+
+    // compute L2expanded
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(out.data_handle(),
+                ld_out,
+                out.extent(0),
+                out.extent(1),
+                norm_x1,
+                norm_x2,
+                is_row_major,
+                handle.get_stream());
+  }
 
   /** Evaluate kernel matrix using RBF kernel.
    *
    * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
    * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
    * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    cudaStream_t stream = handle.get_stream();
+
+    // lazy compute norms if not given
+    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+    if (norm_x1 == nullptr) {
+      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+      norm_x1 = tmp_norm_x1.data();
+      matrixRowNormL2(handle, x1, norm_x1);
+    }
+    if (norm_x2 == nullptr) {
+      tmp_norm_x2.reserve(x2.extent(0), stream);
+      norm_x2 = tmp_norm_x2.data();
+      matrixRowNormL2(handle, x2, norm_x2);
+    }
+
+    // compute L2expanded
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(out.data_handle(),
+                ld_out,
+                out.extent(0),
+                out.extent(1),
+                norm_x1,
+                norm_x2,
+                is_row_major,
+                handle.get_stream());
+  }
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::device_resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2)
+  {
+    cudaStream_t stream = handle.get_stream();
+
+    // lazy compute norms if not given
+    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+    if (norm_x1 == nullptr) {
+      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+      norm_x1 = tmp_norm_x1.data();
+      matrixRowNormL2(handle, x1, norm_x1);
+    }
+    if (norm_x2 == nullptr) {
+      tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
+      norm_x2 = tmp_norm_x2.data();
+      matrixRowNormL2(handle, x2, norm_x2);
+    }
+
+    // compute L2expanded
+    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+    applyKernel(out.data_handle(),
+                ld_out,
+                out.extent(0),
+                out.extent(1),
+                norm_x1,
+                norm_x2,
+                is_row_major,
+                handle.get_stream());
+  }
+
+  /** Evaluate the Gram matrix using the legacy interface.
    *
    * @param [in] x1 device array of vectors, size [n1*n_cols]
    * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
+   * @param [in] n_cols number of columns (features) in x1 and x2
    * @param [in] x2 device array of vectors, size [n2*n_cols]
    * @param [in] n2 number vectors in x2
    * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
    * @param [in] is_row_major whether the input and output matrices are in row
    *        major format
    * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1, currently only ld1 == n1 is supported
-   * @param ld2 leading dimension of x2, currently only ld2 == n2 is supported
-   * @param ld_out leading dimension of out, only ld_out == n1 is supported
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
    */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out)
   {
+    ASSERT(GramMatrixBase<math_t>::legacy_interface,
+           "Legacy interface can only be used with legacy ctor.");
     int minor1    = is_row_major ? n_cols : n1;
     int minor2    = is_row_major ? n_cols : n2;
     int minor_out = is_row_major ? n2 : n1;
     ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
     ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
     ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-    distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
 
-  /** Customize distance function withe RBF epilogue */
-  void distance(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
-  {
     math_t gain   = this->gain;
     using index_t = int64_t;
 
-    auto fin_op = [gain] __device__(math_t d_val, index_t idx) { return exp(-gain * d_val); };
+    rbf_fin_op fin_op{gain};
     raft::distance::distance<raft::distance::DistanceType::L2Unexpanded,
                              math_t,
                              math_t,
diff --git a/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh b/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh
new file mode 100644
index 0000000000..cd19675477
--- /dev/null
+++ b/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/*
+ * This file defines rbf_fin_op, which is used in GramMatrixBase.
+ *
+ * This struct has been moved to a separate file, so that it is cheap to include
+ * in distance/distance-ext.cuh, where an instance of raft::distance::distance
+ * with the rbf_fin_op is instantiated.
+ *
+ */
+
+#include <raft/core/math.hpp>                 // raft::exp
+#include <raft/util/cuda_dev_essentials.cuh>  // HD
+
+namespace raft::distance::kernels::detail {
+
+/** @brief: Final op for Gram matrix with RBF kernel.
+ *
+ * Calculates output = e^(-gain * in)
+ *
+ */
+template <typename OutT>
+struct rbf_fin_op {
+  OutT gain;
+
+  explicit HD rbf_fin_op(OutT gain_) noexcept : gain(gain_) {}
+
+  template <typename... Args>
+  HDI OutT operator()(OutT d_val, Args... unused_args)
+  {
+    return raft::exp(-gain * d_val);
+  }
+};  // struct rbf_fin_op
+
+}  // namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/detail/masked_distance_base.cuh b/cpp/include/raft/distance/detail/masked_distance_base.cuh
index 55da634145..5a33c9ce4a 100644
--- a/cpp/include/raft/distance/detail/masked_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/masked_distance_base.cuh
@@ -217,7 +217,7 @@ struct MaskedDistances : public BaseClass {
         }  // tile_idx_n
       }    // idx_g
       rowEpilog_op(tile_idx_m);
-    }  // tile_idx_m
+    }      // tile_idx_m
   }
 
  private:
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index c6b09be31e..58b5daa8ca 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -18,7 +18,7 @@
 #include <raft/util/cuda_dev_essentials.cuh>  // ceildiv
 #include <raft/util/cuda_rt_essentials.hpp>   // RAFT_CUDA_TRY
 
-#include <cstddef>  // size_t
+#include <cstddef>                            // size_t
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
new file mode 100644
index 0000000000..dd58ab4328
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/operators.hpp>                          // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>    // ops::*
+#include <raft/distance/detail/distance_ops/cutlass.cuh>    // ops::has_cutlass_op
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>      // rbf_fin_op
+#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
+#include <raft/util/raft_explicit.hpp>                      // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::distance::detail {
+
+template <typename OpT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void pairwise_matrix_dispatch(OpT distance_op,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* x_norm,
+                              const DataT* y_norm,
+                              OutT* out,
+                              FinOpT fin_op,
+                              cudaStream_t stream,
+                              bool is_row_major) RAFT_EXPLICIT;
+
+};      // namespace raft::distance::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  extern template void raft::distance::detail::                                        \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+/*
+ * Hierarchy of instantiations:
+ *
+ * This file defines extern template instantiations of the distance kernels. The
+ * instantiation of the public API is handled in raft/distance/distance-ext.cuh.
+ *
+ * After adding an instance here, make sure to also add the instance there.
+ */
+
+// The following two instances are used in the RBF kernel object. Note the use of int64_t for the
+// index type.
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  float,
+  float,
+  float,
+  raft::distance::kernels::detail::rbf_fin_op<float>,
+  int64_t);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::distance::kernels::detail::rbf_fin_op<double>,
+  int64_t);
+
+// Rest of instances
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::correlation_distance_op,
+  float,
+  float,
+  float,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::correlation_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::cosine_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::cosine_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hamming_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hamming_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hellinger_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hellinger_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::jensen_shannon_distance_op,
+  float,
+  float,
+  float,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::jensen_shannon_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::kl_divergence_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::kl_divergence_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l1_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l1_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_exp_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_exp_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l_inf_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l_inf_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::lp_unexp_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::lp_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::russel_rao_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::russel_rao_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
new file mode 100644
index 0000000000..bb4422735b
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/* This file has two responsibilities:
+ *
+ * 1. Dispatch to the correct implementation of a kernel based on the
+ *    architecture of the device on which the kernel will be launched. For
+ *    instance, the cosine distance has a CUTLASS-based implementation that can
+ *    be used on SM80+ and the normal implementation that is used on older
+ *    architectures.
+ *
+ * 2. Provide concise function templates that can be instantiated in
+ *    src/distance/detail/pairwise_matrix/. Previously,
+ *    raft::distance::detail::distance was instantiated. The function
+ *    necessarily required a large set of include files, which slowed down the
+ *    build. The raft::distance::detail::pairwise_matrix_arch_dispatch functions
+ *    do not require as large an include files set, which speeds up the build.
+ */
+
+#include <raft/distance/detail/distance_ops/cutlass.cuh>           // ops::has_cutlass_op
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>  // dispatch_sm60
+#include <raft/distance/detail/pairwise_matrix/params.cuh>         // pairwise_matrix_params
+#include <raft/util/arch.cuh>                                      // raft::util::arch::SM_*
+
+// NOTE: to minimize compile times, we do not include dispatch_sm80.cuh.
+// Including dispatch_sm80.cuh can slow down compile times (due to CUTLASS).
+// Therefore, it is the including file's responsibility to include the correct
+// dispatch_smXX.cuh headers, as is done in raft/distance/detail/distance.cuh
+// and src/distance/detail/pairwise_matrix/dispatch_*.cu.
+
+namespace raft::distance::detail {
+
+// This forward-declaration ensures that we do not need to include
+// dispatch_sm80.cuh if we are not calling it in practice. This makes compiling
+// all the non-CUTLASS based distance instantiations faster. For CUTLASS-based
+// distances, dispatch_sm80.cuh has to be included by the file including this
+// file.
+template <typename OpT,
+          typename IdxT,
+          typename DataT,
+          typename OutT,
+          typename FinOpT,
+          typename SM_compat_t>
+void pairwise_matrix_sm80_dispatch(OpT,
+                                   pairwise_matrix_params<IdxT, DataT, OutT, FinOpT>,
+                                   SM_compat_t,
+                                   cudaStream_t);
+
+template <typename OpT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void pairwise_matrix_dispatch(OpT distance_op,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* x_norm,
+                              const DataT* y_norm,
+                              OutT* out,
+                              FinOpT fin_op,
+                              cudaStream_t stream,
+                              bool is_row_major)
+{
+  // Create kernel parameter struct. Flip x and y if column major.
+  IdxT ldx    = is_row_major ? k : m;
+  IdxT ldy    = is_row_major ? k : n;
+  IdxT ld_out = is_row_major ? n : m;
+
+  pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params{
+    m, n, k, ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, is_row_major};
+
+  if (!params.is_row_major) { params.flip_x_and_y(); }
+
+  // On CUDA 12:
+  // - always execute normal kernel
+  //
+  // On CUDA 11 and below:
+  // - execute CUTLASS-based kernel on SM_80 and above
+  // - execute normal kernel below SM_80
+  namespace arch = raft::util::arch;
+
+  constexpr bool is_ctk_12              = __CUDACC_VER_MAJOR__ == 12;
+  constexpr bool cutlass_op_unavailable = !ops::has_cutlass_op<OpT>();
+
+  if constexpr (is_ctk_12 || cutlass_op_unavailable) {
+    // Always execute legacy kernels on CUDA 12
+    auto any_range = arch::SM_range(arch::SM_min(), arch::SM_future());
+    pairwise_matrix_sm60_dispatch(distance_op, params, any_range, stream);
+  } else {
+    auto cutlass_range = arch::SM_range(arch::SM_80(), arch::SM_future());
+    auto legacy_range  = arch::SM_range(arch::SM_min(), arch::SM_80());
+
+    // Get pointer to SM60 kernel to determine the runtime architecture of the
+    // current system. Other methods to determine the architecture (that do not
+    // require a pointer) can be error prone. See:
+    // https://github.com/NVIDIA/cub/issues/545
+    auto sm60_wrapper = pairwise_matrix_sm60_get_wrapper(distance_op, params, legacy_range);
+    void* kernel_ptr  = reinterpret_cast<void*>(sm60_wrapper.kernel_ptr);
+    auto runtime_arch = arch::kernel_runtime_arch(kernel_ptr);
+
+    if (cutlass_range.contains(runtime_arch)) {
+      // If device is SM_80 or later, use CUTLASS-based kernel.
+      pairwise_matrix_sm80_dispatch(distance_op, params, cutlass_range, stream);
+    } else {
+      // Reuse kernel wrapper that we obtained above. This avoids performing the
+      // dispatch twice.
+      sm60_wrapper.launch(distance_op, params, stream);
+    }
+  }
+}
+
+};  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index e04b56ee8a..4a52b7ebe7 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -15,123 +15,10 @@
  */
 #pragma once
 
-/* This file has two responsibilities:
- *
- * 1. Dispatch to the correct implementation of a kernel based on the
- *    architecture of the device on which the kernel will be launched. For
- *    instance, the cosine distance has a CUTLASS-based implementation that can
- *    be used on SM80+ and the normal implementation that is used on older
- *    architectures.
- *
- * 2. Provide concise function templates that can be instantiated in
- *    src/distance/distance/specializations/detail/. Previously,
- *    raft::distance::detail::distance was instantiated. The function
- *    necessarily required a large set of include files, which slowed down the
- *    build. The raft::distance::detail::pairwise_matrix_arch_dispatch functions
- *    do not require as large an include files set, which speeds up the build.
- */
-
-#include <raft/distance/detail/distance_ops/cutlass.cuh>           // ops::has_cutlass_op
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>  // dispatch_sm60
-#include <raft/distance/detail/pairwise_matrix/params.cuh>         // pairwise_matrix_params
-#include <raft/util/arch.cuh>                                      // raft::util::arch::SM_*
-
-// NOTE: to minimize compile times, we do not include dispatch_sm80.cuh.
-// Including dispatch_sm80.cuh can slow down compile times (due to CUTLASS).
-// Therefore, it is the including file's responsibility to include the correct
-// dispatch_smXX.cuh headers, as is done in raft/distance/detail/distance.cuh
-// and the specializations in src/distance/distance/specializations/detail/.
-
-namespace raft::distance::detail {
-
-// This forward-declaration ensures that we do not need to include
-// dispatch_sm80.cuh if we are not calling it in practice. This makes compiling
-// all the non-CUTLASS based distance specializations faster. For CUTLASS-based
-// distances, dispatch_sm80.cuh has to be included by the file including this
-// file.
-template <typename OpT,
-          typename IdxT,
-          typename DataT,
-          typename OutT,
-          typename FinOpT,
-          typename SM_compat_t>
-void pairwise_matrix_sm80_dispatch(OpT,
-                                   pairwise_matrix_params<IdxT, DataT, OutT, FinOpT>,
-                                   SM_compat_t,
-                                   cudaStream_t);
-
-template <typename OpT, typename IdxT, typename DataT, typename OutT, typename FinOpT>
-void pairwise_matrix_instantiation_point(OpT distance_op,
-                                         pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params,
-                                         cudaStream_t stream)
-{
-  // On CUDA 12:
-  // - always execute normal kernel
-  //
-  // On CUDA 11 and below:
-  // - execute CUTLASS-based kernel on SM_80 and above
-  // - execute normal kernel below SM_80
-  namespace arch = raft::util::arch;
-
-  constexpr bool is_ctk_12              = __CUDACC_VER_MAJOR__ == 12;
-  constexpr bool cutlass_op_unavailable = !ops::has_cutlass_op<OpT>();
-
-  if constexpr (is_ctk_12 || cutlass_op_unavailable) {
-    // Always execute legacy kernels on CUDA 12
-    auto any_range = arch::SM_range(arch::SM_min(), arch::SM_future());
-    pairwise_matrix_sm60_dispatch(distance_op, params, any_range, stream);
-  } else {
-    auto cutlass_range = arch::SM_range(arch::SM_80(), arch::SM_future());
-    auto legacy_range  = arch::SM_range(arch::SM_min(), arch::SM_80());
-
-    // Get pointer to SM60 kernel to determine the runtime architecture of the
-    // current system. Other methods to determine the architecture (that do not
-    // require a pointer) can be error prone. See:
-    // https://github.com/NVIDIA/cub/issues/545
-    auto sm60_wrapper = pairwise_matrix_sm60_get_wrapper(distance_op, params, legacy_range);
-    void* kernel_ptr  = reinterpret_cast<void*>(sm60_wrapper.kernel_ptr);
-    auto runtime_arch = arch::kernel_runtime_arch(kernel_ptr);
-
-    if (cutlass_range.contains(runtime_arch)) {
-      // If device is SM_80 or later, use CUTLASS-based kernel.
-      pairwise_matrix_sm80_dispatch(distance_op, params, cutlass_range, stream);
-    } else {
-      // Reuse kernel wrapper that we obtained above. This avoids performing the
-      // dispatch twice.
-      sm60_wrapper.launch(distance_op, params, stream);
-    }
-  }
-}
-
-template <typename OpT,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void pairwise_matrix_dispatch(OpT distance_op,
-                              IdxT m,
-                              IdxT n,
-                              IdxT k,
-                              const DataT* x,
-                              const DataT* y,
-                              const DataT* x_norm,
-                              const DataT* y_norm,
-                              OutT* out,
-                              FinOpT fin_op,
-                              cudaStream_t stream,
-                              bool is_row_major)
-{
-  // Create kernel parameter struct. Flip x and y if column major.
-  IdxT ldx    = is_row_major ? k : m;
-  IdxT ldy    = is_row_major ? k : n;
-  IdxT ld_out = is_row_major ? n : m;
-
-  pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params{
-    m, n, k, ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, is_row_major};
-
-  if (!params.is_row_major) { params.flip_x_and_y(); }
-  pairwise_matrix_instantiation_point(distance_op, params, stream);
-}
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "dispatch-inl.cuh"
+#endif
 
-};  // namespace raft::distance::detail
+#ifdef RAFT_COMPILED
+#include "dispatch-ext.cuh"
+#endif
diff --git a/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h b/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
index 67c01448dc..ebe6d0c80a 100644
--- a/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
+++ b/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,8 +57,8 @@ namespace threadblock {
 ///
 /// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
 ///
-template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
-          typename Element_,    ///< Element data type
+template <typename ThreadMap_,        ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,          ///< Element data type
           typename Layout_,
           bool ScatterD     = false,  ///< Scatter D operand or not
           bool UseCUDAStore = false>
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
new file mode 100644
index 0000000000..3f7f2b0a23
--- /dev/null
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -0,0 +1,1065 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>                  // raft::device_matrix_view
+#include <raft/core/operators.hpp>                      // raft::identity_op
+#include <raft/core/resources.hpp>                      // raft::resources
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>  // rbf_fin_op
+#include <raft/distance/distance_types.hpp>             // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>                  // RAFT_EXPLICIT
+#include <rmm/device_uvector.hpp>                       // rmm::device_uvector
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft {
+namespace distance {
+
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinalLambda,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              void* workspace,
+              size_t worksize,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
+
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int,
+          typename layout>
+size_t getWorkspaceSize(raft::device_matrix_view<DataT, IdxT, layout> const& x,
+                        raft::device_matrix_view<DataT, IdxT, layout> const& y) RAFT_EXPLICIT;
+
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+
+template <typename Type, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       rmm::device_uvector<char>& workspace,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+
+template <typename Type, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename layout = raft::layout_c_contiguous,
+          typename IdxT   = int>
+void distance(raft::resources const& handle,
+              raft::device_matrix_view<DataT, IdxT, layout> const x,
+              raft::device_matrix_view<DataT, IdxT, layout> const y,
+              raft::device_matrix_view<OutT, IdxT, layout> dist,
+              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+
+template <typename Type, typename layout = layout_c_contiguous, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       device_matrix_view<Type, IdxT, layout> const x,
+                       device_matrix_view<Type, IdxT, layout> const y,
+                       device_matrix_view<Type, IdxT, layout> dist,
+                       raft::distance::DistanceType metric,
+                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+
+};      // namespace distance
+};      // namespace raft
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+/*
+ * Hierarchy of instantiations:
+ *
+ * This file defines the extern template instantiations for the public API of
+ * raft::distance. To improve compile times, the extern template instantiation
+ * of the distance kernels is handled in
+ * distance/detail/pairwise_matrix/dispatch-ext.cuh.
+ *
+ * After adding an instance here, make sure to also add the instance to
+ * dispatch-ext.cuh and the corresponding .cu files.
+ */
+
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT)       \
+  extern template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>( \
+    raft::resources const& handle,                                                         \
+    const DataT* x,                                                                        \
+    const DataT* y,                                                                        \
+    OutT* dist,                                                                            \
+    IdxT m,                                                                                \
+    IdxT n,                                                                                \
+    IdxT k,                                                                                \
+    void* workspace,                                                                       \
+    size_t worksize,                                                                       \
+    FinalLambda fin_op,                                                                    \
+    bool isRowMajor,                                                                       \
+    DataT metric_arg)
+
+// The following two instances are used in test/distance/gram.cu. Note the use
+// of int64_t for the index type.
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::distance::kernels::detail::rbf_fin_op<float>,
+                                   int64_t);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::distance::kernels::detail::rbf_fin_op<double>,
+                                   int64_t);
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::identity_op,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_distance
+
+// Same, but without raft::identity_op
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT)       \
+  extern template void raft::distance::distance<DT, DataT, AccT, OutT, IdxT>( \
+    raft::resources const& handle,                                            \
+    const DataT* x,                                                           \
+    const DataT* y,                                                           \
+    OutT* dist,                                                               \
+    IdxT m,                                                                   \
+    IdxT n,                                                                   \
+    IdxT k,                                                                   \
+    void* workspace,                                                          \
+    size_t worksize,                                                          \
+    bool isRowMajor,                                                          \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_distance
+
+// Same, but without workspace
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT)       \
+  extern template void raft::distance::distance<DT, DataT, AccT, OutT, IdxT>( \
+    raft::resources const& handle,                                            \
+    const DataT* x,                                                           \
+    const DataT* y,                                                           \
+    OutT* dist,                                                               \
+    IdxT m,                                                                   \
+    IdxT n,                                                                   \
+    IdxT k,                                                                   \
+    bool isRowMajor,                                                          \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)         \
+  extern template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT, layout)         \
+  extern template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT, layout>( \
+    raft::device_matrix_view<DataT, IdxT, layout> const& x,                                        \
+    raft::device_matrix_view<DataT, IdxT, layout> const& y)
+
+// We could consider not taking template parameters for this function. The
+// number of instantiations seems a bit excessive..
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::InnerProduct,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::InnerProduct,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::JensenShannon,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::JensenShannon,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::KLDivergence,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::KLDivergence,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2Unexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_f_contiguous);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_pairwise_distance(DataT, IdxT)                               \
+  extern template void raft::distance::pairwise_distance(raft::resources const& handle,        \
+                                                         const DataT* x,                       \
+                                                         const DataT* y,                       \
+                                                         DataT* dist,                          \
+                                                         IdxT m,                               \
+                                                         IdxT n,                               \
+                                                         IdxT k,                               \
+                                                         rmm::device_uvector<char>& workspace, \
+                                                         raft::distance::DistanceType metric,  \
+                                                         bool isRowMajor,                      \
+                                                         DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, int);
+instantiate_raft_distance_pairwise_distance(double, int);
+
+#undef instantiate_raft_distance_pairwise_distance
+
+// Same, but without workspace
+#define instantiate_raft_distance_pairwise_distance(DataT, IdxT)                              \
+  extern template void raft::distance::pairwise_distance(raft::resources const& handle,       \
+                                                         const DataT* x,                      \
+                                                         const DataT* y,                      \
+                                                         DataT* dist,                         \
+                                                         IdxT m,                              \
+                                                         IdxT n,                              \
+                                                         IdxT k,                              \
+                                                         raft::distance::DistanceType metric, \
+                                                         bool isRowMajor,                     \
+                                                         DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, int);
+instantiate_raft_distance_pairwise_distance(double, int);
+
+#undef instantiate_raft_distance_pairwise_distance
+
+// Version with mdspan
+#define instantiate_raft_distance_distance(DistT, DataT, AccT, OutT, layout, IdxT)       \
+  extern template void raft::distance::distance<DistT, DataT, AccT, OutT, layout, IdxT>( \
+    raft::resources const& handle,                                                       \
+    raft::device_matrix_view<DataT, IdxT, layout> const x,                               \
+    raft::device_matrix_view<DataT, IdxT, layout> const y,                               \
+    raft::device_matrix_view<OutT, IdxT, layout> dist,                                   \
+    DataT metric_arg)
+
+// Again, we might want to consider reigning in the number of instantiations...
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::InnerProduct,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::InnerProduct,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::JensenShannon,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::JensenShannon,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::KLDivergence,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::KLDivergence,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::LpUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::LpUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_pairwise_distance(DataT, layout, IdxT) \
+  extern template void raft::distance::pairwise_distance(                \
+    raft::resources const& handle,                                       \
+    raft::device_matrix_view<DataT, IdxT, layout> const x,               \
+    raft::device_matrix_view<DataT, IdxT, layout> const y,               \
+    raft::device_matrix_view<DataT, IdxT, layout> dist,                  \
+    raft::distance::DistanceType metric,                                 \
+    DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_pairwise_distance(float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_pairwise_distance(double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_pairwise_distance(double, raft::layout_f_contiguous, int);
+
+#undef instantiate_raft_distance_pairwise_distance
diff --git a/cpp/include/raft/distance/distance-inl.cuh b/cpp/include/raft/distance/distance-inl.cuh
new file mode 100644
index 0000000000..3399443765
--- /dev/null
+++ b/cpp/include/raft/distance/distance-inl.cuh
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <rmm/device_uvector.hpp>
+#include <type_traits>
+
+#include <raft/core/device_mdspan.hpp>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @defgroup pairwise_distance pointer-based pairwise distance prims
+ * @{
+ */
+
+/**
+ * @brief Evaluate pairwise distances with the user epilogue lamba allowed
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam IdxT Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param fin_op the final gemm epilogue lambda
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note fin_op: This is a device lambda which is supposed to operate upon the
+ * input which is AccT and returns the output in OutT. It's signature is
+ * as follows:  <pre>OutT fin_op(AccT in, int g_idx);</pre>. If one needs
+ * any other parameters, feel free to pass them via closure.
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinalLambda,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f)
+{
+  detail::distance<DistT, DataT, AccT, OutT, FinalLambda, IdxT>(
+    handle, x, y, dist, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg);
+}
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              void* workspace,
+              size_t worksize,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f)
+{
+  detail::distance<DistT, DataT, AccT, OutT, IdxT>(
+    handle, x, y, dist, m, n, k, workspace, worksize, isRowMajor, metric_arg);
+}
+
+/**
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ *
+ * @note If the specified DistT doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+{
+  return detail::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>(x, y, m, n, k);
+}
+
+/**
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param x first set of points (size m*k)
+ * @param y second set of points (size n*k)
+ * @return number of bytes needed in workspace
+ *
+ * @note If the specified DistT doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int,
+          typename layout>
+size_t getWorkspaceSize(raft::device_matrix_view<DataT, IdxT, layout> const& x,
+                        raft::device_matrix_view<DataT, IdxT, layout> const& y)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+
+  return getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>(
+    x.data_handle(), y.data_handle(), x.extent(0), y.extent(0), x.extent(1));
+}
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  rmm::device_uvector<char> workspace(0, stream);
+  auto worksize = getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>(x, y, m, n, k);
+  workspace.resize(worksize, stream);
+  detail::distance<DistT, DataT, AccT, OutT, IdxT>(
+    handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg);
+}
+
+/**
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam IdxT indexing type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace buffer which can get resized as per the
+ * needed workspace size
+ * @param metric distance metric
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename Type, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       rmm::device_uvector<char>& workspace,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  auto dispatch = [&](auto distance_type) {
+    auto worksize = getWorkspaceSize<distance_type(), Type, Type, Type, IdxT>(x, y, m, n, k);
+    workspace.resize(worksize, stream);
+    detail::distance<distance_type(), Type, Type, Type, IdxT>(
+      handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg);
+  };
+
+  switch (metric) {
+    case DistanceType::Canberra:
+      dispatch(std::integral_constant<DistanceType, DistanceType::Canberra>{});
+      break;
+    case DistanceType::CorrelationExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::CorrelationExpanded>{});
+      break;
+    case DistanceType::CosineExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::CosineExpanded>{});
+      break;
+    case DistanceType::HammingUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::HammingUnexpanded>{});
+      break;
+    case DistanceType::HellingerExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::HellingerExpanded>{});
+      break;
+    case raft::distance::DistanceType::InnerProduct:
+      dispatch(std::integral_constant<DistanceType, DistanceType::InnerProduct>{});
+      break;
+    case DistanceType::JensenShannon:
+      dispatch(std::integral_constant<DistanceType, DistanceType::JensenShannon>{});
+      break;
+    case DistanceType::KLDivergence:
+      dispatch(std::integral_constant<DistanceType, DistanceType::KLDivergence>{});
+      break;
+    case DistanceType::L1:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L1>{});
+      break;
+    case DistanceType::L2Expanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2Expanded>{});
+      break;
+    case DistanceType::L2SqrtExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2SqrtExpanded>{});
+      break;
+    case DistanceType::L2SqrtUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2SqrtUnexpanded>{});
+      break;
+    case DistanceType::L2Unexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2Unexpanded>{});
+      break;
+    case DistanceType::Linf:
+      dispatch(std::integral_constant<DistanceType, DistanceType::Linf>{});
+      break;
+    case DistanceType::LpUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::LpUnexpanded>{});
+      break;
+    case DistanceType::RusselRaoExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::RusselRaoExpanded>{});
+      break;
+    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
+  };
+}
+
+/**
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam IdxT indexing type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param metric distance metric
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename Type, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  rmm::device_uvector<char> workspace(0, stream);
+  pairwise_distance<Type, IdxT>(
+    handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
+}
+
+/** @} */
+
+/**
+ * \defgroup distance_mdspan Pairwise distance functions
+ * @{
+ */
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case.
+ *
+ * Note: Only contiguous row- or column-major layouts supported currently.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_mdarray.hpp>
+ * #include <raft/random/make_blobs.cuh>
+ * #include <raft/distance/distance.cuh>
+ *
+ * raft::raft::device_resources handle;
+ * int n_samples = 5000;
+ * int n_features = 50;
+ *
+ * auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
+ * auto labels = raft::make_device_vector<int>(handle, n_samples);
+ * auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
+ *
+ * raft::random::make_blobs(handle, input.view(), labels.view());
+ * auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+ * raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
+ * @endcode
+ *
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points (size n*k)
+ * @param y second set of points (size m*k)
+ * @param dist output distance matrix (size n*m)
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename layout = raft::layout_c_contiguous,
+          typename IdxT   = int>
+void distance(raft::resources const& handle,
+              raft::device_matrix_view<DataT, IdxT, layout> const x,
+              raft::device_matrix_view<DataT, IdxT, layout> const y,
+              raft::device_matrix_view<OutT, IdxT, layout> dist,
+              DataT metric_arg = 2.0f)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous.");
+  RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous.");
+
+  constexpr auto is_rowmajor = std::is_same_v<layout, layout_c_contiguous>;
+
+  distance<DistT, DataT, AccT, OutT, IdxT>(handle,
+                                           x.data_handle(),
+                                           y.data_handle(),
+                                           dist.data_handle(),
+                                           x.extent(0),
+                                           y.extent(0),
+                                           x.extent(1),
+                                           is_rowmajor,
+                                           metric_arg);
+}
+
+/**
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam IdxT indexing type
+ * @param handle raft handle for managing expensive resources
+ * @param x first matrix of points (size mxk)
+ * @param y second matrix of points (size nxk)
+ * @param dist output distance matrix (size mxn)
+ * @param metric distance metric
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename Type, typename layout = layout_c_contiguous, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       device_matrix_view<Type, IdxT, layout> const x,
+                       device_matrix_view<Type, IdxT, layout> const y,
+                       device_matrix_view<Type, IdxT, layout> dist,
+                       raft::distance::DistanceType metric,
+                       Type metric_arg = 2.0f)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous.");
+  RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous.");
+  RAFT_EXPECTS(dist.is_exhaustive(), "Output must be contiguous.");
+
+  constexpr auto rowmajor = std::is_same_v<layout, layout_c_contiguous>;
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+  rmm::device_uvector<char> workspace(0, stream);
+
+  pairwise_distance(handle,
+                    x.data_handle(),
+                    y.data_handle(),
+                    dist.data_handle(),
+                    x.extent(0),
+                    y.extent(0),
+                    x.extent(1),
+                    metric,
+                    rowmajor,
+                    metric_arg);
+}
+
+/** @} */
+
+};  // namespace distance
+};  // namespace raft
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 5216902635..de70cd4691 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -13,470 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __DISTANCE_H
-#define __DISTANCE_H
-
 #pragma once
 
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/core/resources.hpp>
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/distance_types.hpp>
-#include <rmm/device_uvector.hpp>
-#include <type_traits>
-
-#include <raft/core/device_mdspan.hpp>
-
-namespace raft {
-namespace distance {
-
-/**
- * @defgroup pairwise_distance pointer-based pairwise distance prims
- * @{
- */
-
-/**
- * @brief Evaluate pairwise distances with the user epilogue lamba allowed
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note fin_op: This is a device lambda which is supposed to operate upon the
- * input which is AccType and returns the output in OutType. It's signature is
- * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
- * any other parameters, feel free to pass them via closure.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void distance(raft::resources const& handle,
-              const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              FinalLambda fin_op,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
-    handle, x, y, dist, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Evaluate pairwise distances for the simple use case
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-void distance(raft::resources const& handle,
-              const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  detail::distance<distanceType, InType, AccType, OutType, Index_>(
-    handle, x, y, dist, m, n, k, workspace, worksize, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- *
- * @note If the specified distanceType doesn't need the workspace at all, it
- * returns 0.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
-{
-  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
-}
-
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points (size m*k)
- * @param y second set of points (size n*k)
- * @return number of bytes needed in workspace
- *
- * @note If the specified distanceType doesn't need the workspace at all, it
- * returns 0.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int,
-          typename layout>
-size_t getWorkspaceSize(const raft::device_matrix_view<InType, layout> x,
-                        const raft::device_matrix_view<InType, layout> y)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-
-  return getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(
-    x.data(), y.data(), x.extent(0), y.extent(0), x.extent(1));
-}
-
-/**
- * @brief Evaluate pairwise distances for the simple use case
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-void distance(raft::resources const& handle,
-              const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  auto stream = raft::resource::get_cuda_stream(handle);
-  rmm::device_uvector<char> workspace(0, stream);
-  auto worksize = getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
-  workspace.resize(worksize, stream);
-  detail::distance<distanceType, InType, AccType, OutType, Index_>(
-    handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace buffer which can get resized as per the
- * needed workspace size
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <typename Type, typename Index_ = int>
-void pairwise_distance(raft::resources const& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
-                       rmm::device_uvector<char>& workspace,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f)
-{
-  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
-
-  auto dispatch = [&](auto distance_type) {
-    auto worksize = getWorkspaceSize<distance_type(), Type, Type, Type, Index_>(x, y, m, n, k);
-    workspace.resize(worksize, stream);
-    detail::distance<distance_type(), Type, Type, Type, Index_>(
-      handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg);
-  };
-
-  switch (metric) {
-    case DistanceType::Canberra:
-      dispatch(std::integral_constant<DistanceType, DistanceType::Canberra>{});
-      break;
-    case DistanceType::CorrelationExpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::CorrelationExpanded>{});
-      break;
-    case DistanceType::CosineExpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::CosineExpanded>{});
-      break;
-    case DistanceType::HammingUnexpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::HammingUnexpanded>{});
-      break;
-    case DistanceType::HellingerExpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::HellingerExpanded>{});
-      break;
-    case raft::distance::DistanceType::InnerProduct:
-      dispatch(std::integral_constant<DistanceType, DistanceType::InnerProduct>{});
-      break;
-    case DistanceType::JensenShannon:
-      dispatch(std::integral_constant<DistanceType, DistanceType::JensenShannon>{});
-      break;
-    case DistanceType::KLDivergence:
-      dispatch(std::integral_constant<DistanceType, DistanceType::KLDivergence>{});
-      break;
-    case DistanceType::L1:
-      dispatch(std::integral_constant<DistanceType, DistanceType::L1>{});
-      break;
-    case DistanceType::L2Expanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::L2Expanded>{});
-      break;
-    case DistanceType::L2SqrtExpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::L2SqrtExpanded>{});
-      break;
-    case DistanceType::L2SqrtUnexpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::L2SqrtUnexpanded>{});
-      break;
-    case DistanceType::L2Unexpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::L2Unexpanded>{});
-      break;
-    case DistanceType::Linf:
-      dispatch(std::integral_constant<DistanceType, DistanceType::Linf>{});
-      break;
-    case DistanceType::LpUnexpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::LpUnexpanded>{});
-      break;
-    case DistanceType::RusselRaoExpanded:
-      dispatch(std::integral_constant<DistanceType, DistanceType::RusselRaoExpanded>{});
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  };
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <typename Type, typename Index_ = int>
-void pairwise_distance(raft::resources const& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f)
-{
-  auto stream = raft::resource::get_cuda_stream(handle);
-  rmm::device_uvector<char> workspace(0, stream);
-  pairwise_distance<Type, Index_>(
-    handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
-}
-
-/** @} */
-
-/**
- * \defgroup distance_mdspan Pairwise distance functions
- * @{
- */
-
-/**
- * @brief Evaluate pairwise distances for the simple use case.
- *
- * Note: Only contiguous row- or column-major layouts supported currently.
- *
- * Usage example:
- * @code{.cpp}
- * #include <raft/core/device_resources.hpp>
- * #include <raft/core/device_mdarray.hpp>
- * #include <raft/random/make_blobs.cuh>
- * #include <raft/distance/distance.cuh>
- *
- * raft::raft::device_resources handle;
- * int n_samples = 5000;
- * int n_features = 50;
- *
- * auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
- * auto labels = raft::make_device_vector<int>(handle, n_samples);
- * auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
- *
- * raft::random::make_blobs(handle, input.view(), labels.view());
- * auto metric = raft::distance::DistanceType::L2SqrtExpanded;
- * raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
- * @endcode
- *
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points (size n*k)
- * @param y second set of points (size m*k)
- * @param dist output distance matrix (size n*m)
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename layout = raft::layout_c_contiguous,
-          typename Index_ = int>
-void distance(raft::resources const& handle,
-              raft::device_matrix_view<InType, Index_, layout> const x,
-              raft::device_matrix_view<InType, Index_, layout> const y,
-              raft::device_matrix_view<OutType, Index_, layout> dist,
-              InType metric_arg = 2.0f)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
-               "Number of rows in output must be equal to "
-               "number of rows in X");
-  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
-               "Number of columns in output must be equal to "
-               "number of rows in Y");
-
-  RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous.");
-  RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous.");
-
-  constexpr auto is_rowmajor = std::is_same_v<layout, layout_c_contiguous>;
-
-  distance<distanceType, InType, AccType, OutType, Index_>(handle,
-                                                           x.data_handle(),
-                                                           y.data_handle(),
-                                                           dist.data_handle(),
-                                                           x.extent(0),
-                                                           y.extent(0),
-                                                           x.extent(1),
-                                                           is_rowmajor,
-                                                           metric_arg);
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first matrix of points (size mxk)
- * @param y second matrix of points (size nxk)
- * @param dist output distance matrix (size mxn)
- * @param metric distance metric
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <typename Type, typename layout = layout_c_contiguous, typename Index_ = int>
-void pairwise_distance(raft::resources const& handle,
-                       device_matrix_view<Type, Index_, layout> const x,
-                       device_matrix_view<Type, Index_, layout> const y,
-                       device_matrix_view<Type, Index_, layout> dist,
-                       raft::distance::DistanceType metric,
-                       Type metric_arg = 2.0f)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
-               "Number of rows in output must be equal to "
-               "number of rows in X");
-  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
-               "Number of columns in output must be equal to "
-               "number of rows in Y");
-
-  RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous.");
-  RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous.");
-  RAFT_EXPECTS(dist.is_exhaustive(), "Output must be contiguous.");
-
-  constexpr auto rowmajor = std::is_same_v<layout, layout_c_contiguous>;
-
-  auto stream = raft::resource::get_cuda_stream(handle);
-  rmm::device_uvector<char> workspace(0, stream);
-
-  pairwise_distance(handle,
-                    x.data_handle(),
-                    y.data_handle(),
-                    dist.data_handle(),
-                    x.extent(0),
-                    y.extent(0),
-                    x.extent(1),
-                    metric,
-                    rowmajor,
-                    metric_arg);
-}
-
-/** @} */
-
-};  // namespace distance
-};  // namespace raft
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "distance-inl.cuh"
+#endif
 
+#ifdef RAFT_COMPILED
+#include "distance-ext.cuh"
 #endif
diff --git a/cpp/include/raft/distance/distance_types.hpp b/cpp/include/raft/distance/distance_types.hpp
index 4060147f1d..d17ef358ee 100644
--- a/cpp/include/raft/distance/distance_types.hpp
+++ b/cpp/include/raft/distance/distance_types.hpp
@@ -74,8 +74,6 @@ inline bool is_min_close(DistanceType metric)
   bool select_min;
   switch (metric) {
     case DistanceType::InnerProduct:
-    case DistanceType::CosineExpanded:
-    case DistanceType::CorrelationExpanded:
       // Similarity metrics have the opposite meaning, i.e. nearest neighbors are those with larger
       // similarity (See the same logic at cpp/include/raft/sparse/spatial/detail/knn.cuh:362
       // {perform_k_selection})
diff --git a/cpp/include/raft/distance/fused_l2_nn-ext.cuh b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
new file mode 100644
index 0000000000..05732c1f3f
--- /dev/null
+++ b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                                // int64_t
+#include <raft/core/device_resources.hpp>         // raft::device_resources
+#include <raft/core/kvp.hpp>                      // raft::KeyValuePair
+#include <raft/distance/fused_l2_nn_helpers.cuh>  // include initialize and reduce operations
+#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft {
+namespace distance {
+
+template <typename DataT, typename OutT, typename IdxT>
+void fusedL2NNMinReduce(OutT* min,
+                        const DataT* x,
+                        const DataT* y,
+                        const DataT* xn,
+                        const DataT* yn,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        void* workspace,
+                        bool sqrt,
+                        bool initOutBuffer,
+                        cudaStream_t stream) RAFT_EXPLICIT;
+
+}  // namespace distance
+}  // namespace raft
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_distance_fusedL2NNMinReduce(DataT, OutT, IdxT)                          \
+  extern template void raft::distance::fusedL2NNMinReduce<DataT, OutT, IdxT>(OutT * min,         \
+                                                                             const DataT* x,     \
+                                                                             const DataT* y,     \
+                                                                             const DataT* xn,    \
+                                                                             const DataT* yn,    \
+                                                                             IdxT m,             \
+                                                                             IdxT n,             \
+                                                                             IdxT k,             \
+                                                                             void* workspace,    \
+                                                                             bool sqrt,          \
+                                                                             bool initOutBuffer, \
+                                                                             cudaStream_t stream)
+
+instantiate_raft_distance_fusedL2NNMinReduce(double, double, int);
+instantiate_raft_distance_fusedL2NNMinReduce(double, double, int64_t);
+instantiate_raft_distance_fusedL2NNMinReduce(float, float, int);
+instantiate_raft_distance_fusedL2NNMinReduce(float, float, int64_t);
+
+// We can't have comma's in the macro expansion, so we use the COMMA macro:
+#define COMMA ,
+
+instantiate_raft_distance_fusedL2NNMinReduce(double, raft::KeyValuePair<int COMMA double>, int);
+instantiate_raft_distance_fusedL2NNMinReduce(double,
+                                             raft::KeyValuePair<int64_t COMMA double>,
+                                             int64_t);
+instantiate_raft_distance_fusedL2NNMinReduce(float, raft::KeyValuePair<int COMMA float>, int);
+instantiate_raft_distance_fusedL2NNMinReduce(float,
+                                             raft::KeyValuePair<int64_t COMMA float>,
+                                             int64_t);
+
+#undef COMMA
+
+#undef instantiate_raft_distance_fusedL2NNMinReduce
diff --git a/cpp/include/raft/distance/fused_l2_nn-inl.cuh b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
new file mode 100644
index 0000000000..698d287f87
--- /dev/null
+++ b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FUSED_L2_NN_H
+#define __FUSED_L2_NN_H
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <raft/core/device_resources.hpp>
+#include <raft/distance/detail/fused_l2_nn.cuh>
+#include <raft/distance/fused_l2_nn_helpers.cuh>
+#include <raft/linalg/contractions.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <stdint.h>
+#include <type_traits>
+
+namespace raft {
+namespace distance {
+
+/**
+ * \ingroup fused_l2_nn
+ * @{
+ */
+/**
+ * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
+ *
+ * The benefits of such a call are 2-fold: 1) eliminate the need for an
+ * intermediate buffer to store the output of gemm 2) reduce the memory read
+ * traffic on this intermediate buffer, otherwise needed during the reduction
+ * phase for 1-NN.
+ *
+ * @tparam DataT     data type
+ * @tparam OutT      output type to either store 1-NN indices and their minimum
+ *                   distances or store only the min distances. Accordingly, one
+ *                   has to pass an appropriate `ReduceOpT`
+ * @tparam IdxT      indexing arithmetic type
+ * @tparam ReduceOpT A struct to perform the final needed reduction operation
+ *                   and also to initialize the output array elements with the
+ *                   appropriate initial value needed for reduction.
+ *
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
+ * @param[in]  redOp         reduction operator in the epilogue
+ * @param[in] pairRedOp reduction operation on key value pairs
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  initOutBuffer whether to initialize the output buffer before the
+ *                           main kernel launch
+ * @param[in]  stream        cuda stream
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void fusedL2NN(OutT* min,
+               const DataT* x,
+               const DataT* y,
+               const DataT* xn,
+               const DataT* yn,
+               IdxT m,
+               IdxT n,
+               IdxT k,
+               void* workspace,
+               ReduceOpT redOp,
+               KVPReduceOpT pairRedOp,
+               bool sqrt,
+               bool initOutBuffer,
+               cudaStream_t stream)
+{
+  // When k is smaller than 32, the Policy4x4 results in redundant calculations
+  // as it uses tiles that have k=32. Therefore, use a "skinny" policy instead
+  // that uses tiles with a smaller value of k.
+  bool is_skinny = k < 32;
+
+  size_t bytes = sizeof(DataT) * k;
+  auto px      = reinterpret_cast<uintptr_t>(x);
+  auto py      = reinterpret_cast<uintptr_t>(y);
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0 && px % 16 == 0 && py % 16 == 0) {
+    if (is_skinny) {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4Skinny<DataT, 16 / sizeof(DataT)>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    } else {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4<DataT, 16 / sizeof(DataT)>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    }
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0 && px % 8 == 0 && py % 8 == 0) {
+    if (is_skinny) {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4Skinny<DataT, 8 / sizeof(DataT)>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    } else {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4<DataT, 8 / sizeof(DataT)>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    }
+  } else {
+    if (is_skinny) {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4Skinny<DataT, 1>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    } else {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4<DataT, 1>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    }
+  }
+}
+
+/**
+ * @brief Wrapper around fusedL2NN with minimum reduction operators.
+ *
+ * fusedL2NN cannot be compiled in the distance library due to the lambda
+ * operators, so this wrapper covers the most common case (minimum).
+ * This should be preferred to the more generic API when possible, in order to
+ * reduce compilation times for users of the shared library.
+ *
+ * @tparam DataT     data type
+ * @tparam OutT      output type to either store 1-NN indices and their minimum
+ *                   distances (e.g. raft::KeyValuePair<int, float>) or store only the min
+ * distances.
+ * @tparam IdxT      indexing arithmetic type
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  initOutBuffer whether to initialize the output buffer before the
+ *                           main kernel launch
+ * @param[in]  stream        cuda stream
+ */
+template <typename DataT, typename OutT, typename IdxT>
+void fusedL2NNMinReduce(OutT* min,
+                        const DataT* x,
+                        const DataT* y,
+                        const DataT* xn,
+                        const DataT* yn,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        void* workspace,
+                        bool sqrt,
+                        bool initOutBuffer,
+                        cudaStream_t stream)
+{
+  MinAndDistanceReduceOp<IdxT, DataT> redOp;
+  KVPMinReduce<IdxT, DataT> pairRedOp;
+
+  fusedL2NN<DataT, OutT, IdxT>(
+    min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+}
+
+/** @} */
+
+}  // namespace distance
+}  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index e832bcb020..b1a3551323 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -13,218 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef __FUSED_L2_NN_H
-#define __FUSED_L2_NN_H
-
 #pragma once
 
-#include <cub/cub.cuh>
-#include <limits>
-#include <raft/core/device_resources.hpp>
-#include <raft/distance/detail/fused_l2_nn.cuh>
-#include <raft/linalg/contractions.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <stdint.h>
-#include <type_traits>
-
-namespace raft {
-namespace distance {
-/**
- * \defgroup fused_l2_nn Fused 1-nearest neighbors
- * @{
- */
-
-template <typename LabelT, typename DataT>
-using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
-
-template <typename LabelT, typename DataT>
-using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
-
-template <typename LabelT, typename DataT>
-using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
-
-/** @} */
-
-/**
- * Initialize array using init value from reduction op
- */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(
-  raft::device_resources const& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
-{
-  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
-}
-
-/**
- * \ingroup fused_l2_nn
- * @{
- */
-/**
- * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
- *
- * The benefits of such a call are 2-fold: 1) eliminate the need for an
- * intermediate buffer to store the output of gemm 2) reduce the memory read
- * traffic on this intermediate buffer, otherwise needed during the reduction
- * phase for 1-NN.
- *
- * @tparam DataT     data type
- * @tparam OutT      output type to either store 1-NN indices and their minimum
- *                   distances or store only the min distances. Accordingly, one
- *                   has to pass an appropriate `ReduceOpT`
- * @tparam IdxT      indexing arithmetic type
- * @tparam ReduceOpT A struct to perform the final needed reduction operation
- *                   and also to initialize the output array elements with the
- *                   appropriate initial value needed for reduction.
- *
- * @param[out] min           will contain the reduced output (Length = `m`)
- *                           (on device)
- * @param[in]  x             first matrix. Row major. Dim = `m x k`.
- *                           (on device).
- * @param[in]  y             second matrix. Row major. Dim = `n x k`.
- *                           (on device).
- * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
- * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
- * @param[in]  m             gemm m
- * @param[in]  n             gemm n
- * @param[in]  k             gemm k
- * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
- * @param[in]  redOp         reduction operator in the epilogue
- * @param[in] pairRedOp reduction operation on key value pairs
- * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
- * @param[in]  initOutBuffer whether to initialize the output buffer before the
- *                           main kernel launch
- * @param[in]  stream        cuda stream
- */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
-void fusedL2NN(OutT* min,
-               const DataT* x,
-               const DataT* y,
-               const DataT* xn,
-               const DataT* yn,
-               IdxT m,
-               IdxT n,
-               IdxT k,
-               void* workspace,
-               ReduceOpT redOp,
-               KVPReduceOpT pairRedOp,
-               bool sqrt,
-               bool initOutBuffer,
-               cudaStream_t stream)
-{
-  // When k is smaller than 32, the Policy4x4 results in redundant calculations
-  // as it uses tiles that have k=32. Therefore, use a "skinny" policy instead
-  // that uses tiles with a smaller value of k.
-  bool is_skinny = k < 32;
-
-  size_t bytes = sizeof(DataT) * k;
-  auto px      = reinterpret_cast<uintptr_t>(x);
-  auto py      = reinterpret_cast<uintptr_t>(y);
-  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0 && px % 16 == 0 && py % 16 == 0) {
-    if (is_skinny) {
-      detail::fusedL2NNImpl<DataT,
-                            OutT,
-                            IdxT,
-                            typename linalg::Policy4x4Skinny<DataT, 16 / sizeof(DataT)>::Policy,
-                            ReduceOpT>(
-        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-    } else {
-      detail::fusedL2NNImpl<DataT,
-                            OutT,
-                            IdxT,
-                            typename linalg::Policy4x4<DataT, 16 / sizeof(DataT)>::Policy,
-                            ReduceOpT>(
-        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-    }
-  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0 && px % 8 == 0 && py % 8 == 0) {
-    if (is_skinny) {
-      detail::fusedL2NNImpl<DataT,
-                            OutT,
-                            IdxT,
-                            typename linalg::Policy4x4Skinny<DataT, 8 / sizeof(DataT)>::Policy,
-                            ReduceOpT>(
-        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-    } else {
-      detail::fusedL2NNImpl<DataT,
-                            OutT,
-                            IdxT,
-                            typename linalg::Policy4x4<DataT, 8 / sizeof(DataT)>::Policy,
-                            ReduceOpT>(
-        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-    }
-  } else {
-    if (is_skinny) {
-      detail::fusedL2NNImpl<DataT,
-                            OutT,
-                            IdxT,
-                            typename linalg::Policy4x4Skinny<DataT, 1>::Policy,
-                            ReduceOpT>(
-        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-    } else {
-      detail::fusedL2NNImpl<DataT,
-                            OutT,
-                            IdxT,
-                            typename linalg::Policy4x4<DataT, 1>::Policy,
-                            ReduceOpT>(
-        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-    }
-  }
-}
-
-/**
- * @brief Wrapper around fusedL2NN with minimum reduction operators.
- *
- * fusedL2NN cannot be compiled in the distance library due to the lambda
- * operators, so this wrapper covers the most common case (minimum).
- * This should be preferred to the more generic API when possible, in order to
- * reduce compilation times for users of the shared library.
- *
- * @tparam DataT     data type
- * @tparam OutT      output type to either store 1-NN indices and their minimum
- *                   distances (e.g. raft::KeyValuePair<int, float>) or store only the min
- * distances.
- * @tparam IdxT      indexing arithmetic type
- * @param[out] min           will contain the reduced output (Length = `m`)
- *                           (on device)
- * @param[in]  x             first matrix. Row major. Dim = `m x k`.
- *                           (on device).
- * @param[in]  y             second matrix. Row major. Dim = `n x k`.
- *                           (on device).
- * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
- * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
- * @param[in]  m             gemm m
- * @param[in]  n             gemm n
- * @param[in]  k             gemm k
- * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
- * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
- * @param[in]  initOutBuffer whether to initialize the output buffer before the
- *                           main kernel launch
- * @param[in]  stream        cuda stream
- */
-template <typename DataT, typename OutT, typename IdxT>
-void fusedL2NNMinReduce(OutT* min,
-                        const DataT* x,
-                        const DataT* y,
-                        const DataT* xn,
-                        const DataT* yn,
-                        IdxT m,
-                        IdxT n,
-                        IdxT k,
-                        void* workspace,
-                        bool sqrt,
-                        bool initOutBuffer,
-                        cudaStream_t stream)
-{
-  MinAndDistanceReduceOp<IdxT, DataT> redOp;
-  KVPMinReduce<IdxT, DataT> pairRedOp;
-
-  fusedL2NN<DataT, OutT, IdxT>(
-    min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
-}
-
-/** @} */
-
-}  // namespace distance
-}  // namespace raft
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "fused_l2_nn-inl.cuh"
+#endif
 
+#ifdef RAFT_COMPILED
+#include "fused_l2_nn-ext.cuh"
 #endif
diff --git a/cpp/include/raft/distance/fused_l2_nn_helpers.cuh b/cpp/include/raft/distance/fused_l2_nn_helpers.cuh
new file mode 100644
index 0000000000..1bcd7d8dba
--- /dev/null
+++ b/cpp/include/raft/distance/fused_l2_nn_helpers.cuh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/fused_l2_nn.cuh>
+
+namespace raft::distance {
+
+/**
+ * \defgroup fused_l2_nn Fused 1-nearest neighbors
+ * @{
+ */
+
+template <typename LabelT, typename DataT>
+using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
+
+template <typename LabelT, typename DataT>
+using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
+
+template <typename LabelT, typename DataT>
+using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
+
+/** @} */
+
+/**
+ * Initialize array using init value from reduction op
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
+void initialize(
+  raft::device_resources const& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
+  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
+}
+
+}  // namespace raft::distance
diff --git a/cpp/include/raft/distance/specializations.cuh b/cpp/include/raft/distance/specializations.cuh
index 5944534be7..ed0b6848ae 100644
--- a/cpp/include/raft/distance/specializations.cuh
+++ b/cpp/include/raft/distance/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,12 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef __DISTANCE_SPECIALIZATIONS_H
-#define __DISTANCE_SPECIALIZATIONS_H
-
 #pragma once
 
-#include <raft/distance/specializations/distance.cuh>
-
-#endif
\ No newline at end of file
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/distance/specializations/detail/00_write_template.py b/cpp/include/raft/distance/specializations/detail/00_write_template.py
deleted file mode 100644
index 63ae6580b4..0000000000
--- a/cpp/include/raft/distance/specializations/detail/00_write_template.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python3
-
-# This template manages all files in this directory, apart from
-# inner_product.cuh and kernels.cuh.
-
-
-# NOTE: this template is not perfectly formatted. Use pre-commit to get
-# everything in shape again.
-start_template = """/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-"""
-
-extern_template = """
-extern template void pairwise_matrix_instantiation_point<OpT,
-                                                         IdxT,
-                                                         DataT,
-                                                         OutT,
-                                                         FinopT>(
-  OpT,
-  pairwise_matrix_params<IdxT, DataT, OutT, FinopT>,
-  cudaStream_t);
-"""
-
-end_template = """}  // namespace raft::distance::detail
-"""
-
-data_type_instances = [
-    dict(
-        DataT="float",
-        AccT="float",
-        OutT="float",
-        IdxT="int",
-    ),
-    dict(
-        DataT="double",
-        AccT="double",
-        OutT="double",
-        IdxT="int",
-    ),
-]
-
-
-
-
-op_instances = [
-    dict(
-        path_prefix="canberra",
-        OpT="ops::canberra_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="correlation",
-        OpT="ops::correlation_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="cosine",
-        OpT="ops::cosine_distance_op<DataT, AccT, IdxT>",
-        # cosine uses CUTLASS for SM80+
-    ),
-    dict(
-        path_prefix="hamming_unexpanded",
-        OpT="ops::hamming_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="hellinger_expanded",
-        OpT="ops::hellinger_distance_op<DataT, AccT, IdxT>",
-    ),
-    # inner product is handled by cublas.
-    dict(
-        path_prefix="jensen_shannon",
-        OpT="ops::jensen_shannon_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="kl_divergence",
-        OpT="ops::kl_divergence_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="l1",
-        OpT="ops::l1_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="l2_expanded",
-        OpT="ops::l2_exp_distance_op<DataT, AccT, IdxT>",
-        # L2 expanded uses CUTLASS for SM80+
-    ),
-    dict(
-        path_prefix="l2_unexpanded",
-        OpT="ops::l2_unexp_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="l_inf",
-        OpT="ops::l_inf_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="lp_unexpanded",
-        OpT="ops::lp_unexp_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="russel_rao",
-        OpT="ops::russel_rao_distance_op<DataT, AccT, IdxT>",
-    ),
-]
-
-def fill_in(s, template):
-    for k, v in template.items():
-        s = s.replace(k, v)
-    return s
-
-for op_instance in op_instances:
-    path = fill_in("path_prefix.cuh", op_instance)
-    with open(path, "w") as f:
-        f.write(start_template)
-
-        for data_type_instance in data_type_instances:
-            op_data_instance = {
-                k : fill_in(v, data_type_instance)
-                for k, v in op_instance.items()
-            }
-            instance = {
-                **op_data_instance,
-                **data_type_instance,
-                "FinopT": "raft::identity_op",
-            }
-
-            text = fill_in(extern_template, instance)
-
-            f.write(text)
-
-        f.write(end_template)
diff --git a/cpp/include/raft/distance/specializations/detail/canberra.cuh b/cpp/include/raft/distance/specializations/detail/canberra.cuh
deleted file mode 100644
index 276c85e5f6..0000000000
--- a/cpp/include/raft/distance/specializations/detail/canberra.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::canberra_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::canberra_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::canberra_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::canberra_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/correlation.cuh b/cpp/include/raft/distance/specializations/detail/correlation.cuh
deleted file mode 100644
index f019f678df..0000000000
--- a/cpp/include/raft/distance/specializations/detail/correlation.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::correlation_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::correlation_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::correlation_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::correlation_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/cosine.cuh b/cpp/include/raft/distance/specializations/detail/cosine.cuh
deleted file mode 100644
index dcde4ec286..0000000000
--- a/cpp/include/raft/distance/specializations/detail/cosine.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::cosine_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::cosine_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::cosine_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
deleted file mode 100644
index 1d6964fbce..0000000000
--- a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::hamming_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::hamming_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::hamming_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::hamming_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
deleted file mode 100644
index f96a06f919..0000000000
--- a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::hellinger_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::hellinger_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::hellinger_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::hellinger_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/inner_product.cuh b/cpp/include/raft/distance/specializations/detail/inner_product.cuh
deleted file mode 100644
index d97d678928..0000000000
--- a/cpp/include/raft/distance/specializations/detail/inner_product.cuh
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::InnerProduct, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-extern template void
-distance<raft::distance::DistanceType::InnerProduct, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh b/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
deleted file mode 100644
index 0b58646582..0000000000
--- a/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::jensen_shannon_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::jensen_shannon_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::jensen_shannon_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::jensen_shannon_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/kernels.cuh b/cpp/include/raft/distance/specializations/detail/kernels.cuh
deleted file mode 100644
index 75c9c023e8..0000000000
--- a/cpp/include/raft/distance/specializations/detail/kernels.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-
-extern template class raft::distance::kernels::detail::GramMatrixBase<double>;
-extern template class raft::distance::kernels::detail::GramMatrixBase<float>;
-
-extern template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
-extern template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
-
-extern template class raft::distance::kernels::detail::TanhKernel<double>;
-extern template class raft::distance::kernels::detail::TanhKernel<float>;
-
-// These are somehow missing a kernel definition which is causing a compile error
-// extern template class raft::distance::kernels::detail::RBFKernel<double>;
-// extern template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh b/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
deleted file mode 100644
index 5c164e0fd4..0000000000
--- a/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::kl_divergence_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<double, double, int>,
-                                                         int,
-                                                         double,
-                                                         double,
-                                                         raft::identity_op>(
-  ops::kl_divergence_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, raft::identity_op>,
-  cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l1.cuh b/cpp/include/raft/distance/specializations/detail/l1.cuh
deleted file mode 100644
index 870627d909..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l1.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::l1_distance_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::l1_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<ops::l1_distance_op<double, double, int>,
-                                                         int,
-                                                         double,
-                                                         double,
-                                                         raft::identity_op>(
-  ops::l1_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, raft::identity_op>,
-  cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh b/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
deleted file mode 100644
index ee3207bcce..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::l2_exp_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::l2_exp_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::l2_exp_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
deleted file mode 100644
index 1fbf57632b..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::l2_unexp_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::l2_unexp_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::l2_unexp_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::l2_unexp_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l_inf.cuh b/cpp/include/raft/distance/specializations/detail/l_inf.cuh
deleted file mode 100644
index 388d3bf439..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l_inf.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::l_inf_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::l_inf_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::l_inf_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
deleted file mode 100644
index d8e86ce6f2..0000000000
--- a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::lp_unexp_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::lp_unexp_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::lp_unexp_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::lp_unexp_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/russel_rao.cuh b/cpp/include/raft/distance/specializations/detail/russel_rao.cuh
deleted file mode 100644
index 4803fb8ab0..0000000000
--- a/cpp/include/raft/distance/specializations/detail/russel_rao.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::russel_rao_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::russel_rao_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::russel_rao_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::russel_rao_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index a34f696e9e..ed0b6848ae 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -13,22 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/distance/specializations/detail/canberra.cuh>
-#include <raft/distance/specializations/detail/correlation.cuh>
-#include <raft/distance/specializations/detail/cosine.cuh>
-#include <raft/distance/specializations/detail/hamming_unexpanded.cuh>
-#include <raft/distance/specializations/detail/hellinger_expanded.cuh>
-#include <raft/distance/specializations/detail/inner_product.cuh>
-#include <raft/distance/specializations/detail/jensen_shannon.cuh>
-#include <raft/distance/specializations/detail/kernels.cuh>
-#include <raft/distance/specializations/detail/kl_divergence.cuh>
-#include <raft/distance/specializations/detail/l1.cuh>
-#include <raft/distance/specializations/detail/l2_expanded.cuh>
-#include <raft/distance/specializations/detail/l2_unexpanded.cuh>
-#include <raft/distance/specializations/detail/l_inf.cuh>
-#include <raft/distance/specializations/detail/lp_unexpanded.cuh>
-#include <raft/distance/specializations/detail/russel_rao.cuh>
-#include <raft/distance/specializations/fused_l2_nn_min.cuh>
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
index 88e1216635..9588a7f329 100644
--- a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
+++ b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,115 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-
-namespace raft {
-namespace distance {
-
-extern template void fusedL2NNMinReduce<float, raft::KeyValuePair<int, float>, int>(
-  raft::KeyValuePair<int, float>* min,
-  const float* x,
-  const float* y,
-  const float* xn,
-  const float* yn,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-extern template void fusedL2NNMinReduce<float, raft::KeyValuePair<int64_t, float>, int64_t>(
-  raft::KeyValuePair<int64_t, float>* min,
-  const float* x,
-  const float* y,
-  const float* xn,
-  const float* yn,
-  int64_t m,
-  int64_t n,
-  int64_t k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-extern template void fusedL2NNMinReduce<double, raft::KeyValuePair<int, double>, int>(
-  raft::KeyValuePair<int, double>* min,
-  const double* x,
-  const double* y,
-  const double* xn,
-  const double* yn,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-extern template void fusedL2NNMinReduce<double, raft::KeyValuePair<int64_t, double>, int64_t>(
-  raft::KeyValuePair<int64_t, double>* min,
-  const double* x,
-  const double* y,
-  const double* xn,
-  const double* yn,
-  int64_t m,
-  int64_t n,
-  int64_t k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-extern template void fusedL2NNMinReduce<float, float, int>(float* min,
-                                                           const float* x,
-                                                           const float* y,
-                                                           const float* xn,
-                                                           const float* yn,
-                                                           int m,
-                                                           int n,
-                                                           int k,
-                                                           void* workspace,
-                                                           bool sqrt,
-                                                           bool initOutBuffer,
-                                                           cudaStream_t stream);
-extern template void fusedL2NNMinReduce<float, float, int64_t>(float* min,
-                                                               const float* x,
-                                                               const float* y,
-                                                               const float* xn,
-                                                               const float* yn,
-                                                               int64_t m,
-                                                               int64_t n,
-                                                               int64_t k,
-                                                               void* workspace,
-                                                               bool sqrt,
-                                                               bool initOutBuffer,
-                                                               cudaStream_t stream);
-extern template void fusedL2NNMinReduce<double, double, int>(double* min,
-                                                             const double* x,
-                                                             const double* y,
-                                                             const double* xn,
-                                                             const double* yn,
-                                                             int m,
-                                                             int n,
-                                                             int k,
-                                                             void* workspace,
-                                                             bool sqrt,
-                                                             bool initOutBuffer,
-                                                             cudaStream_t stream);
-extern template void fusedL2NNMinReduce<double, double, int64_t>(double* min,
-                                                                 const double* x,
-                                                                 const double* y,
-                                                                 const double* xn,
-                                                                 const double* yn,
-                                                                 int64_t m,
-                                                                 int64_t n,
-                                                                 int64_t k,
-                                                                 void* workspace,
-                                                                 bool sqrt,
-                                                                 bool initOutBuffer,
-                                                                 cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
\ No newline at end of file
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index ca7d5e96a9..f7828294cd 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft/solver version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft/solver version instead.")
 
 #include <raft/solver/linear_assignment.cuh>
 
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp
index 30f2b53e52..5472422053 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the cuh version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the cuh version instead.")
 
 #include <raft/solver/linear_assignment.cuh>
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 608c63e1a9..c19f491319 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -216,7 +216,7 @@ void add_scalar(raft::device_resources const& handle,
 
 /** @} */  // end of group add
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index ed083a1590..88c49d1f42 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -82,7 +82,7 @@ void binary_op(raft::device_resources const& handle, InType in1, InType in2, Out
 
 /** @} */  // end of group binary_op
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index 674be207d8..48c121c359 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -159,7 +159,7 @@ void coalesced_reduction(raft::device_resources const& handle,
 
 /** @} */  // end of group coalesced_reduction
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index 4321e13d95..3b1e8c41c4 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,7 +100,7 @@ struct KernelPolicy {
     SmemSize = 2 * SmemPage * sizeof(DataT),
   };  // enum
 
-};  // struct KernelPolicy
+};    // struct KernelPolicy
 
 template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
 struct ColKernelPolicy {
@@ -151,8 +151,7 @@ struct ColKernelPolicy {
  * @{
  */
 template <typename DataT, int _veclen>
-struct Policy4x4 {
-};
+struct Policy4x4 {};
 
 template <int _veclen>
 struct Policy4x4<float, _veclen> {
@@ -174,8 +173,7 @@ struct Policy4x4<double, _veclen> {
  *
  */
 template <typename DataT, int _veclen>
-struct Policy4x4Skinny {
-};
+struct Policy4x4Skinny {};
 
 template <int _veclen>
 struct Policy4x4Skinny<float, _veclen> {
@@ -194,8 +192,7 @@ struct Policy4x4Skinny<double, _veclen> {
  * @{
  */
 template <typename DataT, int _veclen = 1>
-struct Policy2x8 {
-};
+struct Policy2x8 {};
 
 template <int _veclen>
 struct Policy2x8<float, _veclen> {
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
new file mode 100644
index 0000000000..4800f2e3cf
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/operators.hpp>
+
+// The explicit instantiation of raft::linalg::detail::coalescedReduction is not
+// forced because there would be too many instances. Instead, we cover the most
+// common instantiations with extern template instantiations below.
+
+#define instantiate_raft_linalg_detail_coalescedReduction(                              \
+  InType, OutType, IdxType, MainLambda, ReduceLambda, FinalLambda)                      \
+  extern template void raft::linalg::detail::coalescedReduction(OutType* dots,          \
+                                                                const InType* data,     \
+                                                                IdxType D,              \
+                                                                IdxType N,              \
+                                                                OutType init,           \
+                                                                cudaStream_t stream,    \
+                                                                bool inplace,           \
+                                                                MainLambda main_op,     \
+                                                                ReduceLambda reduce_op, \
+                                                                FinalLambda final_op)
+
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::identity_op, raft::min_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::abs_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::abs_op, raft::max_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::abs_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::abs_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::identity_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::identity_op, raft::min_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, long, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::identity_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::abs_op, raft::max_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, unsigned int, raft::sq_op, raft::add_op, raft::identity_op);
+
+#undef instantiate_raft_linalg_detail_coalescedReduction
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
new file mode 100644
index 0000000000..5b01196cf4
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <int warpSize, int rpb>
+struct ReductionThinPolicy {
+  static constexpr int LogicalWarpSize = warpSize;
+  static constexpr int RowsPerBlock    = rpb;
+  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
+};
+
+template <typename Policy,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+  coalescedReductionThinKernel(OutType* dots,
+                               const InType* data,
+                               IdxType D,
+                               IdxType N,
+                               OutType init,
+                               MainLambda main_op,
+                               ReduceLambda reduce_op,
+                               FinalLambda final_op,
+                               bool inplace = false)
+{
+  IdxType i = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
+  if (i >= N) return;
+
+  OutType acc = init;
+  for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
+    acc = reduce_op(acc, main_op(data[j + (D * i)], j));
+  }
+  acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
+  if (threadIdx.x == 0) {
+    if (inplace) {
+      dots[i] = final_op(reduce_op(dots[i], acc));
+    } else {
+      dots[i] = final_op(acc);
+    }
+  }
+}
+
+template <typename Policy,
+          typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void coalescedReductionThin(OutType* dots,
+                            const InType* data,
+                            IdxType D,
+                            IdxType N,
+                            OutType init,
+                            cudaStream_t stream,
+                            bool inplace           = false,
+                            MainLambda main_op     = raft::identity_op(),
+                            ReduceLambda reduce_op = raft::add_op(),
+                            FinalLambda final_op   = raft::identity_op())
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "coalescedReductionThin<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
+  dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
+  dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
+  coalescedReductionThinKernel<Policy>
+    <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void coalescedReductionThinDispatcher(OutType* dots,
+                                      const InType* data,
+                                      IdxType D,
+                                      IdxType N,
+                                      OutType init,
+                                      cudaStream_t stream,
+                                      bool inplace           = false,
+                                      MainLambda main_op     = raft::identity_op(),
+                                      ReduceLambda reduce_op = raft::add_op(),
+                                      FinalLambda final_op   = raft::identity_op())
+{
+  if (D <= IdxType(2)) {
+    coalescedReductionThin<ReductionThinPolicy<2, 64>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D <= IdxType(4)) {
+    coalescedReductionThin<ReductionThinPolicy<4, 32>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D <= IdxType(8)) {
+    coalescedReductionThin<ReductionThinPolicy<8, 16>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D <= IdxType(16)) {
+    coalescedReductionThin<ReductionThinPolicy<16, 8>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    coalescedReductionThin<ReductionThinPolicy<32, 4>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  }
+}
+
+template <int TPB,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
+                                                                      const InType* data,
+                                                                      IdxType D,
+                                                                      IdxType N,
+                                                                      OutType init,
+                                                                      MainLambda main_op,
+                                                                      ReduceLambda reduce_op,
+                                                                      FinalLambda final_op,
+                                                                      bool inplace = false)
+{
+  typedef cub::BlockReduce<OutType, TPB, cub::BLOCK_REDUCE_RAKING> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  OutType thread_data = init;
+  IdxType rowStart    = blockIdx.x * D;
+  for (IdxType i = threadIdx.x; i < D; i += TPB) {
+    IdxType idx = rowStart + i;
+    thread_data = reduce_op(thread_data, main_op(data[idx], i));
+  }
+  OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
+  if (threadIdx.x == 0) {
+    if (inplace) {
+      dots[blockIdx.x] = final_op(reduce_op(dots[blockIdx.x], acc));
+    } else {
+      dots[blockIdx.x] = final_op(acc);
+    }
+  }
+}
+
+template <int TPB,
+          typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void coalescedReductionMedium(OutType* dots,
+                              const InType* data,
+                              IdxType D,
+                              IdxType N,
+                              OutType init,
+                              cudaStream_t stream,
+                              bool inplace           = false,
+                              MainLambda main_op     = raft::identity_op(),
+                              ReduceLambda reduce_op = raft::add_op(),
+                              FinalLambda final_op   = raft::identity_op())
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("coalescedReductionMedium<%d>", TPB);
+  coalescedReductionMediumKernel<TPB>
+    <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void coalescedReductionMediumDispatcher(OutType* dots,
+                                        const InType* data,
+                                        IdxType D,
+                                        IdxType N,
+                                        OutType init,
+                                        cudaStream_t stream,
+                                        bool inplace           = false,
+                                        MainLambda main_op     = raft::identity_op(),
+                                        ReduceLambda reduce_op = raft::add_op(),
+                                        FinalLambda final_op   = raft::identity_op())
+{
+  // Note: for now, this kernel is only used when D > 256. If this changes in the future, use
+  // smaller block sizes when relevant.
+  coalescedReductionMedium<256>(
+    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+template <int tpb, int bpr>
+struct ReductionThickPolicy {
+  static constexpr int ThreadsPerBlock = tpb;
+  static constexpr int BlocksPerRow    = bpr;
+  static constexpr int BlockStride     = tpb * bpr;
+};
+
+template <typename Policy,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda>
+__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+  coalescedReductionThickKernel(OutType* buffer,
+                                const InType* data,
+                                IdxType D,
+                                IdxType N,
+                                OutType init,
+                                MainLambda main_op,
+                                ReduceLambda reduce_op)
+{
+  typedef cub::BlockReduce<OutType, Policy::ThreadsPerBlock, cub::BLOCK_REDUCE_RAKING> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  OutType thread_data = init;
+  IdxType rowStart    = blockIdx.x * D;
+  for (IdxType i = blockIdx.y * Policy::ThreadsPerBlock + threadIdx.x; i < D;
+       i += Policy::BlockStride) {
+    IdxType idx = rowStart + i;
+    thread_data = reduce_op(thread_data, main_op(data[idx], i));
+  }
+  OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
+  if (threadIdx.x == 0) { buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = acc; }
+}
+
+template <typename ThickPolicy,
+          typename ThinPolicy,
+          typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void coalescedReductionThick(OutType* dots,
+                             const InType* data,
+                             IdxType D,
+                             IdxType N,
+                             OutType init,
+                             cudaStream_t stream,
+                             bool inplace           = false,
+                             MainLambda main_op     = raft::identity_op(),
+                             ReduceLambda reduce_op = raft::add_op(),
+                             FinalLambda final_op   = raft::identity_op())
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "coalescedReductionThick<%d,%d>", ThickPolicy::ThreadsPerBlock, ThickPolicy::BlocksPerRow);
+
+  dim3 threads(ThickPolicy::ThreadsPerBlock, 1, 1);
+  dim3 blocks(N, ThickPolicy::BlocksPerRow, 1);
+
+  rmm::device_uvector<OutType> buffer(N * ThickPolicy::BlocksPerRow, stream);
+
+  /* We apply a two-step reduction:
+   *  1. coalescedReductionThickKernel reduces the [N x D] input data to [N x BlocksPerRow]. It
+   *     applies the main_op but not the final op.
+   *  2. coalescedReductionThinKernel reduces [N x BlocksPerRow] to [N x 1]. It doesn't apply any
+   *     main_op but applies final_op. If in-place, the existing and new values are reduced.
+   */
+
+  coalescedReductionThickKernel<ThickPolicy>
+    <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op, reduce_op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  coalescedReductionThin<ThinPolicy>(dots,
+                                     buffer.data(),
+                                     static_cast<IdxType>(ThickPolicy::BlocksPerRow),
+                                     N,
+                                     init,
+                                     stream,
+                                     inplace,
+                                     raft::identity_op(),
+                                     reduce_op,
+                                     final_op);
+}
+
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void coalescedReductionThickDispatcher(OutType* dots,
+                                       const InType* data,
+                                       IdxType D,
+                                       IdxType N,
+                                       OutType init,
+                                       cudaStream_t stream,
+                                       bool inplace           = false,
+                                       MainLambda main_op     = raft::identity_op(),
+                                       ReduceLambda reduce_op = raft::add_op(),
+                                       FinalLambda final_op   = raft::identity_op())
+{
+  // Note: multiple elements per thread to take advantage of the sequential reduction and loop
+  // unrolling
+  if (D < IdxType(32768)) {
+    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 4>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 4>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  }
+}
+
+// Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along
+// rows for row major or reduce along columns for column major layout. Can do an inplace reduction
+// adding to original values of dots if requested.
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        IdxType D,
+                        IdxType N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::identity_op(),
+                        ReduceLambda reduce_op = raft::add_op(),
+                        FinalLambda final_op   = raft::identity_op())
+{
+  /* The primitive selects one of three implementations based on heuristics:
+   *  - Thin: very efficient when D is small and/or N is large
+   *  - Thick: used when N is very small and D very large
+   *  - Medium: used when N is too small to fill the GPU with the thin kernel
+   */
+  const IdxType numSMs = raft::getMultiProcessorCount();
+  if (D <= IdxType(256) || N >= IdxType(4) * numSMs) {
+    coalescedReductionThinDispatcher(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (N < numSMs && D >= IdxType(16384)) {
+    coalescedReductionThickDispatcher(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    coalescedReductionMediumDispatcher(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  }
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index 238e17fa56..3e6b17978b 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,353 +16,11 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
-#include <raft/common/nvtx.hpp>
-#include <raft/core/operators.hpp>
-#include <raft/util/cuda_utils.cuh>
-#include <rmm/device_uvector.hpp>
+// Always include inline definitions of coalesced reduction, because we do not
+// force explicit instantion.
+#include "coalesced_reduction-inl.cuh"
 
-namespace raft {
-namespace linalg {
-namespace detail {
-
-template <int warpSize, int rpb>
-struct ReductionThinPolicy {
-  static constexpr int LogicalWarpSize = warpSize;
-  static constexpr int RowsPerBlock    = rpb;
-  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
-};
-
-template <typename Policy,
-          typename InType,
-          typename OutType,
-          typename IdxType,
-          typename MainLambda,
-          typename ReduceLambda,
-          typename FinalLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
-  coalescedReductionThinKernel(OutType* dots,
-                               const InType* data,
-                               IdxType D,
-                               IdxType N,
-                               OutType init,
-                               MainLambda main_op,
-                               ReduceLambda reduce_op,
-                               FinalLambda final_op,
-                               bool inplace = false)
-{
-  IdxType i = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
-  if (i >= N) return;
-
-  OutType acc = init;
-  for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
-    acc = reduce_op(acc, main_op(data[j + (D * i)], j));
-  }
-  acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
-  if (threadIdx.x == 0) {
-    if (inplace) {
-      dots[i] = final_op(reduce_op(dots[i], acc));
-    } else {
-      dots[i] = final_op(acc);
-    }
-  }
-}
-
-template <typename Policy,
-          typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::identity_op,
-          typename ReduceLambda = raft::add_op,
-          typename FinalLambda  = raft::identity_op>
-void coalescedReductionThin(OutType* dots,
-                            const InType* data,
-                            IdxType D,
-                            IdxType N,
-                            OutType init,
-                            cudaStream_t stream,
-                            bool inplace           = false,
-                            MainLambda main_op     = raft::identity_op(),
-                            ReduceLambda reduce_op = raft::add_op(),
-                            FinalLambda final_op   = raft::identity_op())
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "coalescedReductionThin<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
-  dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
-  dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
-  coalescedReductionThinKernel<Policy>
-    <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::identity_op,
-          typename ReduceLambda = raft::add_op,
-          typename FinalLambda  = raft::identity_op>
-void coalescedReductionThinDispatcher(OutType* dots,
-                                      const InType* data,
-                                      IdxType D,
-                                      IdxType N,
-                                      OutType init,
-                                      cudaStream_t stream,
-                                      bool inplace           = false,
-                                      MainLambda main_op     = raft::identity_op(),
-                                      ReduceLambda reduce_op = raft::add_op(),
-                                      FinalLambda final_op   = raft::identity_op())
-{
-  if (D <= IdxType(2)) {
-    coalescedReductionThin<ReductionThinPolicy<2, 64>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (D <= IdxType(4)) {
-    coalescedReductionThin<ReductionThinPolicy<4, 32>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (D <= IdxType(8)) {
-    coalescedReductionThin<ReductionThinPolicy<8, 16>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (D <= IdxType(16)) {
-    coalescedReductionThin<ReductionThinPolicy<16, 8>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else {
-    coalescedReductionThin<ReductionThinPolicy<32, 4>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  }
-}
-
-template <int TPB,
-          typename InType,
-          typename OutType,
-          typename IdxType,
-          typename MainLambda,
-          typename ReduceLambda,
-          typename FinalLambda>
-__global__ void __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
-                                                                      const InType* data,
-                                                                      IdxType D,
-                                                                      IdxType N,
-                                                                      OutType init,
-                                                                      MainLambda main_op,
-                                                                      ReduceLambda reduce_op,
-                                                                      FinalLambda final_op,
-                                                                      bool inplace = false)
-{
-  typedef cub::BlockReduce<OutType, TPB, cub::BLOCK_REDUCE_RAKING> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  OutType thread_data = init;
-  IdxType rowStart    = blockIdx.x * D;
-  for (IdxType i = threadIdx.x; i < D; i += TPB) {
-    IdxType idx = rowStart + i;
-    thread_data = reduce_op(thread_data, main_op(data[idx], i));
-  }
-  OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
-  if (threadIdx.x == 0) {
-    if (inplace) {
-      dots[blockIdx.x] = final_op(reduce_op(dots[blockIdx.x], acc));
-    } else {
-      dots[blockIdx.x] = final_op(acc);
-    }
-  }
-}
-
-template <int TPB,
-          typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::identity_op,
-          typename ReduceLambda = raft::add_op,
-          typename FinalLambda  = raft::identity_op>
-void coalescedReductionMedium(OutType* dots,
-                              const InType* data,
-                              IdxType D,
-                              IdxType N,
-                              OutType init,
-                              cudaStream_t stream,
-                              bool inplace           = false,
-                              MainLambda main_op     = raft::identity_op(),
-                              ReduceLambda reduce_op = raft::add_op(),
-                              FinalLambda final_op   = raft::identity_op())
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("coalescedReductionMedium<%d>", TPB);
-  coalescedReductionMediumKernel<TPB>
-    <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::identity_op,
-          typename ReduceLambda = raft::add_op,
-          typename FinalLambda  = raft::identity_op>
-void coalescedReductionMediumDispatcher(OutType* dots,
-                                        const InType* data,
-                                        IdxType D,
-                                        IdxType N,
-                                        OutType init,
-                                        cudaStream_t stream,
-                                        bool inplace           = false,
-                                        MainLambda main_op     = raft::identity_op(),
-                                        ReduceLambda reduce_op = raft::add_op(),
-                                        FinalLambda final_op   = raft::identity_op())
-{
-  // Note: for now, this kernel is only used when D > 256. If this changes in the future, use
-  // smaller block sizes when relevant.
-  coalescedReductionMedium<256>(
-    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-}
-
-template <int tpb, int bpr>
-struct ReductionThickPolicy {
-  static constexpr int ThreadsPerBlock = tpb;
-  static constexpr int BlocksPerRow    = bpr;
-  static constexpr int BlockStride     = tpb * bpr;
-};
-
-template <typename Policy,
-          typename InType,
-          typename OutType,
-          typename IdxType,
-          typename MainLambda,
-          typename ReduceLambda>
-__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
-  coalescedReductionThickKernel(OutType* buffer,
-                                const InType* data,
-                                IdxType D,
-                                IdxType N,
-                                OutType init,
-                                MainLambda main_op,
-                                ReduceLambda reduce_op)
-{
-  typedef cub::BlockReduce<OutType, Policy::ThreadsPerBlock, cub::BLOCK_REDUCE_RAKING> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  OutType thread_data = init;
-  IdxType rowStart    = blockIdx.x * D;
-  for (IdxType i = blockIdx.y * Policy::ThreadsPerBlock + threadIdx.x; i < D;
-       i += Policy::BlockStride) {
-    IdxType idx = rowStart + i;
-    thread_data = reduce_op(thread_data, main_op(data[idx], i));
-  }
-  OutType acc = BlockReduce(temp_storage).Reduce(thread_data, reduce_op);
-  if (threadIdx.x == 0) { buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = acc; }
-}
-
-template <typename ThickPolicy,
-          typename ThinPolicy,
-          typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::identity_op,
-          typename ReduceLambda = raft::add_op,
-          typename FinalLambda  = raft::identity_op>
-void coalescedReductionThick(OutType* dots,
-                             const InType* data,
-                             IdxType D,
-                             IdxType N,
-                             OutType init,
-                             cudaStream_t stream,
-                             bool inplace           = false,
-                             MainLambda main_op     = raft::identity_op(),
-                             ReduceLambda reduce_op = raft::add_op(),
-                             FinalLambda final_op   = raft::identity_op())
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "coalescedReductionThick<%d,%d>", ThickPolicy::ThreadsPerBlock, ThickPolicy::BlocksPerRow);
-
-  dim3 threads(ThickPolicy::ThreadsPerBlock, 1, 1);
-  dim3 blocks(N, ThickPolicy::BlocksPerRow, 1);
-
-  rmm::device_uvector<OutType> buffer(N * ThickPolicy::BlocksPerRow, stream);
-
-  /* We apply a two-step reduction:
-   *  1. coalescedReductionThickKernel reduces the [N x D] input data to [N x BlocksPerRow]. It
-   *     applies the main_op but not the final op.
-   *  2. coalescedReductionThinKernel reduces [N x BlocksPerRow] to [N x 1]. It doesn't apply any
-   *     main_op but applies final_op. If in-place, the existing and new values are reduced.
-   */
-
-  coalescedReductionThickKernel<ThickPolicy>
-    <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op, reduce_op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  coalescedReductionThin<ThinPolicy>(dots,
-                                     buffer.data(),
-                                     static_cast<IdxType>(ThickPolicy::BlocksPerRow),
-                                     N,
-                                     init,
-                                     stream,
-                                     inplace,
-                                     raft::identity_op(),
-                                     reduce_op,
-                                     final_op);
-}
-
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::identity_op,
-          typename ReduceLambda = raft::add_op,
-          typename FinalLambda  = raft::identity_op>
-void coalescedReductionThickDispatcher(OutType* dots,
-                                       const InType* data,
-                                       IdxType D,
-                                       IdxType N,
-                                       OutType init,
-                                       cudaStream_t stream,
-                                       bool inplace           = false,
-                                       MainLambda main_op     = raft::identity_op(),
-                                       ReduceLambda reduce_op = raft::add_op(),
-                                       FinalLambda final_op   = raft::identity_op())
-{
-  // Note: multiple elements per thread to take advantage of the sequential reduction and loop
-  // unrolling
-  if (D < IdxType(32768)) {
-    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 4>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else {
-    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 4>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  }
-}
-
-// Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along
-// rows for row major or reduce along columns for column major layout. Can do an inplace reduction
-// adding to original values of dots if requested.
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::identity_op,
-          typename ReduceLambda = raft::add_op,
-          typename FinalLambda  = raft::identity_op>
-void coalescedReduction(OutType* dots,
-                        const InType* data,
-                        IdxType D,
-                        IdxType N,
-                        OutType init,
-                        cudaStream_t stream,
-                        bool inplace           = false,
-                        MainLambda main_op     = raft::identity_op(),
-                        ReduceLambda reduce_op = raft::add_op(),
-                        FinalLambda final_op   = raft::identity_op())
-{
-  /* The primitive selects one of three implementations based on heuristics:
-   *  - Thin: very efficient when D is small and/or N is large
-   *  - Thick: used when N is very small and D very large
-   *  - Medium: used when N is too small to fill the GPU with the thin kernel
-   */
-  const IdxType numSMs = raft::getMultiProcessorCount();
-  if (D <= IdxType(256) || N >= IdxType(4) * numSMs) {
-    coalescedReductionThinDispatcher(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (N < numSMs && D >= IdxType(16384)) {
-    coalescedReductionThickDispatcher(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else {
-    coalescedReductionMediumDispatcher(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  }
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
\ No newline at end of file
+// Do include the extern template instantiations when possible.
+#ifdef RAFT_COMPILED
+#include "coalesced_reduction-ext.cuh"
+#endif
diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 87a195757c..5a7356a4c2 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -41,9 +41,9 @@ class cublas_device_pointer_mode {
     }
   }
   auto operator=(const cublas_device_pointer_mode&) -> cublas_device_pointer_mode& = delete;
-  auto operator=(cublas_device_pointer_mode&&) -> cublas_device_pointer_mode& = delete;
-  static auto operator new(std::size_t) -> void*                              = delete;
-  static auto operator new[](std::size_t) -> void*                            = delete;
+  auto operator=(cublas_device_pointer_mode&&) -> cublas_device_pointer_mode&      = delete;
+  static auto operator new(std::size_t) -> void*                                   = delete;
+  static auto operator new[](std::size_t) -> void*                                 = delete;
 
   ~cublas_device_pointer_mode()
   {
@@ -550,7 +550,7 @@ cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,
 template <>
 inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int n,
-                                         float* const A[],  // NOLINT
+                                         float* const A[],       // NOLINT
                                          int lda,
                                          int* P,
                                          int* info,
@@ -564,7 +564,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
 template <>
 inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int n,
-                                         double* const A[],  // NOLINT
+                                         double* const A[],      // NOLINT
                                          int lda,
                                          int* P,
                                          int* info,
diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 70bb2df4f5..c22ef09809 100644
--- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -25,8 +25,7 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-struct sum_tag {
-};
+struct sum_tag {};
 
 template <typename InType, typename OutType, int TPB>
 __device__ void reduce(OutType* out, const InType acc, sum_tag)
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 4cba028d87..bc7c551d89 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -18,6 +18,8 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
+#include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/matrix/matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -42,10 +44,10 @@ namespace detail {
  */
 template <typename math_t>
 void qrGetQ_inplace(
-  raft::device_resources const& handle, math_t* Q, int n_rows, int n_cols, cudaStream_t stream)
+  raft::resources const& handle, math_t* Q, int n_rows, int n_cols, cudaStream_t stream)
 {
   RAFT_EXPECTS(n_rows >= n_cols, "QR decomposition expects n_rows >= n_cols.");
-  cusolverDnHandle_t cusolver = handle.get_cusolver_dn_handle();
+  cusolverDnHandle_t cusolver = resource::get_cusolver_dn_handle(handle);
 
   rmm::device_uvector<math_t> tau(n_cols, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * n_cols, stream));
@@ -83,7 +85,7 @@ void qrGetQ_inplace(
 }
 
 template <typename math_t>
-void qrGetQ(raft::device_resources const& handle,
+void qrGetQ(raft::resources const& handle,
             const math_t* M,
             math_t* Q,
             int n_rows,
@@ -95,7 +97,7 @@ void qrGetQ(raft::device_resources const& handle,
 }
 
 template <typename math_t>
-void qrGetQR(raft::device_resources const& handle,
+void qrGetQR(raft::resources const& handle,
              math_t* M,
              math_t* Q,
              math_t* R,
@@ -103,7 +105,7 @@ void qrGetQR(raft::device_resources const& handle,
              int n_cols,
              cudaStream_t stream)
 {
-  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);
 
   int m = n_rows, n = n_cols;
   rmm::device_uvector<math_t> R_full(m * n, stream);
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index 05588bda9c..bbd71a4cf1 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -19,7 +19,9 @@
 #include "cublas_wrappers.hpp"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <rmm/exec_policy.hpp>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -29,14 +31,14 @@ namespace linalg {
 namespace detail {
 
 template <typename math_t>
-void transpose(raft::device_resources const& handle,
+void transpose(raft::resources const& handle,
                math_t* in,
                math_t* out,
                int n_rows,
                int n_cols,
                cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
   RAFT_CUBLAS_TRY(cublasSetStream(cublas_h, stream));
 
   int out_n_rows = n_cols;
@@ -83,7 +85,7 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
 
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
 void transpose_row_major_impl(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
@@ -92,7 +94,7 @@ void transpose_row_major_impl(
   T constexpr kOne  = 1;
   T constexpr kZero = 0;
 
-  CUBLAS_TRY(cublasgeam(handle.get_cublas_handle(),
+  CUBLAS_TRY(cublasgeam(resource::get_cublas_handle(handle),
                         CUBLAS_OP_T,
                         CUBLAS_OP_N,
                         out_n_cols,
@@ -105,12 +107,12 @@ void transpose_row_major_impl(
                         out.stride(0),
                         out.data_handle(),
                         out.stride(0),
-                        handle.get_stream()));
+                        resource::get_cuda_stream(handle)));
 }
 
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
 void transpose_col_major_impl(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
@@ -119,7 +121,7 @@ void transpose_col_major_impl(
   T constexpr kOne  = 1;
   T constexpr kZero = 0;
 
-  CUBLAS_TRY(cublasgeam(handle.get_cublas_handle(),
+  CUBLAS_TRY(cublasgeam(resource::get_cublas_handle(handle),
                         CUBLAS_OP_T,
                         CUBLAS_OP_N,
                         out_n_rows,
@@ -132,7 +134,7 @@ void transpose_col_major_impl(
                         out.stride(1),
                         out.data_handle(),
                         out.stride(1),
-                        handle.get_stream()));
+                        resource::get_cuda_stream(handle)));
 }
 };  // end namespace detail
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index 0b18e6175c..428b9ba618 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -95,7 +95,7 @@ void divide_scalar(raft::device_resources const& handle,
 
 /** @} */  // end of group add
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 03e94a10b1..7829f8e49f 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -219,7 +219,7 @@ void eig_jacobi(raft::device_resources const& handle,
 
 /** @} */  // end of eig
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh
index 96846003f6..019ec9f7ac 100644
--- a/cpp/include/raft/linalg/gemv.cuh
+++ b/cpp/include/raft/linalg/gemv.cuh
@@ -304,6 +304,6 @@ void gemv(raft::device_resources const& handle,
 }
 /** @} */  // end of gemv
 
-};  // namespace linalg
-};  // namespace raft
+};         // namespace linalg
+};         // namespace raft
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lanczos.cuh b/cpp/include/raft/linalg/lanczos.cuh
index c9f3e0010e..04e9980583 100644
--- a/cpp/include/raft/linalg/lanczos.cuh
+++ b/cpp/include/raft/linalg/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the sparse solvers version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the sparse solvers version instead.")
 
 #include <raft/sparse/solver/lanczos.cuh>
 
diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh
index b36a9eba96..c753215737 100644
--- a/cpp/include/raft/linalg/lstsq.cuh
+++ b/cpp/include/raft/linalg/lstsq.cuh
@@ -244,7 +244,7 @@ void lstsq_qr(raft::device_resources const& handle,
 
 /** @} */  // end of lstsq
 
-};  // namespace linalg
-};  // namespace raft
+};         // namespace linalg
+};         // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 59b2ca5ee5..6c65626ac5 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -238,7 +238,7 @@ void matrix_vector_op(raft::device_resources const& handle,
 
 /** @} */  // end of group matrix_vector_op
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index 62f4896d01..317c085673 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -74,7 +74,7 @@ void mean_squared_error(raft::device_resources const& handle,
 
 /** @} */  // end of group mean_squared_error
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 574b88c63d..bdca641616 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -97,7 +97,7 @@ void multiply_scalar(
 
 /** @} */  // end of group multiply
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index 1fdfcb3780..057d6f6827 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -153,7 +153,7 @@ void power_scalar(
 
 /** @} */  // end of group add
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index 8e58af63c1..948996d0ac 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -19,6 +19,8 @@
 #pragma once
 
 #include "detail/qr.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 
 namespace raft {
 namespace linalg {
@@ -33,7 +35,7 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQ(raft::device_resources const& handle,
+void qrGetQ(raft::resources const& handle,
             const math_t* M,
             math_t* Q,
             int n_rows,
@@ -54,7 +56,7 @@ void qrGetQ(raft::device_resources const& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQR(raft::device_resources const& handle,
+void qrGetQR(raft::resources const& handle,
              math_t* M,
              math_t* Q,
              math_t* R,
@@ -77,13 +79,18 @@ void qrGetQR(raft::device_resources const& handle,
  * @param[out] Q Output raft::device_matrix_view
  */
 template <typename ElementType, typename IndexType>
-void qr_get_q(raft::device_resources const& handle,
+void qr_get_q(raft::resources const& handle,
               raft::device_matrix_view<const ElementType, IndexType, raft::col_major> M,
               raft::device_matrix_view<ElementType, IndexType, raft::col_major> Q)
 {
   RAFT_EXPECTS(Q.size() == M.size(), "Size mismatch between Output and Input");
 
-  qrGetQ(handle, M.data_handle(), Q.data_handle(), M.extent(0), M.extent(1), handle.get_stream());
+  qrGetQ(handle,
+         M.data_handle(),
+         Q.data_handle(),
+         M.extent(0),
+         M.extent(1),
+         resource::get_cuda_stream(handle));
 }
 
 /**
@@ -94,7 +101,7 @@ void qr_get_q(raft::device_resources const& handle,
  * @param[out] R Output raft::device_matrix_view
  */
 template <typename ElementType, typename IndexType>
-void qr_get_qr(raft::device_resources const& handle,
+void qr_get_qr(raft::resources const& handle,
                raft::device_matrix_view<const ElementType, IndexType, raft::col_major> M,
                raft::device_matrix_view<ElementType, IndexType, raft::col_major> Q,
                raft::device_matrix_view<ElementType, IndexType, raft::col_major> R)
@@ -107,7 +114,7 @@ void qr_get_qr(raft::device_resources const& handle,
           R.data_handle(),
           M.extent(0),
           M.extent(1),
-          handle.get_stream());
+          resource::get_cuda_stream(handle));
 }
 
 /** @} */
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index ae5457c44f..06f62f207e 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -161,7 +161,7 @@ void reduce(raft::device_resources const& handle,
 
 /** @} */  // end of group reduction
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 2b744d8134..71c8cf14a1 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -112,7 +112,7 @@ void reduce_cols_by_key(
 
 /** @} */  // end of group reduce_cols_by_key
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 484b60238b..0e83c9aa2b 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -191,7 +191,7 @@ void reduce_rows_by_key(
 
 /** @} */  // end of group reduce_rows_by_key
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index eb94547f13..8a32467873 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -765,7 +765,7 @@ void rsvd_perc_symmetric_jacobi(Args... args)
 
 /** @} */  // end of group rsvd
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index 55e661897d..eecc719617 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -83,7 +83,7 @@ void sqrt(raft::device_resources const& handle, InType in, OutType out)
 
 /** @} */  // end of group add
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index f58dfe28b3..25be368865 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -170,7 +170,7 @@ void strided_reduction(raft::device_resources const& handle,
 
 /** @} */  // end of group strided_reduction
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index da995b7a2a..cbd6b9df59 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -222,7 +222,7 @@ void subtract_scalar(
 
 /** @} */  // end of group subtract
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 4b78f2ef61..801d271fe9 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -415,7 +415,7 @@ void svd_reconstruction(raft::device_resources const& handle,
 
 /** @} */  // end of group svd
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index 1e347d69be..ce95e98499 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -83,7 +83,7 @@ void ternary_op(
 
 /** @} */  // end of group ternary_op
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/transpose.cuh b/cpp/include/raft/linalg/transpose.cuh
index a0f418b4f7..0fe752347d 100644
--- a/cpp/include/raft/linalg/transpose.cuh
+++ b/cpp/include/raft/linalg/transpose.cuh
@@ -20,6 +20,7 @@
 
 #include "detail/transpose.cuh"
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/resources.hpp>
 
 namespace raft {
 namespace linalg {
@@ -34,7 +35,7 @@ namespace linalg {
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(raft::device_resources const& handle,
+void transpose(raft::resources const& handle,
                math_t* in,
                math_t* out,
                int n_rows,
@@ -76,7 +77,7 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
  * @param[out] out    Output matirx, storage is pre-allocated by caller.
  */
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
-auto transpose(raft::device_resources const& handle,
+auto transpose(raft::resources const& handle,
                raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
                raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
   -> std::enable_if_t<std::is_floating_point_v<T>, void>
@@ -102,7 +103,7 @@ auto transpose(raft::device_resources const& handle,
 
 /** @} */  // end of group transpose
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 23f932d2f2..58ff2f6bd6 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -124,7 +124,7 @@ void write_only_unary_op(const raft::device_resources& handle, OutType out, Lamb
 
 /** @} */  // end of group unary_op
 
-};  // end namespace linalg
-};  // end namespace raft
+};         // end namespace linalg
+};         // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index a4daf097e5..6546a48279 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -133,6 +133,6 @@ void sort_cols_per_row(Args... args)
 
 /** @} */  // end of group col_wise_sort
 
-};  // end namespace raft::matrix
+};         // end namespace raft::matrix
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
new file mode 100644
index 0000000000..2b233c156d
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                                   // uint32_t
+#include <cuda_fp16.h>                               // __half
+#include <raft/util/raft_explicit.hpp>               // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::matrix::detail {
+
+template <typename T, typename IdxT>
+void select_k(const T* in_val,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out_val,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+}  // namespace raft::matrix::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
+  extern template void raft::matrix::detail::select_k(const T* in_val,              \
+                                                      const IdxT* in_idx,           \
+                                                      size_t batch_size,            \
+                                                      size_t len,                   \
+                                                      int k,                        \
+                                                      T* out_val,                   \
+                                                      IdxT* out_idx,                \
+                                                      bool select_min,              \
+                                                      rmm::cuda_stream_view stream, \
+                                                      rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_matrix_detail_select_k(float, int64_t);
+instantiate_raft_matrix_detail_select_k(float, uint32_t);
+// We did not have these two for double before, but there are tests for them. We
+// therefore include them here.
+instantiate_raft_matrix_detail_select_k(double, int64_t);
+instantiate_raft_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
new file mode 100644
index 0000000000..20c2fb119d
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "select_radix.cuh"
+#include "select_warpsort.cuh"
+
+#include <raft/core/nvtx.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace raft::matrix::detail {
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] in_val
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_val.
+ * @param batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param k
+ *   the number of outputs to select in each input row.
+ * @param[out] out_val
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_val`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out_val`.
+ * @param select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param stream
+ * @param mr an optional memory resource to use across the calls (you can provide a large enough
+ *           memory pool here to avoid memory allocations within the call).
+ */
+template <typename T, typename IdxT>
+void select_k(const T* in_val,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out_val,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+  // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
+  const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
+  if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
+    select::warpsort::select_k<T, IdxT>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
+  } else {
+    select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, true, stream, mr);
+  }
+}
+
+}  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/matrix/detail/select_k.cuh b/cpp/include/raft/matrix/detail/select_k.cuh
index 20c2fb119d..711169984b 100644
--- a/cpp/include/raft/matrix/detail/select_k.cuh
+++ b/cpp/include/raft/matrix/detail/select_k.cuh
@@ -13,79 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include "select_radix.cuh"
-#include "select_warpsort.cuh"
-
-#include <raft/core/nvtx.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-
-namespace raft::matrix::detail {
-
-/**
- * Select k smallest or largest key/values from each row in the input data.
- *
- * If you think of the input data `in_val` as a row-major matrix with `len` columns and
- * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
- * in the row-major matrix `out_val` of size (batch_size, k).
- *
- * @tparam T
- *   the type of the keys (what is being compared).
- * @tparam IdxT
- *   the index type (what is being selected together with the keys).
- *
- * @param[in] in_val
- *   contiguous device array of inputs of size (len * batch_size);
- *   these are compared and selected.
- * @param[in] in_idx
- *   contiguous device array of inputs of size (len * batch_size);
- *   typically, these are indices of the corresponding in_val.
- * @param batch_size
- *   number of input rows, i.e. the batch size.
- * @param len
- *   length of a single input array (row); also sometimes referred as n_cols.
- *   Invariant: len >= k.
- * @param k
- *   the number of outputs to select in each input row.
- * @param[out] out_val
- *   contiguous device array of outputs of size (k * batch_size);
- *   the k smallest/largest values from each row of the `in_val`.
- * @param[out] out_idx
- *   contiguous device array of outputs of size (k * batch_size);
- *   the payload selected together with `out_val`.
- * @param select_min
- *   whether to select k smallest (true) or largest (false) keys.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
- */
-template <typename T, typename IdxT>
-void select_k(const T* in_val,
-              const IdxT* in_idx,
-              size_t batch_size,
-              size_t len,
-              int k,
-              T* out_val,
-              IdxT* out_idx,
-              bool select_min,
-              rmm::cuda_stream_view stream,
-              rmm::mr::device_memory_resource* mr = nullptr)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
-  // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
-  const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
-  if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
-    select::warpsort::select_k<T, IdxT>(
-      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
-  } else {
-    select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
-      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, true, stream, mr);
-  }
-}
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "select_k-inl.cuh"
+#endif
 
-}  // namespace raft::matrix::detail
+#ifdef RAFT_COMPILED
+#include "select_k-ext.cuh"
+#endif
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 7ac40ac0eb..b7d02d6b52 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -778,7 +778,7 @@ void radix_topk(const T* in,
   auto pool_guard = raft::get_pool_memory_resource(mr, mem_req);
   if (pool_guard) {
     RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
+                   mem_req);
   }
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
@@ -1031,10 +1031,7 @@ void radix_topk_one_block(const T* in,
                                    max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT)) +
                                      256 * 4  // might need extra memory for alignment
     );
-  if (pool_guard) {
-    RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
-  }
+  if (pool_guard) { RAFT_LOG_DEBUG("radix::select_k: using pool memory resource"); }
 
   rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
   rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream, mr);
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index d362b73792..dc86a04733 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -27,7 +27,7 @@
 #include <functional>
 #include <type_traits>
 
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 /*
@@ -870,8 +870,7 @@ struct launch_setup {
 };
 
 template <template <int, bool, typename, typename> class WarpSortClass>
-struct LaunchThreshold {
-};
+struct LaunchThreshold {};
 
 template <>
 struct LaunchThreshold<warp_sort_filtered> {
@@ -960,7 +959,7 @@ void calc_launch_parameter(
       if (batch_size >= size_t(another_min_grid_size)  // still have enough work
           && another_block_size < block_size           // protect against an infinite loop
           && another_min_grid_size * another_block_size >
-               min_grid_size * block_size  // improve occupancy
+               min_grid_size * block_size              // improve occupancy
       ) {
         block_size    = another_block_size;
         min_grid_size = another_min_grid_size;
@@ -991,10 +990,7 @@ void select_k_(int num_of_block,
 {
   auto pool_guard = raft::get_pool_memory_resource(
     mr, num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT)));
-  if (pool_guard) {
-    RAFT_LOG_DEBUG("warpsort::select_k: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
-  }
+  if (pool_guard) { RAFT_LOG_DEBUG("warpsort::select_k: using pool memory resource"); }
 
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index 7afb9572be..7cbc212d75 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -19,9 +19,9 @@
  * Please use versions in individual header files instead.
  */
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use versions in individual header files instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use versions in individual header files instead.")
 
 #ifndef __MATH_H
 #define __MATH_H
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 0780e41275..4e549a4ec5 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -19,9 +19,9 @@
  * Please use versions in individual header files instead.
  */
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use versions in individual header files instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use versions in individual header files instead.")
 
 #ifndef __MATRIX_H
 #define __MATRIX_H
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 428c914784..53bd30d2eb 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the cuh version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the cuh version instead.")
 
 #include "matrix.cuh"
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 9a1a14fd73..7951cbdb03 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -42,13 +42,13 @@ namespace raft::matrix {
  * @code{.cpp}
  *   using namespace raft;
  *   // get a 2D row-major array of values to search through
- *   auto in_values = {... input device_matrix_view<const float, size_t, row_major> ...}
+ *   auto in_values = {... input device_matrix_view<const float, int64_t, row_major> ...}
  *   // prepare output arrays
- *   auto out_extents = make_extents<size_t>(in_values.extent(0), k);
+ *   auto out_extents = make_extents<int64_t>(in_values.extent(0), k);
  *   auto out_values  = make_device_mdarray<float>(handle, out_extents);
- *   auto out_indices = make_device_mdarray<size_t>(handle, out_extents);
+ *   auto out_indices = make_device_mdarray<int64_t>(handle, out_extents);
  *   // search `k` smallest values in each row
- *   matrix::select_k<float, size_t>(
+ *   matrix::select_k<float, int64_t>(
  *     handle, in_values, std::nullopt, out_values.view(), out_indices.view(), true);
  * @endcode
  *
@@ -76,13 +76,13 @@ namespace raft::matrix {
  */
 template <typename T, typename IdxT>
 void select_k(const device_resources& handle,
-              raft::device_matrix_view<const T, size_t, row_major> in_val,
-              std::optional<raft::device_matrix_view<const IdxT, size_t, row_major>> in_idx,
-              raft::device_matrix_view<T, size_t, row_major> out_val,
-              raft::device_matrix_view<IdxT, size_t, row_major> out_idx,
+              raft::device_matrix_view<const T, int64_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const IdxT, int64_t, row_major>> in_idx,
+              raft::device_matrix_view<T, int64_t, row_major> out_val,
+              raft::device_matrix_view<IdxT, int64_t, row_major> out_idx,
               bool select_min)
 {
-  RAFT_EXPECTS(out_val.extent(1) <= size_t(std::numeric_limits<int>::max()),
+  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
                "output k must fit the int type.");
   auto batch_size = in_val.extent(0);
   auto len        = in_val.extent(1);
@@ -93,7 +93,7 @@ void select_k(const device_resources& handle,
     RAFT_EXPECTS(batch_size == in_idx->extent(0), "batch sizes must be equal");
     RAFT_EXPECTS(len == in_idx->extent(1), "value and index input lengths must be equal");
   }
-  RAFT_EXPECTS(size_t(k) == out_idx.extent(1), "value and index output lengths must be equal");
+  RAFT_EXPECTS(int64_t(k) == out_idx.extent(1), "value and index output lengths must be equal");
   return detail::select_k<T, IdxT>(in_val.data_handle(),
                                    in_idx.has_value() ? in_idx->data_handle() : nullptr,
                                    batch_size,
diff --git a/cpp/include/raft/matrix/specializations.cuh b/cpp/include/raft/matrix/specializations.cuh
index 07bdeab507..ac3b80e8d9 100644
--- a/cpp/include/raft/matrix/specializations.cuh
+++ b/cpp/include/raft/matrix/specializations.cuh
@@ -13,7 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/matrix/specializations/detail/select_k.cuh>
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/matrix/specializations/detail/select_k.cuh b/cpp/include/raft/matrix/specializations/detail/select_k.cuh
index 3cb1a2d8dc..ac3b80e8d9 100644
--- a/cpp/include/raft/matrix/specializations/detail/select_k.cuh
+++ b/cpp/include/raft/matrix/specializations/detail/select_k.cuh
@@ -13,35 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/matrix/detail/select_k.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                                      \
-  extern template void select_k<T, IdxT>(const T*,              \
-                                         const IdxT*,           \
-                                         size_t,                \
-                                         size_t,                \
-                                         int,                   \
-                                         T*,                    \
-                                         IdxT*,                 \
-                                         bool,                  \
-                                         rmm::cuda_stream_view, \
-                                         rmm::mr::device_memory_resource*);
-
-// Commonly used types
-RAFT_INST(float, int64_t);
-RAFT_INST(half, int64_t);
-
-// These instances are used in the ivf_pq::search parameterized by the internal_distance_dtype
-RAFT_INST(float, uint32_t);
-RAFT_INST(half, uint32_t);
-
-#undef RAFT_INST
-
-}  // namespace raft::matrix::detail
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/ann_types.hpp b/cpp/include/raft/neighbors/ann_types.hpp
index 5bf2062f2f..469d3c09d4 100644
--- a/cpp/include/raft/neighbors/ann_types.hpp
+++ b/cpp/include/raft/neighbors/ann_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,8 +26,7 @@ namespace raft::neighbors::ann {
  */
 
 /** The base for approximate KNN index structures. */
-struct index {
-};
+struct index {};
 
 /** The base for KNN index parameters. */
 struct index_params {
@@ -46,9 +45,8 @@ struct index_params {
   bool add_data_on_build = true;
 };
 
-struct search_params {
-};
+struct search_params {};
 
 /** @} */  // end group ann_types
 
-};  // namespace raft::neighbors::ann
+};         // namespace raft::neighbors::ann
diff --git a/cpp/include/raft/neighbors/ball_cover-ext.cuh b/cpp/include/raft/neighbors/ball_cover-ext.cuh
new file mode 100644
index 0000000000..b6ab12d8e1
--- /dev/null
+++ b/cpp/include/raft/neighbors/ball_cover-ext.cuh
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdint>                              // uint32_t
+#include <raft/distance/distance_types.hpp>     // raft::distance::DistanceType
+#include <raft/neighbors/ball_cover_types.hpp>  // BallCoverIndex
+#include <raft/util/raft_explicit.hpp>          // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::neighbors::ball_cover {
+
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void build_index(raft::device_resources const& handle,
+                 BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index) RAFT_EXPLICIT;
+
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void all_knn_query(raft::device_resources const& handle,
+                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+                   int_t k,
+                   idx_t* inds,
+                   value_t* dists,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0) RAFT_EXPLICIT;
+
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void all_knn_query(raft::device_resources const& handle,
+                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+                   raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
+                   raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
+                   int_t k,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0) RAFT_EXPLICIT;
+
+template <typename idx_t, typename value_t, typename int_t>
+void knn_query(raft::device_resources const& handle,
+               const BallCoverIndex<idx_t, value_t, int_t>& index,
+               int_t k,
+               const value_t* query,
+               int_t n_query_pts,
+               idx_t* inds,
+               value_t* dists,
+               bool perform_post_filtering = true,
+               float weight                = 1.0) RAFT_EXPLICIT;
+
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void knn_query(raft::device_resources const& handle,
+               const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+               raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,
+               raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
+               raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
+               int_t k,
+               bool perform_post_filtering = true,
+               float weight                = 1.0) RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ball_cover
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_neighbors_ball_cover(idx_t, value_t, int_t, matrix_idx_t)                 \
+  extern template void                                                                             \
+  raft::neighbors::ball_cover::build_index<idx_t, value_t, int_t, matrix_idx_t>(                   \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index);      \
+                                                                                                   \
+  extern template void                                                                             \
+  raft::neighbors::ball_cover::all_knn_query<idx_t, value_t, int_t, matrix_idx_t>(                 \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,       \
+    int_t k,                                                                                       \
+    idx_t* inds,                                                                                   \
+    value_t* dists,                                                                                \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  extern template void                                                                             \
+  raft::neighbors::ball_cover::all_knn_query<idx_t, value_t, int_t, matrix_idx_t>(                 \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,       \
+    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,                                 \
+    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,                              \
+    int_t k,                                                                                       \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  extern template void raft::neighbors::ball_cover::knn_query<idx_t, value_t, int_t>(              \
+    raft::device_resources const& handle,                                                          \
+    const raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t>& index,               \
+    int_t k,                                                                                       \
+    const value_t* query,                                                                          \
+    int_t n_query_pts,                                                                             \
+    idx_t* inds,                                                                                   \
+    value_t* dists,                                                                                \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  extern template void                                                                             \
+  raft::neighbors::ball_cover::knn_query<idx_t, value_t, int_t, matrix_idx_t>(                     \
+    raft::device_resources const& handle,                                                          \
+    const raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index, \
+    raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,                        \
+    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,                                 \
+    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,                              \
+    int_t k,                                                                                       \
+    bool perform_post_filtering,                                                                   \
+    float weight);
+
+instantiate_raft_neighbors_ball_cover(int64_t, float, uint32_t, uint32_t);
+
+#undef instantiate_raft_neighbors_ball_cover
diff --git a/cpp/include/raft/neighbors/ball_cover-inl.cuh b/cpp/include/raft/neighbors/ball_cover-inl.cuh
new file mode 100644
index 0000000000..619c57a35a
--- /dev/null
+++ b/cpp/include/raft/neighbors/ball_cover-inl.cuh
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __BALL_COVER_H
+#define __BALL_COVER_H
+
+#pragma once
+
+#include <cstdint>
+
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ball_cover_types.hpp>
+#include <raft/spatial/knn/detail/ball_cover.cuh>
+#include <raft/spatial/knn/detail/ball_cover/common.cuh>
+#include <thrust/transform.h>
+
+namespace raft::neighbors::ball_cover {
+
+/**
+ * @defgroup random_ball_cover Random Ball Cover algorithm
+ * @{
+ */
+
+/**
+ * Builds and populates a previously unbuilt BallCoverIndex
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *  BallCoverIndex index(handle, X, metric);
+ *
+ *  ball_cover::build_index(handle, index);
+ * @endcode
+ *
+ * @tparam idx_t knn index type
+ * @tparam value_t knn value type
+ * @tparam int_t integral type for knn params
+ * @tparam matrix_idx_t matrix indexing type
+ * @param[in] handle library resource management handle
+ * @param[inout] index an empty (and not previous built) instance of BallCoverIndex
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void build_index(raft::device_resources const& handle,
+                 BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index)
+{
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    raft::spatial::knn::detail::rbc_build_index(
+      handle, index, spatial::knn::detail::HaversineFunc<value_t, int_t>());
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    raft::spatial::knn::detail::rbc_build_index(
+      handle, index, spatial::knn::detail::EuclideanFunc<value_t, int_t>());
+  } else {
+    RAFT_FAIL("Metric not support");
+  }
+
+  index.set_index_trained();
+}
+
+/** @} */  // end group random_ball_cover
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ * @tparam idx_t knn index type
+ * @tparam value_t knn distance type
+ * @tparam int_t type for integers, such as number of rows/cols
+ * @param[in] handle raft handle for resource management
+ * @param[inout] index ball cover index which has not yet been built
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void all_knn_query(raft::device_resources const& handle,
+                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+                   int_t k,
+                   idx_t* inds,
+                   value_t* dists,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    raft::spatial::knn::detail::rbc_all_knn_query(
+      handle,
+      index,
+      k,
+      inds,
+      dists,
+      spatial::knn::detail::HaversineFunc<value_t, int_t>(),
+      perform_post_filtering,
+      weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    raft::spatial::knn::detail::rbc_all_knn_query(
+      handle,
+      index,
+      k,
+      inds,
+      dists,
+      spatial::knn::detail::EuclideanFunc<value_t, int_t>(),
+      perform_post_filtering,
+      weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * @ingroup random_ball_cover
+ * @{
+ */
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *
+ *  // Construct a ball cover index
+ *  BallCoverIndex index(handle, X, metric);
+ *
+ *  // Perform all neighbors knn query
+ *  ball_cover::all_knn_query(handle, index, inds, dists, k);
+ * @endcode
+ *
+ * @tparam idx_t knn index type
+ * @tparam value_t knn distance type
+ * @tparam int_t type for integers, such as number of rows/cols
+ * @tparam matrix_idx_t matrix indexing type
+ *
+ * @param[in] handle raft handle for resource management
+ * @param[in] index ball cover index which has not yet been built
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void all_knn_query(raft::device_resources const& handle,
+                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+                   raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
+                   raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
+                   int_t k,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  RAFT_EXPECTS(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
+  RAFT_EXPECTS(k <= index.m,
+               "k must be less than or equal to the number of data points in the index");
+  RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast<matrix_idx_t>(k),
+               "Number of columns in output indices and distances matrices must be equal to k");
+
+  RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == index.get_X().extent(0),
+               "Number of rows in output indices and distances matrices must equal number of rows "
+               "in index matrix.");
+
+  all_knn_query(
+    handle, index, k, inds.data_handle(), dists.data_handle(), perform_post_filtering, weight);
+}
+
+/** @} */
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ * @tparam idx_t index type
+ * @tparam value_t distances type
+ * @tparam int_t integer type for size info
+ * @param[in] handle raft handle for resource management
+ * @param[inout] index ball cover index which has not yet been built
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] query the
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ * @param[in] n_query_pts number of query points
+ */
+template <typename idx_t, typename value_t, typename int_t>
+void knn_query(raft::device_resources const& handle,
+               const BallCoverIndex<idx_t, value_t, int_t>& index,
+               int_t k,
+               const value_t* query,
+               int_t n_query_pts,
+               idx_t* inds,
+               value_t* dists,
+               bool perform_post_filtering = true,
+               float weight                = 1.0)
+{
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    raft::spatial::knn::detail::rbc_knn_query(handle,
+                                              index,
+                                              k,
+                                              query,
+                                              n_query_pts,
+                                              inds,
+                                              dists,
+                                              spatial::knn::detail::HaversineFunc<value_t, int_t>(),
+                                              perform_post_filtering,
+                                              weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    raft::spatial::knn::detail::rbc_knn_query(handle,
+                                              index,
+                                              k,
+                                              query,
+                                              n_query_pts,
+                                              inds,
+                                              dists,
+                                              spatial::knn::detail::EuclideanFunc<value_t, int_t>(),
+                                              perform_post_filtering,
+                                              weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+}
+
+/**
+ * @ingroup random_ball_cover
+ * @{
+ */
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *
+ *  // Build a ball cover index
+ *  BallCoverIndex index(handle, X, metric);
+ *  ball_cover::build_index(handle, index);
+ *
+ *  // Perform all neighbors knn query
+ *  ball_cover::knn_query(handle, index, inds, dists, k);
+ * @endcode
+
+ *
+ * @tparam idx_t index type
+ * @tparam value_t distances type
+ * @tparam int_t integer type for size info
+ * @tparam matrix_idx_t
+ * @param[in] handle raft handle for resource management
+ * @param[in] index ball cover index which has not yet been built
+ * @param[in] query device matrix containing query data points
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void knn_query(raft::device_resources const& handle,
+               const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+               raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,
+               raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
+               raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
+               int_t k,
+               bool perform_post_filtering = true,
+               float weight                = 1.0)
+{
+  RAFT_EXPECTS(k <= index.m,
+               "k must be less than or equal to the number of data points in the index");
+  RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast<idx_t>(k),
+               "Number of columns in output indices and distances matrices must be equal to k");
+
+  RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == query.extent(0),
+               "Number of rows in output indices and distances matrices must equal number of rows "
+               "in search matrix.");
+
+  RAFT_EXPECTS(query.extent(1) == index.get_X().extent(1),
+               "Number of columns in query and index matrices must match.");
+
+  knn_query(handle,
+            index,
+            k,
+            query.data_handle(),
+            query.extent(0),
+            inds.data_handle(),
+            dists.data_handle(),
+            perform_post_filtering,
+            weight);
+}
+
+/** @} */
+
+// TODO: implement functions for:
+//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
+//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
+
+}  // namespace raft::neighbors::ball_cover
+
+#endif
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
index 619c57a35a..41c5d0310c 100644
--- a/cpp/include/raft/neighbors/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/ball_cover.cuh
@@ -13,383 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __BALL_COVER_H
-#define __BALL_COVER_H
-
 #pragma once
 
-#include <cstdint>
-
-#include <raft/distance/distance_types.hpp>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/spatial/knn/detail/ball_cover.cuh>
-#include <raft/spatial/knn/detail/ball_cover/common.cuh>
-#include <thrust/transform.h>
-
-namespace raft::neighbors::ball_cover {
-
-/**
- * @defgroup random_ball_cover Random Ball Cover algorithm
- * @{
- */
-
-/**
- * Builds and populates a previously unbuilt BallCoverIndex
- *
- * Usage example:
- * @code{.cpp}
- *
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/ball_cover.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2Expanded;
- *  BallCoverIndex index(handle, X, metric);
- *
- *  ball_cover::build_index(handle, index);
- * @endcode
- *
- * @tparam idx_t knn index type
- * @tparam value_t knn value type
- * @tparam int_t integral type for knn params
- * @tparam matrix_idx_t matrix indexing type
- * @param[in] handle library resource management handle
- * @param[inout] index an empty (and not previous built) instance of BallCoverIndex
- */
-template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void build_index(raft::device_resources const& handle,
-                 BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index)
-{
-  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
-  if (index.metric == raft::distance::DistanceType::Haversine) {
-    raft::spatial::knn::detail::rbc_build_index(
-      handle, index, spatial::knn::detail::HaversineFunc<value_t, int_t>());
-  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
-             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    raft::spatial::knn::detail::rbc_build_index(
-      handle, index, spatial::knn::detail::EuclideanFunc<value_t, int_t>());
-  } else {
-    RAFT_FAIL("Metric not support");
-  }
-
-  index.set_index_trained();
-}
-
-/** @} */  // end group random_ball_cover
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * performs an all neighbors knn, which can reuse memory when
- * the index and query are the same array. This function will
- * build the index and assumes rbc_build_index() has not already
- * been called.
- * @tparam idx_t knn index type
- * @tparam value_t knn distance type
- * @tparam int_t type for integers, such as number of rows/cols
- * @param[in] handle raft handle for resource management
- * @param[inout] index ball cover index which has not yet been built
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
-template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void all_knn_query(raft::device_resources const& handle,
-                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
-                   int_t k,
-                   idx_t* inds,
-                   value_t* dists,
-                   bool perform_post_filtering = true,
-                   float weight                = 1.0)
-{
-  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
-  if (index.metric == raft::distance::DistanceType::Haversine) {
-    raft::spatial::knn::detail::rbc_all_knn_query(
-      handle,
-      index,
-      k,
-      inds,
-      dists,
-      spatial::knn::detail::HaversineFunc<value_t, int_t>(),
-      perform_post_filtering,
-      weight);
-  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
-             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    raft::spatial::knn::detail::rbc_all_knn_query(
-      handle,
-      index,
-      k,
-      inds,
-      dists,
-      spatial::knn::detail::EuclideanFunc<value_t, int_t>(),
-      perform_post_filtering,
-      weight);
-  } else {
-    RAFT_FAIL("Metric not supported");
-  }
-
-  index.set_index_trained();
-}
-
-/**
- * @ingroup random_ball_cover
- * @{
- */
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * performs an all neighbors knn, which can reuse memory when
- * the index and query are the same array. This function will
- * build the index and assumes rbc_build_index() has not already
- * been called.
- *
- * Usage example:
- * @code{.cpp}
- *
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/ball_cover.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2Expanded;
- *
- *  // Construct a ball cover index
- *  BallCoverIndex index(handle, X, metric);
- *
- *  // Perform all neighbors knn query
- *  ball_cover::all_knn_query(handle, index, inds, dists, k);
- * @endcode
- *
- * @tparam idx_t knn index type
- * @tparam value_t knn distance type
- * @tparam int_t type for integers, such as number of rows/cols
- * @tparam matrix_idx_t matrix indexing type
- *
- * @param[in] handle raft handle for resource management
- * @param[in] index ball cover index which has not yet been built
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
-template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void all_knn_query(raft::device_resources const& handle,
-                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
-                   raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
-                   raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
-                   int_t k,
-                   bool perform_post_filtering = true,
-                   float weight                = 1.0)
-{
-  RAFT_EXPECTS(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
-  RAFT_EXPECTS(k <= index.m,
-               "k must be less than or equal to the number of data points in the index");
-  RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast<matrix_idx_t>(k),
-               "Number of columns in output indices and distances matrices must be equal to k");
-
-  RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == index.get_X().extent(0),
-               "Number of rows in output indices and distances matrices must equal number of rows "
-               "in index matrix.");
-
-  all_knn_query(
-    handle, index, k, inds.data_handle(), dists.data_handle(), perform_post_filtering, weight);
-}
-
-/** @} */
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * function does not build the index and assumes rbc_build_index() has
- * already been called. Use this function when the index and
- * query arrays are different, otherwise use rbc_all_knn_query().
- * @tparam idx_t index type
- * @tparam value_t distances type
- * @tparam int_t integer type for size info
- * @param[in] handle raft handle for resource management
- * @param[inout] index ball cover index which has not yet been built
- * @param[in] k number of nearest neighbors to find
- * @param[in] query the
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- * @param[in] n_query_pts number of query points
- */
-template <typename idx_t, typename value_t, typename int_t>
-void knn_query(raft::device_resources const& handle,
-               const BallCoverIndex<idx_t, value_t, int_t>& index,
-               int_t k,
-               const value_t* query,
-               int_t n_query_pts,
-               idx_t* inds,
-               value_t* dists,
-               bool perform_post_filtering = true,
-               float weight                = 1.0)
-{
-  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
-  if (index.metric == raft::distance::DistanceType::Haversine) {
-    raft::spatial::knn::detail::rbc_knn_query(handle,
-                                              index,
-                                              k,
-                                              query,
-                                              n_query_pts,
-                                              inds,
-                                              dists,
-                                              spatial::knn::detail::HaversineFunc<value_t, int_t>(),
-                                              perform_post_filtering,
-                                              weight);
-  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
-             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    raft::spatial::knn::detail::rbc_knn_query(handle,
-                                              index,
-                                              k,
-                                              query,
-                                              n_query_pts,
-                                              inds,
-                                              dists,
-                                              spatial::knn::detail::EuclideanFunc<value_t, int_t>(),
-                                              perform_post_filtering,
-                                              weight);
-  } else {
-    RAFT_FAIL("Metric not supported");
-  }
-}
-
-/**
- * @ingroup random_ball_cover
- * @{
- */
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * function does not build the index and assumes rbc_build_index() has
- * already been called. Use this function when the index and
- * query arrays are different, otherwise use rbc_all_knn_query().
- *
- * Usage example:
- * @code{.cpp}
- *
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/ball_cover.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2Expanded;
- *
- *  // Build a ball cover index
- *  BallCoverIndex index(handle, X, metric);
- *  ball_cover::build_index(handle, index);
- *
- *  // Perform all neighbors knn query
- *  ball_cover::knn_query(handle, index, inds, dists, k);
- * @endcode
-
- *
- * @tparam idx_t index type
- * @tparam value_t distances type
- * @tparam int_t integer type for size info
- * @tparam matrix_idx_t
- * @param[in] handle raft handle for resource management
- * @param[in] index ball cover index which has not yet been built
- * @param[in] query device matrix containing query data points
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
-template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
-void knn_query(raft::device_resources const& handle,
-               const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
-               raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,
-               raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
-               raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
-               int_t k,
-               bool perform_post_filtering = true,
-               float weight                = 1.0)
-{
-  RAFT_EXPECTS(k <= index.m,
-               "k must be less than or equal to the number of data points in the index");
-  RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast<idx_t>(k),
-               "Number of columns in output indices and distances matrices must be equal to k");
-
-  RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == query.extent(0),
-               "Number of rows in output indices and distances matrices must equal number of rows "
-               "in search matrix.");
-
-  RAFT_EXPECTS(query.extent(1) == index.get_X().extent(1),
-               "Number of columns in query and index matrices must match.");
-
-  knn_query(handle,
-            index,
-            k,
-            query.data_handle(),
-            query.extent(0),
-            inds.data_handle(),
-            dists.data_handle(),
-            perform_post_filtering,
-            weight);
-}
-
-/** @} */
-
-// TODO: implement functions for:
-//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
-//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
-
-}  // namespace raft::neighbors::ball_cover
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "ball_cover-inl.cuh"
+#endif
 
+#ifdef RAFT_COMPILED
+#include "ball_cover-ext.cuh"
 #endif
diff --git a/cpp/include/raft/neighbors/brute_force-ext.cuh b/cpp/include/raft/neighbors/brute_force-ext.cuh
new file mode 100644
index 0000000000..98a186db86
--- /dev/null
+++ b/cpp/include/raft/neighbors/brute_force-ext.cuh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
+#include <raft/core/device_resources.hpp>    // raft::device_resources
+#include <raft/core/operators.hpp>           // raft::identity_op
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::neighbors::brute_force {
+
+template <typename value_t, typename idx_t>
+inline void knn_merge_parts(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const value_t, idx_t, row_major> in_keys,
+  raft::device_matrix_view<const idx_t, idx_t, row_major> in_values,
+  raft::device_matrix_view<value_t, idx_t, row_major> out_keys,
+  raft::device_matrix_view<idx_t, idx_t, row_major> out_values,
+  size_t n_samples,
+  std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt) RAFT_EXPLICIT;
+
+template <typename idx_t,
+          typename value_t,
+          typename matrix_idx,
+          typename index_layout,
+          typename search_layout,
+          typename epilogue_op = raft::identity_op>
+void knn(raft::device_resources const& handle,
+         std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
+         raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
+         raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+         raft::device_matrix_view<value_t, matrix_idx, row_major> distances,
+         distance::DistanceType metric         = distance::DistanceType::L2Unexpanded,
+         std::optional<float> metric_arg       = std::make_optional<float>(2.0f),
+         std::optional<idx_t> global_id_offset = std::nullopt,
+         epilogue_op distance_epilogue         = raft::identity_op()) RAFT_EXPLICIT;
+
+template <typename value_t, typename idx_t, typename idx_layout, typename query_layout>
+void fused_l2_knn(raft::device_resources const& handle,
+                  raft::device_matrix_view<const value_t, idx_t, idx_layout> index,
+                  raft::device_matrix_view<const value_t, idx_t, query_layout> query,
+                  raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,
+                  raft::device_matrix_view<value_t, idx_t, row_major> out_dists,
+                  raft::distance::DistanceType metric) RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::brute_force
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+// No extern template for raft::neighbors::brute_force::knn_merge_parts
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  extern template void raft::neighbors::brute_force::                                       \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+instantiate_raft_neighbors_brute_force_knn(
+  int64_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
+instantiate_raft_neighbors_brute_force_knn(
+  int64_t, float, int64_t, raft::row_major, raft::row_major, raft::identity_op);
+instantiate_raft_neighbors_brute_force_knn(
+  int, float, int, raft::row_major, raft::row_major, raft::identity_op);
+instantiate_raft_neighbors_brute_force_knn(
+  uint32_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
+  value_t, idx_t, idx_layout, query_layout)                             \
+  extern template void raft::neighbors::brute_force::fused_l2_knn(      \
+    raft::device_resources const& handle,                               \
+    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
+    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
+    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
+    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_brute_force_fused_l2_knn(float,
+                                                    int64_t,
+                                                    raft::row_major,
+                                                    raft::row_major)
+
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/include/raft/neighbors/brute_force-inl.cuh b/cpp/include/raft/neighbors/brute_force-inl.cuh
new file mode 100644
index 0000000000..ae7974ea8e
--- /dev/null
+++ b/cpp/include/raft/neighbors/brute_force-inl.cuh
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/detail/knn_brute_force.cuh>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+
+namespace raft::neighbors::brute_force {
+
+/**
+ * @defgroup brute_force_knn Brute-force K-Nearest Neighbors
+ * @{
+ */
+
+/**
+ * @brief Performs a k-select across several (contiguous) row-partitioned index/distance
+ * matrices formatted like the following:
+ *
+ * part1row1: k0, k1, k2, k3
+ * part1row2: k0, k1, k2, k3
+ * part1row3: k0, k1, k2, k3
+ * part2row1: k0, k1, k2, k3
+ * part2row2: k0, k1, k2, k3
+ * part2row3: k0, k1, k2, k3
+ * etc...
+ *
+ * The example above shows what an aggregated index/distance matrix
+ * would look like with two partitions when n_samples=3 and k=4.
+ *
+ * When working with extremely large data sets that have been broken
+ * over multiple indexes, such as when computing over multiple GPUs,
+ * the ids will often start at 0 for each local knn index but the
+ * global ids need to be used when merging them together. An optional
+ * translations vector can be supplied to map the starting id of
+ * each partition to its global id so that the final merged knn
+ * is based on the global ids.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/brute_force.cuh>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  compute multiple knn graphs and aggregate row-wise
+ *  (see detailed description above)
+ *  ...
+ *  brute_force::knn_merge_parts(handle, in_keys, in_values, out_keys, out_values, n_samples);
+ * @endcode
+ *
+ * @tparam idx_t
+ * @tparam value_t
+ *
+ * @param[in] handle
+ * @param[in] in_keys matrix of input keys (size n_samples * n_parts * k)
+ * @param[in] in_values matrix of input values (size n_samples * n_parts * k)
+ * @param[out] out_keys matrix of output keys (size n_samples * k)
+ * @param[out] out_values matrix of output values (size n_samples * k)
+ * @param[in] n_samples number of rows in each partition
+ * @param[in] translations optional vector of starting global id mappings for each local partition
+ */
+template <typename value_t, typename idx_t>
+inline void knn_merge_parts(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const value_t, idx_t, row_major> in_keys,
+  raft::device_matrix_view<const idx_t, idx_t, row_major> in_values,
+  raft::device_matrix_view<value_t, idx_t, row_major> out_keys,
+  raft::device_matrix_view<idx_t, idx_t, row_major> out_values,
+  size_t n_samples,
+  std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt)
+{
+  RAFT_EXPECTS(in_keys.extent(1) == in_values.extent(1) && in_keys.extent(0) == in_values.extent(0),
+               "in_keys and in_values must have the same shape.");
+  RAFT_EXPECTS(
+    out_keys.extent(0) == out_values.extent(0) && out_keys.extent(0) == n_samples,
+    "Number of rows in output keys and val matrices must equal number of rows in search matrix.");
+  RAFT_EXPECTS(
+    out_keys.extent(1) == out_values.extent(1) && out_keys.extent(1) == in_keys.extent(1),
+    "Number of columns in output indices and distances matrices must be equal to k");
+
+  idx_t* translations_ptr = nullptr;
+  if (translations.has_value()) { translations_ptr = translations.value().data_handle(); }
+
+  auto n_parts = in_keys.extent(0) / n_samples;
+  detail::knn_merge_parts(in_keys.data_handle(),
+                          in_values.data_handle(),
+                          out_keys.data_handle(),
+                          out_values.data_handle(),
+                          n_samples,
+                          n_parts,
+                          in_keys.extent(1),
+                          handle.get_stream(),
+                          translations_ptr);
+}
+
+/**
+ * @brief Flat C++ API function to perform a brute force knn on
+ * a series of input arrays and combine the results into a single
+ * output array for indexes and distances. Inputs can be either
+ * row- or column-major but the output matrices will always be in
+ * row-major format.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/brute_force.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+ *  brute_force::knn(handle, index, search, indices, distances, metric);
+ * @endcode
+ *
+ * @param[in] handle: the cuml handle to use
+ * @param[in] index: vector of device matrices (each size m_i*d) to be used as the knn index
+ * @param[in] search: matrix (size n*d) to be used for searching the index
+ * @param[out] indices: matrix (size n*k) to store output knn indices
+ * @param[out] distances: matrix (size n*k) to store the output knn distance
+ * @param[in] metric: distance metric to use. Euclidean (L2) is used by default
+ * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This
+ * 					 is ignored if the metric_type is not Minkowski.
+ * @param[in] global_id_offset: optional starting global id mapping for the local partition
+ *                              (assumes the index contains contiguous ids in the global id space)
+ * @param[in] distance_epilogue: optional epilogue function to run after computing distances. This
+                                 function takes a triple of the (value, rowid, colid) for each
+                                 element in the pairwise distances and returns a transformed value
+                                 back.
+ */
+template <typename idx_t,
+          typename value_t,
+          typename matrix_idx,
+          typename index_layout,
+          typename search_layout,
+          typename epilogue_op = raft::identity_op>
+void knn(raft::device_resources const& handle,
+         std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
+         raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
+         raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+         raft::device_matrix_view<value_t, matrix_idx, row_major> distances,
+         distance::DistanceType metric         = distance::DistanceType::L2Unexpanded,
+         std::optional<float> metric_arg       = std::make_optional<float>(2.0f),
+         std::optional<idx_t> global_id_offset = std::nullopt,
+         epilogue_op distance_epilogue         = raft::identity_op())
+{
+  RAFT_EXPECTS(index[0].extent(1) == search.extent(1),
+               "Number of dimensions for both index and search matrices must be equal");
+
+  RAFT_EXPECTS(indices.extent(0) == distances.extent(0) && distances.extent(0) == search.extent(0),
+               "Number of rows in output indices and distances matrices must equal number of rows "
+               "in search matrix.");
+  RAFT_EXPECTS(indices.extent(1) == distances.extent(1) && distances.extent(1),
+               "Number of columns in output indices and distances matrices must the same");
+
+  bool rowMajorIndex = std::is_same_v<index_layout, layout_c_contiguous>;
+  bool rowMajorQuery = std::is_same_v<search_layout, layout_c_contiguous>;
+
+  std::vector<value_t*> inputs;
+  std::vector<matrix_idx> sizes;
+  for (std::size_t i = 0; i < index.size(); ++i) {
+    inputs.push_back(const_cast<value_t*>(index[i].data_handle()));
+    sizes.push_back(index[i].extent(0));
+  }
+
+  std::vector<idx_t> trans;
+  if (global_id_offset.has_value()) { trans.push_back(global_id_offset.value()); }
+
+  std::vector<idx_t>* trans_arg = global_id_offset.has_value() ? &trans : nullptr;
+
+  raft::neighbors::detail::brute_force_knn_impl(handle,
+                                                inputs,
+                                                sizes,
+                                                index[0].extent(1),
+                                                // TODO: This is unfortunate. Need to fix.
+                                                const_cast<value_t*>(search.data_handle()),
+                                                search.extent(0),
+                                                indices.data_handle(),
+                                                distances.data_handle(),
+                                                indices.extent(1),
+                                                rowMajorIndex,
+                                                rowMajorQuery,
+                                                trans_arg,
+                                                metric,
+                                                metric_arg.value_or(2.0f),
+                                                distance_epilogue);
+}
+
+/**
+ * @brief Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
+ *
+ * This is a specialized function for fusing the k-selection with the distance
+ * computation when k < 64. The value of k will be inferred from the number
+ * of columns in the output matrices.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/brute_force.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+ *  brute_force::fused_l2_knn(handle, index, search, indices, distances, metric);
+ * @endcode
+
+ * @tparam value_t type of values
+ * @tparam idx_t type of indices
+ * @tparam idx_layout layout type of index matrix
+ * @tparam query_layout layout type of query matrix
+ * @param[in] handle raft handle for sharing expensive resources
+ * @param[in] index input index array on device (size m * d)
+ * @param[in] query input query array on device (size n * d)
+ * @param[out] out_inds output indices array on device (size n * k)
+ * @param[out] out_dists output dists array on device (size n * k)
+ * @param[in] metric type of distance computation to perform (must be a variant of L2)
+ */
+template <typename value_t, typename idx_t, typename idx_layout, typename query_layout>
+void fused_l2_knn(raft::device_resources const& handle,
+                  raft::device_matrix_view<const value_t, idx_t, idx_layout> index,
+                  raft::device_matrix_view<const value_t, idx_t, query_layout> query,
+                  raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,
+                  raft::device_matrix_view<value_t, idx_t, row_major> out_dists,
+                  raft::distance::DistanceType metric)
+{
+  int k = static_cast<int>(out_inds.extent(1));
+
+  RAFT_EXPECTS(k <= 64, "For fused k-selection, k must be < 64");
+  RAFT_EXPECTS(out_inds.extent(1) == out_dists.extent(1), "Value of k must match for outputs");
+  RAFT_EXPECTS(index.extent(1) == query.extent(1),
+               "Number of columns in input matrices must be the same.");
+
+  RAFT_EXPECTS(metric == distance::DistanceType::L2Expanded ||
+                 metric == distance::DistanceType::L2Unexpanded ||
+                 metric == distance::DistanceType::L2SqrtUnexpanded ||
+                 metric == distance::DistanceType::L2SqrtExpanded,
+               "Distance metric must be L2");
+
+  size_t n_index_rows = index.extent(0);
+  size_t n_query_rows = query.extent(0);
+  size_t D            = index.extent(1);
+
+  RAFT_EXPECTS(raft::is_row_or_column_major(index), "Index must be row or column major layout");
+  RAFT_EXPECTS(raft::is_row_or_column_major(query), "Query must be row or column major layout");
+
+  const bool rowMajorIndex = raft::is_row_major(index);
+  const bool rowMajorQuery = raft::is_row_major(query);
+
+  raft::spatial::knn::detail::fusedL2Knn(D,
+                                         out_inds.data_handle(),
+                                         out_dists.data_handle(),
+                                         index.data_handle(),
+                                         query.data_handle(),
+                                         n_index_rows,
+                                         n_query_rows,
+                                         k,
+                                         rowMajorIndex,
+                                         rowMajorQuery,
+                                         handle.get_stream(),
+                                         metric);
+}
+
+/** @} */  // end group brute_force_knn
+
+}  // namespace raft::neighbors::brute_force
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
index ae7974ea8e..f5ff7fb28e 100644
--- a/cpp/include/raft/neighbors/brute_force.cuh
+++ b/cpp/include/raft/neighbors/brute_force.cuh
@@ -13,272 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <raft/neighbors/detail/knn_brute_force.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft::neighbors::brute_force {
-
-/**
- * @defgroup brute_force_knn Brute-force K-Nearest Neighbors
- * @{
- */
-
-/**
- * @brief Performs a k-select across several (contiguous) row-partitioned index/distance
- * matrices formatted like the following:
- *
- * part1row1: k0, k1, k2, k3
- * part1row2: k0, k1, k2, k3
- * part1row3: k0, k1, k2, k3
- * part2row1: k0, k1, k2, k3
- * part2row2: k0, k1, k2, k3
- * part2row3: k0, k1, k2, k3
- * etc...
- *
- * The example above shows what an aggregated index/distance matrix
- * would look like with two partitions when n_samples=3 and k=4.
- *
- * When working with extremely large data sets that have been broken
- * over multiple indexes, such as when computing over multiple GPUs,
- * the ids will often start at 0 for each local knn index but the
- * global ids need to be used when merging them together. An optional
- * translations vector can be supplied to map the starting id of
- * each partition to its global id so that the final merged knn
- * is based on the global ids.
- *
- * Usage example:
- * @code{.cpp}
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/brute_force.cuh>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  compute multiple knn graphs and aggregate row-wise
- *  (see detailed description above)
- *  ...
- *  brute_force::knn_merge_parts(handle, in_keys, in_values, out_keys, out_values, n_samples);
- * @endcode
- *
- * @tparam idx_t
- * @tparam value_t
- *
- * @param[in] handle
- * @param[in] in_keys matrix of input keys (size n_samples * n_parts * k)
- * @param[in] in_values matrix of input values (size n_samples * n_parts * k)
- * @param[out] out_keys matrix of output keys (size n_samples * k)
- * @param[out] out_values matrix of output values (size n_samples * k)
- * @param[in] n_samples number of rows in each partition
- * @param[in] translations optional vector of starting global id mappings for each local partition
- */
-template <typename value_t, typename idx_t>
-inline void knn_merge_parts(
-  raft::device_resources const& handle,
-  raft::device_matrix_view<const value_t, idx_t, row_major> in_keys,
-  raft::device_matrix_view<const idx_t, idx_t, row_major> in_values,
-  raft::device_matrix_view<value_t, idx_t, row_major> out_keys,
-  raft::device_matrix_view<idx_t, idx_t, row_major> out_values,
-  size_t n_samples,
-  std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt)
-{
-  RAFT_EXPECTS(in_keys.extent(1) == in_values.extent(1) && in_keys.extent(0) == in_values.extent(0),
-               "in_keys and in_values must have the same shape.");
-  RAFT_EXPECTS(
-    out_keys.extent(0) == out_values.extent(0) && out_keys.extent(0) == n_samples,
-    "Number of rows in output keys and val matrices must equal number of rows in search matrix.");
-  RAFT_EXPECTS(
-    out_keys.extent(1) == out_values.extent(1) && out_keys.extent(1) == in_keys.extent(1),
-    "Number of columns in output indices and distances matrices must be equal to k");
-
-  idx_t* translations_ptr = nullptr;
-  if (translations.has_value()) { translations_ptr = translations.value().data_handle(); }
-
-  auto n_parts = in_keys.extent(0) / n_samples;
-  detail::knn_merge_parts(in_keys.data_handle(),
-                          in_values.data_handle(),
-                          out_keys.data_handle(),
-                          out_values.data_handle(),
-                          n_samples,
-                          n_parts,
-                          in_keys.extent(1),
-                          handle.get_stream(),
-                          translations_ptr);
-}
-
-/**
- * @brief Flat C++ API function to perform a brute force knn on
- * a series of input arrays and combine the results into a single
- * output array for indexes and distances. Inputs can be either
- * row- or column-major but the output matrices will always be in
- * row-major format.
- *
- * Usage example:
- * @code{.cpp}
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/brute_force.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
- *  brute_force::knn(handle, index, search, indices, distances, metric);
- * @endcode
- *
- * @param[in] handle: the cuml handle to use
- * @param[in] index: vector of device matrices (each size m_i*d) to be used as the knn index
- * @param[in] search: matrix (size n*d) to be used for searching the index
- * @param[out] indices: matrix (size n*k) to store output knn indices
- * @param[out] distances: matrix (size n*k) to store the output knn distance
- * @param[in] metric: distance metric to use. Euclidean (L2) is used by default
- * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This
- * 					 is ignored if the metric_type is not Minkowski.
- * @param[in] global_id_offset: optional starting global id mapping for the local partition
- *                              (assumes the index contains contiguous ids in the global id space)
- * @param[in] distance_epilogue: optional epilogue function to run after computing distances. This
-                                 function takes a triple of the (value, rowid, colid) for each
-                                 element in the pairwise distances and returns a transformed value
-                                 back.
- */
-template <typename idx_t,
-          typename value_t,
-          typename matrix_idx,
-          typename index_layout,
-          typename search_layout,
-          typename epilogue_op = raft::identity_op>
-void knn(raft::device_resources const& handle,
-         std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
-         raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
-         raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
-         raft::device_matrix_view<value_t, matrix_idx, row_major> distances,
-         distance::DistanceType metric         = distance::DistanceType::L2Unexpanded,
-         std::optional<float> metric_arg       = std::make_optional<float>(2.0f),
-         std::optional<idx_t> global_id_offset = std::nullopt,
-         epilogue_op distance_epilogue         = raft::identity_op())
-{
-  RAFT_EXPECTS(index[0].extent(1) == search.extent(1),
-               "Number of dimensions for both index and search matrices must be equal");
-
-  RAFT_EXPECTS(indices.extent(0) == distances.extent(0) && distances.extent(0) == search.extent(0),
-               "Number of rows in output indices and distances matrices must equal number of rows "
-               "in search matrix.");
-  RAFT_EXPECTS(indices.extent(1) == distances.extent(1) && distances.extent(1),
-               "Number of columns in output indices and distances matrices must the same");
-
-  bool rowMajorIndex = std::is_same_v<index_layout, layout_c_contiguous>;
-  bool rowMajorQuery = std::is_same_v<search_layout, layout_c_contiguous>;
-
-  std::vector<value_t*> inputs;
-  std::vector<matrix_idx> sizes;
-  for (std::size_t i = 0; i < index.size(); ++i) {
-    inputs.push_back(const_cast<value_t*>(index[i].data_handle()));
-    sizes.push_back(index[i].extent(0));
-  }
-
-  std::vector<idx_t> trans;
-  if (global_id_offset.has_value()) { trans.push_back(global_id_offset.value()); }
-
-  std::vector<idx_t>* trans_arg = global_id_offset.has_value() ? &trans : nullptr;
-
-  raft::neighbors::detail::brute_force_knn_impl(handle,
-                                                inputs,
-                                                sizes,
-                                                index[0].extent(1),
-                                                // TODO: This is unfortunate. Need to fix.
-                                                const_cast<value_t*>(search.data_handle()),
-                                                search.extent(0),
-                                                indices.data_handle(),
-                                                distances.data_handle(),
-                                                indices.extent(1),
-                                                rowMajorIndex,
-                                                rowMajorQuery,
-                                                trans_arg,
-                                                metric,
-                                                metric_arg.value_or(2.0f),
-                                                distance_epilogue);
-}
-
-/**
- * @brief Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
- *
- * This is a specialized function for fusing the k-selection with the distance
- * computation when k < 64. The value of k will be inferred from the number
- * of columns in the output matrices.
- *
- * Usage example:
- * @code{.cpp}
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/brute_force.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
- *  brute_force::fused_l2_knn(handle, index, search, indices, distances, metric);
- * @endcode
-
- * @tparam value_t type of values
- * @tparam idx_t type of indices
- * @tparam idx_layout layout type of index matrix
- * @tparam query_layout layout type of query matrix
- * @param[in] handle raft handle for sharing expensive resources
- * @param[in] index input index array on device (size m * d)
- * @param[in] query input query array on device (size n * d)
- * @param[out] out_inds output indices array on device (size n * k)
- * @param[out] out_dists output dists array on device (size n * k)
- * @param[in] metric type of distance computation to perform (must be a variant of L2)
- */
-template <typename value_t, typename idx_t, typename idx_layout, typename query_layout>
-void fused_l2_knn(raft::device_resources const& handle,
-                  raft::device_matrix_view<const value_t, idx_t, idx_layout> index,
-                  raft::device_matrix_view<const value_t, idx_t, query_layout> query,
-                  raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,
-                  raft::device_matrix_view<value_t, idx_t, row_major> out_dists,
-                  raft::distance::DistanceType metric)
-{
-  int k = static_cast<int>(out_inds.extent(1));
-
-  RAFT_EXPECTS(k <= 64, "For fused k-selection, k must be < 64");
-  RAFT_EXPECTS(out_inds.extent(1) == out_dists.extent(1), "Value of k must match for outputs");
-  RAFT_EXPECTS(index.extent(1) == query.extent(1),
-               "Number of columns in input matrices must be the same.");
-
-  RAFT_EXPECTS(metric == distance::DistanceType::L2Expanded ||
-                 metric == distance::DistanceType::L2Unexpanded ||
-                 metric == distance::DistanceType::L2SqrtUnexpanded ||
-                 metric == distance::DistanceType::L2SqrtExpanded,
-               "Distance metric must be L2");
-
-  size_t n_index_rows = index.extent(0);
-  size_t n_query_rows = query.extent(0);
-  size_t D            = index.extent(1);
-
-  RAFT_EXPECTS(raft::is_row_or_column_major(index), "Index must be row or column major layout");
-  RAFT_EXPECTS(raft::is_row_or_column_major(query), "Query must be row or column major layout");
-
-  const bool rowMajorIndex = raft::is_row_major(index);
-  const bool rowMajorQuery = raft::is_row_major(query);
-
-  raft::spatial::knn::detail::fusedL2Knn(D,
-                                         out_inds.data_handle(),
-                                         out_dists.data_handle(),
-                                         index.data_handle(),
-                                         query.data_handle(),
-                                         n_index_rows,
-                                         n_query_rows,
-                                         k,
-                                         rowMajorIndex,
-                                         rowMajorQuery,
-                                         handle.get_stream(),
-                                         metric);
-}
-
-/** @} */  // end group brute_force_knn
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "brute_force-inl.cuh"
+#endif
 
-}  // namespace raft::neighbors::brute_force
+#ifdef RAFT_COMPILED
+#include "brute_force-ext.cuh"
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index bd9b3b586b..931fb3f23f 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -155,11 +155,11 @@ struct index : ann::index {
   }
 
   // Don't allow copying the index for performance reasons (try avoiding copying data)
-  index(const index&) = delete;
-  index(index&&)      = default;
+  index(const index&)                    = delete;
+  index(index&&)                         = default;
   auto operator=(const index&) -> index& = delete;
-  auto operator=(index&&) -> index& = default;
-  ~index()                          = default;
+  auto operator=(index&&) -> index&      = default;
+  ~index()                               = default;
 
   /** Construct an empty index. */
   index(raft::device_resources const& res)
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index 4d63fb7999..54c806ba13 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -130,10 +130,7 @@ void build_knn_graph(raft::device_resources const& res,
 
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
-  if (pool_guard) {
-    RAFT_LOG_DEBUG("ivf_pq using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
-  }
+  if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq using pool memory resource"); }
 
   raft::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(dataset.data_handle(),
                                                                             dataset.extent(0),
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index 79cbb6198f..5902d1405f 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -16,6 +16,9 @@
 
 #pragma once
 
+#include <raft/neighbors/detail/ivf_pq_search.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/neighbors/cagra_types.hpp>
@@ -94,6 +97,22 @@ void search_main(raft::device_resources const& res,
             _num_executed_iterations,
             topk);
   }
+
+  static_assert(std::is_same_v<DistanceT, float>,
+                "only float distances are supported at the moment");
+  float* dist_out          = distances.data_handle();
+  const DistanceT* dist_in = distances.data_handle();
+  // We're converting the data from T to DistanceT during distance computation
+  // and divide the values by kDivisor. Here we restore the original scale.
+  constexpr float kScale = spatial::knn::detail::utils::config<T>::kDivisor /
+                           spatial::knn::detail::utils::config<DistanceT>::kDivisor;
+  ivf_pq::detail::postprocess_distances(dist_out,
+                                        dist_in,
+                                        index.metric(),
+                                        distances.extent(0),
+                                        distances.extent(1),
+                                        kScale,
+                                        res.get_stream());
 }
 /** @} */  // end group cagra
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
index a05c714700..52e5c62169 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
 #include "device_common.hpp"
 #include "hashmap.hpp"
 #include "utils.hpp"
@@ -51,7 +53,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
   INDEX_T* const result_indices_ptr,       // [num_pickup]
   DISTANCE_T* const result_distances_ptr,  // [num_pickup]
   const float* const query_buffer,
-  const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
+  const DATA_T* const dataset_ptr,         // [dataset_size, dataset_dim]
   const std::size_t dataset_dim,
   const std::size_t dataset_size,
   const std::size_t num_pickup,
@@ -102,7 +104,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
             const uint32_t kv = k + v;
             // if (kv >= dataset_dim) break;
             DISTANCE_T diff = query_buffer[device::swizzling(kv)];
-            diff -= static_cast<float>(dl_buff[e].data[v]) * device::fragment_scale<DATA_T>();
+            diff -= spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].data[v]);
             norm2 += diff * diff;
           }
         }
@@ -229,7 +231,7 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in
             const unsigned kv = k + v;
             diff              = query_buffer[device::swizzling(kv)];
           }
-          diff -= static_cast<float>(dl_buff[e].data[v]) * device::fragment_scale<DATA_T>();
+          diff -= spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].data[v]);
           norm2 += diff * diff;
         }
       }
diff --git a/cpp/include/raft/neighbors/detail/cagra/device_common.hpp b/cpp/include/raft/neighbors/detail/cagra/device_common.hpp
index 20f30d9f11..f9c81f3d25 100644
--- a/cpp/include/raft/neighbors/detail/cagra/device_common.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/device_common.hpp
@@ -27,30 +27,6 @@ namespace device {
 // warpSize for compile time calculation
 constexpr unsigned warp_size = 32;
 
-// scaling factor for distance computation
-template <class T>
-_RAFT_HOST_DEVICE constexpr float fragment_scale();
-template <>
-_RAFT_HOST_DEVICE constexpr float fragment_scale<float>()
-{
-  return 1.0;
-};
-template <>
-_RAFT_HOST_DEVICE constexpr float fragment_scale<half>()
-{
-  return 1.0;
-};
-template <>
-_RAFT_HOST_DEVICE constexpr float fragment_scale<uint8_t>()
-{
-  return 1.0 / 256.0;
-};
-template <>
-_RAFT_HOST_DEVICE constexpr float fragment_scale<int8_t>()
-{
-  return 1.0 / 128.0;
-};
-
 /** Xorshift rondem number generator.
  *
  * See https://en.wikipedia.org/wiki/Xorshift#xorshift for reference.
@@ -73,4 +49,4 @@ _RAFT_DEVICE inline T swizzling(T x)
 }
 
 }  // namespace device
-}  // namespace raft::neighbors::experimental::cagra::detail
\ No newline at end of file
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/fragment.hpp b/cpp/include/raft/neighbors/detail/cagra/fragment.hpp
index d5ec2207e7..c423ac12c2 100644
--- a/cpp/include/raft/neighbors/detail/cagra/fragment.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/fragment.hpp
@@ -48,8 +48,7 @@ struct load_unit_t<1> {
 
 // One dataset or query vector is distributed within a warp and stored as `fragment`.
 template <int DIM, class T, unsigned TEAM_SIZE, class ENABLED>
-struct fragment_base {
-};
+struct fragment_base {};
 template <int DIM, class T, unsigned TEAM_SIZE = warp_size>
 struct fragment
   : fragment_base<DIM,
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
index 568ad0826c..02055f2a4d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -78,12 +78,12 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a
 
 template <class DATA_T, int blockDim_x, int numElementsPerThread>
 __global__ void kern_sort(
-  DATA_T** dataset,  // [num_gpus][dataset_chunk_size, dataset_dim]
+  DATA_T** dataset,             // [num_gpus][dataset_chunk_size, dataset_dim]
   uint32_t dataset_size,
   uint32_t dataset_chunk_size,  // (*) num_gpus * dataset_chunk_size >= dataset_size
   uint32_t dataset_dim,
   float scale,
-  uint32_t** knn_graph,  // [num_gpus][graph_chunk_size, graph_degree]
+  uint32_t** knn_graph,       // [num_gpus][graph_chunk_size, graph_degree]
   uint32_t graph_size,
   uint32_t graph_chunk_size,  // (*) num_gpus * graph_chunk_size >= graph_size
   uint32_t graph_degree,
@@ -211,7 +211,7 @@ __global__ void kern_sort(
 
 template <int MAX_DEGREE>
 __global__ void kern_prune(
-  uint32_t** knn_graph,  // [num_gpus][graph_chunk_size, graph_degree]
+  uint32_t** knn_graph,       // [num_gpus][graph_chunk_size, graph_degree]
   uint32_t graph_size,
   uint32_t graph_chunk_size,  // (*) num_gpus * graph_chunk_size >= graph_size
   uint32_t graph_degree,
@@ -284,8 +284,8 @@ namespace {
 __global__ void kern_make_rev_graph(const uint32_t i_gpu,
                                     const uint32_t* dest_nodes,  // [global_graph_size]
                                     const uint32_t global_graph_size,
-                                    uint32_t* rev_graph,        // [graph_size, degree]
-                                    uint32_t* rev_graph_count,  // [graph_size]
+                                    uint32_t* rev_graph,         // [graph_size, degree]
+                                    uint32_t* rev_graph_count,   // [graph_size]
                                     const uint32_t graph_size,
                                     const uint32_t degree)
 {
@@ -316,8 +316,8 @@ T*** mgpu_alloc(int n_gpus, uint32_t chunk, uint32_t nelems)
     RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
     RAFT_CUDA_TRY(cudaMalloc(&(arrays[i_gpu]), bsize)); /* d1 */
   }
-  T*** d_arrays;                                       // [n_gpus+1][n_gpus][chunk, nelems]
-  d_arrays = (T***)malloc(sizeof(T**) * (n_gpus + 1)); /* h2 */
+  T*** d_arrays;                                        // [n_gpus+1][n_gpus][chunk, nelems]
+  d_arrays = (T***)malloc(sizeof(T**) * (n_gpus + 1));  /* h2 */
   bsize    = sizeof(T*) * n_gpus;
   for (int i_gpu = 0; i_gpu < n_gpus; i_gpu++) {
     RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
@@ -705,7 +705,7 @@ void prune(raft::device_resources const& res,
   d_rev_graph_count = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, 1);
   mgpu_H2D<uint32_t>(d_rev_graph_count, rev_graph_count, num_gpus, graph_size, graph_chunk_size, 1);
 
-  uint32_t* dest_nodes;  // [graph_size]
+  uint32_t* dest_nodes;     // [graph_size]
   dest_nodes = (uint32_t*)malloc(sizeof(uint32_t) * graph_size);
   uint32_t** d_dest_nodes;  // [num_gpus][graph_size]
   d_dest_nodes = (uint32_t**)malloc(sizeof(uint32_t*) * num_gpus);
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
index 2c0ac98417..99553632ac 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 #pragma once
+
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
 #include <algorithm>
 #include <cassert>
 #include <iostream>
@@ -43,7 +46,7 @@ namespace multi_cta_search {
 template <class INDEX_T>
 __device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [num_parents]
                                     const uint32_t num_parents,
-                                    INDEX_T* const itopk_indices,  // [num_itopk]
+                                    INDEX_T* const itopk_indices,        // [num_itopk]
                                     const size_t num_itopk,
                                     uint32_t* const terminate_flag)
 {
@@ -80,8 +83,8 @@ __device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [num
 }
 
 template <unsigned MAX_ELEMENTS>
-__device__ inline void topk_by_bitonic_sort(float* distances,   // [num_elements]
-                                            uint32_t* indices,  // [num_elements]
+__device__ inline void topk_by_bitonic_sort(float* distances,         // [num_elements]
+                                            uint32_t* indices,        // [num_elements]
                                             const uint32_t num_elements,
                                             const uint32_t num_itopk  // num_itopk <= num_elements
 )
@@ -137,7 +140,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
   const uint32_t graph_degree,
   const unsigned num_distilation,
   const uint64_t rand_xor_mask,
-  const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const INDEX_T* seed_ptr,              // [num_queries, num_seeds]
   const uint32_t num_seeds,
   uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
   const uint32_t hash_bitlen,
@@ -204,7 +207,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
   for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) {
     unsigned j = device::swizzling(i);
     if (i < dataset_dim) {
-      query_buffer[j] = static_cast<float>(query_ptr[i]) * device::fragment_scale<DATA_T>();
+      query_buffer[j] = spatial::knn::detail::utils::mapping<float>{}(query_ptr[i]);
     } else {
       query_buffer[j] = 0.0;
     }
@@ -561,9 +564,9 @@ struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
   void operator()(raft::device_resources const& res,
                   raft::device_matrix_view<const DATA_T, INDEX_T, row_major> dataset,
                   raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
-                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
+                  INDEX_T* const topk_indices_ptr,          // [num_queries, topk]
+                  DISTANCE_T* const topk_distances_ptr,     // [num_queries, topk]
+                  const DATA_T* const queries_ptr,          // [num_queries, dataset_dim]
                   const uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
                   uint32_t* const num_executed_iterations,  // [num_queries,]
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index f688941239..e3e9c8a655 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 #pragma once
+
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
 #include <algorithm>
 #include <cassert>
 #include <iostream>
@@ -93,7 +96,7 @@ __global__ void random_pickup_kernel(
   const std::size_t num_pickup,
   const unsigned num_distilation,
   const uint64_t rand_xor_mask,
-  const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const INDEX_T* seed_ptr,                   // [num_queries, num_seeds]
   const uint32_t num_seeds,
   INDEX_T* const result_indices_ptr,         // [num_queries, ldr]
   DISTANCE_T* const result_distances_ptr,    // [num_queries, ldr]
@@ -124,10 +127,12 @@ __global__ void random_pickup_kernel(
       random_data_frag, dataset_ptr + (dataset_dim * seed_index), dataset_dim);
 
     // Compute the norm of two data
-    const auto norm2 =
-      device::norm2<DISTANCE_T>(query_frag, random_data_frag, device::fragment_scale<DATA_T>()
-                                /*, scale*/
-      );
+    const auto norm2 = device::norm2<DISTANCE_T>(
+      query_frag,
+      random_data_frag,
+      static_cast<float>(1.0 / spatial::knn::detail::utils::config<DATA_T>::kDivisor)
+      /*, scale*/
+    );
 
     if (norm2 < best_norm2_team_local) {
       best_norm2_team_local = norm2;
@@ -162,7 +167,7 @@ void random_pickup(const DATA_T* const dataset_ptr,  // [dataset_size, dataset_d
                    const std::size_t num_pickup,
                    const unsigned num_distilation,
                    const uint64_t rand_xor_mask,
-                   const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+                   const INDEX_T* seed_ptr,                   // [num_queries, num_seeds]
                    const uint32_t num_seeds,
                    INDEX_T* const result_indices_ptr,         // [num_queries, ldr]
                    DISTANCE_T* const result_distances_ptr,    // [num_queries, ldr]
@@ -300,17 +305,17 @@ template <unsigned TEAM_SIZE,
 __global__ void compute_distance_to_child_nodes_kernel(
   const INDEX_T* const parent_node_list,  // [num_queries, num_parents]
   const std::uint32_t num_parents,
-  const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
+  const DATA_T* const dataset_ptr,        // [dataset_size, data_dim]
   const std::uint32_t data_dim,
   const std::uint32_t dataset_size,
-  const INDEX_T* const neighbor_graph_ptr,  // [dataset_size, graph_degree]
+  const INDEX_T* const neighbor_graph_ptr,   // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
   const DATA_T* query_ptr,                   // [num_queries, data_dim]
   std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
   const std::uint32_t hash_bitlen,
-  INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
-  DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
-  const std::uint32_t ldd                  // (*) ldd >= num_parents * graph_degree
+  INDEX_T* const result_indices_ptr,         // [num_queries, ldd]
+  DISTANCE_T* const result_distances_ptr,    // [num_queries, ldd]
+  const std::uint32_t ldd                    // (*) ldd >= num_parents * graph_degree
 )
 {
   const uint32_t ldb        = hashmap::get_size(hash_bitlen);
@@ -335,8 +340,10 @@ __global__ void compute_distance_to_child_nodes_kernel(
     device::fragment<MAX_DATASET_DIM, DATA_T, TEAM_SIZE> frag_query;
     device::load_vector_sync(frag_query, query_ptr + blockIdx.y * data_dim, data_dim);
 
-    const auto norm2 =
-      device::norm2<DISTANCE_T>(frag_target, frag_query, device::fragment_scale<DATA_T>());
+    const auto norm2 = device::norm2<DISTANCE_T>(
+      frag_target,
+      frag_query,
+      static_cast<float>(1.0 / spatial::knn::detail::utils::config<DATA_T>::kDivisor));
 
     if (threadIdx.x % TEAM_SIZE == 0) {
       result_indices_ptr[ldd * blockIdx.y + global_team_id]   = child_id;
@@ -357,18 +364,18 @@ template <unsigned TEAM_SIZE,
 void compute_distance_to_child_nodes(
   const INDEX_T* const parent_node_list,  // [num_queries, num_parents]
   const uint32_t num_parents,
-  const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
+  const DATA_T* const dataset_ptr,        // [dataset_size, data_dim]
   const std::uint32_t data_dim,
   const std::uint32_t dataset_size,
-  const INDEX_T* const neighbor_graph_ptr,  // [dataset_size, graph_degree]
+  const INDEX_T* const neighbor_graph_ptr,   // [dataset_size, graph_degree]
   const std::uint32_t graph_degree,
-  const DATA_T* query_ptr,  // [num_queries, data_dim]
+  const DATA_T* query_ptr,                   // [num_queries, data_dim]
   const std::uint32_t num_queries,
   std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
   const std::uint32_t hash_bitlen,
-  INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
-  DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
-  const std::uint32_t ldd,                 // (*) ldd >= num_parants * graph_degree
+  INDEX_T* const result_indices_ptr,         // [num_queries, ldd]
+  DISTANCE_T* const result_distances_ptr,    // [num_queries, ldd]
+  const std::uint32_t ldd,                   // (*) ldd >= num_parents * graph_degree
   cudaStream_t cuda_stream = 0)
 {
   const auto block_size = 128;
@@ -419,7 +426,7 @@ void remove_parent_bit(const std::uint32_t num_queries,
 }
 
 template <class T>
-__global__ void batched_memcpy_kernel(T* const dst,  // [batch_size, ld_dst]
+__global__ void batched_memcpy_kernel(T* const dst,        // [batch_size, ld_dst]
                                       const uint64_t ld_dst,
                                       const T* const src,  // [batch_size, ld_src]
                                       const uint64_t ld_src,
@@ -434,7 +441,7 @@ __global__ void batched_memcpy_kernel(T* const dst,  // [batch_size, ld_dst]
 }
 
 template <class T>
-void batched_memcpy(T* const dst,  // [batch_size, ld_dst]
+void batched_memcpy(T* const dst,        // [batch_size, ld_dst]
                     const uint64_t ld_dst,
                     const T* const src,  // [batch_size, ld_src]
                     const uint64_t ld_src,
@@ -578,9 +585,9 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
   void operator()(raft::device_resources const& res,
                   raft::device_matrix_view<const DATA_T, INDEX_T, row_major> dataset,
                   raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
-                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
-                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
-                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
+                  INDEX_T* const topk_indices_ptr,          // [num_queries, topk]
+                  DISTANCE_T* const topk_distances_ptr,     // [num_queries, topk]
+                  const DATA_T* const queries_ptr,          // [num_queries, dataset_dim]
                   const uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
                   uint32_t* const num_executed_iterations,  // [num_queries,]
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index d9613b345c..09d5e71254 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -112,7 +112,7 @@ struct search_plan_impl : public search_plan_impl_base {
                           DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
                           const DATA_T* const queries_ptr,         // [num_queries, dataset_dim]
                           const std::uint32_t num_queries,
-                          const INDEX_T* dev_seed_ptr,                   // [num_queries, num_seeds]
+                          const INDEX_T* dev_seed_ptr,             // [num_queries, num_seeds]
                           std::uint32_t* const num_executed_iterations,  // [num_queries]
                           uint32_t topk){};
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
index acd7ac321f..531b30ba85 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 #pragma once
+
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
 #include <algorithm>
 #include <cassert>
 #include <iostream>
@@ -89,8 +92,7 @@ struct topk_by_radix_sort_base {
   static constexpr std::uint32_t vecLen           = 2;  // TODO
 };
 template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE, class = void>
-struct topk_by_radix_sort : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {
-};
+struct topk_by_radix_sort : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {};
 
 template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE>
 struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
@@ -257,8 +259,8 @@ __device__ inline void topk_by_bitonic_sort_1st(
 
 template <unsigned MAX_ITOPK, unsigned MULTI_WARPS = 0>
 __device__ inline void topk_by_bitonic_sort_2nd(
-  float* itopk_distances,        // [num_itopk]
-  std::uint32_t* itopk_indices,  // [num_itopk]
+  float* itopk_distances,            // [num_itopk]
+  std::uint32_t* itopk_indices,      // [num_itopk]
   const std::uint32_t num_itopk,
   float* candidate_distances,        // [num_candidates]
   std::uint32_t* candidate_indices,  // [num_candidates]
@@ -465,8 +467,8 @@ template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           unsigned MULTI_WARPS_1,
           unsigned MULTI_WARPS_2>
-__device__ void topk_by_bitonic_sort(float* itopk_distances,        // [num_itopk]
-                                     std::uint32_t* itopk_indices,  // [num_itopk]
+__device__ void topk_by_bitonic_sort(float* itopk_distances,            // [num_itopk]
+                                     std::uint32_t* itopk_indices,      // [num_itopk]
                                      const std::uint32_t num_itopk,
                                      float* candidate_distances,        // [num_candidates]
                                      std::uint32_t* candidate_indices,  // [num_candidates]
@@ -527,7 +529,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
   void search_kernel(INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
                      DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
                      const std::uint32_t top_k,
-                     const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
+                     const DATA_T* const dataset_ptr,         // [dataset_size, dataset_dim]
                      const std::size_t dataset_dim,
                      const std::size_t dataset_size,
                      const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
@@ -535,7 +537,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
                      const std::uint32_t graph_degree,
                      const unsigned num_distilation,
                      const uint64_t rand_xor_mask,
-                     const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+                     const INDEX_T* seed_ptr,                   // [num_queries, num_seeds]
                      const uint32_t num_seeds,
                      std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
                      const std::uint32_t internal_topk,
@@ -593,7 +595,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
   for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) {
     unsigned j = device::swizzling(i);
     if (i < dataset_dim) {
-      query_buffer[j] = static_cast<float>(query_ptr[i]) * device::fragment_scale<DATA_T>();
+      query_buffer[j] = spatial::knn::detail::utils::mapping<float>{}(query_ptr[i]);
     } else {
       query_buffer[j] = 0.0;
     }
@@ -1110,9 +1112,9 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
   void operator()(raft::device_resources const& res,
                   raft::device_matrix_view<const DATA_T, INDEX_T, row_major> dataset,
                   raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
-                  INDEX_T* const result_indices_ptr,       // [num_queries, topk]
-                  DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
-                  const DATA_T* const queries_ptr,         // [num_queries, dataset_dim]
+                  INDEX_T* const result_indices_ptr,             // [num_queries, topk]
+                  DISTANCE_T* const result_distances_ptr,        // [num_queries, topk]
+                  const DATA_T* const queries_ptr,               // [num_queries, dataset_dim]
                   const std::uint32_t num_queries,
                   const INDEX_T* dev_seed_ptr,                   // [num_queries, num_seeds]
                   std::uint32_t* const num_executed_iterations,  // [num_queries]
diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
index d09478d1db..072593550e 100644
--- a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
@@ -183,9 +183,9 @@ __device__ inline void update_histogram(int itr,
                                         uint32_t threshold,
                                         uint32_t& num_bins,
                                         uint32_t& shift,
-                                        const T* x,  // [nx,]
+                                        const T* x,        // [nx,]
                                         uint32_t nx,
-                                        uint32_t* hist,  // [num_bins]
+                                        uint32_t* hist,    // [num_bins]
                                         uint8_t* state,
                                         uint32_t* output,  // [topk]
                                         uint32_t* output_count)
@@ -760,16 +760,16 @@ __launch_bounds__(1024, 1) __global__
   void kern_topk_cta_11(uint32_t topk,
                         uint32_t size_batch,
                         uint32_t len_x,
-                        const uint32_t* _x,  // [size_batch, ld_x,]
+                        const uint32_t* _x,        // [size_batch, ld_x,]
                         uint32_t ld_x,
                         const uint32_t* _in_vals,  // [size_batch, ld_iv,]
                         uint32_t ld_iv,
-                        uint32_t* _y,  // [size_batch, ld_y,]
+                        uint32_t* _y,              // [size_batch, ld_y,]
                         uint32_t ld_y,
-                        uint32_t* _out_vals,  // [size_batch, ld_ov,]
+                        uint32_t* _out_vals,       // [size_batch, ld_ov,]
                         uint32_t ld_ov,
-                        uint8_t* _state,   // [size_batch, ...,]
-                        uint32_t* _hints,  // [size_batch,]
+                        uint8_t* _state,           // [size_batch, ...,]
+                        uint32_t* _hints,          // [size_batch,]
                         bool sort)
 {
   uint32_t i_batch = blockIdx.x;
diff --git a/cpp/include/raft/neighbors/detail/faiss_select/MergeNetworkBlock.cuh b/cpp/include/raft/neighbors/detail/faiss_select/MergeNetworkBlock.cuh
index 79e3f95be0..1f4308fa2f 100644
--- a/cpp/include/raft/neighbors/detail/faiss_select/MergeNetworkBlock.cuh
+++ b/cpp/include/raft/neighbors/detail/faiss_select/MergeNetworkBlock.cuh
@@ -199,8 +199,7 @@ template <int NumThreads,
           typename Comp,
           bool SmallerThanBlock,
           bool FullMerge>
-struct BlockMerge {
-};
+struct BlockMerge {};
 
 /// Merging lists smaller than a block
 template <int NumThreads,
diff --git a/cpp/include/raft/neighbors/detail/faiss_select/MergeNetworkWarp.cuh b/cpp/include/raft/neighbors/detail/faiss_select/MergeNetworkWarp.cuh
index 04f7f90aac..3d87e284ed 100644
--- a/cpp/include/raft/neighbors/detail/faiss_select/MergeNetworkWarp.cuh
+++ b/cpp/include/raft/neighbors/detail/faiss_select/MergeNetworkWarp.cuh
@@ -139,8 +139,7 @@ inline __device__ void warpBitonicMergeLE16(K& k, V& v)
 // Template for performing a bitonic merge of an arbitrary set of
 // registers
 template <typename K, typename V, int N, bool Dir, typename Comp, bool Low, bool Pow2>
-struct BitonicMergeStep {
-};
+struct BitonicMergeStep {};
 
 //
 // Power-of-2 merge specialization
diff --git a/cpp/include/raft/neighbors/detail/faiss_select/Select.cuh b/cpp/include/raft/neighbors/detail/faiss_select/Select.cuh
index 4aa7d68f54..fdb986ac69 100644
--- a/cpp/include/raft/neighbors/detail/faiss_select/Select.cuh
+++ b/cpp/include/raft/neighbors/detail/faiss_select/Select.cuh
@@ -25,8 +25,7 @@ template <int NumWarps,
           int NumWarpQ,
           bool Dir,
           typename Comp>
-struct FinalBlockMerge {
-};
+struct FinalBlockMerge {};
 
 template <int NumThreads, typename K, typename V, int NumWarpQ, bool Dir, typename Comp>
 struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> {
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
new file mode 100644
index 0000000000..46f72c4005
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                            // uintX_t
+#include <raft/neighbors/ivf_flat_types.hpp>  // raft::neighbors::ivf_flat::index
+#include <raft/util/raft_explicit.hpp>        // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>           // rmm:cuda_stream_view
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::neighbors::ivf_flat::detail {
+
+template <typename T, typename AccT, typename IdxT>
+void ivfflat_interleaved_scan(const raft::neighbors::ivf_flat::index<T, IdxT>& index,
+                              const T* queries,
+                              const uint32_t* coarse_query_results,
+                              const uint32_t n_queries,
+                              const raft::distance::DistanceType metric,
+                              const uint32_t n_probes,
+                              const uint32_t k,
+                              const bool select_min,
+                              IdxT* neighbors,
+                              float* distances,
+                              uint32_t& grid_dim_x,
+                              rmm::cuda_stream_view stream) RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ivf_flat::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)         \
+  extern template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                                        \
+    const T* queries,                                                                              \
+    const uint32_t* coarse_query_results,                                                          \
+    const uint32_t n_queries,                                                                      \
+    const raft::distance::DistanceType metric,                                                     \
+    const uint32_t n_probes,                                                                       \
+    const uint32_t k,                                                                              \
+    const bool select_min,                                                                         \
+    IdxT* neighbors,                                                                               \
+    float* distances,                                                                              \
+    uint32_t& grid_dim_x,                                                                          \
+    rmm::cuda_stream_view stream)
+
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(float, float, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(int8_t, int32_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(uint8_t, uint32_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
new file mode 100644
index 0000000000..4eed2aa453
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
@@ -0,0 +1,1076 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/logger.hpp>  // RAFT_LOG_TRACE
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
+#include <raft/util/device_loads_stores.cuh>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::neighbors::ivf_flat::detail {
+
+using namespace raft::spatial::knn::detail;  // NOLINT
+
+constexpr int kThreadsPerBlock = 128;
+
+/**
+ * @brief Copy `n` elements per block from one place to another.
+ *
+ * @param[out] out target pointer (unique per block)
+ * @param[in] in source pointer
+ * @param n number of elements to copy
+ */
+template <int VecBytes = 16, typename T>
+__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n)
+{
+  constexpr int VecElems = VecBytes / sizeof(T);  // NOLINT
+  using align_bytes      = Pow2<(size_t)VecBytes>;
+  if constexpr (VecElems > 1) {
+    using align_elems = Pow2<VecElems>;
+    if (!align_bytes::areSameAlignOffsets(out, in)) {
+      return copy_vectorized<(VecBytes >> 1), T>(out, in, n);
+    }
+    {  // process unaligned head
+      uint32_t head = align_bytes::roundUp(in) - in;
+      if (head > 0) {
+        copy_vectorized<sizeof(T), T>(out, in, head);
+        n -= head;
+        in += head;
+        out += head;
+      }
+    }
+    {  // process main part vectorized
+      using vec_t = typename IOType<T, VecElems>::Type;
+      copy_vectorized<sizeof(vec_t), vec_t>(
+        reinterpret_cast<vec_t*>(out), reinterpret_cast<const vec_t*>(in), align_elems::div(n));
+    }
+    {  // process unaligned tail
+      uint32_t tail = align_elems::mod(n);
+      if (tail > 0) {
+        n -= tail;
+        copy_vectorized<sizeof(T), T>(out + n, in + n, tail);
+      }
+    }
+  }
+  if constexpr (VecElems <= 1) {
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+      out[i] = in[i];
+    }
+  }
+}
+
+/**
+ * @brief Load a part of a vector from the index and from query, compute the (part of the) distance
+ * between them, and aggregate it using the provided Lambda; one structure per thread, per query,
+ * and per index item.
+ *
+ * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
+ * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
+ *                void (AccT& acc, AccT x, AccT y)
+ * @tparam Veclen size of the vectorized load
+ * @tparam T type of the data in the query and the index
+ * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
+ * values)
+ */
+template <int kUnroll, typename Lambda, int Veclen, typename T, typename AccT>
+struct loadAndComputeDist {
+  Lambda compute_dist;
+  AccT& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in shared memory.
+   * Every thread here processes exactly kUnroll * Veclen elements independently of others.
+   */
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
+                                                      const T* query_shared,
+                                                      IdxT loadIndex,
+                                                      IdxT shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      T encV[Veclen];
+      ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen);
+      T queryRegs[Veclen];
+      lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
+#pragma unroll
+      for (int k = 0; k < Veclen; ++k) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in the global memory and is different for every
+   * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into
+   * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once).
+   */
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
+                                                        const T* query,
+                                                        IdxT baseLoadIndex,
+                                                        const int lane_id)
+  {
+    T queryReg               = query[baseLoadIndex + lane_id];
+    constexpr int stride     = kUnroll * Veclen;
+    constexpr int totalIter  = WarpSize / stride;
+    constexpr int gmemStride = stride * kIndexGroupSize;
+#pragma unroll
+    for (int i = 0; i < totalIter; ++i, data += gmemStride) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        T encV[Veclen];
+        ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen);
+        const int d = (i * kUnroll + j) * Veclen;
+#pragma unroll
+        for (int k = 0; k < Veclen; ++k) {
+          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
+        }
+      }
+    }
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
+   */
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    const int loadDim     = dimBlocks + lane_id;
+    T queryReg            = loadDim < dim ? query[loadDim] : 0;
+    const int loadDataIdx = lane_id * Veclen;
+    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
+      T enc[Veclen];
+      ldg(enc, data + loadDataIdx);
+#pragma unroll
+      for (int k = 0; k < Veclen; k++) {
+        compute_dist(dist, shfl(queryReg, d + k, WarpSize), enc[k]);
+      }
+    }
+  }
+};
+
+// This handles uint8_t 8, 16 Veclens
+template <int kUnroll, typename Lambda, int uint8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    loadIndex                = loadIndex * veclen_int;
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV[veclen_int];
+      ldg(encV,
+          reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
+      uint32_t queryRegs[veclen_int];
+      lds(queryRegs, reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    uint32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int stride = kUnroll * uint8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV[veclen_int];
+        ldg(encV,
+            reinterpret_cast<unsigned const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks;
+         d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
+      uint32_t enc[veclen_int];
+      ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize);
+        compute_dist(dist, q, enc[k]);
+      }
+    }
+  }
+};
+
+// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
+// using above common template of int2/int4
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 4;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 4;
+    const int loadDim    = dimBlocks + lane_id;
+    uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
+      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg =
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + lane_id * veclen;
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
+      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = data[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = query_shared[shmemIndex + j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg    = query[baseLoadIndex + lane_id];
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = data[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    int loadDim          = dimBlocks + lane_id;
+    uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = data[lane_id];
+      uint32_t q   = shfl(queryReg, d, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+// This device function is for int8 veclens 4, 8 and 16
+template <int kUnroll, typename Lambda, int int8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      int32_t encV[veclen_int];
+      ldg(encV,
+          reinterpret_cast<int32_t const*>(data) + (loadIndex + j * kIndexGroupSize) * veclen_int);
+      int32_t queryRegs[veclen_int];
+      lds(queryRegs, reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+
+    int32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int stride = kUnroll * int8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        int32_t encV[veclen_int];
+        ldg(encV,
+            reinterpret_cast<int32_t const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          int32_t q = shfl(queryReg, d + k, WarpSize);
+          compute_dist(dist, q, encV[k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen_int = int8_veclen / 4;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
+      int32_t enc[veclen_int];
+      ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        int32_t q = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
+        compute_dist(dist, q, enc[k]);
+      }
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
+      int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    int32_t queryReg =
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
+        int32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + lane_id * veclen;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
+      int32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+    int32_t queryReg     = query[baseLoadIndex + lane_id];
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        compute_dist(
+          dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[lane_id + j * kIndexGroupSize]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    const int loadDim    = dimBlocks + lane_id;
+    int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      compute_dist(dist, shfl(queryReg, d, WarpSize), data[lane_id]);
+    }
+  }
+};
+
+/**
+ * Scan clusters for nearest neighbors of the query vectors.
+ * See `ivfflat_interleaved_scan` for more information.
+ *
+ * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
+ * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
+ * calculated, and the top-k nearest neighbors are selected.
+ *
+ * @param compute_dist distance function
+ * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
+ * block; this number must be a multiple of `WarpSize * Veclen`.
+ * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
+ * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
+ * @param[in] list_indices index<T, IdxT>.indices
+ * @param[in] list_data index<T, IdxT>.data
+ * @param[in] list_sizes index<T, IdxT>.list_sizes
+ * @param[in] list_offsets index<T, IdxT>.list_offsets
+ * @param n_probes
+ * @param k
+ * @param dim
+ * @param[out] neighbors
+ * @param[out] distances
+ */
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename Lambda,
+          typename PostLambda>
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  interleaved_scan_kernel(Lambda compute_dist,
+                          PostLambda post_process,
+                          const uint32_t query_smem_elems,
+                          const T* query,
+                          const uint32_t* coarse_index,
+                          const IdxT* const* list_indices_ptrs,
+                          const T* const* list_data_ptrs,
+                          const uint32_t* list_sizes,
+                          const uint32_t n_probes,
+                          const uint32_t k,
+                          const uint32_t dim,
+                          IdxT* neighbors,
+                          float* distances)
+{
+  extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
+  // Using shared memory for the (part of the) query;
+  // This allows to save on global memory bandwidth when reading index and query
+  // data at the same time.
+  // Its size is `query_smem_elems`.
+  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
+  // Make the query input and output point to this block's shared query
+  {
+    const int query_id = blockIdx.y;
+    query += query_id * dim;
+    neighbors += query_id * k * gridDim.x + blockIdx.x * k;
+    distances += query_id * k * gridDim.x + blockIdx.x * k;
+    coarse_index += query_id * n_probes;
+  }
+
+  // Copy a part of the query into shared memory for faster processing
+  copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
+  __syncthreads();
+
+  using block_sort_t = matrix::detail::select::warpsort::block_sort<
+    matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    Ascending,
+    float,
+    IdxT>;
+  block_sort_t queue(k);
+
+  {
+    using align_warp  = Pow2<WarpSize>;
+    const int lane_id = align_warp::mod(threadIdx.x);
+
+    // How many full warps needed to compute the distance (without remainder)
+    const uint32_t full_warps_along_dim = align_warp::roundDown(dim);
+
+    const uint32_t shm_assisted_dim =
+      (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
+
+    // Every CUDA block scans one cluster at a time.
+    for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
+      const uint32_t list_id = coarse_index[probe_id];  // The id of cluster(list)
+
+      // The number of vectors in each cluster(list); [nlist]
+      const uint32_t list_length = list_sizes[list_id];
+
+      // The number of interleaved groups to be processed
+      const uint32_t num_groups =
+        align_warp::div(list_length + align_warp::Mask);  // ceildiv by power of 2
+
+      constexpr int kUnroll        = WarpSize / Veclen;
+      constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
+      // Every warp reads WarpSize vectors and computes the distances to them.
+      // Then, the distances and corresponding ids are distributed among the threads,
+      // and each thread adds one (id, dist) pair to the filtering queue.
+      for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups;
+           group_id += kNumWarps) {
+        AccT dist = 0;
+        // This is where this warp begins reading data (start position of an interleaved group)
+        const T* data = list_data_ptrs[list_id] + (group_id * kIndexGroupSize) * dim;
+
+        // This is the vector a given lane/thread handles
+        const uint32_t vec_id = group_id * WarpSize + lane_id;
+        const bool valid      = vec_id < list_length;
+
+        // Process first shm_assisted_dim dimensions (always using shared memory)
+        if (valid) {
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = 0; pos < shm_assisted_dim;
+               pos += WarpSize, data += kIndexGroupSize * WarpSize) {
+            lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+          }
+        }
+
+        if (dim > query_smem_elems) {
+          // The default path - using shfl ops - for dimensions beyond query_smem_elems
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) {
+            lc.runLoadShflAndCompute(data, query, pos, lane_id);
+          }
+          lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
+        } else {
+          // when  shm_assisted_dim == full_warps_along_dim < dim
+          if (valid) {
+            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> lc(dist, compute_dist);
+            for (int pos = full_warps_along_dim; pos < dim;
+                 pos += Veclen, data += kIndexGroupSize * Veclen) {
+              lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+            }
+          }
+        }
+
+        // Enqueue one element per thread
+        const float val  = valid ? static_cast<float>(dist) : block_sort_t::queue_t::kDummy;
+        const size_t idx = valid ? static_cast<size_t>(list_indices_ptrs[list_id][vec_id]) : 0;
+        queue.add(val, idx);
+      }
+    }
+  }
+
+  // finalize and store selected neighbours
+  __syncthreads();
+  queue.done(interleaved_scan_kernel_smem);
+  queue.store(distances, neighbors, post_process);
+}
+
+/**
+ *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
+ */
+template <typename T>
+uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func)
+{
+  int dev_id;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  int num_sms;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  int num_blocks_per_sm = 0;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
+
+  size_t min_grid_size = num_sms * num_blocks_per_sm;
+  size_t min_grid_x    = ceildiv<size_t>(min_grid_size, numQueries);
+  return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
+}
+
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename Lambda,
+          typename PostLambda>
+void launch_kernel(Lambda lambda,
+                   PostLambda post_process,
+                   const index<T, IdxT>& index,
+                   const T* queries,
+                   const uint32_t* coarse_index,
+                   const uint32_t num_queries,
+                   const uint32_t n_probes,
+                   const uint32_t k,
+                   IdxT* neighbors,
+                   float* distances,
+                   uint32_t& grid_dim_x,
+                   rmm::cuda_stream_view stream)
+{
+  RAFT_EXPECTS(Veclen == index.veclen(),
+               "Configured Veclen does not match the index interleaving pattern.");
+  constexpr auto kKernel =
+    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda, PostLambda>;
+  const int max_query_smem = 16384;
+  int query_smem_elems =
+    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
+  int smem_size              = query_smem_elems * sizeof(T);
+  constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
+  auto block_merge_mem =
+    raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<AccT, IdxT>(
+      kThreadsPerBlock / kSubwarpSize, k);
+  smem_size += std::max<int>(smem_size, block_merge_mem);
+
+  // power-of-two less than cuda limit (for better addr alignment)
+  constexpr uint32_t kMaxGridY = 32768;
+
+  if (grid_dim_x == 0) {
+    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel);
+    return;
+  }
+
+  for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) {
+    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, num_queries - query_offset);
+    dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
+    dim3 block_dim(kThreadsPerBlock);
+    RAFT_LOG_TRACE(
+      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, "
+      "smem_size = %d",
+      grid_dim.x,
+      grid_dim.y,
+      block_dim.x,
+      n_probes,
+      smem_size);
+    kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                        post_process,
+                                                        query_smem_elems,
+                                                        queries,
+                                                        coarse_index,
+                                                        index.inds_ptrs().data_handle(),
+                                                        index.data_ptrs().data_handle(),
+                                                        index.list_sizes().data_handle(),
+                                                        n_probes,
+                                                        k,
+                                                        index.dim(),
+                                                        neighbors,
+                                                        distances);
+    queries += grid_dim_y * index.dim();
+    neighbors += grid_dim_y * grid_dim_x * k;
+    distances += grid_dim_y * grid_dim_x * k;
+  }
+}
+
+template <int Veclen, typename T, typename AccT>
+struct euclidean_dist {
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
+  {
+    const auto diff = x - y;
+    acc += diff * diff;
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, uint8_t, uint32_t> {
+  __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      const auto diff = __vabsdiffu4(x, y);
+      acc             = dp4a(diff, diff, acc);
+    } else {
+      const auto diff = __usad(x, y, 0u);
+      acc += diff * diff;
+    }
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, int8_t, int32_t> {
+  __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
+      // between two int8 numbers can be greater than 127 and therefore represented as a negative
+      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
+      // from uint8 to uint32 is correct.
+      const auto diff = __vabsdiffs4(x, y);
+      acc             = dp4a(diff, diff, static_cast<uint32_t>(acc));
+    } else {
+      const auto diff = x - y;
+      acc += diff * diff;
+    }
+  }
+};
+
+template <int Veclen, typename T, typename AccT>
+struct inner_prod_dist {
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
+  {
+    if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
+      acc = dp4a(x, y, acc);
+    } else {
+      acc += x * y;
+    }
+  }
+};
+
+/** Select the distance computation function and forward the rest of the arguments. */
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename... Args>
+void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
+{
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2Unexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
+    case raft::distance::DistanceType::L2SqrtExpanded:
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::sqrt_op>({}, {}, std::forward<Args>(args)...);
+    case raft::distance::DistanceType::InnerProduct:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           inner_prod_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
+    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
+    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
+  }
+}
+
+/**
+ * Lift the `capacity` and `veclen` parameters to the template level,
+ * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`.
+ */
+template <typename T,
+          typename AccT,
+          typename IdxT,
+          int Capacity = matrix::detail::select::warpsort::kMaxCapacity,
+          int Veclen   = std::max<int>(1, 16 / sizeof(T))>
+struct select_interleaved_scan_kernel {
+  /**
+   * Recursively reduce the `Capacity` and `Veclen` parameters until they match the
+   * corresponding runtime arguments.
+   * By default, this recursive process starts with maximum possible values of the
+   * two parameters and ends with both values equal to 1.
+   */
+  template <typename... Args>
+  static inline void run(int capacity, int veclen, bool select_min, Args&&... args)
+  {
+    if constexpr (Capacity > 1) {
+      if (capacity * 2 <= Capacity) {
+        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity / 2, Veclen>::run(
+          capacity, veclen, select_min, std::forward<Args>(args)...);
+      }
+    }
+    if constexpr (Veclen > 1) {
+      if (veclen % Veclen != 0) {
+        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity, 1>::run(
+          capacity, 1, select_min, std::forward<Args>(args)...);
+      }
+    }
+    // NB: this is the limitation of the warpsort structures that use a huge number of
+    //     registers (used in the main kernel here).
+    RAFT_EXPECTS(capacity == Capacity,
+                 "Capacity must be power-of-two not bigger than the maximum allowed size "
+                 "matrix::detail::select::warpsort::kMaxCapacity (%d).",
+                 matrix::detail::select::warpsort::kMaxCapacity);
+    RAFT_EXPECTS(
+      veclen == Veclen,
+      "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
+    if (select_min) {
+      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT, IdxT>(std::forward<Args>(args)...);
+    } else {
+      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT, IdxT>(std::forward<Args>(args)...);
+    }
+  }
+};
+
+/**
+ * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
+ *
+ * @tparam T value type
+ * @tparam AccT accumulated type
+ * @tparam IdxT type of the indices
+ *
+ * @param index previously built ivf-flat index
+ * @param[in] queries device pointer to the query vectors [batch_size, dim]
+ * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
+ * @param n_queries batch size
+ * @param metric type of the measured distance
+ * @param n_probes number of nearest clusters to query
+ * @param k number of nearest neighbors.
+ *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
+ * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
+ * metric.
+ * @param[out] neighbors device pointer to the result indices for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[out] distances device pointer to the result distances for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
+ *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
+ * @param stream
+ */
+template <typename T, typename AccT, typename IdxT>
+void ivfflat_interleaved_scan(const index<T, IdxT>& index,
+                              const T* queries,
+                              const uint32_t* coarse_query_results,
+                              const uint32_t n_queries,
+                              const raft::distance::DistanceType metric,
+                              const uint32_t n_probes,
+                              const uint32_t k,
+                              const bool select_min,
+                              IdxT* neighbors,
+                              float* distances,
+                              uint32_t& grid_dim_x,
+                              rmm::cuda_stream_view stream)
+{
+  const int capacity = bound_by_power_of_two(k);
+  select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
+                                                     index.veclen(),
+                                                     select_min,
+                                                     metric,
+                                                     index,
+                                                     queries,
+                                                     coarse_query_results,
+                                                     n_queries,
+                                                     n_probes,
+                                                     k,
+                                                     neighbors,
+                                                     distances,
+                                                     grid_dim_x,
+                                                     stream);
+}
+
+}  // namespace raft::neighbors::ivf_flat::detail
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
similarity index 76%
rename from cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
rename to cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
index 7c80eb29d0..63f341dd9a 100644
--- a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
@@ -14,7 +14,12 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
-#include <raft/distance/specializations.cuh>
+#pragma once
 
-template class raft::distance::kernels::detail::GramMatrixBase<double>;
\ No newline at end of file
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
+#include "ivf_flat_interleaved_scan-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "ivf_flat_interleaved_scan-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
new file mode 100644
index 0000000000..14d15711a6
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                            // uintX_t
+#include <raft/neighbors/ivf_flat_types.hpp>  // raft::neighbors::ivf_flat::index
+#include <raft/util/raft_explicit.hpp>        // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::neighbors::ivf_flat::detail {
+
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const raft::neighbors::ivf_flat::index<T, IdxT>& index,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ivf_flat::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT)         \
+  extern template void raft::neighbors::ivf_flat::detail::search<T, IdxT>( \
+    raft::device_resources const& handle,                                  \
+    const search_params& params,                                           \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                \
+    const T* queries,                                                      \
+    uint32_t n_queries,                                                    \
+    uint32_t k,                                                            \
+    IdxT* neighbors,                                                       \
+    float* distances,                                                      \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_neighbors_ivf_flat_detail_search(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_search(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_search
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
new file mode 100644
index 0000000000..c364118fdd
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_resources.hpp>                       // raft::device_resources
+#include <raft/core/logger.hpp>                                 // RAFT_LOG_TRACE
+#include <raft/distance/distance_types.hpp>                     // is_min_close, DistanceType
+#include <raft/linalg/gemm.cuh>                                 // raft::linalg::gemm
+#include <raft/linalg/norm.cuh>                                 // raft::linalg::norm
+#include <raft/linalg/unary_op.cuh>                             // raft::linalg::unary_op
+#include <raft/matrix/detail/select_k.cuh>                      // matrix::detail::select_k
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan.cuh>  // interleaved_scan
+#include <raft/neighbors/ivf_flat_types.hpp>                    // raft::neighbors::ivf_flat::index
+#include <raft/spatial/knn/detail/ann_utils.cuh>                // utils::mapping
+#include <rmm/mr/device/per_device_resource.hpp>                // rmm::device_memory_resource
+
+namespace raft::neighbors::ivf_flat::detail {
+
+using namespace raft::spatial::knn::detail;  // NOLINT
+
+template <typename T, typename AccT, typename IdxT>
+void search_impl(raft::device_resources const& handle,
+                 const raft::neighbors::ivf_flat::index<T, IdxT>& index,
+                 const T* queries,
+                 uint32_t n_queries,
+                 uint32_t k,
+                 uint32_t n_probes,
+                 bool select_min,
+                 IdxT* neighbors,
+                 AccT* distances,
+                 rmm::mr::device_memory_resource* search_mr)
+{
+  auto stream = handle.get_stream();
+  // The norm of query
+  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
+  // The distance value of cluster(list) and queries
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
+  // The topk distance value of cluster(list) and queries
+  rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream, search_mr);
+  // The topk  index of cluster(list) and queries
+  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries * n_probes, stream, search_mr);
+  // The topk distance value of candidate vectors from each cluster(list)
+  rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream, search_mr);
+  // The topk index of candidate vectors from each cluster(list)
+  rmm::device_uvector<IdxT> refined_indices_dev(n_queries * n_probes * k, stream, search_mr);
+
+  size_t float_query_size;
+  if constexpr (std::is_integral_v<T>) {
+    float_query_size = n_queries * index.dim();
+  } else {
+    float_query_size = 0;
+  }
+  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
+  float* converted_queries_ptr = converted_queries_dev.data();
+
+  if constexpr (std::is_same_v<T, float>) {
+    converted_queries_ptr = const_cast<float*>(queries);
+  } else {
+    linalg::unaryOp(
+      converted_queries_ptr, queries, n_queries * index.dim(), utils::mapping<float>{}, stream);
+  }
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+
+  // todo(lsugy): raft distance? (if performance is similar/better than gemm)
+  switch (index.metric()) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2SqrtExpanded: {
+      alpha = -2.0f;
+      beta  = 1.0f;
+      raft::linalg::rowNorm(query_norm_dev.data(),
+                            converted_queries_ptr,
+                            static_cast<IdxT>(index.dim()),
+                            static_cast<IdxT>(n_queries),
+                            raft::linalg::L2Norm,
+                            true,
+                            stream);
+      utils::outer_add(query_norm_dev.data(),
+                       (IdxT)n_queries,
+                       index.center_norms()->data_handle(),
+                       (IdxT)index.n_lists(),
+                       distance_buffer_dev.data(),
+                       stream);
+      RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min<uint32_t>(20, index.dim()));
+      RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
+      break;
+    }
+    default: {
+      alpha = 1.0f;
+      beta  = 0.0f;
+    }
+  }
+
+  linalg::gemm(handle,
+               true,
+               false,
+               index.n_lists(),
+               n_queries,
+               index.dim(),
+               &alpha,
+               index.centers().data_handle(),
+               index.dim(),
+               converted_queries_ptr,
+               index.dim(),
+               &beta,
+               distance_buffer_dev.data(),
+               index.n_lists(),
+               stream);
+
+  RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
+  matrix::detail::select_k<AccT, uint32_t>(distance_buffer_dev.data(),
+                                           nullptr,
+                                           n_queries,
+                                           index.n_lists(),
+                                           n_probes,
+                                           coarse_distances_dev.data(),
+                                           coarse_indices_dev.data(),
+                                           select_min,
+                                           stream,
+                                           search_mr);
+  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
+  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
+
+  auto distances_dev_ptr = refined_distances_dev.data();
+  auto indices_dev_ptr   = refined_indices_dev.data();
+
+  uint32_t grid_dim_x = 0;
+  if (n_probes > 1) {
+    // query the gridDimX size to store probes topK output
+    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT>(index,
+                                                                          nullptr,
+                                                                          nullptr,
+                                                                          n_queries,
+                                                                          index.metric(),
+                                                                          n_probes,
+                                                                          k,
+                                                                          select_min,
+                                                                          nullptr,
+                                                                          nullptr,
+                                                                          grid_dim_x,
+                                                                          stream);
+  } else {
+    grid_dim_x = 1;
+  }
+
+  if (grid_dim_x == 1) {
+    distances_dev_ptr = distances;
+    indices_dev_ptr   = neighbors;
+  }
+
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT>(index,
+                                                                        queries,
+                                                                        coarse_indices_dev.data(),
+                                                                        n_queries,
+                                                                        index.metric(),
+                                                                        n_probes,
+                                                                        k,
+                                                                        select_min,
+                                                                        indices_dev_ptr,
+                                                                        distances_dev_ptr,
+                                                                        grid_dim_x,
+                                                                        stream);
+
+  RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
+  RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
+
+  // Merge topk values from different blocks
+  if (grid_dim_x > 1) {
+    matrix::detail::select_k<AccT, IdxT>(refined_distances_dev.data(),
+                                         refined_indices_dev.data(),
+                                         n_queries,
+                                         k * grid_dim_x,
+                                         k,
+                                         distances,
+                                         neighbors,
+                                         select_min,
+                                         stream,
+                                         search_mr);
+  }
+}
+
+/** See raft::neighbors::ivf_flat::search docs */
+template <typename T, typename IdxT>
+inline void search(raft::device_resources const& handle,
+                   const search_params& params,
+                   const index<T, IdxT>& index,
+                   const T* queries,
+                   uint32_t n_queries,
+                   uint32_t k,
+                   IdxT* neighbors,
+                   float* distances,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
+
+  RAFT_EXPECTS(params.n_probes > 0,
+               "n_probes (number of clusters to probe in the search) must be positive.");
+  auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists());
+
+  auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16);
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("ivf_flat::search: using pool memory resource with initial size %zu bytes",
+                   n_queries * n_probes * k * 16ull);
+  }
+
+  return search_impl<T, float, IdxT>(handle,
+                                     index,
+                                     queries,
+                                     n_queries,
+                                     k,
+                                     n_probes,
+                                     raft::distance::is_min_close(index.metric()),
+                                     neighbors,
+                                     distances,
+                                     mr);
+}
+
+}  // namespace raft::neighbors::ivf_flat::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
index e6533eaf51..7b03ebeab6 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
@@ -13,1280 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/core/mdarray.hpp>
-#include <raft/core/operators.hpp>
-#include <raft/distance/distance.cuh>
-#include <raft/distance/distance_types.hpp>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/detail/select_warpsort.cuh>
-#include <raft/neighbors/ivf_flat_types.hpp>
-#include <raft/spatial/knn/detail/ann_utils.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/device_loads_stores.cuh>
-#include <raft/util/integer_utils.hpp>
-#include <raft/util/pow2_utils.cuh>
-#include <raft/util/vectorized.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace raft::neighbors::ivf_flat::detail {
-
-using namespace raft::spatial::knn::detail;  // NOLINT
-
-constexpr int kThreadsPerBlock = 128;
-
-/**
- * @brief Copy `n` elements per block from one place to another.
- *
- * @param[out] out target pointer (unique per block)
- * @param[in] in source pointer
- * @param n number of elements to copy
- */
-template <int VecBytes = 16, typename T>
-__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n)
-{
-  constexpr int VecElems = VecBytes / sizeof(T);  // NOLINT
-  using align_bytes      = Pow2<(size_t)VecBytes>;
-  if constexpr (VecElems > 1) {
-    using align_elems = Pow2<VecElems>;
-    if (!align_bytes::areSameAlignOffsets(out, in)) {
-      return copy_vectorized<(VecBytes >> 1), T>(out, in, n);
-    }
-    {  // process unaligned head
-      uint32_t head = align_bytes::roundUp(in) - in;
-      if (head > 0) {
-        copy_vectorized<sizeof(T), T>(out, in, head);
-        n -= head;
-        in += head;
-        out += head;
-      }
-    }
-    {  // process main part vectorized
-      using vec_t = typename IOType<T, VecElems>::Type;
-      copy_vectorized<sizeof(vec_t), vec_t>(
-        reinterpret_cast<vec_t*>(out), reinterpret_cast<const vec_t*>(in), align_elems::div(n));
-    }
-    {  // process unaligned tail
-      uint32_t tail = align_elems::mod(n);
-      if (tail > 0) {
-        n -= tail;
-        copy_vectorized<sizeof(T), T>(out + n, in + n, tail);
-      }
-    }
-  }
-  if constexpr (VecElems <= 1) {
-    for (int i = threadIdx.x; i < n; i += blockDim.x) {
-      out[i] = in[i];
-    }
-  }
-}
-
-/**
- * @brief Load a part of a vector from the index and from query, compute the (part of the) distance
- * between them, and aggregate it using the provided Lambda; one structure per thread, per query,
- * and per index item.
- *
- * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
- * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
- *                void (AccT& acc, AccT x, AccT y)
- * @tparam Veclen size of the vectorized load
- * @tparam T type of the data in the query and the index
- * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
- * values)
- */
-template <int kUnroll, typename Lambda, int Veclen, typename T, typename AccT>
-struct loadAndComputeDist {
-  Lambda compute_dist;
-  AccT& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version assumes the query is stored in shared memory.
-   * Every thread here processes exactly kUnroll * Veclen elements independently of others.
-   */
-  template <typename IdxT>
-  __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
-                                                      const T* query_shared,
-                                                      IdxT loadIndex,
-                                                      IdxT shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      T encV[Veclen];
-      ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen);
-      T queryRegs[Veclen];
-      lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
-#pragma unroll
-      for (int k = 0; k < Veclen; ++k) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-      }
-    }
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version assumes the query is stored in the global memory and is different for every
-   * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into
-   * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once).
-   */
-  template <typename IdxT>
-  __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
-                                                        const T* query,
-                                                        IdxT baseLoadIndex,
-                                                        const int lane_id)
-  {
-    T queryReg               = query[baseLoadIndex + lane_id];
-    constexpr int stride     = kUnroll * Veclen;
-    constexpr int totalIter  = WarpSize / stride;
-    constexpr int gmemStride = stride * kIndexGroupSize;
-#pragma unroll
-    for (int i = 0; i < totalIter; ++i, data += gmemStride) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        T encV[Veclen];
-        ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen);
-        const int d = (i * kUnroll + j) * Veclen;
-#pragma unroll
-        for (int k = 0; k < Veclen; ++k) {
-          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
-        }
-      }
-    }
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
-   */
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    const int loadDim     = dimBlocks + lane_id;
-    T queryReg            = loadDim < dim ? query[loadDim] : 0;
-    const int loadDataIdx = lane_id * Veclen;
-    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
-      T enc[Veclen];
-      ldg(enc, data + loadDataIdx);
-#pragma unroll
-      for (int k = 0; k < Veclen; k++) {
-        compute_dist(dist, shfl(queryReg, d + k, WarpSize), enc[k]);
-      }
-    }
-  }
-};
-
-// This handles uint8_t 8, 16 Veclens
-template <int kUnroll, typename Lambda, int uint8_veclen>
-struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
-  Lambda compute_dist;
-  uint32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
-    loadIndex                = loadIndex * veclen_int;
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV[veclen_int];
-      ldg(encV,
-          reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
-      uint32_t queryRegs[veclen_int];
-      lds(queryRegs, reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-      }
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
-    uint32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int stride = kUnroll * uint8_veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV[veclen_int];
-        ldg(encV,
-            reinterpret_cast<unsigned const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
-        const int d = (i * kUnroll + j) * veclen_int;
-#pragma unroll
-        for (int k = 0; k < veclen_int; ++k) {
-          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;
-    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int
-    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks;
-         d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
-      uint32_t enc[veclen_int];
-      ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize);
-        compute_dist(dist, q, enc[k]);
-      }
-    }
-  }
-};
-
-// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
-// using above common template of int2/int4
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
-  Lambda compute_dist;
-  uint32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 4;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
-        compute_dist(dist, q, encV);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 4;
-    const int loadDim    = dimBlocks + lane_id;
-    uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
-      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      compute_dist(dist, q, enc);
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
-  Lambda compute_dist;
-  uint32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg =
-      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 2;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
-        compute_dist(dist, q, encV);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 2;
-    int loadDim          = dimBlocks + lane_id * veclen;
-    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
-      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      compute_dist(dist, q, enc);
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
-  Lambda compute_dist;
-  uint32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = data[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = query_shared[shmemIndex + j];
-      compute_dist(dist, queryRegs, encV);
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg    = query[baseLoadIndex + lane_id];
-    constexpr int veclen = 1;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = data[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
-        compute_dist(dist, q, encV);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 1;
-    int loadDim          = dimBlocks + lane_id;
-    uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = data[lane_id];
-      uint32_t q   = shfl(queryReg, d, WarpSize);
-      compute_dist(dist, q, enc);
-    }
-  }
-};
-
-// This device function is for int8 veclens 4, 8 and 16
-template <int kUnroll, typename Lambda, int int8_veclen>
-struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
-  Lambda compute_dist;
-  int32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
-
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      int32_t encV[veclen_int];
-      ldg(encV,
-          reinterpret_cast<int32_t const*>(data) + (loadIndex + j * kIndexGroupSize) * veclen_int);
-      int32_t queryRegs[veclen_int];
-      lds(queryRegs, reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
-
-    int32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int stride = kUnroll * int8_veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        int32_t encV[veclen_int];
-        ldg(encV,
-            reinterpret_cast<int32_t const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
-        const int d = (i * kUnroll + j) * veclen_int;
-#pragma unroll
-        for (int k = 0; k < veclen_int; ++k) {
-          int32_t q = shfl(queryReg, d + k, WarpSize);
-          compute_dist(dist, q, encV[k]);
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen_int = int8_veclen / 4;
-    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int;
-    int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
-      int32_t enc[veclen_int];
-      ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        int32_t q = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
-        compute_dist(dist, q, enc[k]);
-      }
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
-  Lambda compute_dist;
-  int32_t& dist;
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
-      int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    int32_t queryReg =
-      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 2;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
-        int32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
-        compute_dist(dist, q, encV);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen = 2;
-    int loadDim          = dimBlocks + lane_id * veclen;
-    int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
-      int32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      compute_dist(dist, q, enc);
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
-  Lambda compute_dist;
-  int32_t& dist;
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]);
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen = 1;
-    constexpr int stride = kUnroll * veclen;
-    int32_t queryReg     = query[baseLoadIndex + lane_id];
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        compute_dist(
-          dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[lane_id + j * kIndexGroupSize]);
-      }
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen = 1;
-    const int loadDim    = dimBlocks + lane_id;
-    int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      compute_dist(dist, shfl(queryReg, d, WarpSize), data[lane_id]);
-    }
-  }
-};
-
-/**
- * Scan clusters for nearest neighbors of the query vectors.
- * See `ivfflat_interleaved_scan` for more information.
- *
- * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
- * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
- * calculated, and the top-k nearest neighbors are selected.
- *
- * @param compute_dist distance function
- * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
- * block; this number must be a multiple of `WarpSize * Veclen`.
- * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
- * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
- * @param[in] list_indices index<T, IdxT>.indices
- * @param[in] list_data index<T, IdxT>.data
- * @param[in] list_sizes index<T, IdxT>.list_sizes
- * @param[in] list_offsets index<T, IdxT>.list_offsets
- * @param n_probes
- * @param k
- * @param dim
- * @param[out] neighbors
- * @param[out] distances
- */
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename Lambda,
-          typename PostLambda>
-__global__ void __launch_bounds__(kThreadsPerBlock)
-  interleaved_scan_kernel(Lambda compute_dist,
-                          PostLambda post_process,
-                          const uint32_t query_smem_elems,
-                          const T* query,
-                          const uint32_t* coarse_index,
-                          const IdxT* const* list_indices_ptrs,
-                          const T* const* list_data_ptrs,
-                          const uint32_t* list_sizes,
-                          const uint32_t n_probes,
-                          const uint32_t k,
-                          const uint32_t dim,
-                          IdxT* neighbors,
-                          float* distances)
-{
-  extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
-  // Using shared memory for the (part of the) query;
-  // This allows to save on global memory bandwidth when reading index and query
-  // data at the same time.
-  // Its size is `query_smem_elems`.
-  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
-  // Make the query input and output point to this block's shared query
-  {
-    const int query_id = blockIdx.y;
-    query += query_id * dim;
-    neighbors += query_id * k * gridDim.x + blockIdx.x * k;
-    distances += query_id * k * gridDim.x + blockIdx.x * k;
-    coarse_index += query_id * n_probes;
-  }
-
-  // Copy a part of the query into shared memory for faster processing
-  copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
-  __syncthreads();
-
-  using block_sort_t = matrix::detail::select::warpsort::block_sort<
-    matrix::detail::select::warpsort::warp_sort_filtered,
-    Capacity,
-    Ascending,
-    float,
-    IdxT>;
-  block_sort_t queue(k);
-
-  {
-    using align_warp  = Pow2<WarpSize>;
-    const int lane_id = align_warp::mod(threadIdx.x);
-
-    // How many full warps needed to compute the distance (without remainder)
-    const uint32_t full_warps_along_dim = align_warp::roundDown(dim);
-
-    const uint32_t shm_assisted_dim =
-      (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
-
-    // Every CUDA block scans one cluster at a time.
-    for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
-      const uint32_t list_id = coarse_index[probe_id];  // The id of cluster(list)
-
-      // The number of vectors in each cluster(list); [nlist]
-      const uint32_t list_length = list_sizes[list_id];
-
-      // The number of interleaved groups to be processed
-      const uint32_t num_groups =
-        align_warp::div(list_length + align_warp::Mask);  // ceildiv by power of 2
-
-      constexpr int kUnroll        = WarpSize / Veclen;
-      constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
-      // Every warp reads WarpSize vectors and computes the distances to them.
-      // Then, the distances and corresponding ids are distributed among the threads,
-      // and each thread adds one (id, dist) pair to the filtering queue.
-      for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups;
-           group_id += kNumWarps) {
-        AccT dist = 0;
-        // This is where this warp begins reading data (start position of an interleaved group)
-        const T* data = list_data_ptrs[list_id] + (group_id * kIndexGroupSize) * dim;
-
-        // This is the vector a given lane/thread handles
-        const uint32_t vec_id = group_id * WarpSize + lane_id;
-        const bool valid      = vec_id < list_length;
-
-        // Process first shm_assisted_dim dimensions (always using shared memory)
-        if (valid) {
-          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
-                                                                                  compute_dist);
-          for (int pos = 0; pos < shm_assisted_dim;
-               pos += WarpSize, data += kIndexGroupSize * WarpSize) {
-            lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
-          }
-        }
-
-        if (dim > query_smem_elems) {
-          // The default path - using shfl ops - for dimensions beyond query_smem_elems
-          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
-                                                                                  compute_dist);
-          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) {
-            lc.runLoadShflAndCompute(data, query, pos, lane_id);
-          }
-          lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
-        } else {
-          // when  shm_assisted_dim == full_warps_along_dim < dim
-          if (valid) {
-            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> lc(dist, compute_dist);
-            for (int pos = full_warps_along_dim; pos < dim;
-                 pos += Veclen, data += kIndexGroupSize * Veclen) {
-              lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
-            }
-          }
-        }
-
-        // Enqueue one element per thread
-        const float val  = valid ? static_cast<float>(dist) : block_sort_t::queue_t::kDummy;
-        const size_t idx = valid ? static_cast<size_t>(list_indices_ptrs[list_id][vec_id]) : 0;
-        queue.add(val, idx);
-      }
-    }
-  }
-
-  // finalize and store selected neighbours
-  __syncthreads();
-  queue.done(interleaved_scan_kernel_smem);
-  queue.store(distances, neighbors, post_process);
-}
-
-/**
- *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
- */
-template <typename T>
-uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func)
-{
-  int dev_id;
-  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
-  int num_sms;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-  int num_blocks_per_sm = 0;
-  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
-
-  size_t min_grid_size = num_sms * num_blocks_per_sm;
-  size_t min_grid_x    = ceildiv<size_t>(min_grid_size, numQueries);
-  return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
-}
-
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename Lambda,
-          typename PostLambda>
-void launch_kernel(Lambda lambda,
-                   PostLambda post_process,
-                   const index<T, IdxT>& index,
-                   const T* queries,
-                   const uint32_t* coarse_index,
-                   const uint32_t num_queries,
-                   const uint32_t n_probes,
-                   const uint32_t k,
-                   IdxT* neighbors,
-                   float* distances,
-                   uint32_t& grid_dim_x,
-                   rmm::cuda_stream_view stream)
-{
-  RAFT_EXPECTS(Veclen == index.veclen(),
-               "Configured Veclen does not match the index interleaving pattern.");
-  constexpr auto kKernel =
-    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda, PostLambda>;
-  const int max_query_smem = 16384;
-  int query_smem_elems =
-    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
-  int smem_size              = query_smem_elems * sizeof(T);
-  constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
-  auto block_merge_mem =
-    raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<AccT, IdxT>(
-      kThreadsPerBlock / kSubwarpSize, k);
-  smem_size += std::max<int>(smem_size, block_merge_mem);
-
-  // power-of-two less than cuda limit (for better addr alignment)
-  constexpr uint32_t kMaxGridY = 32768;
-
-  if (grid_dim_x == 0) {
-    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel);
-    return;
-  }
-
-  for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) {
-    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, num_queries - query_offset);
-    dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
-    dim3 block_dim(kThreadsPerBlock);
-    RAFT_LOG_TRACE(
-      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, "
-      "smem_size = %d",
-      grid_dim.x,
-      grid_dim.y,
-      block_dim.x,
-      n_probes,
-      smem_size);
-    kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
-                                                        post_process,
-                                                        query_smem_elems,
-                                                        queries,
-                                                        coarse_index,
-                                                        index.inds_ptrs().data_handle(),
-                                                        index.data_ptrs().data_handle(),
-                                                        index.list_sizes().data_handle(),
-                                                        n_probes,
-                                                        k,
-                                                        index.dim(),
-                                                        neighbors,
-                                                        distances);
-    queries += grid_dim_y * index.dim();
-    neighbors += grid_dim_y * grid_dim_x * k;
-    distances += grid_dim_y * grid_dim_x * k;
-  }
-}
-
-template <int Veclen, typename T, typename AccT>
-struct euclidean_dist {
-  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
-  {
-    const auto diff = x - y;
-    acc += diff * diff;
-  }
-};
-
-template <int Veclen>
-struct euclidean_dist<Veclen, uint8_t, uint32_t> {
-  __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y)
-  {
-    if constexpr (Veclen > 1) {
-      const auto diff = __vabsdiffu4(x, y);
-      acc             = dp4a(diff, diff, acc);
-    } else {
-      const auto diff = __usad(x, y, 0u);
-      acc += diff * diff;
-    }
-  }
-};
-
-template <int Veclen>
-struct euclidean_dist<Veclen, int8_t, int32_t> {
-  __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
-  {
-    if constexpr (Veclen > 1) {
-      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
-      // between two int8 numbers can be greater than 127 and therefore represented as a negative
-      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
-      // from uint8 to uint32 is correct.
-      const auto diff = __vabsdiffs4(x, y);
-      acc             = dp4a(diff, diff, static_cast<uint32_t>(acc));
-    } else {
-      const auto diff = x - y;
-      acc += diff * diff;
-    }
-  }
-};
-
-template <int Veclen, typename T, typename AccT>
-struct inner_prod_dist {
-  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
-  {
-    if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
-      acc = dp4a(x, y, acc);
-    } else {
-      acc += x * y;
-    }
-  }
-};
-
-/** Select the distance computation function and forward the rest of the arguments. */
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename... Args>
-void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
-{
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-    case raft::distance::DistanceType::L2Unexpanded:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           T,
-                           AccT,
-                           IdxT,
-                           euclidean_dist<Veclen, T, AccT>,
-                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
-    case raft::distance::DistanceType::L2SqrtExpanded:
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           T,
-                           AccT,
-                           IdxT,
-                           euclidean_dist<Veclen, T, AccT>,
-                           raft::sqrt_op>({}, {}, std::forward<Args>(args)...);
-    case raft::distance::DistanceType::InnerProduct:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           T,
-                           AccT,
-                           IdxT,
-                           inner_prod_dist<Veclen, T, AccT>,
-                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
-    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
-    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
-  }
-}
-
-/**
- * Lift the `capacity` and `veclen` parameters to the template level,
- * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`.
- */
-template <typename T,
-          typename AccT,
-          typename IdxT,
-          int Capacity = matrix::detail::select::warpsort::kMaxCapacity,
-          int Veclen   = std::max<int>(1, 16 / sizeof(T))>
-struct select_interleaved_scan_kernel {
-  /**
-   * Recursively reduce the `Capacity` and `Veclen` parameters until they match the
-   * corresponding runtime arguments.
-   * By default, this recursive process starts with maximum possible values of the
-   * two parameters and ends with both values equal to 1.
-   */
-  template <typename... Args>
-  static inline void run(int capacity, int veclen, bool select_min, Args&&... args)
-  {
-    if constexpr (Capacity > 1) {
-      if (capacity * 2 <= Capacity) {
-        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity / 2, Veclen>::run(
-          capacity, veclen, select_min, std::forward<Args>(args)...);
-      }
-    }
-    if constexpr (Veclen > 1) {
-      if (veclen * 2 <= Veclen) {
-        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity, Veclen / 2>::run(
-          capacity, veclen, select_min, std::forward<Args>(args)...);
-      }
-    }
-    // NB: this is the limitation of the warpsort structures that use a huge number of
-    //     registers (used in the main kernel here).
-    RAFT_EXPECTS(capacity == Capacity,
-                 "Capacity must be power-of-two not bigger than the maximum allowed size "
-                 "matrix::detail::select::warpsort::kMaxCapacity (%d).",
-                 matrix::detail::select::warpsort::kMaxCapacity);
-    RAFT_EXPECTS(
-      veclen == Veclen,
-      "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
-    if (select_min) {
-      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT, IdxT>(std::forward<Args>(args)...);
-    } else {
-      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT, IdxT>(std::forward<Args>(args)...);
-    }
-  }
-};
-
-/**
- * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
- *
- * @tparam T value type
- * @tparam AccT accumulated type
- * @tparam IdxT type of the indices
- *
- * @param index previously built ivf-flat index
- * @param[in] queries device pointer to the query vectors [batch_size, dim]
- * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
- * @param n_queries batch size
- * @param metric type of the measured distance
- * @param n_probes number of nearest clusters to query
- * @param k number of nearest neighbors.
- *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
- * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
- * metric.
- * @param[out] neighbors device pointer to the result indices for each query and cluster
- * [batch_size, grid_dim_x, k]
- * @param[out] distances device pointer to the result distances for each query and cluster
- * [batch_size, grid_dim_x, k]
- * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
- *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
- * @param stream
- */
-template <typename T, typename AccT, typename IdxT>
-void ivfflat_interleaved_scan(const index<T, IdxT>& index,
-                              const T* queries,
-                              const uint32_t* coarse_query_results,
-                              const uint32_t n_queries,
-                              const raft::distance::DistanceType metric,
-                              const uint32_t n_probes,
-                              const uint32_t k,
-                              const bool select_min,
-                              IdxT* neighbors,
-                              float* distances,
-                              uint32_t& grid_dim_x,
-                              rmm::cuda_stream_view stream)
-{
-  // greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-  // function is used in both raft::neighbors::ivf_flat::search and
-  // raft::neighbors::detail::refine_device. To prevent a duplicate
-  // instantiation of this function (which defines ~270 kernels) in the refine
-  // specializations, an extern template definition is provided. Please check
-  // related function calls after editing this function definition. Search for
-  // `greppable-id-specializations-ivf-flat-search` to find them.
-
-  const int capacity = bound_by_power_of_two(k);
-  select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
-                                                     index.veclen(),
-                                                     select_min,
-                                                     metric,
-                                                     index,
-                                                     queries,
-                                                     coarse_query_results,
-                                                     n_queries,
-                                                     n_probes,
-                                                     k,
-                                                     neighbors,
-                                                     distances,
-                                                     grid_dim_x,
-                                                     stream);
-}
-
-template <typename T, typename AccT, typename IdxT>
-void search_impl(raft::device_resources const& handle,
-                 const index<T, IdxT>& index,
-                 const T* queries,
-                 uint32_t n_queries,
-                 uint32_t k,
-                 uint32_t n_probes,
-                 bool select_min,
-                 IdxT* neighbors,
-                 AccT* distances,
-                 rmm::mr::device_memory_resource* search_mr)
-{
-  auto stream = handle.get_stream();
-  // The norm of query
-  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
-  // The distance value of cluster(list) and queries
-  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
-  // The topk distance value of cluster(list) and queries
-  rmm::device_uvector<float> coarse_distances_dev(n_queries * n_probes, stream, search_mr);
-  // The topk  index of cluster(list) and queries
-  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries * n_probes, stream, search_mr);
-  // The topk distance value of candidate vectors from each cluster(list)
-  rmm::device_uvector<AccT> refined_distances_dev(n_queries * n_probes * k, stream, search_mr);
-  // The topk index of candidate vectors from each cluster(list)
-  rmm::device_uvector<IdxT> refined_indices_dev(n_queries * n_probes * k, stream, search_mr);
-
-  size_t float_query_size;
-  if constexpr (std::is_integral_v<T>) {
-    float_query_size = n_queries * index.dim();
-  } else {
-    float_query_size = 0;
-  }
-  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
-  float* converted_queries_ptr = converted_queries_dev.data();
-
-  if constexpr (std::is_same_v<T, float>) {
-    converted_queries_ptr = const_cast<float*>(queries);
-  } else {
-    linalg::unaryOp(
-      converted_queries_ptr, queries, n_queries * index.dim(), utils::mapping<float>{}, stream);
-  }
-
-  float alpha = 1.0f;
-  float beta  = 0.0f;
-
-  // todo(lsugy): raft distance? (if performance is similar/better than gemm)
-  switch (index.metric()) {
-    case raft::distance::DistanceType::L2Expanded:
-    case raft::distance::DistanceType::L2SqrtExpanded: {
-      alpha = -2.0f;
-      beta  = 1.0f;
-      raft::linalg::rowNorm(query_norm_dev.data(),
-                            converted_queries_ptr,
-                            static_cast<IdxT>(index.dim()),
-                            static_cast<IdxT>(n_queries),
-                            raft::linalg::L2Norm,
-                            true,
-                            stream);
-      utils::outer_add(query_norm_dev.data(),
-                       (IdxT)n_queries,
-                       index.center_norms()->data_handle(),
-                       (IdxT)index.n_lists(),
-                       distance_buffer_dev.data(),
-                       stream);
-      RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min<uint32_t>(20, index.dim()));
-      RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
-      break;
-    }
-    default: {
-      alpha = 1.0f;
-      beta  = 0.0f;
-    }
-  }
-
-  linalg::gemm(handle,
-               true,
-               false,
-               index.n_lists(),
-               n_queries,
-               index.dim(),
-               &alpha,
-               index.centers().data_handle(),
-               index.dim(),
-               converted_queries_ptr,
-               index.dim(),
-               &beta,
-               distance_buffer_dev.data(),
-               index.n_lists(),
-               stream);
-
-  RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
-  matrix::detail::select_k<AccT, uint32_t>(distance_buffer_dev.data(),
-                                           nullptr,
-                                           n_queries,
-                                           index.n_lists(),
-                                           n_probes,
-                                           coarse_distances_dev.data(),
-                                           coarse_indices_dev.data(),
-                                           select_min,
-                                           stream,
-                                           search_mr);
-  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
-  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
-
-  auto distances_dev_ptr = refined_distances_dev.data();
-  auto indices_dev_ptr   = refined_indices_dev.data();
-
-  uint32_t grid_dim_x = 0;
-  if (n_probes > 1) {
-    // query the gridDimX size to store probes topK output
-    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT>(index,
-                                                                          nullptr,
-                                                                          nullptr,
-                                                                          n_queries,
-                                                                          index.metric(),
-                                                                          n_probes,
-                                                                          k,
-                                                                          select_min,
-                                                                          nullptr,
-                                                                          nullptr,
-                                                                          grid_dim_x,
-                                                                          stream);
-  } else {
-    grid_dim_x = 1;
-  }
-
-  if (grid_dim_x == 1) {
-    distances_dev_ptr = distances;
-    indices_dev_ptr   = neighbors;
-  }
-
-  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT>(index,
-                                                                        queries,
-                                                                        coarse_indices_dev.data(),
-                                                                        n_queries,
-                                                                        index.metric(),
-                                                                        n_probes,
-                                                                        k,
-                                                                        select_min,
-                                                                        indices_dev_ptr,
-                                                                        distances_dev_ptr,
-                                                                        grid_dim_x,
-                                                                        stream);
-
-  RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
-  RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k);
-
-  // Merge topk values from different blocks
-  if (grid_dim_x > 1) {
-    matrix::detail::select_k<AccT, IdxT>(refined_distances_dev.data(),
-                                         refined_indices_dev.data(),
-                                         n_queries,
-                                         k * grid_dim_x,
-                                         k,
-                                         distances,
-                                         neighbors,
-                                         select_min,
-                                         stream,
-                                         search_mr);
-  }
-}
-
-/** See raft::neighbors::ivf_flat::search docs */
-template <typename T, typename IdxT>
-inline void search(raft::device_resources const& handle,
-                   const search_params& params,
-                   const index<T, IdxT>& index,
-                   const T* queries,
-                   uint32_t n_queries,
-                   uint32_t k,
-                   IdxT* neighbors,
-                   float* distances,
-                   rmm::mr::device_memory_resource* mr = nullptr)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
-
-  RAFT_EXPECTS(params.n_probes > 0,
-               "n_probes (number of clusters to probe in the search) must be positive.");
-  auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists());
-
-  auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16);
-  if (pool_guard) {
-    RAFT_LOG_DEBUG("ivf_flat::search: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
-  }
-
-  return search_impl<T, float, IdxT>(handle,
-                                     index,
-                                     queries,
-                                     n_queries,
-                                     k,
-                                     n_probes,
-                                     raft::distance::is_min_close(index.metric()),
-                                     neighbors,
-                                     distances,
-                                     mr);
-}
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "ivf_flat_search-inl.cuh"
+#endif
 
-}  // namespace raft::neighbors::ivf_flat::detail
+#ifdef RAFT_COMPILED
+#include "ivf_flat_search-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
index 1bb7f97123..bec3b890eb 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
@@ -21,6 +21,7 @@
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_list_types.hpp>
+#include <raft/util/pow2_utils.cuh>
 
 #include <fstream>
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 1a563d213e..b17b3a3559 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
+#include <raft/neighbors/detail/ivf_pq_codepacking.cuh>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
@@ -60,63 +61,6 @@ namespace raft::neighbors::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
-/** A chunk of PQ-encoded vector managed by one CUDA thread. */
-using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
-
-namespace {
-
-/**
- * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
- *
- * @tparam Bits number of bits comprising the value.
- */
-template <uint32_t Bits>
-struct bitfield_ref_t {
-  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
-  constexpr static uint8_t kMask = static_cast<uint8_t>((1u << Bits) - 1u);
-  uint8_t* ptr;
-  uint32_t offset;
-
-  constexpr operator uint8_t()  // NOLINT
-  {
-    auto pair = static_cast<uint16_t>(ptr[0]);
-    if (offset + Bits > 8) { pair |= static_cast<uint16_t>(ptr[1]) << 8; }
-    return static_cast<uint8_t>((pair >> offset) & kMask);
-  }
-
-  constexpr auto operator=(uint8_t code) -> bitfield_ref_t&
-  {
-    if (offset + Bits > 8) {
-      auto pair = static_cast<uint16_t>(ptr[0]);
-      pair |= static_cast<uint16_t>(ptr[1]) << 8;
-      pair &= ~(static_cast<uint16_t>(kMask) << offset);
-      pair |= static_cast<uint16_t>(code) << offset;
-      ptr[0] = static_cast<uint8_t>(Pow2<256>::mod(pair));
-      ptr[1] = static_cast<uint8_t>(Pow2<256>::div(pair));
-    } else {
-      ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset);
-    }
-    return *this;
-  }
-};
-
-/**
- * View a byte array as an array of unsigned integers of custom small bit size.
- *
- * @tparam Bits number of bits comprising a single element of the array.
- */
-template <uint32_t Bits>
-struct bitfield_view_t {
-  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
-  uint8_t* raw;
-
-  constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
-  {
-    uint32_t bit_offset = i * Bits;
-    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
-  }
-};
-
 template <uint32_t BlockDim, typename T, typename S>
 __launch_bounds__(BlockDim) __global__ void copy_warped_kernel(
   T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
@@ -162,8 +106,6 @@ void copy_warped(T* out,
     <<<blocks, threads, 0, stream>>>(out, ld_out, in, ld_in, n_cols, n_rows);
 }
 
-}  // namespace
-
 /**
  * @brief Fill-in a random orthogonal transformation matrix.
  *
@@ -271,12 +213,12 @@ void select_residuals(raft::device_resources const& handle,
 template <typename T, typename IdxT>
 void flat_compute_residuals(
   raft::device_resources const& handle,
-  float* residuals,  // [n_rows, rot_dim]
+  float* residuals,                                                      // [n_rows, rot_dim]
   IdxT n_rows,
   device_matrix_view<const float, uint32_t, row_major> rotation_matrix,  // [rot_dim, dim]
   device_matrix_view<const float, uint32_t, row_major> centers,          // [n_lists, dim_ext]
   const T* dataset,                                                      // [n_rows, dim]
-  const uint32_t* labels,                                                // [n_rows]
+  std::variant<uint32_t, const uint32_t*> labels,                        // [n_rows]
   rmm::mr::device_memory_resource* device_memory)
 {
   auto stream  = handle.get_stream();
@@ -287,7 +229,9 @@ void flat_compute_residuals(
   linalg::map_offset(handle, tmp_view, [centers, dataset, labels, dim] __device__(size_t i) {
     auto row_ix = i / dim;
     auto el_ix  = i % dim;
-    auto label  = labels[row_ix];
+    auto label  = std::holds_alternative<uint32_t>(labels)
+                    ? std::get<uint32_t>(labels)
+                    : std::get<const uint32_t*>(labels)[row_ix];
     return utils::mapping<float>{}(dataset[i]) - centers(label, el_ix);
   });
 
@@ -558,11 +502,363 @@ void train_per_cluster(raft::device_resources const& handle,
 }
 
 /**
- * Compute the code: find the closest cluster in each pq_dim-subspace.
+ * A helper function: given the dataset in the rotated space
+ *  [n_rows, rot_dim] = [n_rows, pq_dim * pq_len],
+ * reinterpret the last dimension as two: [n_rows, pq_dim, pq_len]
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param vectors input data [n_rows, rot_dim]
+ * @param pq_centers codebook (used to infer the structure - pq_len)
+ * @return reinterpreted vectors [n_rows, pq_dim, pq_len]
+ */
+template <typename T, typename IdxT>
+static __device__ auto reinterpret_vectors(
+  device_matrix_view<T, IdxT, row_major> vectors,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers)
+  -> device_mdspan<T, extent_3d<IdxT>, row_major>
+{
+  const uint32_t pq_len = pq_centers.extent(1);
+  const uint32_t pq_dim = vectors.extent(1) / pq_len;
+  using layout_t        = typename decltype(vectors)::layout_type;
+  using accessor_t      = typename decltype(vectors)::accessor_type;
+  return mdspan<T, extent_3d<IdxT>, layout_t, accessor_t>(
+    vectors.data_handle(), extent_3d<IdxT>{vectors.extent(0), pq_dim, pq_len});
+}
+
+/**
+ * A consumer for the `run_on_list` and `run_on_vector` that just flattens PQ codes
+ * one-per-byte. That is, independent of the code width (pq_bits), one code uses
+ * the whole byte, hence one vectors uses pq_dim bytes.
+ */
+struct unpack_codes {
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[out] out_codes the destination for the read codes.
+   */
+  __device__ inline unpack_codes(device_matrix_view<uint8_t, uint32_t, row_major> out_codes)
+    : out_codes{out_codes}
+  {
+  }
+
+  /**  Write j-th component (code) of the i-th vector into the output array. */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    out_codes(i, j) = code;
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel(
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  const uint32_t pq_dim = out_codes.extent(1);
+  auto unpack_action    = unpack_codes{out_codes};
+  run_on_list<PqBits>(in_list_data, offset_or_indices, out_codes.extent(0), pq_dim, unpack_action);
+}
+
+/**
+ * Unpack flat PQ codes from an existing list by the given offset.
+ *
+ * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] list_data the packed ivf::list data.
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void unpack_list_data(
+  device_matrix_view<uint8_t, uint32_t, row_major> codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  auto n_rows = codes.extent(0);
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return unpack_list_data_kernel<kBlockSize, 4>;
+      case 5: return unpack_list_data_kernel<kBlockSize, 5>;
+      case 6: return unpack_list_data_kernel<kBlockSize, 6>;
+      case 7: return unpack_list_data_kernel<kBlockSize, 7>;
+      case 8: return unpack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(codes, list_data, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Unpack the list data; see the public interface for the api and usage. */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  unpack_list_data(out_codes,
+                   index.lists()[label]->data.view(),
+                   offset_or_indices,
+                   index.pq_bits(),
+                   res.get_stream());
+}
+
+/** A consumer for the `run_on_list` and `run_on_vector` that approximates the original input data.
+ */
+struct reconstruct_vectors {
+  codebook_gen codebook_kind;
+  uint32_t cluster_ix;
+  uint32_t pq_len;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> centers_rot;
+  device_mdspan<float, extent_3d<uint32_t>, row_major> out_vectors;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[out] out_vectors the destination for the decoded vectors.
+   * @param[in] pq_centers the codebook
+   * @param[in] centers_rot
+   * @param[in] codebook_kind
+   * @param[in] cluster_ix label/id of the cluster.
+   */
+  __device__ inline reconstruct_vectors(
+    device_matrix_view<float, uint32_t, row_major> out_vectors,
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+    device_matrix_view<const float, uint32_t, row_major> centers_rot,
+    codebook_gen codebook_kind,
+    uint32_t cluster_ix)
+    : codebook_kind{codebook_kind},
+      cluster_ix{cluster_ix},
+      pq_len{pq_centers.extent(1)},
+      pq_centers{pq_centers},
+      centers_rot{reinterpret_vectors(centers_rot, pq_centers)},
+      out_vectors{reinterpret_vectors(out_vectors, pq_centers)}
+  {
+  }
+
+  /**
+   * Decode j-th component of the i-th vector by its code and write it into a chunk of the output
+   * vectors (pq_len elements).
+   */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    uint32_t partition_ix;
+    switch (codebook_kind) {
+      case codebook_gen::PER_CLUSTER: {
+        partition_ix = cluster_ix;
+      } break;
+      case codebook_gen::PER_SUBSPACE: {
+        partition_ix = j;
+      } break;
+      default: __builtin_unreachable();
+    }
+    for (uint32_t k = 0; k < pq_len; k++) {
+      out_vectors(i, j, k) = pq_centers(partition_ix, k, code) + centers_rot(cluster_ix, j, k);
+    }
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
+  device_matrix_view<float, uint32_t, row_major> out_vectors,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  device_matrix_view<const float, uint32_t, row_major> centers_rot,
+  codebook_gen codebook_kind,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  const uint32_t pq_dim = out_vectors.extent(1) / pq_centers.extent(1);
+  auto reconstruct_action =
+    reconstruct_vectors{out_vectors, pq_centers, centers_rot, codebook_kind, cluster_ix};
+  run_on_list<PqBits>(
+    in_list_data, offset_or_indices, out_vectors.extent(0), pq_dim, reconstruct_action);
+}
+
+/** Decode the list data; see the public interface for the api and usage. */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = out_vectors.extent(0);
+  if (n_rows == 0) { return; }
+  auto& list = index.lists()[label];
+  if (std::holds_alternative<uint32_t>(offset_or_indices)) {
+    auto n_skip = std::get<uint32_t>(offset_or_indices);
+    // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
+    // to avoid an extra device-host data copy and the stream sync.
+    RAFT_EXPECTS(n_skip + n_rows <= list->size.load(),
+                 "offset + output size must be not bigger than the cluster size.");
+  }
+
+  auto tmp = make_device_mdarray<float>(
+    res, res.get_workspace_resource(), make_extents<uint32_t>(n_rows, index.rot_dim()));
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return reconstruct_list_data_kernel<kBlockSize, 4>;
+      case 5: return reconstruct_list_data_kernel<kBlockSize, 5>;
+      case 6: return reconstruct_list_data_kernel<kBlockSize, 6>;
+      case 7: return reconstruct_list_data_kernel<kBlockSize, 7>;
+      case 8: return reconstruct_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index.pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(tmp.view(),
+                                                   list->data.view(),
+                                                   index.pq_centers(),
+                                                   index.centers_rot(),
+                                                   index.codebook_kind(),
+                                                   label,
+                                                   offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  float* out_float_ptr = nullptr;
+  rmm::device_uvector<float> out_float_buf(0, res.get_stream(), res.get_workspace_resource());
+  if constexpr (std::is_same_v<T, float>) {
+    out_float_ptr = out_vectors.data_handle();
+  } else {
+    out_float_buf.resize(size_t{n_rows} * size_t{index.dim()}, res.get_stream());
+    out_float_ptr = out_float_buf.data();
+  }
+  // Rotate the results back to the original space
+  float alpha = 1.0;
+  float beta  = 0.0;
+  linalg::gemm(res,
+               false,
+               false,
+               index.dim(),
+               n_rows,
+               index.rot_dim(),
+               &alpha,
+               index.rotation_matrix().data_handle(),
+               index.dim(),
+               tmp.data_handle(),
+               index.rot_dim(),
+               &beta,
+               out_float_ptr,
+               index.dim(),
+               res.get_stream());
+  // Transform the data to the original type, if necessary
+  if constexpr (!std::is_same_v<T, float>) {
+    linalg::map(res,
+                out_vectors,
+                utils::mapping<T>{},
+                make_device_matrix_view<const float>(out_float_ptr, n_rows, index.dim()));
+  }
+}
+
+/**
+ * A producer for the `write_list` and `write_vector` reads the codes byte-by-byte. That is,
+ * independent of the code width (pq_bits), one code uses the whole byte, hence one vectors uses
+ * pq_dim bytes.
+ */
+struct pass_codes {
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[in] codes the source codes.
+   */
+  __device__ inline pass_codes(device_matrix_view<const uint8_t, uint32_t, row_major> codes)
+    : codes{codes}
+  {
+  }
+
+  /** Read j-th component (code) of the i-th vector from the source. */
+  __device__ inline auto operator()(uint32_t i, uint32_t j) const -> uint8_t { return codes(i, j); }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void pack_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  write_list<PqBits, 1>(
+    list_data, offset_or_indices, codes.extent(0), codes.extent(1), pass_codes{codes});
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_rows).
+ *
+ * @param[out] list_data the packed ivf::list data.
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void pack_list_data(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  auto n_rows = codes.extent(0);
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return pack_list_data_kernel<kBlockSize, 4>;
+      case 5: return pack_list_data_kernel<kBlockSize, 5>;
+      case 6: return pack_list_data_kernel<kBlockSize, 6>;
+      case 7: return pack_list_data_kernel<kBlockSize, 7>;
+      case 8: return pack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(list_data, codes, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                    uint32_t label,
+                    std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  pack_list_data(index->lists()[label]->data.view(),
+                 new_codes,
+                 offset_or_indices,
+                 index->pq_bits(),
+                 res.get_stream());
+}
+
+/**
+ *
+ * A producer for the `write_list` and `write_vector` that encodes level-1 input vector residuals
+ * into lvl-2 PQ codes.
+ * Computing a PQ code means finding the closest cluster in a pq_dim-subspace.
  *
  * @tparam SubWarpSize
  *   how many threads work on a single vector;
- *   bouded by either WarpSize or pq_book_size.
+ *   bounded by either WarpSize or pq_book_size.
  *
  * @param pq_centers
  *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
@@ -574,56 +870,75 @@ void train_per_cluster(raft::device_resources const& handle,
  * @param j index along pq_dim "dimension"
  * @param cluster_ix is used for PER_CLUSTER codebooks.
  */
-template <uint32_t SubWarpSize>
-__device__ auto compute_pq_code(
-  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
-  device_mdspan<const float, extent_2d<uint32_t>, row_major> new_vector,
-  codebook_gen codebook_kind,
-  uint32_t j,
-  uint32_t cluster_ix) -> uint8_t
-{
-  using subwarp_align = Pow2<SubWarpSize>;
-  uint32_t lane_id    = subwarp_align::mod(laneId());
-  uint32_t partition_ix;
-  switch (codebook_kind) {
-    case codebook_gen::PER_CLUSTER: {
-      partition_ix = cluster_ix;
-    } break;
-    case codebook_gen::PER_SUBSPACE: {
-      partition_ix = j;
-    } break;
-    default: __builtin_unreachable();
+/**
+ */
+template <uint32_t SubWarpSize, typename IdxT>
+struct encode_vectors {
+  codebook_gen codebook_kind;
+  uint32_t cluster_ix;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers;
+  device_mdspan<const float, extent_3d<IdxT>, row_major> in_vectors;
+
+  __device__ inline encode_vectors(
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+    device_matrix_view<const float, IdxT, row_major> in_vectors,
+    codebook_gen codebook_kind,
+    uint32_t cluster_ix)
+    : codebook_kind{codebook_kind},
+      cluster_ix{cluster_ix},
+      pq_centers{pq_centers},
+      in_vectors{reinterpret_vectors(in_vectors, pq_centers)}
+  {
   }
 
-  const uint32_t pq_book_size = pq_centers.extent(2);
-  const uint32_t pq_len       = pq_centers.extent(1);
-  float min_dist              = std::numeric_limits<float>::infinity();
-  uint8_t code                = 0;
-  // calculate the distance for each PQ cluster, find the minimum for each thread
-  for (uint32_t i = lane_id; i < pq_book_size; i += subwarp_align::Value) {
-    // NB: the L2 quantifiers on residuals are always trained on L2 metric.
-    float d = 0.0f;
-    for (uint32_t k = 0; k < pq_len; k++) {
-      auto t = new_vector(j, k) - pq_centers(partition_ix, k, i);
-      d += t * t;
+  /**
+   * Decode j-th component of the i-th vector by its code and write it into a chunk of the output
+   * vectors (pq_len elements).
+   */
+  __device__ inline auto operator()(IdxT i, uint32_t j) -> uint8_t
+  {
+    uint32_t lane_id = Pow2<SubWarpSize>::mod(laneId());
+    uint32_t partition_ix;
+    switch (codebook_kind) {
+      case codebook_gen::PER_CLUSTER: {
+        partition_ix = cluster_ix;
+      } break;
+      case codebook_gen::PER_SUBSPACE: {
+        partition_ix = j;
+      } break;
+      default: __builtin_unreachable();
     }
-    if (d < min_dist) {
-      min_dist = d;
-      code     = uint8_t(i);
+
+    const uint32_t pq_book_size = pq_centers.extent(2);
+    const uint32_t pq_len       = pq_centers.extent(1);
+    float min_dist              = std::numeric_limits<float>::infinity();
+    uint8_t code                = 0;
+    // calculate the distance for each PQ cluster, find the minimum for each thread
+    for (uint32_t l = lane_id; l < pq_book_size; l += SubWarpSize) {
+      // NB: the L2 quantifiers on residuals are always trained on L2 metric.
+      float d = 0.0f;
+      for (uint32_t k = 0; k < pq_len; k++) {
+        auto t = in_vectors(i, j, k) - pq_centers(partition_ix, k, l);
+        d += t * t;
+      }
+      if (d < min_dist) {
+        min_dist = d;
+        code     = uint8_t(l);
+      }
     }
-  }
-  // reduce among threads
+    // reduce among threads
 #pragma unroll
-  for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
-    const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
-    const auto other_code = shfl_xor(code, stride, SubWarpSize);
-    if (other_dist < min_dist) {
-      min_dist = other_dist;
-      code     = other_code;
+    for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
+      const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
+      const auto other_code = shfl_xor(code, stride, SubWarpSize);
+      if (other_dist < min_dist) {
+        min_dist = other_dist;
+        code     = other_code;
+      }
     }
+    return code;
   }
-  return code;
-}
+};
 
 template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
 __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
@@ -639,7 +954,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
   using subwarp_align             = Pow2<kSubWarpSize>;
   const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
-  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{blockDim.x} * IdxT{blockIdx.x});
+  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x});
   if (row_ix >= new_vectors.extent(0)) { return; }
 
   const uint32_t cluster_ix = new_labels[row_ix];
@@ -647,7 +962,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   if (lane_id == 0) { out_ix = atomicAdd(&list_sizes(cluster_ix), 1); }
   out_ix = shfl(out_ix, 0, kSubWarpSize);
 
-  // write the label
+  // write the label  (one record per subwarp)
   auto pq_indices = inds_ptrs(cluster_ix);
   if (lane_id == 0) {
     if (std::holds_alternative<IdxT>(src_offset_or_indices)) {
@@ -657,40 +972,81 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
     }
   }
 
-  // write the codes
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(out_ix);
-  const uint32_t ingroup_ix = group_align::mod(out_ix);
-  const uint32_t pq_len     = pq_centers.extent(1);
-  const uint32_t pq_dim     = new_vectors.extent(1) / pq_len;
-
+  // write the codes (one record per subwarp):
+  const uint32_t pq_dim = new_vectors.extent(1) / pq_centers.extent(1);
   auto pq_extents = list_spec<uint32_t, IdxT>{PqBits, pq_dim, true}.make_list_extents(out_ix + 1);
-  auto pq_extents_vectorized =
-    make_extents<uint32_t>(pq_extents.extent(0), pq_extents.extent(1), pq_extents.extent(2));
-  auto pq_dataset = make_mdspan<pq_vec_t, uint32_t, row_major, false, true>(
-    reinterpret_cast<pq_vec_t*>(data_ptrs[cluster_ix]), pq_extents_vectorized);
-
-  __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)];
-  pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)];
-  bitfield_view_t<PqBits> out{reinterpret_cast<uint8_t*>(&code)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // clear the chunk for writing
-    if (lane_id == 0) { code = pq_vec_t{}; }
-    // fill-in the values, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // find the label
-      using layout_t   = typename decltype(new_vectors)::layout_type;
-      using accessor_t = typename decltype(new_vectors)::accessor_type;
-      auto one_vector  = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
-        &new_vectors(row_ix, 0), extent_2d<uint32_t>{pq_dim, pq_len});
-      auto l = compute_pq_code<kSubWarpSize>(pq_centers, one_vector, codebook_kind, j, cluster_ix);
-      if (lane_id == 0) { out[k] = l; }
+  auto pq_dataset =
+    make_mdspan<uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+  write_vector<PqBits, kSubWarpSize>(
+    pq_dataset,
+    out_ix,
+    row_ix,
+    pq_dim,
+    encode_vectors<kSubWarpSize, IdxT>{pq_centers, new_vectors, codebook_kind, cluster_ix});
+}
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void encode_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const float, uint32_t, row_major> new_vectors,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  codebook_gen codebook_kind,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  const uint32_t pq_dim           = new_vectors.extent(1) / pq_centers.extent(1);
+  auto encode_action =
+    encode_vectors<kSubWarpSize, uint32_t>{pq_centers, new_vectors, codebook_kind, cluster_ix};
+  write_list<PqBits, kSubWarpSize>(
+    list_data, offset_or_indices, new_vectors.extent(0), pq_dim, encode_action);
+}
+
+template <typename T, typename IdxT>
+void encode_list_data(raft::device_resources const& res,
+                      index<IdxT>* index,
+                      device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = new_vectors.extent(0);
+  if (n_rows == 0) { return; }
+
+  auto mr = res.get_workspace_resource();
+
+  auto new_vectors_residual =
+    make_device_mdarray<float>(res, mr, make_extents<uint32_t>(n_rows, index->rot_dim()));
+
+  flat_compute_residuals<T, uint32_t>(res,
+                                      new_vectors_residual.data_handle(),
+                                      n_rows,
+                                      index->rotation_matrix(),
+                                      index->centers(),
+                                      new_vectors.data_handle(),
+                                      label,
+                                      mr);
+
+  constexpr uint32_t kBlockSize  = 256;
+  const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index->pq_book_size());
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize / threads_per_vec), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return encode_list_data_kernel<kBlockSize, 4>;
+      case 5: return encode_list_data_kernel<kBlockSize, 5>;
+      case 6: return encode_list_data_kernel<kBlockSize, 6>;
+      case 7: return encode_list_data_kernel<kBlockSize, 7>;
+      case 8: return encode_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
     }
-    // write the chunk into the dataset
-    if (lane_id == 0) { pq_dataset(group_ix, i, ingroup_ix) = code; }
-  }
+  }(index->pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(index->lists()[label]->data.view(),
+                                                   new_vectors_residual.view(),
+                                                   index->pq_centers(),
+                                                   index->codebook_kind(),
+                                                   label,
+                                                   offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -732,14 +1088,14 @@ void process_and_fill_codes(raft::device_resources const& handle,
   auto new_vectors_residual =
     make_device_mdarray<float>(handle, mr, make_extents<IdxT>(n_rows, index.rot_dim()));
 
-  flat_compute_residuals(handle,
-                         new_vectors_residual.data_handle(),
-                         n_rows,
-                         index.rotation_matrix(),
-                         index.centers(),
-                         new_vectors,
-                         new_labels,
-                         mr);
+  flat_compute_residuals<T, IdxT>(handle,
+                                  new_vectors_residual.data_handle(),
+                                  n_rows,
+                                  index.rotation_matrix(),
+                                  index.centers(),
+                                  new_vectors,
+                                  new_labels,
+                                  mr);
 
   constexpr uint32_t kBlockSize  = 256;
   const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index.pq_book_size());
@@ -819,6 +1175,85 @@ void recompute_internal_state(const raft::device_resources& res, index<IdxT>& in
   }
 }
 
+/**
+ * Helper function: allocate enough space in the list, compute the offset, at which to start
+ * writing, and fill-in indices.
+ *
+ * @return offset for writing the data
+ */
+template <typename IdxT>
+auto extend_list_prepare(raft::device_resources const& res,
+                         index<IdxT>* index,
+                         device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                         uint32_t label) -> uint32_t
+{
+  uint32_t n_rows = new_indices.extent(0);
+  uint32_t offset;
+  // Allocate the lists to fit the new data
+  copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream());
+  res.sync_stream();
+  uint32_t new_size = offset + n_rows;
+  copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream());
+  auto spec = list_spec<uint32_t, IdxT>{
+    index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()};
+  auto& list = index->lists()[label];
+  ivf::resize_list(res, list, spec, new_size, offset);
+  copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream());
+  return offset;
+}
+
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  // Allocate memory and write indices
+  auto offset = extend_list_prepare(res, index, new_indices, label);
+  // Pack the data
+  pack_list_data<IdxT>(res, index, new_codes, label, offset);
+  // Update the pointers and the sizes
+  recompute_internal_state(res, *index);
+}
+
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification step.
+ * See the public interface for the api and usage.
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  // Allocate memory and write indices
+  auto offset = extend_list_prepare(res, index, new_indices, label);
+  // Encode the data
+  encode_list_data<T, IdxT>(res, index, new_vectors, label, offset);
+  // Update the pointers and the sizes
+  recompute_internal_state(res, *index);
+}
+
+/**
+ * Remove all data from a single list.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  uint32_t zero = 0;
+  copy(index->list_sizes().data_handle() + label, &zero, 1, res.get_stream());
+  index->lists()[label].reset();
+  recompute_internal_state(res, *index);
+}
+
 /** Copy the state of an index into a new index, but share the list data among the two. */
 template <typename IdxT>
 auto clone(const raft::device_resources& res, const index<IdxT>& source) -> index<IdxT>
@@ -889,10 +1324,7 @@ void extend(raft::device_resources const& handle,
 
   rmm::mr::device_memory_resource* device_memory = nullptr;
   auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
-  if (pool_guard) {
-    RAFT_LOG_DEBUG("ivf_pq::extend: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
-  }
+  if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq::extend: using pool memory resource"); }
 
   rmm::mr::managed_memory_resource managed_memory_upstream;
   rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource> managed_memory(
@@ -1101,10 +1533,7 @@ auto build(raft::device_resources const& handle,
 
     rmm::mr::device_memory_resource* device_memory = nullptr;
     auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
-    if (pool_guard) {
-      RAFT_LOG_DEBUG("ivf_pq::build: using pool memory resource with initial size %zu bytes",
-                     pool_guard->pool_size());
-    }
+    if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq::build: using pool memory resource"); }
 
     rmm::mr::managed_memory_resource managed_memory_upstream;
     rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource> managed_memory(
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
new file mode 100644
index 0000000000..52969dd176
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/ivf_list.hpp>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+#include <variant>
+
+namespace raft::neighbors::ivf_pq::detail {
+
+/** A chunk of PQ-encoded vector managed by one CUDA thread. */
+using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+
+/**
+ * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
+ *
+ * @tparam Bits number of bits comprising the value.
+ */
+template <uint32_t Bits>
+struct bitfield_ref_t {
+  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
+  constexpr static uint8_t kMask = static_cast<uint8_t>((1u << Bits) - 1u);
+  uint8_t* ptr;
+  uint32_t offset;
+
+  constexpr operator uint8_t()  // NOLINT
+  {
+    auto pair = static_cast<uint16_t>(ptr[0]);
+    if (offset + Bits > 8) { pair |= static_cast<uint16_t>(ptr[1]) << 8; }
+    return static_cast<uint8_t>((pair >> offset) & kMask);
+  }
+
+  constexpr auto operator=(uint8_t code) -> bitfield_ref_t&
+  {
+    if (offset + Bits > 8) {
+      auto pair = static_cast<uint16_t>(ptr[0]);
+      pair |= static_cast<uint16_t>(ptr[1]) << 8;
+      pair &= ~(static_cast<uint16_t>(kMask) << offset);
+      pair |= static_cast<uint16_t>(code) << offset;
+      ptr[0] = static_cast<uint8_t>(Pow2<256>::mod(pair));
+      ptr[1] = static_cast<uint8_t>(Pow2<256>::div(pair));
+    } else {
+      ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset);
+    }
+    return *this;
+  }
+};
+
+/**
+ * View a byte array as an array of unsigned integers of custom small bit size.
+ *
+ * @tparam Bits number of bits comprising a single element of the array.
+ */
+template <uint32_t Bits>
+struct bitfield_view_t {
+  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
+  uint8_t* raw;
+
+  constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
+  {
+    uint32_t bit_offset = i * Bits;
+    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
+  }
+};
+
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack)
+ *
+ * @param[in] in_list_data the encoded cluster data.
+ * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
+ * @param[in] out_ix the output index passed to the action
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_vector(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  uint32_t in_ix,
+  uint32_t out_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(in_ix);
+  const uint32_t ingroup_ix = group_align::mod(in_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // read the chunk
+    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
+    // read the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // read a piece of the reconstructed vector
+      action(code_view[k], out_ix, j);
+    }
+  }
+}
+
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data).
+ * @tparam IdxT type of the index passed to the action
+ * @tparam Action tells how to process a single vector (e.g. encode or just pack)
+ *
+ * @param[in] out_list_data the encoded cluster data.
+ * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads).
+ * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads).
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename IdxT, typename Action>
+__device__ void write_vector(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  uint32_t out_ix,
+  IdxT in_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  const uint32_t lane_id = Pow2<SubWarpSize>::mod(threadIdx.x);
+
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(out_ix);
+  const uint32_t ingroup_ix = group_align::mod(out_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // clear the chunk
+    if (lane_id == 0) { code_chunk = pq_vec_t{}; }
+    // write the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // write a single code
+      uint8_t code = action(in_ix, j);
+      if (lane_id == 0) { code_view[k] = code; }
+    }
+    // write the chunk to the list
+    if (lane_id == 0) {
+      *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk;
+    }
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_list(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    run_on_vector<PqBits>(in_list_data, src_ix, ix, pq_dim, action);
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
+__device__ void write_list(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  using subwarp_align = Pow2<SubWarpSize>;
+  uint32_t stride     = subwarp_align::div(blockDim.x);
+  uint32_t ix         = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x);
+  for (; ix < len; ix += stride) {
+    const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    write_vector<PqBits, SubWarpSize>(out_list_data, dst_ix, ix, pq_dim, action);
+  }
+}
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
new file mode 100644
index 0000000000..41e9fda701
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_fp16.h>                               // __half
+#include <raft/core/detail/macros.hpp>               // RAFT_WEAK_FUNCTION
+#include <raft/distance/distance_types.hpp>          // raft::distance::DistanceType
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>  // raft::neighbors::ivf_pq::detail::fp_8bit
+#include <raft/neighbors/ivf_pq_types.hpp>           // raft::neighbors::ivf_pq::codebook_gen
+#include <raft/util/raft_explicit.hpp>               // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>                  // rmm::cuda_stream_view
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::neighbors::ivf_pq::detail {
+
+// is_local_topk_feasible is not inline here, because we would have to define it
+// here as well. That would run the risk of the definitions here and in the
+// -inl.cuh header diverging.
+auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries)
+  -> bool;
+
+template <typename OutT,
+          typename LutT,
+          uint32_t PqBits,
+          int Capacity,
+          bool PrecompBaseDiff,
+          bool EnableSMemLut>
+__global__ void compute_similarity_kernel(uint32_t n_rows,
+                                          uint32_t dim,
+                                          uint32_t n_probes,
+                                          uint32_t pq_dim,
+                                          uint32_t n_queries,
+                                          distance::DistanceType metric,
+                                          codebook_gen codebook_kind,
+                                          uint32_t topk,
+                                          uint32_t max_samples,
+                                          const float* cluster_centers,
+                                          const float* pq_centers,
+                                          const uint8_t* const* pq_dataset,
+                                          const uint32_t* cluster_labels,
+                                          const uint32_t* _chunk_indices,
+                                          const float* queries,
+                                          const uint32_t* index_list,
+                                          float* query_kths,
+                                          LutT* lut_scores,
+                                          OutT* _out_scores,
+                                          uint32_t* _out_indices) RAFT_EXPLICIT;
+
+// The signature of the kernel defined by a minimal set of template parameters
+template <typename OutT, typename LutT>
+using compute_similarity_kernel_t =
+  decltype(&compute_similarity_kernel<OutT, LutT, 8, 0, true, true>);
+
+template <typename OutT, typename LutT>
+struct selected {
+  compute_similarity_kernel_t<OutT, LutT> kernel;
+  dim3 grid_dim;
+  dim3 block_dim;
+  size_t smem_size;
+  size_t device_lut_size;
+};
+
+template <typename OutT, typename LutT>
+void compute_similarity_run(selected<OutT, LutT> s,
+                            rmm::cuda_stream_view stream,
+                            uint32_t n_rows,
+                            uint32_t dim,
+                            uint32_t n_probes,
+                            uint32_t pq_dim,
+                            uint32_t n_queries,
+                            distance::DistanceType metric,
+                            codebook_gen codebook_kind,
+                            uint32_t topk,
+                            uint32_t max_samples,
+                            const float* cluster_centers,
+                            const float* pq_centers,
+                            const uint8_t* const* pq_dataset,
+                            const uint32_t* cluster_labels,
+                            const uint32_t* _chunk_indices,
+                            const float* queries,
+                            const uint32_t* index_list,
+                            float* query_kths,
+                            LutT* lut_scores,
+                            OutT* _out_scores,
+                            uint32_t* _out_indices) RAFT_EXPLICIT;
+
+/**
+ * Use heuristics to choose an optimal instance of the search kernel.
+ * It selects among a few kernel variants (with/out using shared mem for
+ * lookup tables / precomputed distances) and tries to choose the block size
+ * to maximize kernel occupancy.
+ *
+ * @param manage_local_topk
+ *    whether use the fused calculate+select or just calculate the distances for each
+ *    query and probed cluster.
+ *
+ * @param locality_hint
+ *    beyond this limit do not consider increasing the number of active blocks per SM
+ *    would improve locality anymore.
+ */
+template <typename OutT, typename LutT>
+auto compute_similarity_select(const cudaDeviceProp& dev_props,
+                               bool manage_local_topk,
+                               int locality_hint,
+                               double preferred_shmem_carveout,
+                               uint32_t pq_bits,
+                               uint32_t pq_dim,
+                               uint32_t precomp_data_count,
+                               uint32_t n_queries,
+                               uint32_t n_probes,
+                               uint32_t topk) -> selected<OutT, LutT> RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ivf_pq::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)         \
+  extern template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                           \
+    bool manage_local_topk,                                                                    \
+    int locality_hint,                                                                         \
+    double preferred_shmem_carveout,                                                           \
+    uint32_t pq_bits,                                                                          \
+    uint32_t pq_dim,                                                                           \
+    uint32_t precomp_data_count,                                                               \
+    uint32_t n_queries,                                                                        \
+    uint32_t n_probes,                                                                         \
+    uint32_t topk)                                                                             \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                                   \
+                                                                                               \
+  extern template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                                   \
+    rmm::cuda_stream_view stream,                                                              \
+    uint32_t n_rows,                                                                           \
+    uint32_t dim,                                                                              \
+    uint32_t n_probes,                                                                         \
+    uint32_t pq_dim,                                                                           \
+    uint32_t n_queries,                                                                        \
+    raft::distance::DistanceType metric,                                                       \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                       \
+    uint32_t topk,                                                                             \
+    uint32_t max_samples,                                                                      \
+    const float* cluster_centers,                                                              \
+    const float* pq_centers,                                                                   \
+    const uint8_t* const* pq_dataset,                                                          \
+    const uint32_t* cluster_labels,                                                            \
+    const uint32_t* _chunk_indices,                                                            \
+    const float* queries,                                                                      \
+    const uint32_t* index_list,                                                                \
+    float* query_kths,                                                                         \
+    LutT* lut_scores,                                                                          \
+    OutT* _out_scores,                                                                         \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(half, half);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, half);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, float);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
new file mode 100644
index 0000000000..bc899c7ca7
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
@@ -0,0 +1,845 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/matrix/detail/select_warpsort.cuh>  // matrix::detail::select::warpsort::warp_sort_distributed
+#include <raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh>  // dummy_block_sort_t
+#include <raft/neighbors/ivf_pq_types.hpp>                    // codebook_gen
+#include <raft/util/cuda_rt_essentials.hpp>                   // RAFT_CUDA_TRY
+#include <raft/util/device_atomics.cuh>                       // raft::atomicMin
+#include <raft/util/pow2_utils.cuh>                           // raft::Pow2
+#include <raft/util/vectorized.cuh>                           // raft::TxN_t
+#include <rmm/cuda_stream_view.hpp>                           // rmm::cuda_stream_view
+
+namespace raft::neighbors::ivf_pq::detail {
+
+/**
+ * Maximum value of k for the fused calculate & select in ivfpq.
+ *
+ * If runtime value of k is larger than this, the main search operation
+ * is split into two kernels (per batch, first calculate distance, then select top-k).
+ */
+static constexpr int kMaxCapacity = 128;
+static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)),
+              "kMaxCapacity must be a power of two, not smaller than the WarpSize.");
+
+// using weak attribute here, because it may be compiled multiple times.
+auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries)
+  -> bool
+{
+  if (k > kMaxCapacity) { return false; }             // warp_sort not possible
+  if (n_probes <= 16) { return false; }               // too few clusters
+  if (n_queries * n_probes <= 256) { return false; }  // overall amount of work is too small
+  return true;
+}
+
+template <int Capacity, typename T, typename IdxT>
+struct pq_block_sort {
+  using type = matrix::detail::select::warpsort::
+    block_sort<matrix::detail::select::warpsort::warp_sort_distributed, Capacity, true, T, IdxT>;
+};
+
+template <typename T, typename IdxT>
+struct pq_block_sort<0, T, IdxT> : dummy_block_sort_t<T, IdxT> {
+  using type = dummy_block_sort_t<T, IdxT>;
+};
+
+template <int Capacity, typename T, typename IdxT>
+using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
+
+/**
+ * Estimate a carveout value as expected by `cudaFuncAttributePreferredSharedMemoryCarveout`
+ * (which does not take into account `reservedSharedMemPerBlock`),
+ * given by a desired schmem-L1 split and a per-block memory requirement in bytes.
+ *
+ * NB: As per the programming guide, the memory carveout setting is just a hint for the driver; it's
+ * free to choose any shmem-L1 configuration it deems appropriate. For example, if you set the
+ * carveout to zero, it will choose a non-zero config that will allow to run at least one active
+ * block per SM.
+ *
+ * @param shmem_fraction
+ *   a fraction representing a desired split (shmem / (shmem + L1)) [0, 1].
+ * @param shmem_per_block
+ *   a shared memory usage per block (dynamic + static shared memory sizes), in bytes.
+ * @param dev_props
+ *   device properties.
+ * @return
+ *   a carveout value in percents [0, 100].
+ */
+constexpr inline auto estimate_carveout(double shmem_fraction,
+                                        size_t shmem_per_block,
+                                        const cudaDeviceProp& dev_props) -> int
+{
+  using shmem_unit = Pow2<128>;
+  size_t m         = shmem_unit::roundUp(shmem_per_block);
+  size_t r         = dev_props.reservedSharedMemPerBlock;
+  size_t s         = dev_props.sharedMemPerMultiprocessor;
+  return (size_t(100 * s * m * shmem_fraction) - (m - 1) * r) / (s * (m + r));
+}
+
+/* Manually unrolled loop over a chunk of pq_dataset that fits into one VecT. */
+template <typename OutT,
+          typename LutT,
+          typename VecT,
+          bool CheckBounds,
+          uint32_t PqBits,
+          uint32_t BitsLeft = 0,
+          uint32_t Ix       = 0>
+__device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
+                                                    typename VecT::math_t& pq_code,
+                                                    const VecT& pq_codes,
+                                                    const LutT*& lut_head,
+                                                    const LutT*& lut_end)
+{
+  if constexpr (CheckBounds) {
+    if (lut_head >= lut_end) { return; }
+  }
+  constexpr uint32_t kTotalBits = 8 * sizeof(typename VecT::math_t);
+  constexpr uint32_t kPqShift   = 1u << PqBits;
+  constexpr uint32_t kPqMask    = kPqShift - 1u;
+  if constexpr (BitsLeft >= PqBits) {
+    uint8_t code = pq_code & kPqMask;
+    pq_code >>= PqBits;
+    score += OutT(lut_head[code]);
+    lut_head += kPqShift;
+    return ivfpq_compute_chunk<OutT, LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+  } else if constexpr (Ix < VecT::Ratio) {
+    uint8_t code                = pq_code;
+    pq_code                     = pq_codes.val.data[Ix];
+    constexpr uint32_t kRemBits = PqBits - BitsLeft;
+    constexpr uint32_t kRemMask = (1u << kRemBits) - 1u;
+    code |= (pq_code & kRemMask) << BitsLeft;
+    pq_code >>= kRemBits;
+    score += OutT(lut_head[code]);
+    lut_head += kPqShift;
+    return ivfpq_compute_chunk<OutT,
+                               LutT,
+                               VecT,
+                               CheckBounds,
+                               PqBits,
+                               kTotalBits - kRemBits,
+                               Ix + 1>(score, pq_code, pq_codes, lut_head, lut_end);
+  }
+}
+
+/* Compute the similarity for one vector in the pq_dataset */
+template <typename OutT, typename LutT, typename VecT, uint32_t PqBits>
+__device__ auto ivfpq_compute_score(uint32_t pq_dim,
+                                    const typename VecT::io_t* pq_head,
+                                    const LutT* lut_scores,
+                                    OutT early_stop_limit) -> OutT
+{
+  constexpr uint32_t kChunkSize = sizeof(VecT) * 8u / PqBits;
+  auto lut_head                 = lut_scores;
+  auto lut_end                  = lut_scores + (pq_dim << PqBits);
+  VecT pq_codes;
+  OutT score{0};
+  for (; pq_dim >= kChunkSize; pq_dim -= kChunkSize) {
+    *pq_codes.vectorized_data() = *pq_head;
+    pq_head += kIndexGroupSize;
+    typename VecT::math_t pq_code = 0;
+    ivfpq_compute_chunk<OutT, LutT, VecT, false, PqBits>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+    // Early stop when it makes sense (otherwise early_stop_limit is kDummy/infinity).
+    if (score >= early_stop_limit) { return score; }
+  }
+  if (pq_dim > 0) {
+    *pq_codes.vectorized_data()   = *pq_head;
+    typename VecT::math_t pq_code = 0;
+    ivfpq_compute_chunk<OutT, LutT, VecT, true, PqBits>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+  }
+  return score;
+}
+
+/**
+ * The main kernel that computes similarity scores across multiple queries and probes.
+ * When `Capacity > 0`, it also selects top K candidates for each query and probe
+ * (which need to be merged across probes afterwards).
+ *
+ * Each block processes a (query, probe) pair: it calculates the distance between the single query
+ * vector and all the dataset vector in the cluster that we are probing.
+ *
+ * @tparam OutT
+ *   The output type - distances.
+ * @tparam LutT
+ *   The lookup table element type (lut_scores).
+ * @tparam PqBits
+ *   The bit length of an encoded vector element after compression by PQ
+ *   (NB: pq_book_size = 1 << PqBits).
+ * @tparam Capacity
+ *   Power-of-two; the maximum possible `k` in top-k. Value zero disables fused top-k search.
+ * @tparam PrecompBaseDiff
+ *   Defines whether we should precompute part of the distance and keep it in shared memory
+ *   before the main part (score calculation) to increase memory usage efficiency in the latter.
+ *   For L2, this is the distance between the query and the cluster center.
+ * @tparam EnableSMemLut
+ *   Defines whether to use the shared memory for the lookup table (`lut_scores`).
+ *   Setting this to `false` allows to reduce the shared memory usage (and maximum data dim)
+ *   at the cost of reducing global memory reading throughput.
+ *
+ * @param n_rows the number of records in the dataset
+ * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`).
+ * @param n_probes the number of clusters to search for each query
+ * @param pq_dim
+ *   The dimensionality of an encoded vector after compression by PQ.
+ * @param n_queries the number of queries.
+ * @param metric the distance type.
+ * @param codebook_kind Defines the way PQ codebooks have been trained.
+ * @param topk the `k` in the select top-k.
+ * @param max_samples the size of the output for a single query.
+ * @param cluster_centers
+ *   The device pointer to the cluster centers in the original space (NB: after rotation)
+ *   [n_clusters, dim].
+ * @param pq_centers
+ *   The device pointer to the cluster centers in the PQ space
+ *   [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len,].
+ * @param pq_dataset
+ *   The device pointer to the PQ index (data) [n_rows, ...].
+ * @param cluster_labels
+ *   The device pointer to the labels (clusters) for each query and probe [n_queries, n_probes].
+ * @param _chunk_indices
+ *   The device pointer to the data offsets for each query and probe [n_queries, n_probes].
+ * @param queries
+ *   The device pointer to the queries (NB: after rotation) [n_queries, dim].
+ * @param index_list
+ *   An optional device pointer to the enforced order of search [n_queries, n_probes].
+ *   One can pass reordered indices here to try to improve data reading locality.
+ * @param lut_scores
+ *   The device pointer for storing the lookup table globally [gridDim.x, pq_dim << PqBits].
+ *   Ignored when `EnableSMemLut == true`.
+ * @param _out_scores
+ *   The device pointer to the output scores
+ *   [n_queries, max_samples] or [n_queries, n_probes, topk].
+ * @param _out_indices
+ *   The device pointer to the output indices [n_queries, n_probes, topk].
+ *   These are the indices of the records as they appear in the database view formed by the probed
+ *   clusters / defined by the `_chunk_indices`.
+ *   The indices can have values within the range [0, max_samples).
+ *   Ignored  when `Capacity == 0`.
+ */
+template <typename OutT,
+          typename LutT,
+          uint32_t PqBits,
+          int Capacity,
+          bool PrecompBaseDiff,
+          bool EnableSMemLut>
+__global__ void compute_similarity_kernel(uint32_t n_rows,
+                                          uint32_t dim,
+                                          uint32_t n_probes,
+                                          uint32_t pq_dim,
+                                          uint32_t n_queries,
+                                          distance::DistanceType metric,
+                                          codebook_gen codebook_kind,
+                                          uint32_t topk,
+                                          uint32_t max_samples,
+                                          const float* cluster_centers,
+                                          const float* pq_centers,
+                                          const uint8_t* const* pq_dataset,
+                                          const uint32_t* cluster_labels,
+                                          const uint32_t* _chunk_indices,
+                                          const float* queries,
+                                          const uint32_t* index_list,
+                                          float* query_kths,
+                                          LutT* lut_scores,
+                                          OutT* _out_scores,
+                                          uint32_t* _out_indices)
+{
+  /* Shared memory:
+
+    * lut_scores: lookup table (LUT) of size = `pq_dim << PqBits`  (when EnableSMemLut)
+    * base_diff: size = dim (which is equal to `pq_dim * pq_len`)  or dim*2
+    * topk::block_sort: some amount of shared memory, but overlaps with the rest:
+        block_sort only needs shared memory for `.done()` operation, which can come very last.
+  */
+  extern __shared__ __align__(256) uint8_t smem_buf[];  // NOLINT
+  constexpr bool kManageLocalTopK = Capacity > 0;
+
+  constexpr uint32_t PqShift = 1u << PqBits;  // NOLINT
+  constexpr uint32_t PqMask  = PqShift - 1u;  // NOLINT
+
+  const uint32_t pq_len   = dim / pq_dim;
+  const uint32_t lut_size = pq_dim * PqShift;
+
+  if constexpr (EnableSMemLut) {
+    lut_scores = reinterpret_cast<LutT*>(smem_buf);
+  } else {
+    lut_scores += lut_size * blockIdx.x;
+  }
+
+  float* base_diff = nullptr;
+  if constexpr (PrecompBaseDiff) {
+    if constexpr (EnableSMemLut) {
+      base_diff = reinterpret_cast<float*>(lut_scores + lut_size);
+    } else {
+      base_diff = reinterpret_cast<float*>(smem_buf);
+    }
+  }
+
+  for (int ib = blockIdx.x; ib < n_queries * n_probes; ib += gridDim.x) {
+    if (ib >= gridDim.x) {
+      // sync shared memory accesses on the second and further iterations
+      __syncthreads();
+    }
+    uint32_t query_ix;
+    uint32_t probe_ix;
+    if (index_list == nullptr) {
+      query_ix = ib % n_queries;
+      probe_ix = ib / n_queries;
+    } else {
+      auto ordered_ix = index_list[ib];
+      query_ix        = ordered_ix / n_probes;
+      probe_ix        = ordered_ix % n_probes;
+    }
+
+    const uint32_t* chunk_indices = _chunk_indices + (n_probes * query_ix);
+    const float* query            = queries + (dim * query_ix);
+    OutT* out_scores;
+    uint32_t* out_indices = nullptr;
+    if constexpr (kManageLocalTopK) {
+      // Store topk calculated distances to out_scores (and its indices to out_indices)
+      out_scores  = _out_scores + topk * (probe_ix + (n_probes * query_ix));
+      out_indices = _out_indices + topk * (probe_ix + (n_probes * query_ix));
+    } else {
+      // Store all calculated distances to out_scores
+      out_scores = _out_scores + max_samples * query_ix;
+    }
+    uint32_t label              = cluster_labels[n_probes * query_ix + probe_ix];
+    const float* cluster_center = cluster_centers + (dim * label);
+    const float* pq_center;
+    if (codebook_kind == codebook_gen::PER_SUBSPACE) {
+      pq_center = pq_centers;
+    } else {
+      pq_center = pq_centers + (pq_len << PqBits) * label;
+    }
+
+    if constexpr (PrecompBaseDiff) {
+      // Reduce number of memory reads later by pre-computing parts of the score
+      switch (metric) {
+        case distance::DistanceType::L2SqrtExpanded:
+        case distance::DistanceType::L2Expanded: {
+          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+            base_diff[i] = query[i] - cluster_center[i];
+          }
+        } break;
+        case distance::DistanceType::InnerProduct: {
+          float2 pvals;
+          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+            pvals.x                                 = query[i];
+            pvals.y                                 = cluster_center[i] * pvals.x;
+            reinterpret_cast<float2*>(base_diff)[i] = pvals;
+          }
+        } break;
+        default: __builtin_unreachable();
+      }
+      __syncthreads();
+    }
+
+    {
+      // Create a lookup table
+      // For each subspace, the lookup table stores the distance between the actual query vector
+      // (projected into the subspace) and all possible pq vectors in that subspace.
+      for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
+        const uint32_t i_pq  = i >> PqBits;
+        uint32_t j           = i_pq * pq_len;
+        const uint32_t j_end = pq_len + j;
+        auto cur_pq_center   = pq_center + (i & PqMask) +
+                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j * PqShift : 0u);
+        float score = 0.0;
+        do {
+          float pq_c = *cur_pq_center;
+          cur_pq_center += PqShift;
+          switch (metric) {
+            case distance::DistanceType::L2SqrtExpanded:
+            case distance::DistanceType::L2Expanded: {
+              float diff;
+              if constexpr (PrecompBaseDiff) {
+                diff = base_diff[j];
+              } else {
+                diff = query[j] - cluster_center[j];
+              }
+              diff -= pq_c;
+              score += diff * diff;
+            } break;
+            case distance::DistanceType::InnerProduct: {
+              // NB: we negate the scores as we hardcoded select-topk to always compute the minimum
+              float q;
+              if constexpr (PrecompBaseDiff) {
+                float2 pvals = reinterpret_cast<float2*>(base_diff)[j];
+                q            = pvals.x;
+                score -= pvals.y;
+              } else {
+                q = query[j];
+                score -= q * cluster_center[j];
+              }
+              score -= q * pq_c;
+            } break;
+            default: __builtin_unreachable();
+          }
+        } while (++j < j_end);
+        lut_scores[i] = LutT(score);
+      }
+    }
+
+    // Define helper types for efficient access to the pq_dataset, which is stored in an interleaved
+    // format. The chunks of PQ data are stored in kIndexGroupVecLen-bytes-long chunks, interleaved
+    // in groups of kIndexGroupSize elems (which is normally equal to the warp size) for the fastest
+    // possible access by thread warps.
+    //
+    // Consider one record in the pq_dataset is `pq_dim * pq_bits`-bit-long.
+    // Assuming `kIndexGroupVecLen = 16`, one chunk of data read by a thread at once is 128-bits.
+    // Then, such a chunk contains `chunk_size = 128 / pq_bits` record elements, and the record
+    // consists of `ceildiv(pq_dim, chunk_size)` chunks. The chunks are interleaved in groups of 32,
+    // so that the warp can achieve the best coalesced read throughput.
+    using group_align  = Pow2<kIndexGroupSize>;
+    using vec_align    = Pow2<kIndexGroupVecLen>;
+    using local_topk_t = block_sort_t<Capacity, OutT, uint32_t>;
+    using op_t         = uint32_t;
+    using vec_t        = TxN_t<op_t, kIndexGroupVecLen / sizeof(op_t)>;
+
+    uint32_t sample_offset = 0;
+    if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
+    uint32_t n_samples            = chunk_indices[probe_ix] - sample_offset;
+    uint32_t n_samples_aligned    = group_align::roundUp(n_samples);
+    constexpr uint32_t kChunkSize = (kIndexGroupVecLen * 8u) / PqBits;
+    uint32_t pq_line_width        = div_rounding_up_unsafe(pq_dim, kChunkSize) * kIndexGroupVecLen;
+    auto pq_thread_data = pq_dataset[label] + group_align::roundDown(threadIdx.x) * pq_line_width +
+                          group_align::mod(threadIdx.x) * vec_align::Value;
+    pq_line_width *= blockDim.x;
+
+    constexpr OutT kDummy = upper_bound<OutT>();
+    OutT query_kth        = kDummy;
+    if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
+    local_topk_t block_topk(topk, nullptr, query_kth);
+    OutT early_stop_limit = kDummy;
+    switch (metric) {
+      // If the metric is non-negative, we can use the query_kth approximation as an early stop
+      // threshold to skip some iterations when computing the score. Add such metrics here.
+      case distance::DistanceType::L2SqrtExpanded:
+      case distance::DistanceType::L2Expanded: {
+        early_stop_limit = query_kth;
+      } break;
+      default: break;
+    }
+
+    // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score
+    __threadfence_block();
+    __syncthreads();
+
+    // Compute a distance for each sample
+    for (uint32_t i = threadIdx.x; i < n_samples_aligned;
+         i += blockDim.x, pq_thread_data += pq_line_width) {
+      OutT score = kDummy;
+      bool valid = i < n_samples;
+      if (valid) {
+        score = ivfpq_compute_score<OutT, LutT, vec_t, PqBits>(
+          pq_dim,
+          reinterpret_cast<const vec_t::io_t*>(pq_thread_data),
+          lut_scores,
+          early_stop_limit);
+      }
+      if constexpr (kManageLocalTopK) {
+        block_topk.add(score, sample_offset + i);
+      } else {
+        if (valid) { out_scores[sample_offset + i] = score; }
+      }
+    }
+    if constexpr (kManageLocalTopK) {
+      // sync threads before the topk merging operation, because we reuse smem_buf
+      __syncthreads();
+      block_topk.done(smem_buf);
+      block_topk.store(out_scores, out_indices);
+      if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
+    } else {
+      // fill in the rest of the out_scores with dummy values
+      if (probe_ix + 1 == n_probes) {
+        for (uint32_t i = threadIdx.x + sample_offset + n_samples; i < max_samples;
+             i += blockDim.x) {
+          out_scores[i] = kDummy;
+        }
+      }
+    }
+  }
+}
+
+// The signature of the kernel defined by a minimal set of template parameters
+template <typename OutT, typename LutT>
+using compute_similarity_kernel_t =
+  decltype(&compute_similarity_kernel<OutT, LutT, 8, 0, true, true>);
+
+// The config struct lifts the runtime parameters to the template parameters
+template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
+struct compute_similarity_kernel_config {
+ public:
+  static auto get(uint32_t pq_bits, uint32_t k_max) -> compute_similarity_kernel_t<OutT, LutT>
+  {
+    return kernel_choose_bits(pq_bits, k_max);
+  }
+
+ private:
+  static auto kernel_choose_bits(uint32_t pq_bits, uint32_t k_max)
+    -> compute_similarity_kernel_t<OutT, LutT>
+  {
+    switch (pq_bits) {
+      case 4: return kernel_try_capacity<4, kMaxCapacity>(k_max);
+      case 5: return kernel_try_capacity<5, kMaxCapacity>(k_max);
+      case 6: return kernel_try_capacity<6, kMaxCapacity>(k_max);
+      case 7: return kernel_try_capacity<7, kMaxCapacity>(k_max);
+      case 8: return kernel_try_capacity<8, kMaxCapacity>(k_max);
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }
+
+  template <uint32_t PqBits, int Capacity>
+  static auto kernel_try_capacity(uint32_t k_max) -> compute_similarity_kernel_t<OutT, LutT>
+  {
+    if constexpr (Capacity > 0) {
+      if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<PqBits, 0>(k_max); }
+    }
+    if constexpr (Capacity > 1) {
+      if (k_max * 2 <= Capacity) { return kernel_try_capacity<PqBits, (Capacity / 2)>(k_max); }
+    }
+    return compute_similarity_kernel<OutT, LutT, PqBits, Capacity, PrecompBaseDiff, EnableSMemLut>;
+  }
+};
+
+// A standalone accessor function was necessary to make sure template
+// instantiation work correctly. This accessor function is not used anymore and
+// may be removed.
+template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
+auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max)
+  -> compute_similarity_kernel_t<OutT, LutT>
+{
+  return compute_similarity_kernel_config<OutT, LutT, PrecompBaseDiff, EnableSMemLut>::get(pq_bits,
+                                                                                           k_max);
+}
+
+/** Estimate the occupancy for the given kernel on the given device. */
+template <typename OutT, typename LutT>
+struct occupancy_t {
+  using shmem_unit = Pow2<128>;
+
+  int blocks_per_sm = 0;
+  double occupancy  = 0.0;
+  double shmem_use  = 1.0;
+
+  inline occupancy_t() = default;
+  inline occupancy_t(size_t smem,
+                     uint32_t n_threads,
+                     compute_similarity_kernel_t<OutT, LutT> kernel,
+                     const cudaDeviceProp& dev_props)
+  {
+    RAFT_CUDA_TRY(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm, kernel, n_threads, smem));
+    occupancy = double(blocks_per_sm * n_threads) / double(dev_props.maxThreadsPerMultiProcessor);
+    shmem_use = double(shmem_unit::roundUp(smem) * blocks_per_sm) /
+                double(dev_props.sharedMemPerMultiprocessor);
+  }
+};
+
+template <typename OutT, typename LutT>
+struct selected {
+  compute_similarity_kernel_t<OutT, LutT> kernel;
+  dim3 grid_dim;
+  dim3 block_dim;
+  size_t smem_size;
+  size_t device_lut_size;
+};
+
+template <typename OutT, typename LutT>
+void compute_similarity_run(selected<OutT, LutT> s,
+                            rmm::cuda_stream_view stream,
+                            uint32_t n_rows,
+                            uint32_t dim,
+                            uint32_t n_probes,
+                            uint32_t pq_dim,
+                            uint32_t n_queries,
+                            distance::DistanceType metric,
+                            codebook_gen codebook_kind,
+                            uint32_t topk,
+                            uint32_t max_samples,
+                            const float* cluster_centers,
+                            const float* pq_centers,
+                            const uint8_t* const* pq_dataset,
+                            const uint32_t* cluster_labels,
+                            const uint32_t* _chunk_indices,
+                            const float* queries,
+                            const uint32_t* index_list,
+                            float* query_kths,
+                            LutT* lut_scores,
+                            OutT* _out_scores,
+                            uint32_t* _out_indices)
+{
+  s.kernel<<<s.grid_dim, s.block_dim, s.smem_size, stream>>>(n_rows,
+                                                             dim,
+                                                             n_probes,
+                                                             pq_dim,
+                                                             n_queries,
+                                                             metric,
+                                                             codebook_kind,
+                                                             topk,
+                                                             max_samples,
+                                                             cluster_centers,
+                                                             pq_centers,
+                                                             pq_dataset,
+                                                             cluster_labels,
+                                                             _chunk_indices,
+                                                             queries,
+                                                             index_list,
+                                                             query_kths,
+                                                             lut_scores,
+                                                             _out_scores,
+                                                             _out_indices);
+  RAFT_CHECK_CUDA(stream);
+}
+
+/**
+ * Use heuristics to choose an optimal instance of the search kernel.
+ * It selects among a few kernel variants (with/out using shared mem for
+ * lookup tables / precomputed distances) and tries to choose the block size
+ * to maximize kernel occupancy.
+ *
+ * @param manage_local_topk
+ *    whether use the fused calculate+select or just calculate the distances for each
+ *    query and probed cluster.
+ *
+ * @param locality_hint
+ *    beyond this limit do not consider increasing the number of active blocks per SM
+ *    would improve locality anymore.
+ */
+template <typename OutT, typename LutT>
+auto compute_similarity_select(const cudaDeviceProp& dev_props,
+                               bool manage_local_topk,
+                               int locality_hint,
+                               double preferred_shmem_carveout,
+                               uint32_t pq_bits,
+                               uint32_t pq_dim,
+                               uint32_t precomp_data_count,
+                               uint32_t n_queries,
+                               uint32_t n_probes,
+                               uint32_t topk) -> selected<OutT, LutT>
+{
+  // Shared memory for storing the lookup table
+  size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits);
+  // Shared memory for storing pre-computed pieces to speedup the lookup table construction
+  // (e.g. the distance between a cluster center and the query for L2).
+  size_t bdf_mem = sizeof(float) * precomp_data_count;
+  // Shared memory for the fused top-k component; it may overlap with the other uses of shared
+  // memory and depends on the number of threads.
+  struct ltk_mem_t {
+    uint32_t subwarp_size;
+    uint32_t topk;
+    bool manage_local_topk;
+    ltk_mem_t(bool manage_local_topk, uint32_t topk)
+      : manage_local_topk(manage_local_topk), topk(topk)
+    {
+      subwarp_size = WarpSize;
+      while (topk * 2 <= subwarp_size) {
+        subwarp_size /= 2;
+      }
+    }
+
+    [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
+    {
+      return manage_local_topk
+               ? matrix::detail::select::warpsort::template calc_smem_size_for_block_wide<OutT,
+                                                                                          uint32_t>(
+                   n_threads / subwarp_size, topk)
+               : 0;
+    }
+  } ltk_mem{manage_local_topk, topk};
+
+  // Total amount of work; should be enough to occupy the GPU.
+  uint32_t n_blocks = n_queries * n_probes;
+
+  // The minimum block size we may want:
+  //   1. It's a power-of-two for efficient L1 caching of pq_centers values
+  //      (multiples of `1 << pq_bits`).
+  //   2. It should be large enough to fully utilize an SM.
+  uint32_t n_threads_min = WarpSize;
+  while (dev_props.maxBlocksPerMultiProcessor * int(n_threads_min) <
+         dev_props.maxThreadsPerMultiProcessor) {
+    n_threads_min *= 2;
+  }
+  // Further increase the minimum block size to make sure full device occupancy
+  // (NB: this may lead to `n_threads_min` being larger than the kernel's maximum)
+  while (int(n_blocks * n_threads_min) <
+           dev_props.multiProcessorCount * dev_props.maxThreadsPerMultiProcessor &&
+         int(n_threads_min) < dev_props.maxThreadsPerBlock) {
+    n_threads_min *= 2;
+  }
+  // Even further, increase it to allow less blocks per SM if there not enough queries.
+  // With this, we reduce the chance of different clusters being processed by two blocks
+  // on the same SM and thus improve the data locality for L1 caching.
+  while (int(n_queries * n_threads_min) < dev_props.maxThreadsPerMultiProcessor &&
+         int(n_threads_min) < dev_props.maxThreadsPerBlock) {
+    n_threads_min *= 2;
+  }
+
+  // Granularity of changing the number of threads when computing the maximum block size.
+  // It's good to have it multiple of the PQ book width.
+  uint32_t n_threads_gty = round_up_safe<uint32_t>(1u << pq_bits, WarpSize);
+
+  /*
+   Shared memory / L1 cache balance is the main limiter of this kernel.
+   The more blocks per SM we launch, the more shared memory we need. Besides that, we have
+   three versions of the kernel varying in performance and shmem usage.
+
+   We try the most demanding and the fastest kernel first, trying to maximize occupancy with
+   the minimum number of blocks (just one, really). Then, we tweak the `n_threads` to further
+   optimize occupancy and data locality for the L1 cache.
+   */
+  auto conf_fast        = get_compute_similarity_kernel<OutT, LutT, true, true>;
+  auto conf_no_basediff = get_compute_similarity_kernel<OutT, LutT, false, true>;
+  auto conf_no_smem_lut = get_compute_similarity_kernel<OutT, LutT, true, false>;
+  auto topk_or_zero     = manage_local_topk ? topk : 0u;
+  std::array candidates{std::make_tuple(conf_fast(pq_bits, topk_or_zero), lut_mem + bdf_mem, true),
+                        std::make_tuple(conf_no_basediff(pq_bits, topk_or_zero), lut_mem, true),
+                        std::make_tuple(conf_no_smem_lut(pq_bits, topk_or_zero), bdf_mem, false)};
+
+  // we may allow slightly lower than 100% occupancy;
+  constexpr double kTargetOccupancy = 0.75;
+  // This struct is used to select the better candidate
+  occupancy_t<OutT, LutT> selected_perf{};
+  selected<OutT, LutT> selected_config;
+  for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) {
+    if (smem_size_const > dev_props.sharedMemPerBlockOptin) {
+      // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate.
+      continue;
+    }
+
+    // First, we set the carveout hint to the preferred value. The driver will increase this if
+    // needed to run at least one block per SM. At the same time, if more blocks fit into one SM,
+    // this carveout value will limit the calculated occupancy. When we're done selecting the best
+    // launch configuration, we will tighten the carveout once more, based on the final memory
+    // usage and occupancy.
+    const int max_carveout =
+      estimate_carveout(preferred_shmem_carveout, smem_size_const, dev_props);
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, max_carveout));
+
+    // Get the theoretical maximum possible number of threads per block
+    cudaFuncAttributes kernel_attrs;
+    RAFT_CUDA_TRY(cudaFuncGetAttributes(&kernel_attrs, kernel));
+    uint32_t n_threads = round_down_safe<uint32_t>(kernel_attrs.maxThreadsPerBlock, n_threads_gty);
+
+    // Actual required shmem depens on the number of threads
+    size_t smem_size = max(smem_size_const, ltk_mem(n_threads));
+
+    // Make sure the kernel can get enough shmem.
+    cudaError_t cuda_status =
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+    if (cuda_status != cudaSuccess) {
+      RAFT_EXPECTS(
+        cuda_status == cudaGetLastError(),
+        "Tried to reset the expected cuda error code, but it didn't match the expectation");
+      // Failed to request enough shmem for the kernel. Skip the candidate.
+      continue;
+    }
+
+    occupancy_t<OutT, LutT> cur(smem_size, n_threads, kernel, dev_props);
+    if (cur.blocks_per_sm <= 0) {
+      // For some reason, we still cannot make this kernel run. Skip the candidate.
+      continue;
+    }
+
+    {
+      // Try to reduce the number of threads to increase occupancy and data locality
+      auto n_threads_tmp = n_threads_min;
+      while (n_threads_tmp * 2 < n_threads) {
+        n_threads_tmp *= 2;
+      }
+      if (n_threads_tmp < n_threads) {
+        while (n_threads_tmp >= n_threads_min) {
+          auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp));
+          occupancy_t<OutT, LutT> tmp(smem_size_tmp, n_threads_tmp, kernel, dev_props);
+          bool select_it = false;
+          if (lut_is_in_shmem && locality_hint >= tmp.blocks_per_sm) {
+            // Normally, the smaller the block the better for L1 cache hit rate.
+            // Hence, the occupancy should be "just good enough"
+            select_it = tmp.occupancy >= min(kTargetOccupancy, cur.occupancy);
+          } else if (lut_is_in_shmem) {
+            // If we don't have enough repeating probes (locality_hint < tmp.blocks_per_sm),
+            // the locality is not going to improve with increasing the number of blocks per SM.
+            // Hence, the only metric here is the occupancy.
+            bool improves_occupancy = tmp.occupancy > cur.occupancy;
+            // Otherwise, the performance still improves with a smaller block size,
+            // given there is enough work to do
+            bool improves_parallelism =
+              tmp.occupancy == cur.occupancy &&
+              7u * tmp.blocks_per_sm * dev_props.multiProcessorCount <= n_blocks;
+            select_it = improves_occupancy || improves_parallelism;
+          } else {
+            // If we don't use shared memory for the lookup table, increasing the number of blocks
+            // is very taxing on the global memory usage.
+            // In this case, the occupancy must increase a lot to make it worth the cost.
+            select_it = tmp.occupancy >= min(1.0, cur.occupancy / kTargetOccupancy);
+          }
+          if (select_it) {
+            n_threads = n_threads_tmp;
+            smem_size = smem_size_tmp;
+            cur       = tmp;
+          }
+          n_threads_tmp /= 2;
+        }
+      }
+    }
+
+    {
+      if (selected_perf.occupancy <= 0.0                 // no candidate yet
+          || (selected_perf.occupancy < cur.occupancy * kTargetOccupancy &&
+              selected_perf.shmem_use >= cur.shmem_use)  // much improved occupancy
+      ) {
+        selected_perf = cur;
+        if (lut_is_in_shmem) {
+          selected_config = {
+            kernel, dim3(n_blocks, 1, 1), dim3(n_threads, 1, 1), smem_size, size_t(0)};
+        } else {
+          // When the global memory is used for the lookup table, we need to minimize the grid
+          // size; otherwise, the kernel may quickly run out of memory.
+          auto n_blocks_min =
+            std::min<uint32_t>(n_blocks, cur.blocks_per_sm * dev_props.multiProcessorCount);
+          selected_config = {kernel,
+                             dim3(n_blocks_min, 1, 1),
+                             dim3(n_threads, 1, 1),
+                             smem_size,
+                             size_t(n_blocks_min) * size_t(pq_dim << pq_bits)};
+        }
+        // Actual shmem/L1 split wildly rounds up the specified preferred carveout, so we set here
+        // a rather conservative bar; most likely, the kernel gets more shared memory than this,
+        // and the occupancy doesn't get hurt.
+        auto carveout = std::min<int>(max_carveout, std::ceil(100.0 * cur.shmem_use));
+        RAFT_CUDA_TRY(
+          cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+        if (cur.occupancy >= kTargetOccupancy) { break; }
+      } else if (selected_perf.occupancy > 0.0) {
+        // If we found a reasonable candidate on a previous iteration, and this one is not better,
+        // then don't try any more candidates because they are much slower anyway.
+        break;
+      }
+    }
+  }
+
+  RAFT_EXPECTS(selected_perf.occupancy > 0.0,
+               "Couldn't determine a working kernel launch configuration.");
+
+  return selected_config;
+}
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
similarity index 76%
rename from cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
rename to cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
index 28306d0c21..d987c0d4ed 100644
--- a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
@@ -14,7 +14,12 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
+#pragma once
 
-template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
\ No newline at end of file
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
+#include "ivf_pq_compute_similarity-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "ivf_pq_compute_similarity-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh
new file mode 100644
index 0000000000..a00b6a50ff
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/matrix/detail/select_warpsort.cuh>  // matrix::detail::select::warpsort::warp_sort_distributed
+
+/*
+ * This header file is a bit of an ugly duckling. The type dummy_block_sort is
+ * needed by both ivf_pq_search.cuh and ivf_pq_compute_similarity.cuh.
+ *
+ * I have decided to move it to it's own header file, which is overkill. Perhaps
+ * there is a nicer solution.
+ *
+ */
+
+namespace raft::neighbors::ivf_pq::detail {
+
+template <typename T, typename IdxT>
+struct dummy_block_sort_t {
+  using queue_t = matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
+  template <typename... Args>
+  __device__ dummy_block_sort_t(int k, Args...){};
+};
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh
new file mode 100644
index 0000000000..87f9bfb622
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/device_loads_stores.cuh>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cub/cub.cuh>
+
+#include <cuda_fp16.h>
+
+#include <optional>
+
+namespace raft::neighbors::ivf_pq::detail {
+
+/** 8-bit floating-point storage type.
+ *
+ * This is a custom type for the current IVF-PQ implementation. No arithmetic operations defined
+ * only conversion to and from fp32. This type is unrelated to the proposed FP8 specification.
+ */
+template <uint32_t ExpBits, bool Signed>
+struct fp_8bit {
+  static_assert(ExpBits + uint8_t{Signed} <= 8, "The type does not fit in 8 bits.");
+  constexpr static uint32_t ExpMask = (1u << (ExpBits - 1u)) - 1u;  // NOLINT
+  constexpr static uint32_t ValBits = 8u - ExpBits;                 // NOLINT
+
+ public:
+  uint8_t bitstring;
+
+  HDI explicit fp_8bit(uint8_t bs) : bitstring(bs) {}
+  HDI explicit fp_8bit(float fp) : fp_8bit(float2fp_8bit(fp).bitstring) {}
+  HDI auto operator=(float fp) -> fp_8bit<ExpBits, Signed>&
+  {
+    bitstring = float2fp_8bit(fp).bitstring;
+    return *this;
+  }
+  HDI explicit operator float() const { return fp_8bit2float(*this); }
+  HDI explicit operator half() const { return half(fp_8bit2float(*this)); }
+
+ private:
+  static constexpr float kMin = 1.0f / float(1u << ExpMask);
+  static constexpr float kMax = float(1u << (ExpMask + 1)) * (2.0f - 1.0f / float(1u << ValBits));
+
+  static HDI auto float2fp_8bit(float v) -> fp_8bit<ExpBits, Signed>
+  {
+    if constexpr (Signed) {
+      auto u = fp_8bit<ExpBits, false>(std::abs(v)).bitstring;
+      u      = (u & 0xfeu) | uint8_t{v < 0};  // set the sign bit
+      return fp_8bit<ExpBits, true>(u);
+    } else {
+      // sic! all small and negative numbers are truncated to zero.
+      if (v < kMin) { return fp_8bit<ExpBits, false>{static_cast<uint8_t>(0)}; }
+      // protect from overflow
+      if (v >= kMax) { return fp_8bit<ExpBits, false>{static_cast<uint8_t>(0xffu)}; }
+      // the rest of possible float values should be within the normalized range
+      return fp_8bit<ExpBits, false>{static_cast<uint8_t>(
+        (*reinterpret_cast<uint32_t*>(&v) + (ExpMask << 23u) - 0x3f800000u) >> (15u + ExpBits))};
+    }
+  }
+
+  static HDI auto fp_8bit2float(const fp_8bit<ExpBits, Signed>& v) -> float
+  {
+    uint32_t u = v.bitstring;
+    if constexpr (Signed) {
+      u &= ~1;  // zero the sign bit
+    }
+    float r;
+    *reinterpret_cast<uint32_t*>(&r) =
+      ((u << (15u + ExpBits)) + (0x3f800000u | (0x00400000u >> ValBits)) - (ExpMask << 23));
+    if constexpr (Signed) {  // recover the sign bit
+      if (v.bitstring & 1) { r = -r; }
+    }
+    return r;
+  }
+};
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4b6e6f5e31..c1c15d3424 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -18,6 +18,9 @@
 
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
+#include <raft/neighbors/detail/ivf_pq_compute_similarity.cuh>
+#include <raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
 #include <raft/core/cudart_utils.hpp>
@@ -49,79 +52,8 @@
 
 namespace raft::neighbors::ivf_pq::detail {
 
-/**
- * Maximum value of k for the fused calculate & select in ivfpq.
- *
- * If runtime value of k is larger than this, the main search operation
- * is split into two kernels (per batch, first calculate distance, then select top-k).
- */
-static constexpr int kMaxCapacity = 128;
-static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)),
-              "kMaxCapacity must be a power of two, not smaller than the WarpSize.");
-
 using namespace raft::spatial::knn::detail;  // NOLINT
 
-/** 8-bit floating-point storage type.
- *
- * This is a custom type for the current IVF-PQ implementation. No arithmetic operations defined
- * only conversion to and from fp32. This type is unrelated to the proposed FP8 specification.
- */
-template <uint32_t ExpBits, bool Signed>
-struct fp_8bit {
-  static_assert(ExpBits + uint8_t{Signed} <= 8, "The type does not fit in 8 bits.");
-  constexpr static uint32_t ExpMask = (1u << (ExpBits - 1u)) - 1u;  // NOLINT
-  constexpr static uint32_t ValBits = 8u - ExpBits;                 // NOLINT
-
- public:
-  uint8_t bitstring;
-
-  HDI explicit fp_8bit(uint8_t bs) : bitstring(bs) {}
-  HDI explicit fp_8bit(float fp) : fp_8bit(float2fp_8bit(fp).bitstring) {}
-  HDI auto operator=(float fp) -> fp_8bit<ExpBits, Signed>&
-  {
-    bitstring = float2fp_8bit(fp).bitstring;
-    return *this;
-  }
-  HDI explicit operator float() const { return fp_8bit2float(*this); }
-  HDI explicit operator half() const { return half(fp_8bit2float(*this)); }
-
- private:
-  static constexpr float kMin = 1.0f / float(1u << ExpMask);
-  static constexpr float kMax = float(1u << (ExpMask + 1)) * (2.0f - 1.0f / float(1u << ValBits));
-
-  static HDI auto float2fp_8bit(float v) -> fp_8bit<ExpBits, Signed>
-  {
-    if constexpr (Signed) {
-      auto u = fp_8bit<ExpBits, false>(std::abs(v)).bitstring;
-      u      = (u & 0xfeu) | uint8_t{v < 0};  // set the sign bit
-      return fp_8bit<ExpBits, true>(u);
-    } else {
-      // sic! all small and negative numbers are truncated to zero.
-      if (v < kMin) { return fp_8bit<ExpBits, false>{static_cast<uint8_t>(0)}; }
-      // protect from overflow
-      if (v >= kMax) { return fp_8bit<ExpBits, false>{static_cast<uint8_t>(0xffu)}; }
-      // the rest of possible float values should be within the normalized range
-      return fp_8bit<ExpBits, false>{static_cast<uint8_t>(
-        (*reinterpret_cast<uint32_t*>(&v) + (ExpMask << 23u) - 0x3f800000u) >> (15u + ExpBits))};
-    }
-  }
-
-  static HDI auto fp_8bit2float(const fp_8bit<ExpBits, Signed>& v) -> float
-  {
-    uint32_t u = v.bitstring;
-    if constexpr (Signed) {
-      u &= ~1;  // zero the sign bit
-    }
-    float r;
-    *reinterpret_cast<uint32_t*>(&r) =
-      ((u << (15u + ExpBits)) + (0x3f800000u | (0x00400000u >> ValBits)) - (ExpMask << 23));
-    if constexpr (Signed) {  // recover the sign bit
-      if (v.bitstring & 1) { r = -r; }
-    }
-    return r;
-  }
-};
-
 /**
  * Select the clusters to probe and, as a side-effect, translate the queries type `T -> float`
  *
@@ -439,464 +371,6 @@ void postprocess_distances(float* out,        // [n_queries, topk]
   }
 }
 
-template <typename T, typename IdxT>
-struct dummy_block_sort_t {
-  using queue_t = matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
-  template <typename... Args>
-  __device__ dummy_block_sort_t(int k, Args...){};
-};
-
-template <int Capacity, typename T, typename IdxT>
-struct pq_block_sort {
-  using type = matrix::detail::select::warpsort::
-    block_sort<matrix::detail::select::warpsort::warp_sort_distributed, Capacity, true, T, IdxT>;
-};
-
-template <typename T, typename IdxT>
-struct pq_block_sort<0, T, IdxT> : dummy_block_sort_t<T, IdxT> {
-  using type = dummy_block_sort_t<T, IdxT>;
-};
-
-template <int Capacity, typename T, typename IdxT>
-using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
-
-/* Manually unrolled loop over a chunk of pq_dataset that fits into one VecT. */
-template <typename OutT,
-          typename LutT,
-          typename VecT,
-          bool CheckBounds,
-          uint32_t PqBits,
-          uint32_t BitsLeft = 0,
-          uint32_t Ix       = 0>
-__device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
-                                                    typename VecT::math_t& pq_code,
-                                                    const VecT& pq_codes,
-                                                    const LutT*& lut_head,
-                                                    const LutT*& lut_end)
-{
-  if constexpr (CheckBounds) {
-    if (lut_head >= lut_end) { return; }
-  }
-  constexpr uint32_t kTotalBits = 8 * sizeof(typename VecT::math_t);
-  constexpr uint32_t kPqShift   = 1u << PqBits;
-  constexpr uint32_t kPqMask    = kPqShift - 1u;
-  if constexpr (BitsLeft >= PqBits) {
-    uint8_t code = pq_code & kPqMask;
-    pq_code >>= PqBits;
-    score += OutT(lut_head[code]);
-    lut_head += kPqShift;
-    return ivfpq_compute_chunk<OutT, LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
-      score, pq_code, pq_codes, lut_head, lut_end);
-  } else if constexpr (Ix < VecT::Ratio) {
-    uint8_t code                = pq_code;
-    pq_code                     = pq_codes.val.data[Ix];
-    constexpr uint32_t kRemBits = PqBits - BitsLeft;
-    constexpr uint32_t kRemMask = (1u << kRemBits) - 1u;
-    code |= (pq_code & kRemMask) << BitsLeft;
-    pq_code >>= kRemBits;
-    score += OutT(lut_head[code]);
-    lut_head += kPqShift;
-    return ivfpq_compute_chunk<OutT,
-                               LutT,
-                               VecT,
-                               CheckBounds,
-                               PqBits,
-                               kTotalBits - kRemBits,
-                               Ix + 1>(score, pq_code, pq_codes, lut_head, lut_end);
-  }
-}
-
-/* Compute the similarity for one vector in the pq_dataset */
-template <typename OutT, typename LutT, typename VecT, uint32_t PqBits>
-__device__ auto ivfpq_compute_score(uint32_t pq_dim,
-                                    const typename VecT::io_t* pq_head,
-                                    const LutT* lut_scores,
-                                    OutT early_stop_limit) -> OutT
-{
-  constexpr uint32_t kChunkSize = sizeof(VecT) * 8u / PqBits;
-  auto lut_head                 = lut_scores;
-  auto lut_end                  = lut_scores + (pq_dim << PqBits);
-  VecT pq_codes;
-  OutT score{0};
-  for (; pq_dim >= kChunkSize; pq_dim -= kChunkSize) {
-    *pq_codes.vectorized_data() = *pq_head;
-    pq_head += kIndexGroupSize;
-    typename VecT::math_t pq_code = 0;
-    ivfpq_compute_chunk<OutT, LutT, VecT, false, PqBits>(
-      score, pq_code, pq_codes, lut_head, lut_end);
-    // Early stop when it makes sense (otherwise early_stop_limit is kDummy/infinity).
-    if (score >= early_stop_limit) { return score; }
-  }
-  if (pq_dim > 0) {
-    *pq_codes.vectorized_data()   = *pq_head;
-    typename VecT::math_t pq_code = 0;
-    ivfpq_compute_chunk<OutT, LutT, VecT, true, PqBits>(
-      score, pq_code, pq_codes, lut_head, lut_end);
-  }
-  return score;
-}
-
-/**
- * The main kernel that computes similarity scores across multiple queries and probes.
- * When `Capacity > 0`, it also selects top K candidates for each query and probe
- * (which need to be merged across probes afterwards).
- *
- * Each block processes a (query, probe) pair: it calculates the distance between the single query
- * vector and all the dataset vector in the cluster that we are probing.
- *
- * @tparam OutT
- *   The output type - distances.
- * @tparam LutT
- *   The lookup table element type (lut_scores).
- * @tparam PqBits
- *   The bit length of an encoded vector element after compression by PQ
- *   (NB: pq_book_size = 1 << PqBits).
- * @tparam Capacity
- *   Power-of-two; the maximum possible `k` in top-k. Value zero disables fused top-k search.
- * @tparam PrecompBaseDiff
- *   Defines whether we should precompute part of the distance and keep it in shared memory
- *   before the main part (score calculation) to increase memory usage efficiency in the latter.
- *   For L2, this is the distance between the query and the cluster center.
- * @tparam EnableSMemLut
- *   Defines whether to use the shared memory for the lookup table (`lut_scores`).
- *   Setting this to `false` allows to reduce the shared memory usage (and maximum data dim)
- *   at the cost of reducing global memory reading throughput.
- *
- * @param n_rows the number of records in the dataset
- * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`).
- * @param n_probes the number of clusters to search for each query
- * @param pq_dim
- *   The dimensionality of an encoded vector after compression by PQ.
- * @param n_queries the number of queries.
- * @param metric the distance type.
- * @param codebook_kind Defines the way PQ codebooks have been trained.
- * @param topk the `k` in the select top-k.
- * @param max_samples the size of the output for a single query.
- * @param cluster_centers
- *   The device pointer to the cluster centers in the original space (NB: after rotation)
- *   [n_clusters, dim].
- * @param pq_centers
- *   The device pointer to the cluster centers in the PQ space
- *   [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len,].
- * @param pq_dataset
- *   The device pointer to the PQ index (data) [n_rows, ...].
- * @param cluster_labels
- *   The device pointer to the labels (clusters) for each query and probe [n_queries, n_probes].
- * @param _chunk_indices
- *   The device pointer to the data offsets for each query and probe [n_queries, n_probes].
- * @param queries
- *   The device pointer to the queries (NB: after rotation) [n_queries, dim].
- * @param index_list
- *   An optional device pointer to the enforced order of search [n_queries, n_probes].
- *   One can pass reordered indices here to try to improve data reading locality.
- * @param lut_scores
- *   The device pointer for storing the lookup table globally [gridDim.x, pq_dim << PqBits].
- *   Ignored when `EnableSMemLut == true`.
- * @param _out_scores
- *   The device pointer to the output scores
- *   [n_queries, max_samples] or [n_queries, n_probes, topk].
- * @param _out_indices
- *   The device pointer to the output indices [n_queries, n_probes, topk].
- *   These are the indices of the records as they appear in the database view formed by the probed
- *   clusters / defined by the `_chunk_indices`.
- *   The indices can have values within the range [0, max_samples).
- *   Ignored  when `Capacity == 0`.
- */
-template <typename OutT,
-          typename LutT,
-          uint32_t PqBits,
-          int Capacity,
-          bool PrecompBaseDiff,
-          bool EnableSMemLut>
-__global__ void compute_similarity_kernel(uint32_t n_rows,
-                                          uint32_t dim,
-                                          uint32_t n_probes,
-                                          uint32_t pq_dim,
-                                          uint32_t n_queries,
-                                          distance::DistanceType metric,
-                                          codebook_gen codebook_kind,
-                                          uint32_t topk,
-                                          uint32_t max_samples,
-                                          const float* cluster_centers,
-                                          const float* pq_centers,
-                                          const uint8_t* const* pq_dataset,
-                                          const uint32_t* cluster_labels,
-                                          const uint32_t* _chunk_indices,
-                                          const float* queries,
-                                          const uint32_t* index_list,
-                                          float* query_kths,
-                                          LutT* lut_scores,
-                                          OutT* _out_scores,
-                                          uint32_t* _out_indices)
-{
-  /* Shared memory:
-
-    * lut_scores: lookup table (LUT) of size = `pq_dim << PqBits`  (when EnableSMemLut)
-    * base_diff: size = dim (which is equal to `pq_dim * pq_len`)  or dim*2
-    * topk::block_sort: some amount of shared memory, but overlaps with the rest:
-        block_sort only needs shared memory for `.done()` operation, which can come very last.
-  */
-  extern __shared__ __align__(256) uint8_t smem_buf[];  // NOLINT
-  constexpr bool kManageLocalTopK = Capacity > 0;
-
-  constexpr uint32_t PqShift = 1u << PqBits;  // NOLINT
-  constexpr uint32_t PqMask  = PqShift - 1u;  // NOLINT
-
-  const uint32_t pq_len   = dim / pq_dim;
-  const uint32_t lut_size = pq_dim * PqShift;
-
-  if constexpr (EnableSMemLut) {
-    lut_scores = reinterpret_cast<LutT*>(smem_buf);
-  } else {
-    lut_scores += lut_size * blockIdx.x;
-  }
-
-  float* base_diff = nullptr;
-  if constexpr (PrecompBaseDiff) {
-    if constexpr (EnableSMemLut) {
-      base_diff = reinterpret_cast<float*>(lut_scores + lut_size);
-    } else {
-      base_diff = reinterpret_cast<float*>(smem_buf);
-    }
-  }
-
-  for (int ib = blockIdx.x; ib < n_queries * n_probes; ib += gridDim.x) {
-    if (ib >= gridDim.x) {
-      // sync shared memory accesses on the second and further iterations
-      __syncthreads();
-    }
-    uint32_t query_ix;
-    uint32_t probe_ix;
-    if (index_list == nullptr) {
-      query_ix = ib % n_queries;
-      probe_ix = ib / n_queries;
-    } else {
-      auto ordered_ix = index_list[ib];
-      query_ix        = ordered_ix / n_probes;
-      probe_ix        = ordered_ix % n_probes;
-    }
-
-    const uint32_t* chunk_indices = _chunk_indices + (n_probes * query_ix);
-    const float* query            = queries + (dim * query_ix);
-    OutT* out_scores;
-    uint32_t* out_indices = nullptr;
-    if constexpr (kManageLocalTopK) {
-      // Store topk calculated distances to out_scores (and its indices to out_indices)
-      out_scores  = _out_scores + topk * (probe_ix + (n_probes * query_ix));
-      out_indices = _out_indices + topk * (probe_ix + (n_probes * query_ix));
-    } else {
-      // Store all calculated distances to out_scores
-      out_scores = _out_scores + max_samples * query_ix;
-    }
-    uint32_t label              = cluster_labels[n_probes * query_ix + probe_ix];
-    const float* cluster_center = cluster_centers + (dim * label);
-    const float* pq_center;
-    if (codebook_kind == codebook_gen::PER_SUBSPACE) {
-      pq_center = pq_centers;
-    } else {
-      pq_center = pq_centers + (pq_len << PqBits) * label;
-    }
-
-    if constexpr (PrecompBaseDiff) {
-      // Reduce number of memory reads later by pre-computing parts of the score
-      switch (metric) {
-        case distance::DistanceType::L2SqrtExpanded:
-        case distance::DistanceType::L2Expanded: {
-          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
-            base_diff[i] = query[i] - cluster_center[i];
-          }
-        } break;
-        case distance::DistanceType::InnerProduct: {
-          float2 pvals;
-          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
-            pvals.x                                 = query[i];
-            pvals.y                                 = cluster_center[i] * pvals.x;
-            reinterpret_cast<float2*>(base_diff)[i] = pvals;
-          }
-        } break;
-        default: __builtin_unreachable();
-      }
-      __syncthreads();
-    }
-
-    {
-      // Create a lookup table
-      // For each subspace, the lookup table stores the distance between the actual query vector
-      // (projected into the subspace) and all possible pq vectors in that subspace.
-      for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
-        const uint32_t i_pq  = i >> PqBits;
-        uint32_t j           = i_pq * pq_len;
-        const uint32_t j_end = pq_len + j;
-        auto cur_pq_center   = pq_center + (i & PqMask) +
-                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j * PqShift : 0u);
-        float score = 0.0;
-        do {
-          float pq_c = *cur_pq_center;
-          cur_pq_center += PqShift;
-          switch (metric) {
-            case distance::DistanceType::L2SqrtExpanded:
-            case distance::DistanceType::L2Expanded: {
-              float diff;
-              if constexpr (PrecompBaseDiff) {
-                diff = base_diff[j];
-              } else {
-                diff = query[j] - cluster_center[j];
-              }
-              diff -= pq_c;
-              score += diff * diff;
-            } break;
-            case distance::DistanceType::InnerProduct: {
-              // NB: we negate the scores as we hardcoded select-topk to always compute the minimum
-              float q;
-              if constexpr (PrecompBaseDiff) {
-                float2 pvals = reinterpret_cast<float2*>(base_diff)[j];
-                q            = pvals.x;
-                score -= pvals.y;
-              } else {
-                q = query[j];
-                score -= q * cluster_center[j];
-              }
-              score -= q * pq_c;
-            } break;
-            default: __builtin_unreachable();
-          }
-        } while (++j < j_end);
-        lut_scores[i] = LutT(score);
-      }
-    }
-
-    // Define helper types for efficient access to the pq_dataset, which is stored in an interleaved
-    // format. The chunks of PQ data are stored in kIndexGroupVecLen-bytes-long chunks, interleaved
-    // in groups of kIndexGroupSize elems (which is normally equal to the warp size) for the fastest
-    // possible access by thread warps.
-    //
-    // Consider one record in the pq_dataset is `pq_dim * pq_bits`-bit-long.
-    // Assuming `kIndexGroupVecLen = 16`, one chunk of data read by a thread at once is 128-bits.
-    // Then, such a chunk contains `chunk_size = 128 / pq_bits` record elements, and the record
-    // consists of `ceildiv(pq_dim, chunk_size)` chunks. The chunks are interleaved in groups of 32,
-    // so that the warp can achieve the best coalesced read throughput.
-    using group_align  = Pow2<kIndexGroupSize>;
-    using vec_align    = Pow2<kIndexGroupVecLen>;
-    using local_topk_t = block_sort_t<Capacity, OutT, uint32_t>;
-    using op_t         = uint32_t;
-    using vec_t        = TxN_t<op_t, kIndexGroupVecLen / sizeof(op_t)>;
-
-    uint32_t sample_offset = 0;
-    if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
-    uint32_t n_samples            = chunk_indices[probe_ix] - sample_offset;
-    uint32_t n_samples_aligned    = group_align::roundUp(n_samples);
-    constexpr uint32_t kChunkSize = (kIndexGroupVecLen * 8u) / PqBits;
-    uint32_t pq_line_width        = div_rounding_up_unsafe(pq_dim, kChunkSize) * kIndexGroupVecLen;
-    auto pq_thread_data = pq_dataset[label] + group_align::roundDown(threadIdx.x) * pq_line_width +
-                          group_align::mod(threadIdx.x) * vec_align::Value;
-    pq_line_width *= blockDim.x;
-
-    constexpr OutT kDummy = upper_bound<OutT>();
-    OutT query_kth        = kDummy;
-    if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
-    local_topk_t block_topk(topk, nullptr, query_kth);
-    OutT early_stop_limit = kDummy;
-    switch (metric) {
-      // If the metric is non-negative, we can use the query_kth approximation as an early stop
-      // threshold to skip some iterations when computing the score. Add such metrics here.
-      case distance::DistanceType::L2SqrtExpanded:
-      case distance::DistanceType::L2Expanded: {
-        early_stop_limit = query_kth;
-      } break;
-      default: break;
-    }
-
-    // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score
-    __threadfence_block();
-    __syncthreads();
-
-    // Compute a distance for each sample
-    for (uint32_t i = threadIdx.x; i < n_samples_aligned;
-         i += blockDim.x, pq_thread_data += pq_line_width) {
-      OutT score = kDummy;
-      bool valid = i < n_samples;
-      if (valid) {
-        score = ivfpq_compute_score<OutT, LutT, vec_t, PqBits>(
-          pq_dim,
-          reinterpret_cast<const vec_t::io_t*>(pq_thread_data),
-          lut_scores,
-          early_stop_limit);
-      }
-      if constexpr (kManageLocalTopK) {
-        block_topk.add(score, sample_offset + i);
-      } else {
-        if (valid) { out_scores[sample_offset + i] = score; }
-      }
-    }
-    if constexpr (kManageLocalTopK) {
-      // sync threads before the topk merging operation, because we reuse smem_buf
-      __syncthreads();
-      block_topk.done(smem_buf);
-      block_topk.store(out_scores, out_indices);
-      if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
-    } else {
-      // fill in the rest of the out_scores with dummy values
-      if (probe_ix + 1 == n_probes) {
-        for (uint32_t i = threadIdx.x + sample_offset + n_samples; i < max_samples;
-             i += blockDim.x) {
-          out_scores[i] = kDummy;
-        }
-      }
-    }
-  }
-}
-
-// The signature of the kernel defined by a minimal set of template parameters
-template <typename OutT, typename LutT>
-using compute_similarity_kernel_t =
-  decltype(&compute_similarity_kernel<OutT, LutT, 8, 0, true, true>);
-
-// The config struct lifts the runtime parameters to the template parameters
-template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
-struct compute_similarity_kernel_config {
- public:
-  static auto get(uint32_t pq_bits, uint32_t k_max) -> compute_similarity_kernel_t<OutT, LutT>
-  {
-    return kernel_choose_bits(pq_bits, k_max);
-  }
-
- private:
-  static auto kernel_choose_bits(uint32_t pq_bits, uint32_t k_max)
-    -> compute_similarity_kernel_t<OutT, LutT>
-  {
-    switch (pq_bits) {
-      case 4: return kernel_try_capacity<4, kMaxCapacity>(k_max);
-      case 5: return kernel_try_capacity<5, kMaxCapacity>(k_max);
-      case 6: return kernel_try_capacity<6, kMaxCapacity>(k_max);
-      case 7: return kernel_try_capacity<7, kMaxCapacity>(k_max);
-      case 8: return kernel_try_capacity<8, kMaxCapacity>(k_max);
-      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
-    }
-  }
-
-  template <uint32_t PqBits, int Capacity>
-  static auto kernel_try_capacity(uint32_t k_max) -> compute_similarity_kernel_t<OutT, LutT>
-  {
-    if constexpr (Capacity > 0) {
-      if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<PqBits, 0>(k_max); }
-    }
-    if constexpr (Capacity > 1) {
-      if (k_max * 2 <= Capacity) { return kernel_try_capacity<PqBits, (Capacity / 2)>(k_max); }
-    }
-    return compute_similarity_kernel<OutT, LutT, PqBits, Capacity, PrecompBaseDiff, EnableSMemLut>;
-  }
-};
-
-// A standalone accessor function is necessary to make sure template specializations work correctly
-// (we "extern template" this function)
-template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
-auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max)
-  -> compute_similarity_kernel_t<OutT, LutT>
-{
-  return compute_similarity_kernel_config<OutT, LutT, PrecompBaseDiff, EnableSMemLut>::get(pq_bits,
-                                                                                           k_max);
-}
-
 /**
  * An approximation to the number of times each cluster appears in a batched sample.
  *
@@ -930,318 +404,6 @@ constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
   return 1 + (n_queries - 1) * n_probes / (2 * n_clusters);
 }
 
-/**
- * Estimate a carveout value as expected by `cudaFuncAttributePreferredSharedMemoryCarveout`
- * (which does not take into account `reservedSharedMemPerBlock`),
- * given by a desired schmem-L1 split and a per-block memory requirement in bytes.
- *
- * NB: As per the programming guide, the memory carveout setting is just a hint for the driver; it's
- * free to choose any shmem-L1 configuration it deems appropriate. For example, if you set the
- * carveout to zero, it will choose a non-zero config that will allow to run at least one active
- * block per SM.
- *
- * @param shmem_fraction
- *   a fraction representing a desired split (shmem / (shmem + L1)) [0, 1].
- * @param shmem_per_block
- *   a shared memory usage per block (dynamic + static shared memory sizes), in bytes.
- * @param dev_props
- *   device properties.
- * @return
- *   a carveout value in percents [0, 100].
- */
-constexpr inline auto estimate_carveout(double shmem_fraction,
-                                        size_t shmem_per_block,
-                                        const cudaDeviceProp& dev_props) -> int
-{
-  using shmem_unit = Pow2<128>;
-  size_t m         = shmem_unit::roundUp(shmem_per_block);
-  size_t r         = dev_props.reservedSharedMemPerBlock;
-  size_t s         = dev_props.sharedMemPerMultiprocessor;
-  return (size_t(100 * s * m * shmem_fraction) - (m - 1) * r) / (s * (m + r));
-}
-
-/** Select an appropriate kernel instance and launch parameters. */
-template <typename OutT, typename LutT>
-struct compute_similarity {
-  /** Estimate the occupancy for the given kernel on the given device. */
-  struct occupancy_t {
-    using shmem_unit = Pow2<128>;
-
-    int blocks_per_sm = 0;
-    double occupancy  = 0.0;
-    double shmem_use  = 1.0;
-
-    inline occupancy_t() = default;
-    inline occupancy_t(size_t smem,
-                       uint32_t n_threads,
-                       compute_similarity_kernel_t<OutT, LutT> kernel,
-                       const cudaDeviceProp& dev_props)
-    {
-      RAFT_CUDA_TRY(
-        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm, kernel, n_threads, smem));
-      occupancy = double(blocks_per_sm * n_threads) / double(dev_props.maxThreadsPerMultiProcessor);
-      shmem_use = double(shmem_unit::roundUp(smem) * blocks_per_sm) /
-                  double(dev_props.sharedMemPerMultiprocessor);
-    }
-  };
-
-  struct selected {
-    compute_similarity_kernel_t<OutT, LutT> kernel;
-    dim3 grid_dim;
-    dim3 block_dim;
-    size_t smem_size;
-    size_t device_lut_size;
-
-    template <typename... Args>
-    void operator()(rmm::cuda_stream_view stream, Args... args)
-    {
-      kernel<<<grid_dim, block_dim, smem_size, stream>>>(args...);
-      RAFT_CHECK_CUDA(stream);
-    }
-  };
-
-  /**
-   * Use heuristics to choose an optimal instance of the search kernel.
-   * It selects among a few kernel variants (with/out using shared mem for
-   * lookup tables / precomputed distances) and tries to choose the block size
-   * to maximize kernel occupancy.
-   *
-   * @param manage_local_topk
-   *    whether use the fused calculate+select or just calculate the distances for each
-   *    query and probed cluster.
-   *
-   * @param locality_hint
-   *    beyond this limit do not consider increasing the number of active blocks per SM
-   *    would improve locality anymore.
-   */
-  static inline auto select(const cudaDeviceProp& dev_props,
-                            bool manage_local_topk,
-                            int locality_hint,
-                            double preferred_shmem_carveout,
-                            uint32_t pq_bits,
-                            uint32_t pq_dim,
-                            uint32_t precomp_data_count,
-                            uint32_t n_queries,
-                            uint32_t n_probes,
-                            uint32_t topk) -> selected
-  {
-    // Shared memory for storing the lookup table
-    size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits);
-    // Shared memory for storing pre-computed pieces to speedup the lookup table construction
-    // (e.g. the distance between a cluster center and the query for L2).
-    size_t bdf_mem = sizeof(float) * precomp_data_count;
-    // Shared memory for the fused top-k component; it may overlap with the other uses of shared
-    // memory and depends on the number of threads.
-    struct ltk_mem_t {
-      uint32_t subwarp_size;
-      uint32_t topk;
-      bool manage_local_topk;
-      ltk_mem_t(bool manage_local_topk, uint32_t topk)
-        : manage_local_topk(manage_local_topk), topk(topk)
-      {
-        subwarp_size = WarpSize;
-        while (topk * 2 <= subwarp_size) {
-          subwarp_size /= 2;
-        }
-      }
-
-      [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
-      {
-        return manage_local_topk ? matrix::detail::select::warpsort::
-                                     template calc_smem_size_for_block_wide<OutT, uint32_t>(
-                                       n_threads / subwarp_size, topk)
-                                 : 0;
-      }
-    } ltk_mem{manage_local_topk, topk};
-
-    // Total amount of work; should be enough to occupy the GPU.
-    uint32_t n_blocks = n_queries * n_probes;
-
-    // The minimum block size we may want:
-    //   1. It's a power-of-two for efficient L1 caching of pq_centers values
-    //      (multiples of `1 << pq_bits`).
-    //   2. It should be large enough to fully utilize an SM.
-    uint32_t n_threads_min = WarpSize;
-    while (dev_props.maxBlocksPerMultiProcessor * int(n_threads_min) <
-           dev_props.maxThreadsPerMultiProcessor) {
-      n_threads_min *= 2;
-    }
-    // Further increase the minimum block size to make sure full device occupancy
-    // (NB: this may lead to `n_threads_min` being larger than the kernel's maximum)
-    while (int(n_blocks * n_threads_min) <
-             dev_props.multiProcessorCount * dev_props.maxThreadsPerMultiProcessor &&
-           int(n_threads_min) < dev_props.maxThreadsPerBlock) {
-      n_threads_min *= 2;
-    }
-    // Even further, increase it to allow less blocks per SM if there not enough queries.
-    // With this, we reduce the chance of different clusters being processed by two blocks
-    // on the same SM and thus improve the data locality for L1 caching.
-    while (int(n_queries * n_threads_min) < dev_props.maxThreadsPerMultiProcessor &&
-           int(n_threads_min) < dev_props.maxThreadsPerBlock) {
-      n_threads_min *= 2;
-    }
-
-    // Granularity of changing the number of threads when computing the maximum block size.
-    // It's good to have it multiple of the PQ book width.
-    uint32_t n_threads_gty = round_up_safe<uint32_t>(1u << pq_bits, WarpSize);
-
-    /*
-     Shared memory / L1 cache balance is the main limiter of this kernel.
-     The more blocks per SM we launch, the more shared memory we need. Besides that, we have
-     three versions of the kernel varying in performance and shmem usage.
-
-     We try the most demanding and the fastest kernel first, trying to maximize occupancy with
-     the minimum number of blocks (just one, really). Then, we tweak the `n_threads` to further
-     optimize occupancy and data locality for the L1 cache.
-     */
-    auto conf_fast        = get_compute_similarity_kernel<OutT, LutT, true, true>;
-    auto conf_no_basediff = get_compute_similarity_kernel<OutT, LutT, false, true>;
-    auto conf_no_smem_lut = get_compute_similarity_kernel<OutT, LutT, true, false>;
-    auto topk_or_zero     = manage_local_topk ? topk : 0u;
-    std::array candidates{
-      std::make_tuple(conf_fast(pq_bits, topk_or_zero), lut_mem + bdf_mem, true),
-      std::make_tuple(conf_no_basediff(pq_bits, topk_or_zero), lut_mem, true),
-      std::make_tuple(conf_no_smem_lut(pq_bits, topk_or_zero), bdf_mem, false)};
-
-    // we may allow slightly lower than 100% occupancy;
-    constexpr double kTargetOccupancy = 0.75;
-    // This struct is used to select the better candidate
-    occupancy_t selected_perf{};
-    selected selected_config;
-    for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) {
-      if (smem_size_const > dev_props.sharedMemPerBlockOptin) {
-        // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate.
-        continue;
-      }
-
-      // First, we set the carveout hint to the preferred value. The driver will increase this if
-      // needed to run at least one block per SM. At the same time, if more blocks fit into one SM,
-      // this carveout value will limit the calculated occupancy. When we're done selecting the best
-      // launch configuration, we will tighten the carveout once more, based on the final memory
-      // usage and occupancy.
-      const int max_carveout =
-        estimate_carveout(preferred_shmem_carveout, smem_size_const, dev_props);
-      RAFT_CUDA_TRY(
-        cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, max_carveout));
-
-      // Get the theoretical maximum possible number of threads per block
-      cudaFuncAttributes kernel_attrs;
-      RAFT_CUDA_TRY(cudaFuncGetAttributes(&kernel_attrs, kernel));
-      uint32_t n_threads =
-        round_down_safe<uint32_t>(kernel_attrs.maxThreadsPerBlock, n_threads_gty);
-
-      // Actual required shmem depens on the number of threads
-      size_t smem_size = max(smem_size_const, ltk_mem(n_threads));
-
-      // Make sure the kernel can get enough shmem.
-      cudaError_t cuda_status =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
-      if (cuda_status != cudaSuccess) {
-        RAFT_EXPECTS(
-          cuda_status == cudaGetLastError(),
-          "Tried to reset the expected cuda error code, but it didn't match the expectation");
-        // Failed to request enough shmem for the kernel. Skip the candidate.
-        continue;
-      }
-
-      occupancy_t cur(smem_size, n_threads, kernel, dev_props);
-      if (cur.blocks_per_sm <= 0) {
-        // For some reason, we still cannot make this kernel run. Skip the candidate.
-        continue;
-      }
-
-      {
-        // Try to reduce the number of threads to increase occupancy and data locality
-        auto n_threads_tmp = n_threads_min;
-        while (n_threads_tmp * 2 < n_threads) {
-          n_threads_tmp *= 2;
-        }
-        if (n_threads_tmp < n_threads) {
-          while (n_threads_tmp >= n_threads_min) {
-            auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp));
-            occupancy_t tmp(smem_size_tmp, n_threads_tmp, kernel, dev_props);
-            bool select_it = false;
-            if (lut_is_in_shmem && locality_hint >= tmp.blocks_per_sm) {
-              // Normally, the smaller the block the better for L1 cache hit rate.
-              // Hence, the occupancy should be "just good enough"
-              select_it = tmp.occupancy >= min(kTargetOccupancy, cur.occupancy);
-            } else if (lut_is_in_shmem) {
-              // If we don't have enough repeating probes (locality_hint < tmp.blocks_per_sm),
-              // the locality is not going to improve with increasing the number of blocks per SM.
-              // Hence, the only metric here is the occupancy.
-              bool improves_occupancy = tmp.occupancy > cur.occupancy;
-              // Otherwise, the performance still improves with a smaller block size,
-              // given there is enough work to do
-              bool improves_parallelism =
-                tmp.occupancy == cur.occupancy &&
-                7u * tmp.blocks_per_sm * dev_props.multiProcessorCount <= n_blocks;
-              select_it = improves_occupancy || improves_parallelism;
-            } else {
-              // If we don't use shared memory for the lookup table, increasing the number of blocks
-              // is very taxing on the global memory usage.
-              // In this case, the occupancy must increase a lot to make it worth the cost.
-              select_it = tmp.occupancy >= min(1.0, cur.occupancy / kTargetOccupancy);
-            }
-            if (select_it) {
-              n_threads = n_threads_tmp;
-              smem_size = smem_size_tmp;
-              cur       = tmp;
-            }
-            n_threads_tmp /= 2;
-          }
-        }
-      }
-
-      {
-        if (selected_perf.occupancy <= 0.0  // no candidate yet
-            || (selected_perf.occupancy < cur.occupancy * kTargetOccupancy &&
-                selected_perf.shmem_use >= cur.shmem_use)  // much improved occupancy
-        ) {
-          selected_perf = cur;
-          if (lut_is_in_shmem) {
-            selected_config = {
-              kernel, dim3(n_blocks, 1, 1), dim3(n_threads, 1, 1), smem_size, size_t(0)};
-          } else {
-            // When the global memory is used for the lookup table, we need to minimize the grid
-            // size; otherwise, the kernel may quickly run out of memory.
-            auto n_blocks_min =
-              std::min<uint32_t>(n_blocks, cur.blocks_per_sm * dev_props.multiProcessorCount);
-            selected_config = {kernel,
-                               dim3(n_blocks_min, 1, 1),
-                               dim3(n_threads, 1, 1),
-                               smem_size,
-                               size_t(n_blocks_min) * size_t(pq_dim << pq_bits)};
-          }
-          // Actual shmem/L1 split wildly rounds up the specified preferred carveout, so we set here
-          // a rather conservative bar; most likely, the kernel gets more shared memory than this,
-          // and the occupancy doesn't get hurt.
-          auto carveout = std::min<int>(max_carveout, std::ceil(100.0 * cur.shmem_use));
-          RAFT_CUDA_TRY(
-            cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
-          if (cur.occupancy >= kTargetOccupancy) { break; }
-        } else if (selected_perf.occupancy > 0.0) {
-          // If we found a reasonable candidate on a previous iteration, and this one is not better,
-          // then don't try any more candidates because they are much slower anyway.
-          break;
-        }
-      }
-    }
-
-    RAFT_EXPECTS(selected_perf.occupancy > 0.0,
-                 "Couldn't determine a working kernel launch configuration.");
-
-    return selected_config;
-  }
-};
-
-inline auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool
-{
-  if (k > kMaxCapacity) { return false; }             // warp_sort not possible
-  if (n_probes <= 16) { return false; }               // too few clusters
-  if (n_queries * n_probes <= 256) { return false; }  // overall amount of work is too small
-  return true;
-}
-
 /**
  * The "main part" of the search, which assumes that outer-level `search` has already:
  *
@@ -1364,16 +526,16 @@ void ivfpq_search_worker(raft::device_resources const& handle,
     } break;
   }
 
-  auto search_instance = compute_similarity<ScoreT, LutT>::select(handle.get_device_properties(),
-                                                                  manage_local_topk,
-                                                                  coresidency,
-                                                                  preferred_shmem_carveout,
-                                                                  index.pq_bits(),
-                                                                  index.pq_dim(),
-                                                                  precomp_data_count,
-                                                                  n_queries,
-                                                                  n_probes,
-                                                                  topK);
+  auto search_instance = compute_similarity_select<ScoreT, LutT>(handle.get_device_properties(),
+                                                                 manage_local_topk,
+                                                                 coresidency,
+                                                                 preferred_shmem_carveout,
+                                                                 index.pq_bits(),
+                                                                 index.pq_dim(),
+                                                                 precomp_data_count,
+                                                                 n_queries,
+                                                                 n_probes,
+                                                                 topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
   std::optional<device_vector<float>> query_kths_buf{std::nullopt};
@@ -1386,27 +548,28 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                 raft::const_op<float>{dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy});
     query_kths = query_kths_buf->data_handle();
   }
-  search_instance(stream,
-                  index.size(),
-                  index.rot_dim(),
-                  n_probes,
-                  index.pq_dim(),
-                  n_queries,
-                  index.metric(),
-                  index.codebook_kind(),
-                  topK,
-                  max_samples,
-                  index.centers_rot().data_handle(),
-                  index.pq_centers().data_handle(),
-                  index.data_ptrs().data_handle(),
-                  clusters_to_probe,
-                  chunk_index.data(),
-                  query,
-                  index_list_sorted,
-                  query_kths,
-                  device_lut.data(),
-                  distances_buf.data(),
-                  neighbors_ptr);
+  compute_similarity_run(search_instance,
+                         stream,
+                         index.size(),
+                         index.rot_dim(),
+                         n_probes,
+                         index.pq_dim(),
+                         n_queries,
+                         index.metric(),
+                         index.codebook_kind(),
+                         topK,
+                         max_samples,
+                         index.centers_rot().data_handle(),
+                         index.pq_centers().data_handle(),
+                         index.data_ptrs().data_handle(),
+                         clusters_to_probe,
+                         chunk_index.data(),
+                         query,
+                         index_list_sorted,
+                         query_kths,
+                         device_lut.data(),
+                         distances_buf.data(),
+                         neighbors_ptr);
 
   // Select topk vectors for each query
   rmm::device_uvector<ScoreT> topk_dists(n_queries * topK, stream, mr);
@@ -1604,7 +767,7 @@ inline void search(raft::device_resources const& handle,
   auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16);
   if (pool_guard) {
     RAFT_LOG_DEBUG("ivf_pq::search: using pool memory resource with initial size %zu bytes",
-                   pool_guard->pool_size());
+                   n_queries * n_probes * k * 16ull);
   }
 
   // Maximum number of query vectors to search at the same time.
@@ -1613,7 +776,7 @@ inline void search(raft::device_resources const& handle,
 
   rmm::device_uvector<float> float_queries(max_queries * dim_ext, stream, mr);
   rmm::device_uvector<float> rot_queries(max_queries * index.rot_dim(), stream, mr);
-  rmm::device_uvector<uint32_t> clusters_to_probe(max_queries * params.n_probes, stream, mr);
+  rmm::device_uvector<uint32_t> clusters_to_probe(max_queries * n_probes, stream, mr);
 
   auto search_instance = ivfpq_search<IdxT>::fun(params, index.metric());
 
@@ -1624,7 +787,7 @@ inline void search(raft::device_resources const& handle,
                     clusters_to_probe.data(),
                     float_queries.data(),
                     queries_batch,
-                    params.n_probes,
+                    n_probes,
                     index.n_lists(),
                     dim,
                     dim_ext,
@@ -1661,10 +824,10 @@ inline void search(raft::device_resources const& handle,
       search_instance(handle,
                       index,
                       max_samples,
-                      params.n_probes,
+                      n_probes,
                       k,
                       batch_size,
-                      clusters_to_probe.data() + uint64_t(params.n_probes) * offset_b,
+                      clusters_to_probe.data() + uint64_t(n_probes) * offset_b,
                       rot_queries.data() + uint64_t(index.rot_dim()) * offset_b,
                       neighbors + uint64_t(k) * (offset_q + offset_b),
                       distances + uint64_t(k) * (offset_q + offset_b),
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index a776ce2586..879aafee32 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -36,6 +36,7 @@
 #include <raft/neighbors/detail/selection_faiss.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
+#include <raft/spatial/knn/detail/processing.cuh>
 #include <set>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -56,7 +57,7 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
                            size_t m,
                            size_t n,
                            size_t d,
-                           int k,
+                           size_t k,
                            ElementType* distances,  // size (m, k)
                            IndexType* indices,      // size (m, k)
                            raft::distance::DistanceType metric,
@@ -79,7 +80,7 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
   if (max_col_tile_size && (tile_cols > max_col_tile_size)) { tile_cols = max_col_tile_size; }
 
   // tile_cols must be at least k items
-  tile_cols = std::max(tile_cols, static_cast<size_t>(k));
+  tile_cols = std::max(tile_cols, k);
 
   // stores pairwise distances for the current tile
   rmm::device_uvector<ElementType> temp_distances(tile_rows * tile_cols, stream);
@@ -90,13 +91,34 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
   rmm::device_uvector<ElementType> search_norms(0, stream);
   rmm::device_uvector<ElementType> index_norms(0, stream);
   if (metric == raft::distance::DistanceType::L2Expanded ||
-      metric == raft::distance::DistanceType::L2SqrtExpanded) {
+      metric == raft::distance::DistanceType::L2SqrtExpanded ||
+      metric == raft::distance::DistanceType::CosineExpanded) {
     search_norms.resize(m, stream);
     index_norms.resize(n, stream);
-    raft::linalg::rowNorm(
-      search_norms.data(), search, d, m, raft::linalg::NormType::L2Norm, true, stream);
-    raft::linalg::rowNorm(
-      index_norms.data(), index, d, n, raft::linalg::NormType::L2Norm, true, stream);
+    // cosine needs the l2norm, where as l2 distances needs the squared norm
+    if (metric == raft::distance::DistanceType::CosineExpanded) {
+      raft::linalg::rowNorm(search_norms.data(),
+                            search,
+                            d,
+                            m,
+                            raft::linalg::NormType::L2Norm,
+                            true,
+                            stream,
+                            raft::sqrt_op{});
+      raft::linalg::rowNorm(index_norms.data(),
+                            index,
+                            d,
+                            n,
+                            raft::linalg::NormType::L2Norm,
+                            true,
+                            stream,
+                            raft::sqrt_op{});
+    } else {
+      raft::linalg::rowNorm(
+        search_norms.data(), search, d, m, raft::linalg::NormType::L2Norm, true, stream);
+      raft::linalg::rowNorm(
+        index_norms.data(), index, d, n, raft::linalg::NormType::L2Norm, true, stream);
+    }
     pairwise_metric = raft::distance::DistanceType::InnerProduct;
   }
 
@@ -109,20 +131,17 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
   // in which case the number of columns here is too high in the temp output.
   // adjust if necessary
   auto last_col_tile_size = n % tile_cols;
-  if (last_col_tile_size && (last_col_tile_size < static_cast<size_t>(k))) {
-    temp_out_cols -= k - last_col_tile_size;
-  }
+  if (last_col_tile_size && (last_col_tile_size < k)) { temp_out_cols -= k - last_col_tile_size; }
 
   // if we have less than k items in the index, we should fill out the result
   // to indicate that we are missing items (and match behaviour in faiss)
-  if (n < static_cast<size_t>(k)) {
+  if (n < k) {
     raft::matrix::fill(handle,
-                       raft::make_device_matrix_view(distances, m, static_cast<size_t>(k)),
+                       raft::make_device_matrix_view(distances, m, k),
                        std::numeric_limits<ElementType>::lowest());
 
     if constexpr (std::is_signed_v<IndexType>) {
-      raft::matrix::fill(
-        handle, raft::make_device_matrix_view(indices, m, static_cast<size_t>(k)), IndexType{-1});
+      raft::matrix::fill(handle, raft::make_device_matrix_view(indices, m, k), IndexType{-1});
     }
   }
 
@@ -136,12 +155,12 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
 
     for (size_t j = 0; j < n; j += tile_cols) {
       size_t current_centroid_size = std::min(tile_cols, n - j);
-      size_t current_k             = std::min(current_centroid_size, static_cast<size_t>(k));
+      size_t current_k             = std::min(current_centroid_size, k);
 
       // calculate the top-k elements for the current tile, by calculating the
       // full pairwise distance for the tile - and then selecting the top-k from that
       // note: we're using a int32 IndexType here on purpose in order to
-      // use the pairwise_distance specializations. Since the tile size will ensure
+      // use the pairwise_distance instantiations. Since the tile size will ensure
       // that the total memory is < 1GB per tile, this will not cause any issues
       distance::pairwise_distance<ElementType, int>(handle,
                                                     search + i * d,
@@ -176,6 +195,21 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
             val = distance_epilogue(val, row, col);
             return val;
           });
+      } else if (metric == raft::distance::DistanceType::CosineExpanded) {
+        auto row_norms = search_norms.data();
+        auto col_norms = index_norms.data();
+        auto dist      = temp_distances.data();
+
+        raft::linalg::map_offset(
+          handle,
+          raft::make_device_vector_view(dist, current_query_size * current_centroid_size),
+          [=] __device__(IndexType idx) {
+            IndexType row = i + (idx / current_centroid_size);
+            IndexType col = j + (idx % current_centroid_size);
+            auto val      = 1.0 - dist[idx] / (row_norms[row] * col_norms[col]);
+            val           = distance_epilogue(val, row, col);
+            return val;
+          });
       } else {
         // if we're not l2 distance, and we have a distance epilogue - run it now
         if constexpr (!std::is_same_v<DistanceEpilogue, raft::identity_op>) {
@@ -310,18 +344,6 @@ void brute_force_knn_impl(
     id_ranges = translations;
   }
 
-  // perform preprocessing
-  std::unique_ptr<MetricProcessor<value_t>> query_metric_processor =
-    create_processor<value_t>(metric, n, D, k, rowMajorQuery, userStream);
-  query_metric_processor->preprocess(search_items);
-
-  std::vector<std::unique_ptr<MetricProcessor<value_t>>> metric_processors(input.size());
-  for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i] =
-      create_processor<value_t>(metric, sizes[i], D, k, rowMajorQuery, userStream);
-    metric_processors[i]->preprocess(input[i]);
-  }
-
   int device;
   RAFT_CUDA_TRY(cudaGetDevice(&device));
 
@@ -430,14 +452,6 @@ void brute_force_knn_impl(
             raft::linalg::transpose(handle, input[i], index, sizes[i], D, stream);
           }
 
-          // cosine/correlation are handled by metric processor, use IP distance
-          // for brute force knn call.
-          auto tiled_metric = metric;
-          if (metric == raft::distance::DistanceType::CosineExpanded ||
-              metric == raft::distance::DistanceType::CorrelationExpanded) {
-            tiled_metric = raft::distance::DistanceType::InnerProduct;
-          }
-
           tiled_brute_force_knn<value_t, IdxType>(stream_pool_handle,
                                                   search,
                                                   index,
@@ -447,7 +461,7 @@ void brute_force_knn_impl(
                                                   k,
                                                   out_d_ptr,
                                                   out_i_ptr,
-                                                  tiled_metric,
+                                                  metric,
                                                   metricArg,
                                                   0,
                                                   0,
@@ -470,12 +484,6 @@ void brute_force_knn_impl(
     knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
   }
 
-  query_metric_processor->revert(search_items);
-  query_metric_processor->postprocess(out_D);
-  for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i]->revert(input[i]);
-  }
-
   if (translations == nullptr) delete id_ranges;
 };
 
diff --git a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
index dbb58e8aba..7802c60f94 100644
--- a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh
@@ -65,8 +65,8 @@ __global__ void knn_merge_parts_kernel(const value_t* const inK,
 
   int col = i % k;
 
-  value_t* inKStart   = inK + (row_idx + col);
-  value_idx* inVStart = inV + (row_idx + col);
+  const value_t* inKStart   = inK + (row_idx + col);
+  const value_idx* inVStart = inV + (row_idx + col);
 
   int limit             = Pow2<WarpSize>::roundDown(total_k);
   value_idx translation = 0;
diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index aedfc42698..0ff5e4cdbc 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -20,7 +20,9 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/neighbors/detail/ivf_flat_build.cuh>
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan.cuh>
 #include <raft/neighbors/detail/ivf_flat_search.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
@@ -116,15 +118,6 @@ void refine_device(raft::device_resources const& handle,
                                                            neighbor_candidates.data_handle(),
                                                            n_queries,
                                                            n_candidates);
-
-  // greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-  // function is used in both raft::neighbors::ivf_flat::search and
-  // raft::neighbors::detail::refine_device. To prevent a duplicate
-  // instantiation of this function (which defines ~270 kernels) in the refine
-  // specializations, an extern template definition is provided. Please check
-  // and adjust the extern template definition and the instantiation when the
-  // below function call is edited. Search for
-  // `greppable-id-specializations-ivf-flat-search` to find them.
   uint32_t grid_dim_x = 1;
   raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<
     data_t,
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
new file mode 100644
index 0000000000..8636ee9596
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>                                            // size_t
+#include <cstdint>                                            // uint32_t
+#include <raft/neighbors/detail/selection_faiss_helpers.cuh>  // kFaissMaxK
+#include <raft/util/raft_explicit.hpp>                        // RAFT_EXPLICIT
+
+#if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
+
+namespace raft::neighbors::detail {
+
+template <typename payload_t = int, typename key_t = float>
+void select_k(const key_t* inK,
+              const payload_t* inV,
+              size_t n_rows,
+              size_t n_cols,
+              key_t* outK,
+              payload_t* outV,
+              bool select_min,
+              int k,
+              cudaStream_t stream) RAFT_EXPLICIT;
+};      // namespace raft::neighbors::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)           \
+  extern template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                         const payload_t* inV, \
+                                                         size_t n_rows,        \
+                                                         size_t n_cols,        \
+                                                         key_t* outK,          \
+                                                         payload_t* outV,      \
+                                                         bool select_min,      \
+                                                         int k,                \
+                                                         cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(uint32_t, float);
+instantiate_raft_neighbors_detail_select_k(int32_t, float);
+instantiate_raft_neighbors_detail_select_k(long, float);
+instantiate_raft_neighbors_detail_select_k(size_t, double);
+// test/neighbors/selection.cu
+instantiate_raft_neighbors_detail_select_k(int, double);
+instantiate_raft_neighbors_detail_select_k(size_t, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh
new file mode 100644
index 0000000000..d2e3206993
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+#include <raft/neighbors/detail/faiss_select/Select.cuh>
+#include <raft/neighbors/detail/selection_faiss_helpers.cuh>  // kFaissMaxK
+
+namespace raft::neighbors::detail {
+
+template <typename payload_t, typename key_t, bool select_min, int warp_q, int thread_q, int tpb>
+__global__ void select_k_kernel(const key_t* inK,
+                                const payload_t* inV,
+                                size_t n_rows,
+                                size_t n_cols,
+                                key_t* outK,
+                                payload_t* outV,
+                                key_t initK,
+                                payload_t initV,
+                                int k)
+{
+  using align_warp        = Pow2<WarpSize>;
+  constexpr int kNumWarps = align_warp::div(tpb);
+
+  __shared__ key_t smemK[kNumWarps * warp_q];
+  __shared__ payload_t smemV[kNumWarps * warp_q];
+
+  faiss_select::BlockSelect<key_t,
+                            payload_t,
+                            select_min,
+                            faiss_select::Comparator<key_t>,
+                            warp_q,
+                            thread_q,
+                            tpb>
+    heap(initK, initV, smemK, smemV, k);
+
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+  {
+    size_t i = size_t(threadIdx.x);
+
+    inK += row * n_cols;
+    if (inV != nullptr) { inV += row * n_cols; }
+
+    // Whole warps must participate in the selection
+    size_t limit = align_warp::roundDown(n_cols);
+
+    for (; i < limit; i += tpb) {
+      heap.add(inK[i], (inV != nullptr) ? inV[i] : payload_t(i));
+    }
+
+    // Handle last remainder fraction of a warp of elements
+    if (i < n_cols) { heap.addThreadQ(inK[i], (inV != nullptr) ? inV[i] : payload_t(i)); }
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += tpb) {
+    outK[row * k + i] = smemK[i];
+    outV[row * k + i] = smemV[i];
+  }
+}
+
+template <typename payload_t = int, typename key_t = float, int warp_q, int thread_q>
+inline void select_k_impl(const key_t* inK,
+                          const payload_t* inV,
+                          size_t n_rows,
+                          size_t n_cols,
+                          key_t* outK,
+                          payload_t* outV,
+                          bool select_min,
+                          int k,
+                          cudaStream_t stream)
+{
+  auto grid = dim3(n_rows);
+
+  constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
+  auto block              = dim3(n_threads);
+
+  auto kInit = select_min ? upper_bound<key_t>() : lower_bound<key_t>();
+  auto vInit = -1;
+  if (select_min) {
+    select_k_kernel<payload_t, key_t, false, warp_q, thread_q, n_threads>
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
+  } else {
+    select_k_kernel<payload_t, key_t, true, warp_q, thread_q, n_threads>
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
+  }
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+/**
+ * @brief Select the k-nearest neighbors from dense
+ * distance and index matrices.
+ *
+ * @param[in] inK partitioned knn distance matrix
+ * @param[in] inV partitioned knn index matrix
+ * @param[in] n_rows number of rows in distance and index matrices
+ * @param[in] n_cols number of columns in distance and index matrices
+ * @param[out] outK merged knn distance matrix
+ * @param[out] outV merged knn index matrix
+ * @param[in] select_min whether to select the min or the max distances
+ * @param[in] k number of neighbors per partition (also number of merged neighbors)
+ * @param[in] stream CUDA stream to use
+ */
+template <typename payload_t = int, typename key_t = float>
+inline void select_k(const key_t* inK,
+                     const payload_t* inV,
+                     size_t n_rows,
+                     size_t n_cols,
+                     key_t* outK,
+                     payload_t* outV,
+                     bool select_min,
+                     int k,
+                     cudaStream_t stream)
+{
+  constexpr int max_k = kFaissMaxK<payload_t, key_t>();
+  if (k == 1)
+    select_k_impl<payload_t, key_t, 1, 1>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else if (k <= 32)
+    select_k_impl<payload_t, key_t, 32, 2>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else if (k <= 64)
+    select_k_impl<payload_t, key_t, 64, 3>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else if (k <= 128)
+    select_k_impl<payload_t, key_t, 128, 3>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else if (k <= 256)
+    select_k_impl<payload_t, key_t, 256, 4>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else if (k <= 512)
+    select_k_impl<payload_t, key_t, 512, 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else if (k <= 1024 && k <= max_k)
+    // note: have to use constexpr std::min here to avoid instantiating templates
+    // for parameters we don't support
+    select_k_impl<payload_t, key_t, std::min(1024, max_k), 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else if (k <= 2048 && k <= max_k)
+    select_k_impl<payload_t, key_t, std::min(2048, max_k), 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else
+    ASSERT(k <= max_k, "Current max k is %d (requested %d)", max_k, k);
+}
+};  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss.cuh b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
index 5df42e94b9..dd229b37e8 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
@@ -13,157 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/pow2_utils.cuh>
-
-#include <raft/neighbors/detail/faiss_select/Select.cuh>
-
-namespace raft::neighbors::detail {
-
-template <typename payload_t, typename key_t>
-constexpr int kFaissMaxK()
-{
-  if (sizeof(key_t) >= 8) { return sizeof(payload_t) >= 8 ? 512 : 1024; }
-  return 2048;
-}
-
-template <typename payload_t, typename key_t, bool select_min, int warp_q, int thread_q, int tpb>
-__global__ void select_k_kernel(const key_t* inK,
-                                const payload_t* inV,
-                                size_t n_rows,
-                                size_t n_cols,
-                                key_t* outK,
-                                payload_t* outV,
-                                key_t initK,
-                                payload_t initV,
-                                int k)
-{
-  using align_warp        = Pow2<WarpSize>;
-  constexpr int kNumWarps = align_warp::div(tpb);
-
-  __shared__ key_t smemK[kNumWarps * warp_q];
-  __shared__ payload_t smemV[kNumWarps * warp_q];
-
-  faiss_select::BlockSelect<key_t,
-                            payload_t,
-                            select_min,
-                            faiss_select::Comparator<key_t>,
-                            warp_q,
-                            thread_q,
-                            tpb>
-    heap(initK, initV, smemK, smemV, k);
-
-  // Grid is exactly sized to rows available
-  int row = blockIdx.x;
-  {
-    size_t i = size_t(threadIdx.x);
-
-    inK += row * n_cols;
-    if (inV != nullptr) { inV += row * n_cols; }
-
-    // Whole warps must participate in the selection
-    size_t limit = align_warp::roundDown(n_cols);
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "selection_faiss-inl.cuh"
+#endif
 
-    for (; i < limit; i += tpb) {
-      heap.add(inK[i], (inV != nullptr) ? inV[i] : payload_t(i));
-    }
-
-    // Handle last remainder fraction of a warp of elements
-    if (i < n_cols) { heap.addThreadQ(inK[i], (inV != nullptr) ? inV[i] : payload_t(i)); }
-  }
-
-  heap.reduce();
-
-  for (int i = threadIdx.x; i < k; i += tpb) {
-    outK[row * k + i] = smemK[i];
-    outV[row * k + i] = smemV[i];
-  }
-}
-
-template <typename payload_t = int, typename key_t = float, int warp_q, int thread_q>
-inline void select_k_impl(const key_t* inK,
-                          const payload_t* inV,
-                          size_t n_rows,
-                          size_t n_cols,
-                          key_t* outK,
-                          payload_t* outV,
-                          bool select_min,
-                          int k,
-                          cudaStream_t stream)
-{
-  auto grid = dim3(n_rows);
-
-  constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block              = dim3(n_threads);
-
-  auto kInit = select_min ? upper_bound<key_t>() : lower_bound<key_t>();
-  auto vInit = -1;
-  if (select_min) {
-    select_k_kernel<payload_t, key_t, false, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
-  } else {
-    select_k_kernel<payload_t, key_t, true, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
-  }
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-/**
- * @brief Select the k-nearest neighbors from dense
- * distance and index matrices.
- *
- * @param[in] inK partitioned knn distance matrix
- * @param[in] inV partitioned knn index matrix
- * @param[in] n_rows number of rows in distance and index matrices
- * @param[in] n_cols number of columns in distance and index matrices
- * @param[out] outK merged knn distance matrix
- * @param[out] outV merged knn index matrix
- * @param[in] select_min whether to select the min or the max distances
- * @param[in] k number of neighbors per partition (also number of merged neighbors)
- * @param[in] stream CUDA stream to use
- */
-template <typename payload_t = int, typename key_t = float>
-inline void select_k(const key_t* inK,
-                     const payload_t* inV,
-                     size_t n_rows,
-                     size_t n_cols,
-                     key_t* outK,
-                     payload_t* outV,
-                     bool select_min,
-                     int k,
-                     cudaStream_t stream)
-{
-  constexpr int max_k = kFaissMaxK<payload_t, key_t>();
-  if (k == 1)
-    select_k_impl<payload_t, key_t, 1, 1>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 32)
-    select_k_impl<payload_t, key_t, 32, 2>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 64)
-    select_k_impl<payload_t, key_t, 64, 3>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 128)
-    select_k_impl<payload_t, key_t, 128, 3>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 256)
-    select_k_impl<payload_t, key_t, 256, 4>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 512)
-    select_k_impl<payload_t, key_t, 512, 8>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 1024 && k <= max_k)
-    // note: have to use constexpr std::min here to avoid instantiating templates
-    // for parameters we don't support
-    select_k_impl<payload_t, key_t, std::min(1024, max_k), 8>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 2048 && k <= max_k)
-    select_k_impl<payload_t, key_t, std::min(2048, max_k), 8>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else
-    ASSERT(k <= max_k, "Current max k is %d (requested %d)", max_k, k);
-}
-};  // namespace raft::neighbors::detail
+#ifdef RAFT_COMPILED
+#include "selection_faiss-ext.cuh"
+#endif
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu b/cpp/include/raft/neighbors/detail/selection_faiss_helpers.cuh
similarity index 54%
rename from cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
rename to cpp/include/raft/neighbors/detail/selection_faiss_helpers.cuh
index 1a0322a722..c4b69f21ec 100644
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
+++ b/cpp/include/raft/neighbors/detail/selection_faiss_helpers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,18 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
+#pragma once
 
-#include <cuda_fp16.h>
+namespace raft::neighbors::detail {
 
-namespace raft::neighbors::ivf_pq::detail {
+// This function is used in cpp/test/neighbors/select.cu. We want to make it
+// available through both the selection_faiss-inl.cuh and
+// selection_faiss-ext.cuh headers.
+template <typename payload_t, typename key_t>
+constexpr int kFaissMaxK()
+{
+  if (sizeof(key_t) >= 8) { return sizeof(payload_t) >= 8 ? 512 : 1024; }
+  return 2048;
+}
 
-template auto get_compute_similarity_kernel<float, float, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, float>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
+}  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
new file mode 100644
index 0000000000..2dfe8dcc78
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                                // int64_t
+
+#include <raft/core/device_mdspan.hpp>            // raft::device_matrix_view
+#include <raft/core/device_resources.hpp>         // raft::device_resources
+#include <raft/neighbors/ivf_flat_serialize.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>      // raft::neighbors::ivf_flat::index
+#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
+#include <rmm/mr/device/per_device_resource.hpp>  // rmm::mr::device_memory_resource
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::neighbors::ivf_flat {
+
+template <typename T, typename IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<T, IdxT> RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, IdxT, row_major> dataset)
+  -> index<T, IdxT> RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void build(raft::device_resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, IdxT, row_major> dataset,
+           raft::neighbors::ivf_flat::index<T, IdxT>& idx) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            const index<T, IdxT>& orig_index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<T, IdxT> RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            const index<T, IdxT>& orig_index) -> index<T, IdxT> RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            index<T, IdxT>* index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            index<T, IdxT>* index) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<T, IdxT>& index,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<T, IdxT>& index,
+            raft::device_matrix_view<const T, IdxT, row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, row_major> distances) RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ivf_flat
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)        \
+  extern template auto raft::neighbors::ivf_flat::build<T, IdxT>( \
+    raft::device_resources const& handle,                         \
+    const raft::neighbors::ivf_flat::index_params& params,        \
+    const T* dataset,                                             \
+    IdxT n_rows,                                                  \
+    uint32_t dim)                                                 \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                  \
+                                                                  \
+  extern template auto raft::neighbors::ivf_flat::build<T, IdxT>( \
+    raft::device_resources const& handle,                         \
+    const raft::neighbors::ivf_flat::index_params& params,        \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset)   \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                  \
+                                                                  \
+  extern template void raft::neighbors::ivf_flat::build<T, IdxT>( \
+    raft::device_resources const& handle,                         \
+    const raft::neighbors::ivf_flat::index_params& params,        \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset,   \
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+
+instantiate_raft_neighbors_ivf_flat_build(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_build(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_build(uint8_t, int64_t);
+#undef instantiate_raft_neighbors_ivf_flat_build
+
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
+  extern template auto raft::neighbors::ivf_flat::extend<T, IdxT>(         \
+    raft::device_resources const& handle,                                  \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows)                                                           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  extern template auto raft::neighbors::ivf_flat::extend<T, IdxT>(         \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  extern template void raft::neighbors::ivf_flat::extend<T, IdxT>(         \
+    raft::device_resources const& handle,                                  \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows);                                                          \
+                                                                           \
+  extern template void raft::neighbors::ivf_flat::extend<T, IdxT>(         \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+
+instantiate_raft_neighbors_ivf_flat_extend(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_extend(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_extend(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_extend
+
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)        \
+  extern template void raft::neighbors::ivf_flat::search<T, IdxT>( \
+    raft::device_resources const& handle,                          \
+    const raft::neighbors::ivf_flat::search_params& params,        \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \
+    const T* queries,                                              \
+    uint32_t n_queries,                                            \
+    uint32_t k,                                                    \
+    IdxT* neighbors,                                               \
+    float* distances,                                              \
+    rmm::mr::device_memory_resource* mr);                          \
+                                                                   \
+  extern template void raft::neighbors::ivf_flat::search<T, IdxT>( \
+    raft::device_resources const& handle,                          \
+    const raft::neighbors::ivf_flat::search_params& params,        \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,    \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,     \
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+
+instantiate_raft_neighbors_ivf_flat_search(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_search(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_search
diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
new file mode 100644
index 0000000000..4f8d7f596e
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_flat_build.cuh>
+#include <raft/neighbors/detail/ivf_flat_search.cuh>
+#include <raft/neighbors/ivf_flat_serialize.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+
+#include <raft/core/device_resources.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace raft::neighbors::ivf_flat {
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ * @param[in] n_rows the number of samples
+ * @param[in] dim the dimensionality of the data
+ *
+ * @return the constructed ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<T, IdxT>
+{
+  return raft::neighbors::ivf_flat::detail::build(handle, params, dataset, n_rows, dim);
+}
+
+/**
+ * @defgroup ivf_flat IVF Flat Algorithm
+ * @{
+ */
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, dataset, index_params);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, search_params, index, queries, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, IdxT, row_major> dataset) -> index<T, IdxT>
+{
+  return raft::neighbors::ivf_flat::detail::build(handle,
+                                                  params,
+                                                  dataset.data_handle(),
+                                                  static_cast<IdxT>(dataset.extent(0)),
+                                                  static_cast<IdxT>(dataset.extent(1)));
+}
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
+ *   ivf_flat::build(handle, dataset, index_params, index);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, search_params, index, queries, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_flat::index
+ *
+ */
+template <typename T, typename IdxT>
+void build(raft::device_resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, IdxT, row_major> dataset,
+           raft::neighbors::ivf_flat::index<T, IdxT>& idx)
+{
+  idx = raft::neighbors::ivf_flat::detail::build(handle,
+                                                 params,
+                                                 dataset.data_handle(),
+                                                 static_cast<IdxT>(dataset.extent(0)),
+                                                 static_cast<IdxT>(dataset.extent(1)));
+}
+
+/** @} */
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] orig_index original index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows number of rows in `new_vectors`
+ *
+ * @return the constructed extended ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            const index<T, IdxT>& orig_index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<T, IdxT>
+{
+  return raft::neighbors::ivf_flat::detail::extend(
+    handle, orig_index, new_vectors, new_indices, n_rows);
+}
+
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   auto index = ivf_flat::extend(handle, index_empty, no_op, dataset);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::device_vector_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index original index
+ *
+ * @return the constructed extended ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            const index<T, IdxT>& orig_index) -> index<T, IdxT>
+{
+  return extend<T, IdxT>(handle,
+                         orig_index,
+                         new_vectors.data_handle(),
+                         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                         new_vectors.extent(0));
+}
+
+/** @} */
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param[inout] index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows the number of samples
+ */
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            index<T, IdxT>* index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows)
+{
+  raft::neighbors::ivf_flat::detail::extend(handle, index, new_vectors, new_indices, n_rows);
+}
+
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::device_vector_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] index pointer to index, to be overwritten in-place
+ */
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            index<T, IdxT>* index)
+{
+  extend(handle,
+         index,
+         new_vectors.data_handle(),
+         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+         static_cast<IdxT>(new_vectors.extent(0)));
+}
+
+/** @} */
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // Create a pooling memory resource with a pre-defined initial size.
+ *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
+ *     rmm::mr::get_current_device_resource(), 1024 * 1024);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
+ *   ivf_flat::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
+ *   ivf_flat::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
+ *   ...
+ * @endcode
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-flat constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param[in] n_queries the batch size
+ * @param[in] k the number of neighbors to find for each query.
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param[in] mr an optional memory resource to use across the searches (you can provide a large
+ * enough memory pool here to avoid memory allocations within search).
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<T, IdxT>& index,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr)
+{
+  return raft::neighbors::ivf_flat::detail::search(
+    handle, params, index, queries, n_queries, k, neighbors, distances, mr);
+}
+
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_flat::search(handle, search_params, index, queries1, out_inds1, out_dists1);
+ *   ivf_flat::search(handle, search_params, index, queries2, out_inds2, out_dists2);
+ *   ivf_flat::search(handle, search_params, index, queries3, out_inds3, out_dists3);
+ *   ...
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-flat constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<T, IdxT>& index,
+            raft::device_matrix_view<const T, IdxT, row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, row_major> distances)
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
+               "Number of columns in output neighbors and distances matrices must be equal");
+
+  RAFT_EXPECTS(queries.extent(1) == index.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  return search(handle,
+                params,
+                index,
+                queries.data_handle(),
+                static_cast<std::uint32_t>(queries.extent(0)),
+                static_cast<std::uint32_t>(neighbors.extent(1)),
+                neighbors.data_handle(),
+                distances.data_handle(),
+                nullptr);
+}
+
+/** @} */
+
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat.cuh
index f12062f851..8fd9628a41 100644
--- a/cpp/include/raft/neighbors/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat.cuh
@@ -13,459 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/detail/ivf_flat_build.cuh>
-#include <raft/neighbors/detail/ivf_flat_search.cuh>
-#include <raft/neighbors/ivf_flat_serialize.cuh>
-#include <raft/neighbors/ivf_flat_types.hpp>
-
-#include <raft/core/device_resources.hpp>
-
-#include <raft/core/device_mdspan.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace raft::neighbors::ivf_flat {
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_flat::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- * @param[in] n_rows the number of samples
- * @param[in] dim the dimensionality of the data
- *
- * @return the constructed ivf-flat index
- */
-template <typename T, typename IdxT>
-auto build(raft::device_resources const& handle,
-           const index_params& params,
-           const T* dataset,
-           IdxT n_rows,
-           uint32_t dim) -> index<T, IdxT>
-{
-  return raft::neighbors::ivf_flat::detail::build(handle, params, dataset, n_rows, dim);
-}
-
-/**
- * @defgroup ivf_flat IVF Flat Algorithm
- * @{
- */
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_flat::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_flat::build(handle, dataset, index_params);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, search_params, index, queries, out_inds, out_dists);
- * @endcode
- *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- *
- * @return the constructed ivf-flat index
- */
-template <typename value_t, typename idx_t>
-auto build(raft::device_resources const& handle,
-           const index_params& params,
-           raft::device_matrix_view<const value_t, idx_t, row_major> dataset)
-  -> index<value_t, idx_t>
-{
-  return raft::neighbors::ivf_flat::detail::build(handle,
-                                                  params,
-                                                  dataset.data_handle(),
-                                                  static_cast<idx_t>(dataset.extent(0)),
-                                                  static_cast<idx_t>(dataset.extent(1)));
-}
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_flat::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
- *   ivf_flat::build(handle, dataset, index_params, index);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, search_params, index, queries, out_inds, out_dists);
- * @endcode
- *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
- * @param[out] idx reference to ivf_flat::index
- *
- */
-template <typename value_t, typename idx_t>
-void build(raft::device_resources const& handle,
-           const index_params& params,
-           raft::device_matrix_view<const value_t, idx_t, row_major> dataset,
-           raft::neighbors::ivf_flat::index<value_t, idx_t>& idx)
-{
-  idx = raft::neighbors::ivf_flat::detail::build(handle,
-                                                 params,
-                                                 dataset.data_handle(),
-                                                 static_cast<idx_t>(dataset.extent(0)),
-                                                 static_cast<idx_t>(dataset.extent(1)));
-}
-
-/** @} */
-
-/**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are adjusted to match the newly labeled data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] orig_index original index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows number of rows in `new_vectors`
- *
- * @return the constructed extended ivf-flat index
- */
-template <typename T, typename IdxT>
-auto extend(raft::device_resources const& handle,
-            const index<T, IdxT>& orig_index,
-            const T* new_vectors,
-            const IdxT* new_indices,
-            IdxT n_rows) -> index<T, IdxT>
-{
-  return raft::neighbors::ivf_flat::detail::extend(
-    handle, orig_index, new_vectors, new_indices, n_rows);
-}
-
-/**
- * @ingroup ivf_flat
- * @{
- */
-
-/**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are adjusted to match the newly labeled data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
- *   // fill the index with the data
- *   std::optional<raft::device_vector_view<const idx_t, idx_t>> no_op = std::nullopt;
- *   auto index = ivf_flat::extend(handle, index_empty, no_op, dataset);
- * @endcode
- *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices optional raft::device_vector_view to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] orig_index original index
- *
- * @return the constructed extended ivf-flat index
- */
-template <typename value_t, typename idx_t>
-auto extend(raft::device_resources const& handle,
-            raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
-            std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices,
-            const index<value_t, idx_t>& orig_index) -> index<value_t, idx_t>
-{
-  return extend<value_t, idx_t>(
-    handle,
-    orig_index,
-    new_vectors.data_handle(),
-    new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-    new_vectors.extent(0));
-}
-
-/** @} */
-
-/**
- * @brief Extend the index in-place with the new data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param handle
- * @param[inout] index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows the number of samples
- */
-template <typename T, typename IdxT>
-void extend(raft::device_resources const& handle,
-            index<T, IdxT>* index,
-            const T* new_vectors,
-            const IdxT* new_indices,
-            IdxT n_rows)
-{
-  raft::neighbors::ivf_flat::detail::extend(handle, index, new_vectors, new_indices, n_rows);
-}
-
-/**
- * @ingroup ivf_flat
- * @{
- */
-
-/**
- * @brief Extend the index in-place with the new data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
- *   // fill the index with the data
- *   std::optional<raft::device_vector_view<const idx_t, idx_t>> no_op = std::nullopt;
- *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
- * @endcode
- *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices optional raft::device_vector_view to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[inout] index pointer to index, to be overwritten in-place
- */
-template <typename value_t, typename idx_t>
-void extend(raft::device_resources const& handle,
-            raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
-            std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices,
-            index<value_t, idx_t>* index)
-{
-  extend(handle,
-         index,
-         new_vectors.data_handle(),
-         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-         static_cast<idx_t>(new_vectors.extent(0)));
-}
-
-/** @} */
-
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_flat::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_flat::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
- *   ...
- * @endcode
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] index ivf-flat constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param[in] n_queries the batch size
- * @param[in] k the number of neighbors to find for each query.
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] mr an optional memory resource to use across the searches (you can provide a large
- * enough memory pool here to avoid memory allocations within search).
- */
-template <typename T, typename IdxT>
-void search(raft::device_resources const& handle,
-            const search_params& params,
-            const index<T, IdxT>& index,
-            const T* queries,
-            uint32_t n_queries,
-            uint32_t k,
-            IdxT* neighbors,
-            float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr)
-{
-  return raft::neighbors::ivf_flat::detail::search(
-    handle, params, index, queries, n_queries, k, neighbors, distances, mr);
-}
-
-/**
- * @ingroup ivf_flat
- * @{
- */
-
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_flat::search(handle, search_params, index, queries1, out_inds1, out_dists1);
- *   ivf_flat::search(handle, search_params, index, queries2, out_inds2, out_dists2);
- *   ivf_flat::search(handle, search_params, index, queries3, out_inds3, out_dists3);
- *   ...
- * @endcode
- *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] index ivf-flat constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- */
-template <typename value_t, typename idx_t>
-void search(raft::device_resources const& handle,
-            const search_params& params,
-            const index<value_t, idx_t>& index,
-            raft::device_matrix_view<const value_t, idx_t, row_major> queries,
-            raft::device_matrix_view<idx_t, idx_t, row_major> neighbors,
-            raft::device_matrix_view<float, idx_t, row_major> distances)
-{
-  RAFT_EXPECTS(
-    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
-    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
-
-  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
-               "Number of columns in output neighbors and distances matrices must be equal");
-
-  RAFT_EXPECTS(queries.extent(1) == index.dim(),
-               "Number of query dimensions should equal number of dimensions in the index.");
-
-  return search(handle,
-                params,
-                index,
-                queries.data_handle(),
-                static_cast<std::uint32_t>(queries.extent(0)),
-                static_cast<std::uint32_t>(neighbors.extent(1)),
-                neighbors.data_handle(),
-                distances.data_handle(),
-                nullptr);
-}
-
-/** @} */
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "ivf_flat-inl.cuh"
+#endif
 
-}  // namespace raft::neighbors::ivf_flat
+#ifdef RAFT_COMPILED
+#include "ivf_flat-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 2a6aa12847..c7abe83f8a 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -27,6 +27,7 @@
 #include <raft/neighbors/ivf_list_types.hpp>
 #include <raft/util/integer_utils.hpp>
 
+#include <algorithm>  // std::max
 #include <memory>
 #include <optional>
 #include <thrust/fill.h>
@@ -228,11 +229,11 @@ struct index : ann::index {
   [[nodiscard]] constexpr inline auto n_lists() const noexcept -> uint32_t { return lists_.size(); }
 
   // Don't allow copying the index for performance reasons (try avoiding copying data)
-  index(const index&) = delete;
-  index(index&&)      = default;
+  index(const index&)                    = delete;
+  index(index&&)                         = default;
   auto operator=(const index&) -> index& = delete;
-  auto operator=(index&&) -> index& = default;
-  ~index()                          = default;
+  auto operator=(index&&) -> index&      = default;
+  ~index()                               = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
   index(raft::device_resources const& res,
@@ -379,10 +380,11 @@ struct index : ann::index {
   {
     // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
     // template parameter (https://github.com/rapidsai/raft/issues/711)
-    uint32_t veclen = 16 / sizeof(T);
-    while (dim % veclen != 0) {
-      veclen = veclen >> 1;
-    }
+
+    // NOTE: keep this consistent with the select_interleaved_scan_kernel logic
+    // in detail/ivf_flat_interleaved_scan-inl.cuh.
+    uint32_t veclen = std::max<uint32_t>(1, 16 / sizeof(T));
+    if (dim % veclen != 0) { veclen = 1; }
     return veclen;
   }
 };
diff --git a/cpp/include/raft/neighbors/ivf_list_types.hpp b/cpp/include/raft/neighbors/ivf_list_types.hpp
index 233775ea39..50a905c6ae 100644
--- a/cpp/include/raft/neighbors/ivf_list_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_list_types.hpp
@@ -31,9 +31,8 @@ namespace raft::neighbors::ivf {
  * `size` bound or whenever the list is allocated but not filled-in yet.
  */
 template <typename IdxT>
-constexpr static IdxT kInvalidRecord = (std::is_signed_v<IdxT> ? IdxT{0}
-                                                               : std::numeric_limits<IdxT>::max()) -
-                                       1;
+constexpr static IdxT kInvalidRecord =
+  (std::is_signed_v<IdxT> ? IdxT{0} : std::numeric_limits<IdxT>::max()) - 1;
 
 /** The data for a single IVF list. */
 template <template <typename, typename...> typename SpecT,
@@ -58,8 +57,7 @@ struct list {
 };
 
 template <typename ListT, class T = void>
-struct enable_if_valid_list {
-};
+struct enable_if_valid_list {};
 
 template <class T,
           template <typename, typename...>
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
new file mode 100644
index 0000000000..4b9b0673d4
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                                // int64_t
+
+#include <raft/core/device_mdspan.hpp>            // raft::device_matrix_view
+#include <raft/core/device_resources.hpp>         // raft::device_resources
+#include <raft/neighbors/ivf_pq_types.hpp>        // raft::neighbors::ivf_pq::index
+#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
+#include <rmm/mr/device/per_device_resource.hpp>  // rmm::mr::device_memory_resource
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::neighbors::ivf_pq {
+
+template <typename T, typename IdxT = uint32_t>
+index<IdxT> build(raft::device_resources const& handle,
+                  const index_params& params,
+                  raft::device_matrix_view<const T, IdxT, row_major> dataset) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+index<IdxT> extend(raft::device_resources const& handle,
+                   raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+                   std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,
+                   const index<IdxT>& idx) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,
+            index<IdxT>* idx) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<IdxT>& idx,
+            raft::device_matrix_view<const T, IdxT, row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, row_major> distances) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT = uint32_t>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<IdxT> RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            const index<IdxT>& idx,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<IdxT> RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            index<IdxT>* idx,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) RAFT_EXPLICIT;
+
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const raft::neighbors::ivf_pq::search_params& params,
+            const index<IdxT>& idx,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ivf_pq
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                        \
+  extern template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::device_resources const& handle,                                                       \
+    const raft::neighbors::ivf_pq::index_params& params,                                        \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                                \
+                                                                                                \
+  extern template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::device_resources const& handle,                                                       \
+    const raft::neighbors::ivf_pq::index_params& params,                                        \
+    const T* dataset,                                                                           \
+    IdxT n_rows,                                                                                \
+    uint32_t dim)                                                                               \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(float, int64_t);
+instantiate_raft_neighbors_ivf_pq_build(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_pq_build(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
+
+#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT)                                        \
+  extern template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
+    raft::device_resources const& handle,                                                        \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                              \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,            \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                            \
+                                                                                                 \
+  extern template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                        \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                              \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,            \
+    raft::neighbors::ivf_pq::index<IdxT>* idx);                                                  \
+                                                                                                 \
+  extern template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                                             \
+    const T* new_vectors,                                                                        \
+    const IdxT* new_indices,                                                                     \
+    IdxT n_rows)                                                                                 \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                                                      \
+                                                                                                 \
+  extern template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                        \
+    raft::neighbors::ivf_pq::index<IdxT>* idx,                                                   \
+    const T* new_vectors,                                                                        \
+    const IdxT* new_indices,                                                                     \
+    IdxT n_rows);
+
+instantiate_raft_neighbors_ivf_pq_extend(float, int64_t);
+instantiate_raft_neighbors_ivf_pq_extend(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_extend
+
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)        \
+  extern template void raft::neighbors::ivf_pq::search<T, IdxT>( \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,  \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,   \
+    raft::device_matrix_view<float, IdxT, row_major> distances); \
+                                                                 \
+  extern template void raft::neighbors::ivf_pq::search<T, IdxT>( \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    const T* queries,                                            \
+    uint32_t n_queries,                                          \
+    uint32_t k,                                                  \
+    IdxT* neighbors,                                             \
+    float* distances,                                            \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_neighbors_ivf_pq_search(float, int64_t);
+instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_pq_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_search
diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
new file mode 100644
index 0000000000..dfc24e8214
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_pq_build.cuh>
+#include <raft/neighbors/detail/ivf_pq_search.cuh>
+#include <raft/neighbors/ivf_pq_serialize.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace raft::neighbors::ivf_pq {
+
+/**
+ * @defgroup ivf_pq IVF PQ Algorithm
+ * @{
+ */
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device matrix view to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-pq index
+ */
+template <typename T, typename IdxT = uint32_t>
+index<IdxT> build(raft::device_resources const& handle,
+                  const index_params& params,
+                  raft::device_matrix_view<const T, IdxT, row_major> dataset)
+{
+  IdxT n_rows = dataset.extent(0);
+  IdxT dim    = dataset.extent(1);
+  return detail::build(handle, params, dataset.data_handle(), n_rows, dim);
+}
+
+/**
+ * @brief Extend the index with the new data.
+ * *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx
+ */
+template <typename T, typename IdxT>
+index<IdxT> extend(raft::device_resources const& handle,
+                   raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+                   std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+                   const index<IdxT>& idx)
+{
+  ASSERT(new_vectors.extent(1) == idx.dim(),
+         "new_vectors should have the same dimension as the index");
+
+  IdxT n_rows = new_vectors.extent(0);
+  if (new_indices.has_value()) {
+    ASSERT(n_rows == new_indices.value().extent(0),
+           "new_vectors and new_indices have different number of rows");
+  }
+
+  return detail::extend(handle,
+                        idx,
+                        new_vectors.data_handle(),
+                        new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                        n_rows);
+}
+
+/**
+ * @brief Extend the index with the new data.
+ * *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx
+ */
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            index<IdxT>* idx)
+{
+  ASSERT(new_vectors.extent(1) == idx->dim(),
+         "new_vectors should have the same dimension as the index");
+
+  IdxT n_rows = new_vectors.extent(0);
+  if (new_indices.has_value()) {
+    ASSERT(n_rows == new_indices.value().extent(0),
+           "new_vectors and new_indices have different number of rows");
+  }
+
+  *idx = detail::extend(handle,
+                        *idx,
+                        new_vectors.data_handle(),
+                        new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                        n_rows);
+}
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] idx ivf-pq constructed index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<IdxT>& idx,
+            raft::device_matrix_view<const T, IdxT, row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, row_major> distances)
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
+               "Number of columns in output neighbors and distances matrices must equal k");
+
+  RAFT_EXPECTS(queries.extent(1) == idx.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  std::uint32_t k = neighbors.extent(1);
+  return detail::search(handle,
+                        params,
+                        idx,
+                        queries.data_handle(),
+                        static_cast<std::uint32_t>(queries.extent(0)),
+                        k,
+                        neighbors.data_handle(),
+                        distances.data_handle(),
+                        handle.get_workspace_resource());
+}
+
+/** @} */  // end group ivf_pq
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_pq::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device/host pointer to a row-major matrix [n_rows, dim]
+ * @param[in] n_rows the number of samples
+ * @param[in] dim the dimensionality of the data
+ *
+ * @return the constructed ivf-pq index
+ */
+template <typename T, typename IdxT = uint32_t>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<IdxT>
+{
+  return detail::build(handle, params, dataset, n_rows, dim);
+}
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, the cluster
+ *    centers are unchanged.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_pq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   auto index = ivf_pq::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[inout] idx original index
+ * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows the number of samples
+ *
+ * @return the constructed extended ivf-pq index
+ */
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            const index<IdxT>& idx,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<IdxT>
+{
+  return detail::extend(handle, idx, new_vectors, new_indices, n_rows);
+}
+
+/**
+ * @brief Extend the index with the new data.
+ * *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[inout] idx
+ * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows the number of samples
+ */
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            index<IdxT>* idx,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows)
+{
+  detail::extend(handle, idx, new_vectors, new_indices, n_rows);
+}
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // Create a pooling memory resource with a pre-defined initial size.
+ *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
+ *     rmm::mr::get_current_device_resource(), 1024 * 1024);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
+ *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
+ *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
+ *   ...
+ * @endcode
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] idx ivf-pq constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param[in] n_queries the batch size
+ * @param[in] k the number of neighbors to find for each query.
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param[in] mr an optional memory resource to use across the searches (you can provide a large
+ * enough memory pool here to avoid memory allocations within search).
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<IdxT>& idx,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr)
+{
+  return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, mr);
+}
+
+}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 934643e0af..2d20638f00 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -13,343 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/detail/ivf_pq_build.cuh>
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/ivf_pq_serialize.cuh>
-#include <raft/neighbors/ivf_pq_types.hpp>
-
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace raft::neighbors::ivf_pq {
-
-/**
- * @defgroup ivf_pq IVF PQ Algorithm
- * @{
- */
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device matrix view to a row-major matrix [n_rows, dim]
- *
- * @return the constructed ivf-pq index
- */
-template <typename T, typename IdxT = uint32_t>
-index<IdxT> build(raft::device_resources const& handle,
-                  const index_params& params,
-                  raft::device_matrix_view<const T, IdxT, row_major> dataset)
-{
-  IdxT n_rows = dataset.extent(0);
-  IdxT dim    = dataset.extent(1);
-  return detail::build(handle, params, dataset.data_handle(), n_rows, dim);
-}
-
-/**
- * @brief Extend the index with the new data.
- * *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device vector view to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[inout] idx
- */
-template <typename T, typename IdxT>
-index<IdxT> extend(raft::device_resources const& handle,
-                   raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
-                   std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
-                   const index<IdxT>& idx)
-{
-  ASSERT(new_vectors.extent(1) == idx.dim(),
-         "new_vectors should have the same dimension as the index");
-
-  IdxT n_rows = new_vectors.extent(0);
-  if (new_indices.has_value()) {
-    ASSERT(n_rows == new_indices.value().extent(0),
-           "new_vectors and new_indices have different number of rows");
-  }
-
-  return detail::extend(handle,
-                        idx,
-                        new_vectors.data_handle(),
-                        new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                        n_rows);
-}
-
-/**
- * @brief Extend the index with the new data.
- * *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device vector view to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[inout] idx
- */
-template <typename T, typename IdxT>
-void extend(raft::device_resources const& handle,
-            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
-            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
-            index<IdxT>* idx)
-{
-  ASSERT(new_vectors.extent(1) == idx->dim(),
-         "new_vectors should have the same dimension as the index");
-
-  IdxT n_rows = new_vectors.extent(0);
-  if (new_indices.has_value()) {
-    ASSERT(n_rows == new_indices.value().extent(0),
-           "new_vectors and new_indices have different number of rows");
-  }
-
-  *idx = detail::extend(handle,
-                        *idx,
-                        new_vectors.data_handle(),
-                        new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-                        n_rows);
-}
-
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`.
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] idx ivf-pq constructed index
- * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
- * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
- * k]
- */
-template <typename T, typename IdxT>
-void search(raft::device_resources const& handle,
-            const search_params& params,
-            const index<IdxT>& idx,
-            raft::device_matrix_view<const T, IdxT, row_major> queries,
-            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
-            raft::device_matrix_view<float, IdxT, row_major> distances)
-{
-  RAFT_EXPECTS(
-    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
-    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
-
-  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
-               "Number of columns in output neighbors and distances matrices must equal k");
-
-  RAFT_EXPECTS(queries.extent(1) == idx.dim(),
-               "Number of query dimensions should equal number of dimensions in the index.");
-
-  std::uint32_t k = neighbors.extent(1);
-  return detail::search(handle,
-                        params,
-                        idx,
-                        queries.data_handle(),
-                        static_cast<std::uint32_t>(queries.extent(0)),
-                        k,
-                        neighbors.data_handle(),
-                        distances.data_handle(),
-                        handle.get_workspace_resource());
-}
-
-/** @} */  // end group ivf_pq
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_pq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_pq::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_pq::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device/host pointer to a row-major matrix [n_rows, dim]
- * @param[in] n_rows the number of samples
- * @param[in] dim the dimensionality of the data
- *
- * @return the constructed ivf-pq index
- */
-template <typename T, typename IdxT = uint32_t>
-auto build(raft::device_resources const& handle,
-           const index_params& params,
-           const T* dataset,
-           IdxT n_rows,
-           uint32_t dim) -> index<IdxT>
-{
-  return detail::build(handle, params, dataset, n_rows, dim);
-}
-
-/**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are unchanged.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_pq::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_pq::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   auto index = ivf_pq::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[inout] idx original index
- * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows the number of samples
- *
- * @return the constructed extended ivf-pq index
- */
-template <typename T, typename IdxT>
-auto extend(raft::device_resources const& handle,
-            const index<IdxT>& idx,
-            const T* new_vectors,
-            const IdxT* new_indices,
-            IdxT n_rows) -> index<IdxT>
-{
-  return detail::extend(handle, idx, new_vectors, new_indices, n_rows);
-}
-
-/**
- * @brief Extend the index with the new data.
- * *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[inout] idx
- * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows the number of samples
- */
-template <typename T, typename IdxT>
-void extend(raft::device_resources const& handle,
-            index<IdxT>* idx,
-            const T* new_vectors,
-            const IdxT* new_indices,
-            IdxT n_rows)
-{
-  detail::extend(handle, idx, new_vectors, new_indices, n_rows);
-}
-
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
- *   // use default search parameters
- *   ivf_pq::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
- *   ...
- * @endcode
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] idx ivf-pq constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param[in] n_queries the batch size
- * @param[in] k the number of neighbors to find for each query.
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] mr an optional memory resource to use across the searches (you can provide a large
- * enough memory pool here to avoid memory allocations within search).
- */
-template <typename T, typename IdxT>
-void search(raft::device_resources const& handle,
-            const search_params& params,
-            const index<IdxT>& idx,
-            const T* queries,
-            uint32_t n_queries,
-            uint32_t k,
-            IdxT* neighbors,
-            float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr)
-{
-  return detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, mr);
-}
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "ivf_pq-inl.cuh"
+#endif
 
-}  // namespace raft::neighbors::ivf_pq
+#ifdef RAFT_COMPILED
+#include "ivf_pq-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
new file mode 100644
index 0000000000..398bd545f1
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_pq_build.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+namespace raft::neighbors::ivf_pq::helpers {
+/**
+ * @defgroup ivf_pq_helpers Helper functions for manipulationg IVF PQ Index
+ * @{
+ */
+
+namespace codepacker {
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Bit compression is removed, which means output will have pq_dim dimensional vectors (one code per
+ * byte, instead of ceildiv(pq_dim * pq_bits, 8) bytes of pq codes).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_take, index.pq_dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_pq::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view());
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[out] codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ */
+inline void unpack(
+  raft::device_resources const& res,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_matrix_view<uint8_t, uint32_t, row_major> codes)
+{
+  ivf_pq::detail::unpack_list_data(codes, list_data, offset, pq_bits, res.get_stream());
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_vec, index.pq_dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim]
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[in] list_data block to write into
+ */
+inline void pack(
+  raft::device_resources const& res,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data)
+{
+  ivf_pq::detail::pack_list_data(list_data, codes, offset, pq_bits, res.get_stream());
+}
+}  // namespace codepacker
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * The list is identified by its label.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will write into the 137th cluster
+ *   uint32_t label = 137;
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<const uint8_t>(res, n_vec, index.pq_dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::pack_list_data(res, &index, codes_to_pack, label, 42);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[inout] index IVF-PQ index.
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] label The id of the list (cluster) into which we write.
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+                    uint32_t label,
+                    uint32_t offset)
+{
+  ivf_pq::detail::pack_list_data(res, index, codes, label, offset);
+}
+
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, list_size, index.pq_dim());
+ *   // unpack the whole list
+ *   ivf_pq::helpers::unpack_list_data(res, index, codes.view(), label, 0);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      uint32_t offset)
+{
+  return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, offset);
+}
+
+/**
+ * @brief Unpack a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, selected_indices.size(), index.pq_dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::unpack_list_data(
+ *       res, index, selected_indices.view(), codes.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_vector_view<const uint32_t> in_cluster_indices,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label)
+{
+  return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
+}
+
+/**
+ * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           uint32_t offset)
+{
+  return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, offset);
+}
+
+/**
+ * @brief Decode a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(
+ *                             res, selected_indices.size(), index.dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::reconstruct_list_data(
+ *       res, index, selected_indices.view(), decoded_vectors.view(), label);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_vector_view<const uint32_t> in_cluster_indices,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label)
+{
+  return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will extend the fourth cluster
+ *   uint32_t label = 3;
+ *   // We will fill 4 new vectors
+ *   uint32_t n_vec = 4;
+ *   // Indices of the new vectors
+ *   auto indices = raft::make_device_vector<uint32_t>(res, n_vec);
+ *   ... fill the indices ...
+ *   auto new_codes = raft::make_device_matrix<uint8_t, uint32_t, row_major> new_codes(
+ *       res, n_vec, index.pq_dim());
+ *   ... fill codes ...
+ *   // extend list with new codes
+ *   ivf_pq::helpers::extend_list_with_codes(
+ *       res, &index, codes.view(), indices.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  ivf_pq::detail::extend_list_with_codes(res, index, new_codes, new_indices, label);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification
+ * step.
+ *
+ *  Usage example:
+ * @code{.cpp}
+ *   // We will extend the fourth cluster
+ *   uint32_t label = 3;
+ *   // We will extend with 4 new vectors
+ *   uint32_t n_vec = 4;
+ *   // Indices of the new vectors
+ *   auto indices = raft::make_device_vector<uint32_t>(res, n_vec);
+ *   ... fill the indices ...
+ *   auto new_vectors = raft::make_device_matrix<float, uint32_t, row_major> new_codes(
+ *       res, n_vec, index.dim());
+ *   ... fill vectors ...
+ *   // extend list with new vectors
+ *   ivf_pq::helpers::extend_list(
+ *       res, &index, new_vectors.view(), indices.view(), label);
+ * @endcode
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_vectors data to encode [n_rows, index.dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ *
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  ivf_pq::detail::extend_list(res, index, new_vectors, new_indices, label);
+}
+
+/**
+ * @brief Remove all data from a single list (cluster) in the index.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will erase the fourth cluster (label = 3)
+ *   ivf_pq::helpers::erase_list(res, &index, 3);
+ * @endcode
+ *
+ * @tparam IdxT
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  ivf_pq::detail::erase_list(res, index, label);
+}
+
+/** @} */
+}  // namespace raft::neighbors::ivf_pq::helpers
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index eec9238435..4d11bac42e 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -325,11 +325,11 @@ struct index : ann::index {
   }
 
   // Don't allow copying the index for performance reasons (try avoiding copying data)
-  index(const index&) = delete;
-  index(index&&)      = default;
+  index(const index&)                    = delete;
+  index(index&&)                         = default;
   auto operator=(const index&) -> index& = delete;
-  auto operator=(index&&) -> index& = default;
-  ~index()                          = default;
+  auto operator=(index&&) -> index&      = default;
+  ~index()                               = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
   index(raft::device_resources const& handle,
diff --git a/cpp/include/raft/neighbors/refine-ext.cuh b/cpp/include/raft/neighbors/refine-ext.cuh
new file mode 100644
index 0000000000..0ba2d2c5ab
--- /dev/null
+++ b/cpp/include/raft/neighbors/refine-ext.cuh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                           // int64_t
+
+#include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
+#include <raft/core/device_resources.hpp>    // raft::device_resources
+#include <raft/core/host_mdspan.hpp>         // // raft::host_matrix_view
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::neighbors {
+
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
+            raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
+            raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+            raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+            raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
+            raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+  RAFT_EXPLICIT;
+
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
+            raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
+            raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+            raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
+            raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
+            raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+  RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)       \
+  extern template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>( \
+    raft::device_resources const& handle,                                              \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,             \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,             \
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,  \
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                    \
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,             \
+    raft::distance::DistanceType metric);                                              \
+                                                                                       \
+  extern template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>( \
+    raft::device_resources const& handle,                                              \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,               \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,               \
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,    \
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                      \
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,               \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_refine(int64_t, float, float, int64_t);
+instantiate_raft_neighbors_refine(int64_t, int8_t, float, int64_t);
+instantiate_raft_neighbors_refine(int64_t, uint8_t, float, int64_t);
+
+#undef instantiate_raft_neighbors_refine
diff --git a/cpp/include/raft/neighbors/refine-inl.cuh b/cpp/include/raft/neighbors/refine-inl.cuh
new file mode 100644
index 0000000000..4243d7e723
--- /dev/null
+++ b/cpp/include/raft/neighbors/refine-inl.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <raft/neighbors/detail/refine.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
+namespace raft::neighbors {
+
+/**
+ * @defgroup ann_refine Approximate Nearest Neighbors Refinement
+ * @{
+ */
+
+/**
+ * @brief Refine nearest neighbor search.
+ *
+ * Refinement is an operation that follows an approximate NN search. The approximate search has
+ * already selected n_candidates neighbor candidates for each query. We narrow it down to k
+ * neighbors. For each query, we calculate the exact distance between the query and its
+ * n_candidates neighbor candidate, and select the k nearest ones.
+ *
+ * The k nearest neighbors and distances are returned.
+ *
+ * Example usage
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search m = 4 * k nearest neighbours for each of the N queries
+ *   ivf_pq::search(handle, search_params, index, queries, N, 4 * k, neighbor_candidates,
+ *                  out_dists_tmp);
+ *   // refine it to the k nearest one
+ *   refine(handle, dataset, queries, neighbor_candidates, out_indices, out_dists,
+ *           index.metric());
+ * @endcode
+ *
+ *
+ * @param[in] handle the raft handle
+ * @param[in] dataset device matrix that stores the dataset [n_rows, dims]
+ * @param[in] queries device matrix of the queries [n_queris, dims]
+ * @param[in] neighbor_candidates indices of candidate vectors [n_queries, n_candidates], where
+ *   n_candidates >= k
+ * @param[out] indices device matrix that stores the refined indices [n_queries, k]
+ * @param[out] distances device matrix that stores the refined distances [n_queries, k]
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ */
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
+            raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
+            raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+            raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+            raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
+            distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+{
+  detail::refine_device(handle, dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+/** Same as above, but all input and out data is in host memory.
+ * @param[in] handle the raft handle
+ * @param[in] dataset host matrix that stores the dataset [n_rows, dims]
+ * @param[in] queries host matrix of the queries [n_queris, dims]
+ * @param[in] neighbor_candidates host matrix with indices of candidate vectors [n_queries,
+ *   n_candidates], where n_candidates >= k
+ * @param[out] indices host matrix that stores the refined indices [n_queries, k]
+ * @param[out] distances host matrix that stores the refined distances [n_queries, k]
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ */
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
+            raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
+            raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+            raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
+            raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
+            distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+{
+  detail::refine_host(dataset, queries, neighbor_candidates, indices, distances, metric);
+}
+
+/** @} */  // end group ann_refine
+}  // namespace raft::neighbors
diff --git a/cpp/include/raft/neighbors/refine.cuh b/cpp/include/raft/neighbors/refine.cuh
index 4243d7e723..15f2b02928 100644
--- a/cpp/include/raft/neighbors/refine.cuh
+++ b/cpp/include/raft/neighbors/refine.cuh
@@ -13,93 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/core/host_mdspan.hpp>
-#include <raft/matrix/matrix.cuh>
-#include <raft/neighbors/detail/refine.cuh>
-#include <raft/spatial/knn/detail/ann_utils.cuh>
-
-namespace raft::neighbors {
-
-/**
- * @defgroup ann_refine Approximate Nearest Neighbors Refinement
- * @{
- */
-
-/**
- * @brief Refine nearest neighbor search.
- *
- * Refinement is an operation that follows an approximate NN search. The approximate search has
- * already selected n_candidates neighbor candidates for each query. We narrow it down to k
- * neighbors. For each query, we calculate the exact distance between the query and its
- * n_candidates neighbor candidate, and select the k nearest ones.
- *
- * The k nearest neighbors and distances are returned.
- *
- * Example usage
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_pq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_pq::search_params search_params;
- *   // search m = 4 * k nearest neighbours for each of the N queries
- *   ivf_pq::search(handle, search_params, index, queries, N, 4 * k, neighbor_candidates,
- *                  out_dists_tmp);
- *   // refine it to the k nearest one
- *   refine(handle, dataset, queries, neighbor_candidates, out_indices, out_dists,
- *           index.metric());
- * @endcode
- *
- *
- * @param[in] handle the raft handle
- * @param[in] dataset device matrix that stores the dataset [n_rows, dims]
- * @param[in] queries device matrix of the queries [n_queris, dims]
- * @param[in] neighbor_candidates indices of candidate vectors [n_queries, n_candidates], where
- *   n_candidates >= k
- * @param[out] indices device matrix that stores the refined indices [n_queries, k]
- * @param[out] distances device matrix that stores the refined distances [n_queries, k]
- * @param[in] metric distance metric to use. Euclidean (L2) is used by default
- */
-template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::device_resources const& handle,
-            raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
-            raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
-            raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
-            raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
-            raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
-            distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
-{
-  detail::refine_device(handle, dataset, queries, neighbor_candidates, indices, distances, metric);
-}
-
-/** Same as above, but all input and out data is in host memory.
- * @param[in] handle the raft handle
- * @param[in] dataset host matrix that stores the dataset [n_rows, dims]
- * @param[in] queries host matrix of the queries [n_queris, dims]
- * @param[in] neighbor_candidates host matrix with indices of candidate vectors [n_queries,
- *   n_candidates], where n_candidates >= k
- * @param[out] indices host matrix that stores the refined indices [n_queries, k]
- * @param[out] distances host matrix that stores the refined distances [n_queries, k]
- * @param[in] metric distance metric to use. Euclidean (L2) is used by default
- */
-template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::device_resources const& handle,
-            raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
-            raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
-            raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
-            raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
-            raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
-            distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
-{
-  detail::refine_host(dataset, queries, neighbor_candidates, indices, distances, metric);
-}
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "refine-inl.cuh"
+#endif
 
-/** @} */  // end group ann_refine
-}  // namespace raft::neighbors
+#ifdef RAFT_COMPILED
+#include "refine-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index 9da5649ef8..ed0b6848ae 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -13,17 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/specializations/ball_cover.cuh>
-#include <raft/neighbors/specializations/brute_force.cuh>
-#include <raft/neighbors/specializations/fused_l2_knn.cuh>
-
-#include <raft/neighbors/specializations/ivf_flat.cuh>
-#include <raft/neighbors/specializations/ivf_pq.cuh>
-#include <raft/neighbors/specializations/refine.cuh>
-
-#include <raft/cluster/specializations.cuh>
-#include <raft/distance/specializations.cuh>
-#include <raft/matrix/specializations.cuh>
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
index d6a6b2e296..ed0b6848ae 100644
--- a/cpp/include/raft/neighbors/specializations/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
@@ -13,41 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/ball_cover.cuh>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
-
-#include <cstdint>
-
-namespace raft::neighbors::ball_cover {
-extern template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
-extern template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
-
-extern template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  raft::device_resources const& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
-
-extern template void knn_query<std::int64_t, float, std::uint32_t>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  const float* query,
-  std::uint32_t n_query_pts,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-extern template void all_knn_query<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  raft::device_resources const& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-};  // namespace raft::neighbors::ball_cover
\ No newline at end of file
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/brute_force.cuh b/cpp/include/raft/neighbors/specializations/brute_force.cuh
index 1337beb68a..ed0b6848ae 100644
--- a/cpp/include/raft/neighbors/specializations/brute_force.cuh
+++ b/cpp/include/raft/neighbors/specializations/brute_force.cuh
@@ -13,34 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/brute_force.cuh>
-
-// also define the detail api, which is used by raft::neighbors::brute_force
-// (not doing the public api, since has extra template params on index_layout, matrix_index,
-// search_layout etc - and isn't clear what the defaults here should be)
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                                 \
-  extern template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                           std::vector<T*>& input,               \
-                                                           std::vector<IntT>& sizes,             \
-                                                           IntT D,                               \
-                                                           T* search_items,                      \
-                                                           IntT n,                               \
-                                                           IdxT* res_I,                          \
-                                                           T* res_D,                             \
-                                                           IntT k,                               \
-                                                           bool rowMajorIndex,                   \
-                                                           bool rowMajorQuery,                   \
-                                                           std::vector<IdxT>* translations,      \
-                                                           raft::distance::DistanceType metric,  \
-                                                           float metricArg,                      \
-                                                           raft::identity_op);
-RAFT_INST(long, float, int);
-RAFT_INST(long, float, unsigned int);
-RAFT_INST(uint32_t, float, int);
-RAFT_INST(uint32_t, float, unsigned int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
index f1c46b1225..9588a7f329 100644
--- a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
+++ b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
@@ -13,38 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-namespace {
-using fp8s_t = fp_8bit<5, true>;
-using fp8u_t = fp_8bit<5, false>;
-}  // namespace
-
-#define RAFT_INST(OutT, LutT)                                                                     \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, true, true>(uint32_t, uint32_t)  \
-    ->compute_similarity_kernel_t<OutT, LutT>;                                                    \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, true, false>(uint32_t, uint32_t) \
-    ->compute_similarity_kernel_t<OutT, LutT>;                                                    \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, false, true>(uint32_t, uint32_t) \
-    ->compute_similarity_kernel_t<OutT, LutT>;
-
-#define RAFT_INST_ALL_OUT_T(LutT) \
-  RAFT_INST(float, LutT)          \
-  RAFT_INST(half, LutT)
-
-RAFT_INST_ALL_OUT_T(float)
-RAFT_INST_ALL_OUT_T(half)
-RAFT_INST_ALL_OUT_T(fp8s_t)
-RAFT_INST_ALL_OUT_T(fp8u_t)
-
-#undef RAFT_INST
-#undef RAFT_INST_ALL_OUT_T
-
-}  // namespace raft::neighbors::ivf_pq::detail
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
index 916db8f0a2..ed0b6848ae 100644
--- a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
+++ b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,68 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
-#include <cstdint>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-extern template void fusedL2Knn<long, float, true>(size_t D,
-                                                   long* out_inds,
-                                                   float* out_dists,
-                                                   const float* index,
-                                                   const float* query,
-                                                   size_t n_index_rows,
-                                                   size_t n_query_rows,
-                                                   int k,
-                                                   bool rowMajorIndex,
-                                                   bool rowMajorQuery,
-                                                   cudaStream_t stream,
-                                                   raft::distance::DistanceType metric);
-
-extern template void fusedL2Knn<long, float, false>(size_t D,
-                                                    long* out_inds,
-                                                    float* out_dists,
-                                                    const float* index,
-                                                    const float* query,
-                                                    size_t n_index_rows,
-                                                    size_t n_query_rows,
-                                                    int k,
-                                                    bool rowMajorIndex,
-                                                    bool rowMajorQuery,
-                                                    cudaStream_t stream,
-                                                    raft::distance::DistanceType metric);
-
-extern template void fusedL2Knn<int, float, true>(size_t D,
-                                                  int* out_inds,
-                                                  float* out_dists,
-                                                  const float* index,
-                                                  const float* query,
-                                                  size_t n_index_rows,
-                                                  size_t n_query_rows,
-                                                  int k,
-                                                  bool rowMajorIndex,
-                                                  bool rowMajorQuery,
-                                                  cudaStream_t stream,
-                                                  raft::distance::DistanceType metric);
-
-extern template void fusedL2Knn<int, float, false>(size_t D,
-                                                   int* out_inds,
-                                                   float* out_dists,
-                                                   const float* index,
-                                                   const float* query,
-                                                   size_t n_index_rows,
-                                                   size_t n_query_rows,
-                                                   int k,
-                                                   bool rowMajorIndex,
-                                                   bool rowMajorQuery,
-                                                   cudaStream_t stream,
-                                                   raft::distance::DistanceType metric);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
index 161f3462c9..ac3b80e8d9 100644
--- a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
@@ -13,65 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/ivf_flat.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-// greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-// function is used in both raft::neighbors::ivf_flat::search and
-// raft::neighbors::detail::refine_device. To prevent a duplicate instantiation
-// of this function (which defines ~270 kernels) in the refine specializations,
-// an extern template definition is provided here. Please check related function
-// calls after editing template definition below. Search for
-// `greppable-id-specializations-ivf-flat-search` to find them.
-#define RAFT_INST(T, IdxT)                                                               \
-  extern template auto build(raft::device_resources const& handle,                       \
-                             const index_params& params,                                 \
-                             raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<T, IdxT>;                                                                    \
-                                                                                         \
-  extern template auto extend(                                                           \
-    raft::device_resources const& handle,                                                \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                      \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,               \
-    const index<T, IdxT>& orig_index)                                                    \
-    ->index<T, IdxT>;                                                                    \
-                                                                                         \
-  extern template void extend(                                                           \
-    raft::device_resources const& handle,                                                \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                      \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,               \
-    raft::neighbors::ivf_flat::index<T, IdxT>* idx);                                     \
-                                                                                         \
-  extern template void search(raft::device_resources const&,                             \
-                              raft::neighbors::ivf_flat::search_params const&,           \
-                              const raft::neighbors::ivf_flat::index<T, IdxT>&,          \
-                              raft::device_matrix_view<const T, IdxT, row_major>,        \
-                              raft::device_matrix_view<IdxT, IdxT, row_major>,           \
-                              raft::device_matrix_view<float, IdxT, row_major>);         \
-                                                                                         \
-  extern template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<      \
-    T,                                                                                   \
-    typename raft::spatial::knn::detail::utils::config<T>::value_t,                      \
-    IdxT>(const index<T, IdxT>& index,                                                   \
-          const T* queries,                                                              \
-          const uint32_t* coarse_query_results,                                          \
-          const uint32_t n_queries,                                                      \
-          const raft::distance::DistanceType metric,                                     \
-          const uint32_t n_probes,                                                       \
-          const uint32_t k,                                                              \
-          const bool select_min,                                                         \
-          IdxT* neighbors,                                                               \
-          float* distances,                                                              \
-          uint32_t& grid_dim_x,                                                          \
-          rmm::cuda_stream_view stream);
-
-RAFT_INST(float, int64_t);
-RAFT_INST(int8_t, int64_t);
-RAFT_INST(uint8_t, int64_t);
-
-#undef RAFT_INST
-}  // namespace raft::neighbors::ivf_flat
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
index 9209f5095d..9588a7f329 100644
--- a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
@@ -13,63 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/cluster/specializations.cuh>
-#include <raft/distance/specializations.cuh>
-#include <raft/matrix/specializations.cuh>
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#ifdef RAFT_DECL_BUILD_EXTEND
-#undef RAFT_DECL_BUILD_EXTEND
-#endif
-
-#ifdef RAFT_DECL_SEARCH
-#undef RAFT_DECL_SEARCH
-#endif
-
-// We define overloads for build and extend with void return type. This is used in the Cython
-// wrappers, where exception handling is not compatible with return type that has nontrivial
-// constructor.
-#define RAFT_DECL_BUILD_EXTEND(T, IdxT)                                                  \
-  extern template auto build(raft::device_resources const&,                              \
-                             const raft::neighbors::ivf_pq::index_params&,               \
-                             raft::device_matrix_view<const T, IdxT, row_major>)         \
-    ->raft::neighbors::ivf_pq::index<IdxT>;                                              \
-                                                                                         \
-  extern template auto extend(raft::device_resources const&,                             \
-                              raft::device_matrix_view<const T, IdxT, row_major>,        \
-                              std::optional<raft::device_vector_view<const IdxT, IdxT>>, \
-                              const raft::neighbors::ivf_pq::index<IdxT>&)               \
-    ->raft::neighbors::ivf_pq::index<IdxT>;                                              \
-                                                                                         \
-  extern template void extend(raft::device_resources const&,                             \
-                              raft::device_matrix_view<const T, IdxT, row_major>,        \
-                              std::optional<raft::device_vector_view<const IdxT, IdxT>>, \
-                              raft::neighbors::ivf_pq::index<IdxT>*);
-
-RAFT_DECL_BUILD_EXTEND(float, int64_t)
-RAFT_DECL_BUILD_EXTEND(int8_t, int64_t)
-RAFT_DECL_BUILD_EXTEND(uint8_t, int64_t)
-
-#undef RAFT_DECL_BUILD_EXTEND
-
-#define RAFT_DECL_SEARCH(T, IdxT)                                                 \
-  extern template void search(raft::device_resources const&,                      \
-                              const raft::neighbors::ivf_pq::search_params&,      \
-                              const raft::neighbors::ivf_pq::index<IdxT>&,        \
-                              raft::device_matrix_view<const T, IdxT, row_major>, \
-                              raft::device_matrix_view<IdxT, IdxT, row_major>,    \
-                              raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_DECL_SEARCH(float, int64_t);
-RAFT_DECL_SEARCH(int8_t, int64_t);
-RAFT_DECL_SEARCH(uint8_t, int64_t);
-
-#undef RAFT_DECL_SEARCH
-
-}  // namespace raft::neighbors::ivf_pq
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/refine.cuh b/cpp/include/raft/neighbors/specializations/refine.cuh
index aef4834c9f..9588a7f329 100644
--- a/cpp/include/raft/neighbors/specializations/refine.cuh
+++ b/cpp/include/raft/neighbors/specializations/refine.cuh
@@ -13,39 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/refine.cuh>
-
-namespace raft::neighbors {
-
-#ifdef RAFT_INST
-#undef RAFT_INST
-#endif
-
-#define RAFT_INST(T, IdxT)                                                        \
-  extern template void refine<IdxT, T, float, int64_t>(                           \
-    raft::device_resources const& handle,                                         \
-    raft::device_matrix_view<const T, int64_t, row_major> dataset,                \
-    raft::device_matrix_view<const T, int64_t, row_major> queries,                \
-    raft::device_matrix_view<const IdxT, int64_t, row_major> neighbor_candidates, \
-    raft::device_matrix_view<IdxT, int64_t, row_major> indices,                   \
-    raft::device_matrix_view<float, int64_t, row_major> distances,                \
-    distance::DistanceType metric);                                               \
-                                                                                  \
-  extern template void refine<IdxT, T, float, int64_t>(                           \
-    raft::device_resources const& handle,                                         \
-    raft::host_matrix_view<const T, int64_t, row_major> dataset,                  \
-    raft::host_matrix_view<const T, int64_t, row_major> queries,                  \
-    raft::host_matrix_view<const IdxT, int64_t, row_major> neighbor_candidates,   \
-    raft::host_matrix_view<IdxT, int64_t, row_major> indices,                     \
-    raft::host_matrix_view<float, int64_t, row_major> distances,                  \
-    distance::DistanceType metric);
-
-RAFT_INST(float, int64_t);
-RAFT_INST(uint8_t, int64_t);
-RAFT_INST(int8_t, int64_t);
-
-#undef RAFT_INST
-}  // namespace raft::neighbors
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index 01d97d496d..1715dcbe81 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -22,7 +22,8 @@
 
 #include <algorithm>
 
-#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/init.cuh>
@@ -52,7 +53,7 @@ static __global__ void _singular_profile_kernel(DataT* out, IdxT n, DataT tail_s
 
 /* Internal auxiliary function to generate a low-rank matrix */
 template <typename DataT, typename IdxT>
-static void _make_low_rank_matrix(raft::device_resources const& handle,
+static void _make_low_rank_matrix(raft::resources const& handle,
                                   DataT* out,
                                   IdxT n_rows,
                                   IdxT n_cols,
@@ -61,8 +62,7 @@ static void _make_low_rank_matrix(raft::device_resources const& handle,
                                   raft::random::RngState& r,
                                   cudaStream_t stream)
 {
-  cusolverDnHandle_t cusolver_handle = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublas_handle       = handle.get_cublas_handle();
+  cublasHandle_t cublas_handle = resource::get_cublas_handle(handle);
 
   IdxT n = std::min(n_rows, n_cols);
 
@@ -143,7 +143,7 @@ static __global__ void _gather2d_kernel(
 }
 
 template <typename DataT, typename IdxT>
-void make_regression_caller(raft::device_resources const& handle,
+void make_regression_caller(raft::resources const& handle,
                             DataT* out,
                             DataT* values,
                             IdxT n_rows,
@@ -162,8 +162,7 @@ void make_regression_caller(raft::device_resources const& handle,
 {
   n_informative = std::min(n_informative, n_cols);
 
-  cusolverDnHandle_t cusolver_handle = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublas_handle       = handle.get_cublas_handle();
+  cublasHandle_t cublas_handle = resource::get_cublas_handle(handle);
 
   cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);
   raft::random::RngState r(seed, type);
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index 16f50446ae..68934ac1ff 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -20,7 +20,10 @@
 #include <memory>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
@@ -139,18 +142,16 @@ class multi_variable_gaussian_impl {
   int *info, Lwork, info_h;
   syevjInfo_t syevj_params = NULL;
   curandGenerator_t gen;
-  raft::device_resources const& handle;
+  raft::resources const& handle;
   cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
   bool deinitilized      = false;
 
  public:  // functions
   multi_variable_gaussian_impl() = delete;
-  multi_variable_gaussian_impl(raft::device_resources const& handle,
-                               const int dim,
-                               Decomposer method)
+  multi_variable_gaussian_impl(raft::resources const& handle, const int dim, Decomposer method)
     : handle(handle), dim(dim), method(method)
   {
-    auto cusolverHandle = handle.get_cusolver_dn_handle();
+    auto cusolverHandle = resource::get_cusolver_dn_handle(handle);
 
     CURAND_CHECK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
     CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(gen, 28));  // SEED
@@ -191,9 +192,9 @@ class multi_variable_gaussian_impl {
 
   void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
   {
-    auto cusolverHandle = handle.get_cusolver_dn_handle();
-    auto cublasHandle   = handle.get_cublas_handle();
-    auto cudaStream     = handle.get_stream();
+    auto cusolverHandle = resource::get_cusolver_dn_handle(handle);
+    auto cublasHandle   = resource::get_cublas_handle(handle);
+    auto cudaStream     = resource::get_cuda_stream(handle);
     if (method == chol_decomp) {
       // lower part will contains chol_decomp
       RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(
@@ -299,7 +300,7 @@ class multi_variable_gaussian_setup_token;
 
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method);
@@ -315,7 +316,7 @@ template <typename ValueType>
 class multi_variable_gaussian_setup_token {
   template <typename T>
   friend multi_variable_gaussian_setup_token<T> build_multi_variable_gaussian_token_impl(
-    raft::device_resources const& handle,
+    raft::resources const& handle,
     rmm::mr::device_memory_resource& mem_resource,
     const int dim,
     const multi_variable_gaussian_decomposition_method method);
@@ -342,7 +343,7 @@ class multi_variable_gaussian_setup_token {
 
   // Constructor, only for use by friend functions.
   // Hiding this will let us change the implementation in the future.
-  multi_variable_gaussian_setup_token(raft::device_resources const& handle,
+  multi_variable_gaussian_setup_token(raft::resources const& handle,
                                       rmm::mr::device_memory_resource& mem_resource,
                                       const int dim,
                                       const multi_variable_gaussian_decomposition_method method)
@@ -399,14 +400,15 @@ class multi_variable_gaussian_setup_token {
 
  private:
   std::unique_ptr<multi_variable_gaussian_impl<ValueType>> impl_;
-  raft::device_resources const& handle_;
+  raft::resources const& handle_;
   rmm::mr::device_memory_resource& mem_resource_;
   int dim_ = 0;
 
   auto allocate_workspace() const
   {
     const auto num_elements = impl_->get_workspace_size();
-    return rmm::device_uvector<ValueType>{num_elements, handle_.get_stream(), &mem_resource_};
+    return rmm::device_uvector<ValueType>{
+      num_elements, resource::get_cuda_stream(handle_), &mem_resource_};
   }
 
   int dim() const { return dim_; }
@@ -414,7 +416,7 @@ class multi_variable_gaussian_setup_token {
 
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method)
@@ -434,7 +436,7 @@ void compute_multi_variable_gaussian_impl(
 
 template <typename ValueType>
 void compute_multi_variable_gaussian_impl(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   rmm::mr::device_memory_resource& mem_resource,
   std::optional<raft::device_vector_view<const ValueType, int>> x,
   raft::device_matrix_view<ValueType, int, raft::col_major> P,
@@ -455,7 +457,7 @@ class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
   // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
 
   multi_variable_gaussian() = delete;
-  multi_variable_gaussian(raft::device_resources const& handle,
+  multi_variable_gaussian(raft::resources const& handle,
                           const int dim,
                           typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
     : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index b5e0610405..d00fc29056 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -18,7 +18,8 @@
 
 #include "rmat_rectangular_generator_types.cuh"
 
-#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -206,7 +207,7 @@ void rmat_rectangular_gen_caller(IdxT* out,
  * @param[in]  c_scale 2^c_scale represents the number of destination nodes
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen_impl(raft::device_resources const& handle,
+void rmat_rectangular_gen_impl(raft::resources const& handle,
                                raft::random::RngState& r,
                                raft::device_vector_view<const ProbT, IdxT> theta,
                                raft::random::detail::rmat_rectangular_gen_output<IdxT> output,
@@ -247,7 +248,7 @@ void rmat_rectangular_gen_impl(raft::device_resources const& handle,
                               r_scale,
                               c_scale,
                               n_edges,
-                              handle.get_stream(),
+                              resource::get_cuda_stream(handle),
                               r);
 }
 
@@ -259,7 +260,7 @@ void rmat_rectangular_gen_impl(raft::device_resources const& handle,
  * `theta` parameter.
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen_impl(raft::device_resources const& handle,
+void rmat_rectangular_gen_impl(raft::resources const& handle,
                                raft::random::RngState& r,
                                raft::random::detail::rmat_rectangular_gen_output<IdxT> output,
                                ProbT a,
@@ -286,8 +287,17 @@ void rmat_rectangular_gen_impl(raft::device_resources const& handle,
   IdxT* out_dst_ptr            = out_dst_has_value ? (*out_dst).data_handle() : nullptr;
   const IdxT n_edges           = output.number_of_edges();
 
-  detail::rmat_rectangular_gen_caller(
-    out_ptr, out_src_ptr, out_dst_ptr, a, b, c, r_scale, c_scale, n_edges, handle.get_stream(), r);
+  detail::rmat_rectangular_gen_caller(out_ptr,
+                                      out_src_ptr,
+                                      out_dst_ptr,
+                                      a,
+                                      b,
+                                      c,
+                                      r_scale,
+                                      c_scale,
+                                      n_edges,
+                                      resource::get_cuda_stream(handle),
+                                      r);
 }
 
 }  // end namespace detail
diff --git a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
index 362c844fb3..8895d22cf0 100644
--- a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
+++ b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
@@ -23,7 +23,7 @@
 #include "rng_device.cuh"
 
 #include <curand_kernel.h>
-#include <raft/core/device_resources.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/detail/cub_wrappers.cuh>
@@ -259,7 +259,7 @@ class RngImpl {
 
   template <typename DataT, typename WeightsT, typename IdxT = int>
   METHOD_DEPR(sampleWithoutReplacement)
-  void sampleWithoutReplacement(raft::device_resources const& handle,
+  void sampleWithoutReplacement(raft::resources const& handle,
                                 DataT* out,
                                 IdxT* outIdx,
                                 const DataT* in,
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 7aa0362f6d..079ab43b74 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -22,6 +22,8 @@
 #include "detail/make_blobs.cuh"
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 
 namespace raft::random {
 
@@ -129,7 +131,7 @@ void make_blobs(DataT* out,
  */
 template <typename DataT, typename IdxT, typename layout>
 void make_blobs(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   raft::device_matrix_view<DataT, IdxT, layout> out,
   raft::device_vector_view<IdxT, IdxT> labels,
   IdxT n_clusters                                                        = 5,
@@ -167,7 +169,7 @@ void make_blobs(
                             (IdxT)out.extent(0),
                             (IdxT)out.extent(1),
                             n_clusters,
-                            handle.get_stream(),
+                            resource::get_cuda_stream(handle),
                             row_major,
                             prm_centers,
                             prm_cluster_std,
diff --git a/cpp/include/raft/random/make_regression.cuh b/cpp/include/raft/random/make_regression.cuh
index f4a7e82308..0aa9cc4daa 100644
--- a/cpp/include/raft/random/make_regression.cuh
+++ b/cpp/include/raft/random/make_regression.cuh
@@ -26,6 +26,8 @@
 #include <algorithm>
 #include <optional>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 
 #include "detail/make_regression.cuh"
 
@@ -67,7 +69,7 @@ namespace raft::random {
  * @param[in]   type            Random generator type
  */
 template <typename DataT, typename IdxT>
-void make_regression(raft::device_resources const& handle,
+void make_regression(raft::resources const& handle,
                      DataT* out,
                      DataT* values,
                      IdxT n_rows,
@@ -138,7 +140,7 @@ void make_regression(raft::device_resources const& handle,
  * @param[in]   type            Random generator type
  */
 template <typename DataT, typename IdxT>
-void make_regression(raft::device_resources const& handle,
+void make_regression(raft::resources const& handle,
                      raft::device_matrix_view<DataT, IdxT, raft::row_major> out,
                      raft::device_matrix_view<DataT, IdxT, raft::row_major> values,
                      IdxT n_informative,
@@ -170,7 +172,7 @@ void make_regression(raft::device_resources const& handle,
                                  n_samples,
                                  n_features,
                                  n_informative,
-                                 handle.get_stream(),
+                                 resource::get_cuda_stream(handle),
                                  coef_ptr,
                                  n_targets,
                                  bias,
diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh
index 91a7695f2c..eada1c9521 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/multi_variable_gaussian.cuh
@@ -20,6 +20,7 @@
 #pragma once
 
 #include "detail/multi_variable_gaussian.cuh"
+#include <raft/core/resources.hpp>
 #include <raft/random/random_types.hpp>
 
 namespace raft::random {
@@ -30,7 +31,7 @@ namespace raft::random {
  */
 
 template <typename ValueType>
-void multi_variable_gaussian(raft::device_resources const& handle,
+void multi_variable_gaussian(raft::resources const& handle,
                              rmm::mr::device_memory_resource& mem_resource,
                              std::optional<raft::device_vector_view<const ValueType, int>> x,
                              raft::device_matrix_view<ValueType, int, raft::col_major> P,
@@ -41,7 +42,7 @@ void multi_variable_gaussian(raft::device_resources const& handle,
 }
 
 template <typename ValueType>
-void multi_variable_gaussian(raft::device_resources const& handle,
+void multi_variable_gaussian(raft::resources const& handle,
                              std::optional<raft::device_vector_view<const ValueType, int>> x,
                              raft::device_matrix_view<ValueType, int, raft::col_major> P,
                              raft::device_matrix_view<ValueType, int, raft::col_major> X,
diff --git a/cpp/include/raft/random/permute.cuh b/cpp/include/raft/random/permute.cuh
index f84b603549..d349b68add 100644
--- a/cpp/include/raft/random/permute.cuh
+++ b/cpp/include/raft/random/permute.cuh
@@ -23,7 +23,8 @@
 
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <type_traits>
 
 namespace raft::random {
@@ -31,8 +32,7 @@ namespace raft::random {
 namespace permute_impl {
 
 template <typename T, typename InputOutputValueType, typename IdxType, typename Layout>
-struct perms_out_view {
-};
+struct perms_out_view {};
 
 template <typename InputOutputValueType, typename IdxType, typename Layout>
 struct perms_out_view<std::nullopt_t, InputOutputValueType, IdxType, Layout> {
@@ -95,7 +95,7 @@ using perms_out_view_t = typename perms_out_view<T, InputOutputValueType, IdxTyp
  *   then we recommend Knuth Shuffle.
  */
 template <typename InputOutputValueType, typename IntType, typename IdxType, typename Layout>
-void permute(raft::device_resources const& handle,
+void permute(raft::resources const& handle,
              raft::device_matrix_view<const InputOutputValueType, IdxType, Layout> in,
              std::optional<raft::device_vector_view<IntType, IdxType>> permsOut,
              std::optional<raft::device_matrix_view<InputOutputValueType, IdxType, Layout>> out)
@@ -128,8 +128,13 @@ void permute(raft::device_resources const& handle,
   if (permsOut_ptr != nullptr || out_ptr != nullptr) {
     const IdxType N = in.extent(0);
     const IdxType D = in.extent(1);
-    detail::permute<InputOutputValueType, IntType, IdxType>(
-      permsOut_ptr, out_ptr, in.data_handle(), D, N, is_row_major, handle.get_stream());
+    detail::permute<InputOutputValueType, IntType, IdxType>(permsOut_ptr,
+                                                            out_ptr,
+                                                            in.data_handle(),
+                                                            D,
+                                                            N,
+                                                            is_row_major,
+                                                            resource::get_cuda_stream(handle));
   }
 }
 
@@ -142,7 +147,7 @@ template <typename InputOutputValueType,
           typename Layout,
           typename PermsOutType,
           typename OutType>
-void permute(raft::device_resources const& handle,
+void permute(raft::resources const& handle,
              raft::device_matrix_view<const InputOutputValueType, IdxType, Layout> in,
              PermsOutType&& permsOut,
              OutType&& out)
diff --git a/cpp/include/raft/random/rmat_rectangular_generator.cuh b/cpp/include/raft/random/rmat_rectangular_generator.cuh
index d578794d31..90cd9baf81 100644
--- a/cpp/include/raft/random/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/rmat_rectangular_generator.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "detail/rmat_rectangular_generator.cuh"
+#include <raft/core/resources.hpp>
 
 namespace raft::random {
 
@@ -78,7 +79,7 @@ namespace raft::random {
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   raft::random::RngState& r,
   raft::device_vector_view<const ProbT, IdxT> theta,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
@@ -102,7 +103,7 @@ void rmat_rectangular_gen(
  * @pre `out_src.extent(0) == out_dst.extent(0)` is `true`
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen(raft::device_resources const& handle,
+void rmat_rectangular_gen(raft::resources const& handle,
                           raft::random::RngState& r,
                           raft::device_vector_view<const ProbT, IdxT> theta,
                           raft::device_vector_view<IdxT, IdxT> out_src,
@@ -125,7 +126,7 @@ void rmat_rectangular_gen(raft::device_resources const& handle,
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   raft::random::RngState& r,
   raft::device_vector_view<const ProbT, IdxT> theta,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
@@ -152,7 +153,7 @@ void rmat_rectangular_gen(
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   raft::random::RngState& r,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
   raft::device_vector_view<IdxT, IdxT> out_src,
@@ -179,7 +180,7 @@ void rmat_rectangular_gen(
  * @pre `out_src.extent(0) == out_dst.extent(0)` is `true`
  */
 template <typename IdxT, typename ProbT>
-void rmat_rectangular_gen(raft::device_resources const& handle,
+void rmat_rectangular_gen(raft::resources const& handle,
                           raft::random::RngState& r,
                           raft::device_vector_view<IdxT, IdxT> out_src,
                           raft::device_vector_view<IdxT, IdxT> out_dst,
@@ -204,7 +205,7 @@ void rmat_rectangular_gen(raft::device_resources const& handle,
  */
 template <typename IdxT, typename ProbT>
 void rmat_rectangular_gen(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   raft::random::RngState& r,
   raft::device_mdspan<IdxT, raft::extents<IdxT, raft::dynamic_extent, 2>, raft::row_major> out,
   ProbT a,
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index d03975d0db..c3b44a7577 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -22,7 +22,8 @@
 #include <cassert>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <type_traits>
 #include <variant>
 
@@ -41,13 +42,14 @@ namespace raft::random {
  * @param[in] end end of the range
  */
 template <typename OutputValueType, typename IndexType>
-void uniform(raft::device_resources const& handle,
+void uniform(raft::resources const& handle,
              RngState& rng_state,
              raft::device_vector_view<OutputValueType, IndexType> out,
              OutputValueType start,
              OutputValueType end)
 {
-  detail::uniform(rng_state, out.data_handle(), out.extent(0), start, end, handle.get_stream());
+  detail::uniform(
+    rng_state, out.data_handle(), out.extent(0), start, end, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -63,14 +65,14 @@ void uniform(raft::device_resources const& handle,
  * @param[in] end end of the range
  */
 template <typename OutType, typename LenType = int>
-void uniform(raft::device_resources const& handle,
+void uniform(raft::resources const& handle,
              RngState& rng_state,
              OutType* ptr,
              LenType len,
              OutType start,
              OutType end)
 {
-  detail::uniform(rng_state, ptr, len, start, end, handle.get_stream());
+  detail::uniform(rng_state, ptr, len, start, end, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -86,7 +88,7 @@ void uniform(raft::device_resources const& handle,
  * @param[in] end end of the range
  */
 template <typename OutputValueType, typename IndexType>
-void uniformInt(raft::device_resources const& handle,
+void uniformInt(raft::resources const& handle,
                 RngState& rng_state,
                 raft::device_vector_view<OutputValueType, IndexType> out,
                 OutputValueType start,
@@ -98,7 +100,8 @@ void uniformInt(raft::device_resources const& handle,
     "so that we can write to it.");
   static_assert(std::is_integral<OutputValueType>::value,
                 "uniformInt: The elements of the output vector must have integral type.");
-  detail::uniformInt(rng_state, out.data_handle(), out.extent(0), start, end, handle.get_stream());
+  detail::uniformInt(
+    rng_state, out.data_handle(), out.extent(0), start, end, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -114,14 +117,14 @@ void uniformInt(raft::device_resources const& handle,
  * @param[in] end end of the range
  */
 template <typename OutType, typename LenType = int>
-void uniformInt(raft::device_resources const& handle,
+void uniformInt(raft::resources const& handle,
                 RngState& rng_state,
                 OutType* ptr,
                 LenType len,
                 OutType start,
                 OutType end)
 {
-  detail::uniformInt(rng_state, ptr, len, start, end, handle.get_stream());
+  detail::uniformInt(rng_state, ptr, len, start, end, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -138,13 +141,14 @@ void uniformInt(raft::device_resources const& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void normal(raft::device_resources const& handle,
+void normal(raft::resources const& handle,
             RngState& rng_state,
             raft::device_vector_view<OutputValueType, IndexType> out,
             OutputValueType mu,
             OutputValueType sigma)
 {
-  detail::normal(rng_state, out.data_handle(), out.extent(0), mu, sigma, handle.get_stream());
+  detail::normal(
+    rng_state, out.data_handle(), out.extent(0), mu, sigma, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -160,14 +164,14 @@ void normal(raft::device_resources const& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename OutType, typename LenType = int>
-void normal(raft::device_resources const& handle,
+void normal(raft::resources const& handle,
             RngState& rng_state,
             OutType* ptr,
             LenType len,
             OutType mu,
             OutType sigma)
 {
-  detail::normal(rng_state, ptr, len, mu, sigma, handle.get_stream());
+  detail::normal(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -183,7 +187,7 @@ void normal(raft::device_resources const& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void normalInt(raft::device_resources const& handle,
+void normalInt(raft::resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                OutputValueType mu,
@@ -196,7 +200,8 @@ void normalInt(raft::device_resources const& handle,
   static_assert(std::is_integral<OutputValueType>::value,
                 "normalInt: The output vector's value type must be an integer.");
 
-  detail::normalInt(rng_state, out.data_handle(), out.extent(0), mu, sigma, handle.get_stream());
+  detail::normalInt(
+    rng_state, out.data_handle(), out.extent(0), mu, sigma, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -212,14 +217,14 @@ void normalInt(raft::device_resources const& handle,
  * @param[in] sigma std-dev of the distribution
  */
 template <typename IntType, typename LenType = int>
-void normalInt(raft::device_resources const& handle,
+void normalInt(raft::resources const& handle,
                RngState& rng_state,
                IntType* ptr,
                LenType len,
                IntType mu,
                IntType sigma)
 {
-  detail::normalInt(rng_state, ptr, len, mu, sigma, handle.get_stream());
+  detail::normalInt(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -244,7 +249,7 @@ void normalInt(raft::device_resources const& handle,
  */
 template <typename OutputValueType, typename IndexType>
 void normalTable(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   RngState& rng_state,
   raft::device_vector_view<const OutputValueType, IndexType> mu_vec,
   std::variant<raft::device_vector_view<const OutputValueType, IndexType>, OutputValueType> sigma,
@@ -283,7 +288,7 @@ void normalTable(
                       mu_vec.data_handle(),
                       sigma_vec_ptr,
                       sigma_value,
-                      handle.get_stream());
+                      resource::get_cuda_stream(handle));
 }
 
 /**
@@ -307,7 +312,7 @@ void normalTable(
  * @param[in] sigma scalar sigma to be used if 'sigma_vec' is nullptr
  */
 template <typename OutType, typename LenType = int>
-void normalTable(raft::device_resources const& handle,
+void normalTable(raft::resources const& handle,
                  RngState& rng_state,
                  OutType* ptr,
                  LenType n_rows,
@@ -317,7 +322,7 @@ void normalTable(raft::device_resources const& handle,
                  OutType sigma)
 {
   detail::normalTable(
-    rng_state, ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, handle.get_stream());
+    rng_state, ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -332,12 +337,12 @@ void normalTable(raft::device_resources const& handle,
  * @param[out] out the output vector
  */
 template <typename OutputValueType, typename IndexType>
-void fill(raft::device_resources const& handle,
+void fill(raft::resources const& handle,
           RngState& rng_state,
           OutputValueType val,
           raft::device_vector_view<OutputValueType, IndexType> out)
 {
-  detail::fill(rng_state, out.data_handle(), out.extent(0), val, handle.get_stream());
+  detail::fill(rng_state, out.data_handle(), out.extent(0), val, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -353,9 +358,9 @@ void fill(raft::device_resources const& handle,
  */
 template <typename OutType, typename LenType = int>
 void fill(
-  raft::device_resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType val)
+  raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType val)
 {
-  detail::fill(rng_state, ptr, len, val, handle.get_stream());
+  detail::fill(rng_state, ptr, len, val, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -372,12 +377,13 @@ void fill(
  * @param[in] prob coin-toss probability for heads
  */
 template <typename OutputValueType, typename IndexType, typename Type>
-void bernoulli(raft::device_resources const& handle,
+void bernoulli(raft::resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                Type prob)
 {
-  detail::bernoulli(rng_state, out.data_handle(), out.extent(0), prob, handle.get_stream());
+  detail::bernoulli(
+    rng_state, out.data_handle(), out.extent(0), prob, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -395,9 +401,9 @@ void bernoulli(raft::device_resources const& handle,
  */
 template <typename Type, typename OutType = bool, typename LenType = int>
 void bernoulli(
-  raft::device_resources const& handle, RngState& rng_state, OutType* ptr, LenType len, Type prob)
+  raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, Type prob)
 {
-  detail::bernoulli(rng_state, ptr, len, prob, handle.get_stream());
+  detail::bernoulli(rng_state, ptr, len, prob, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -413,14 +419,14 @@ void bernoulli(
  * @param[in] scale scaling factor
  */
 template <typename OutputValueType, typename IndexType>
-void scaled_bernoulli(raft::device_resources const& handle,
+void scaled_bernoulli(raft::resources const& handle,
                       RngState& rng_state,
                       raft::device_vector_view<OutputValueType, IndexType> out,
                       OutputValueType prob,
                       OutputValueType scale)
 {
   detail::scaled_bernoulli(
-    rng_state, out.data_handle(), out.extent(0), prob, scale, handle.get_stream());
+    rng_state, out.data_handle(), out.extent(0), prob, scale, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -436,14 +442,14 @@ void scaled_bernoulli(raft::device_resources const& handle,
  * @param[in] scale scaling factor
  */
 template <typename OutType, typename LenType = int>
-void scaled_bernoulli(raft::device_resources const& handle,
+void scaled_bernoulli(raft::resources const& handle,
                       RngState& rng_state,
                       OutType* ptr,
                       LenType len,
                       OutType prob,
                       OutType scale)
 {
-  detail::scaled_bernoulli(rng_state, ptr, len, prob, scale, handle.get_stream());
+  detail::scaled_bernoulli(rng_state, ptr, len, prob, scale, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -460,13 +466,14 @@ void scaled_bernoulli(raft::device_resources const& handle,
  * @note https://en.wikipedia.org/wiki/Gumbel_distribution
  */
 template <typename OutputValueType, typename IndexType = int>
-void gumbel(raft::device_resources const& handle,
+void gumbel(raft::resources const& handle,
             RngState& rng_state,
             raft::device_vector_view<OutputValueType, IndexType> out,
             OutputValueType mu,
             OutputValueType beta)
 {
-  detail::gumbel(rng_state, out.data_handle(), out.extent(0), mu, beta, handle.get_stream());
+  detail::gumbel(
+    rng_state, out.data_handle(), out.extent(0), mu, beta, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -483,14 +490,14 @@ void gumbel(raft::device_resources const& handle,
  * @note https://en.wikipedia.org/wiki/Gumbel_distribution
  */
 template <typename OutType, typename LenType = int>
-void gumbel(raft::device_resources const& handle,
+void gumbel(raft::resources const& handle,
             RngState& rng_state,
             OutType* ptr,
             LenType len,
             OutType mu,
             OutType beta)
 {
-  detail::gumbel(rng_state, ptr, len, mu, beta, handle.get_stream());
+  detail::gumbel(rng_state, ptr, len, mu, beta, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -506,13 +513,14 @@ void gumbel(raft::device_resources const& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutputValueType, typename IndexType>
-void lognormal(raft::device_resources const& handle,
+void lognormal(raft::resources const& handle,
                RngState& rng_state,
                raft::device_vector_view<OutputValueType, IndexType> out,
                OutputValueType mu,
                OutputValueType sigma)
 {
-  detail::lognormal(rng_state, out.data_handle(), out.extent(0), mu, sigma, handle.get_stream());
+  detail::lognormal(
+    rng_state, out.data_handle(), out.extent(0), mu, sigma, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -528,14 +536,14 @@ void lognormal(raft::device_resources const& handle,
  * @param[in] sigma standard deviation of the distribution
  */
 template <typename OutType, typename LenType = int>
-void lognormal(raft::device_resources const& handle,
+void lognormal(raft::resources const& handle,
                RngState& rng_state,
                OutType* ptr,
                LenType len,
                OutType mu,
                OutType sigma)
 {
-  detail::lognormal(rng_state, ptr, len, mu, sigma, handle.get_stream());
+  detail::lognormal(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -551,13 +559,14 @@ void lognormal(raft::device_resources const& handle,
  * @param[in] scale scale value
  */
 template <typename OutputValueType, typename IndexType = int>
-void logistic(raft::device_resources const& handle,
+void logistic(raft::resources const& handle,
               RngState& rng_state,
               raft::device_vector_view<OutputValueType, IndexType> out,
               OutputValueType mu,
               OutputValueType scale)
 {
-  detail::logistic(rng_state, out.data_handle(), out.extent(0), mu, scale, handle.get_stream());
+  detail::logistic(
+    rng_state, out.data_handle(), out.extent(0), mu, scale, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -573,14 +582,14 @@ void logistic(raft::device_resources const& handle,
  * @param[in] scale scale value
  */
 template <typename OutType, typename LenType = int>
-void logistic(raft::device_resources const& handle,
+void logistic(raft::resources const& handle,
               RngState& rng_state,
               OutType* ptr,
               LenType len,
               OutType mu,
               OutType scale)
 {
-  detail::logistic(rng_state, ptr, len, mu, scale, handle.get_stream());
+  detail::logistic(rng_state, ptr, len, mu, scale, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -595,12 +604,13 @@ void logistic(raft::device_resources const& handle,
  * @param[in] lambda the exponential distribution's lambda parameter
  */
 template <typename OutputValueType, typename IndexType>
-void exponential(raft::device_resources const& handle,
+void exponential(raft::resources const& handle,
                  RngState& rng_state,
                  raft::device_vector_view<OutputValueType, IndexType> out,
                  OutputValueType lambda)
 {
-  detail::exponential(rng_state, out.data_handle(), out.extent(0), lambda, handle.get_stream());
+  detail::exponential(
+    rng_state, out.data_handle(), out.extent(0), lambda, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -615,13 +625,10 @@ void exponential(raft::device_resources const& handle,
  * @param[in] lambda the exponential distribution's lambda parameter
  */
 template <typename OutType, typename LenType = int>
-void exponential(raft::device_resources const& handle,
-                 RngState& rng_state,
-                 OutType* ptr,
-                 LenType len,
-                 OutType lambda)
+void exponential(
+  raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType lambda)
 {
-  detail::exponential(rng_state, ptr, len, lambda, handle.get_stream());
+  detail::exponential(rng_state, ptr, len, lambda, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -636,12 +643,13 @@ void exponential(raft::device_resources const& handle,
  * @param[in] sigma the distribution's sigma parameter
  */
 template <typename OutputValueType, typename IndexType>
-void rayleigh(raft::device_resources const& handle,
+void rayleigh(raft::resources const& handle,
               RngState& rng_state,
               raft::device_vector_view<OutputValueType, IndexType> out,
               OutputValueType sigma)
 {
-  detail::rayleigh(rng_state, out.data_handle(), out.extent(0), sigma, handle.get_stream());
+  detail::rayleigh(
+    rng_state, out.data_handle(), out.extent(0), sigma, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -656,15 +664,11 @@ void rayleigh(raft::device_resources const& handle,
  * @param[in] sigma the distribution's sigma parameter
  */
 template <typename OutType, typename LenType = int>
-void rayleigh(raft::device_resources const& handle,
-              RngState& rng_state,
-              OutType* ptr,
-              LenType len,
-              OutType sigma)
+void rayleigh(
+  raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType sigma)
 {
-  detail::rayleigh(rng_state, ptr, len, sigma, handle.get_stream());
+  detail::rayleigh(rng_state, ptr, len, sigma, resource::get_cuda_stream(handle));
 }
-
 /**
  * @brief Generate laplace distributed random numbers
  *
@@ -678,13 +682,14 @@ void rayleigh(raft::device_resources const& handle,
  * @param[in] scale the scale
  */
 template <typename OutputValueType, typename IndexType>
-void laplace(raft::device_resources const& handle,
+void laplace(raft::resources const& handle,
              RngState& rng_state,
              raft::device_vector_view<OutputValueType, IndexType> out,
              OutputValueType mu,
              OutputValueType scale)
 {
-  detail::laplace(rng_state, out.data_handle(), out.extent(0), mu, scale, handle.get_stream());
+  detail::laplace(
+    rng_state, out.data_handle(), out.extent(0), mu, scale, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -700,14 +705,14 @@ void laplace(raft::device_resources const& handle,
  * @param[in] scale the scale
  */
 template <typename OutType, typename LenType = int>
-void laplace(raft::device_resources const& handle,
+void laplace(raft::resources const& handle,
              RngState& rng_state,
              OutType* ptr,
              LenType len,
              OutType mu,
              OutType scale)
 {
-  detail::laplace(rng_state, ptr, len, mu, scale, handle.get_stream());
+  detail::laplace(rng_state, ptr, len, mu, scale, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -716,10 +721,10 @@ void laplace(raft::device_resources const& handle,
  * Usage example:
  * @code{.cpp}
  *  #include <raft/core/device_mdarray.hpp>
- *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/core/resources.hpp>
  *  #include <raft/random/rng.cuh>
  *
- *  raft::raft::device_resources handle;
+ *  raft::resources handle;
  *  ...
  *  raft::random::RngState rng(seed);
  *  auto indices = raft::make_device_vector<int>(handle, n_samples);
@@ -737,7 +742,7 @@ void laplace(raft::device_resources const& handle,
  */
 template <typename OutType, typename WeightType, typename IndexType>
 std::enable_if_t<std::is_integral_v<OutType>> discrete(
-  raft::device_resources const& handle,
+  raft::resources const& handle,
   RngState& rng_state,
   raft::device_vector_view<OutType, IndexType> out,
   raft::device_vector_view<const WeightType, IndexType> weights)
@@ -747,7 +752,7 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(
                    weights.data_handle(),
                    out.extent(0),
                    weights.extent(0),
-                   handle.get_stream());
+                   resource::get_cuda_stream(handle));
 }
 
 /**
@@ -770,7 +775,7 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(
  * @param[in] len input array length
  */
 template <typename DataT, typename WeightsT, typename IdxT = int>
-void sampleWithoutReplacement(raft::device_resources const& handle,
+void sampleWithoutReplacement(raft::resources const& handle,
                               RngState& rng_state,
                               DataT* out,
                               IdxT* outIdx,
@@ -780,7 +785,7 @@ void sampleWithoutReplacement(raft::device_resources const& handle,
                               IdxT len)
 {
   detail::sampleWithoutReplacement(
-    rng_state, out, outIdx, in, wts, sampledLen, len, handle.get_stream());
+    rng_state, out, outIdx, in, wts, sampledLen, len, resource::get_cuda_stream(handle));
 }
 
 /**
@@ -1106,7 +1111,7 @@ class DEPR Rng : public detail::RngImpl {
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(raft::device_resources const& handle,
+  void sampleWithoutReplacement(raft::resources const& handle,
                                 DataT* out,
                                 IdxT* outIdx,
                                 const DataT* in,
diff --git a/cpp/include/raft/random/sample_without_replacement.cuh b/cpp/include/raft/random/sample_without_replacement.cuh
index 8998db98ae..b074f68af6 100644
--- a/cpp/include/raft/random/sample_without_replacement.cuh
+++ b/cpp/include/raft/random/sample_without_replacement.cuh
@@ -21,7 +21,8 @@
 #include <cassert>
 #include <optional>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <type_traits>
 #include <variant>
 
@@ -29,8 +30,7 @@ namespace raft::random {
 
 namespace sample_without_replacement_impl {
 template <typename T>
-struct weight_alias {
-};
+struct weight_alias {};
 
 template <>
 struct weight_alias<std::nullopt_t> {
@@ -94,7 +94,7 @@ using weight_t = typename weight_alias<T>::type;
  *   equals the number of inputs `in.extent(0)`.
  */
 template <typename DataT, typename IdxT, typename WeightsVectorType, class OutIndexVectorType>
-void sample_without_replacement(raft::device_resources const& handle,
+void sample_without_replacement(raft::resources const& handle,
                                 RngState& rng_state,
                                 raft::device_vector_view<const DataT, IdxT> in,
                                 WeightsVectorType&& weights_opt,
@@ -145,7 +145,7 @@ void sample_without_replacement(raft::device_resources const& handle,
                                    wts_ptr,
                                    sampledLen,
                                    len,
-                                   handle.get_stream());
+                                   resource::get_cuda_stream(handle));
 }
 
 /**
diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
index 6ae6874466..fe433d4641 100644
--- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -45,15 +45,13 @@ template <
 cusparseStatus_t cusparsegthr(
   cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream)
 {
-  auto constexpr float_type = []() constexpr
-  {
+  auto constexpr float_type = []() constexpr {
     if constexpr (std::is_same_v<T, float>) {
       return CUDA_R_32F;
     } else if constexpr (std::is_same_v<T, double>) {
       return CUDA_R_64F;
     }
-  }
-  ();
+  }();
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
   auto dense_vector_descr  = cusparseDnVecDescr_t{};
   auto sparse_vector_descr = cusparseSpVecDescr_t{};
diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 3c3b92b739..01ebfd04df 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use raft/cluster/single_linkage_types.hpp instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use raft/cluster/single_linkage_types.hpp instead.")
 
 #include <raft/cluster/single_linkage_types.hpp>
 
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
index dbf353da73..7f990ff44b 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft/cluster version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft/cluster version instead.")
 
 #include <raft/cluster/single_linkage.cuh>
 #include <raft/sparse/hierarchy/common.h>
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index c2a8aa4246..3cb4a3e353 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,15 @@
 #pragma once
 
 #include <cusparse_v2.h>
+#include <raft/common/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/linalg/norm_types.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <raft/sparse/op/row_op.cuh>
+
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
@@ -44,10 +49,10 @@ __global__ void csr_row_normalize_l1_kernel(
   // over each row and then divide the values in parallel.
   const int* ia,  // csr row ex_scan (sorted by row)
   const T* vals,
-  int nnz,  // array of values and number of non-zeros
-  int m,    // num rows in csr
+  int nnz,        // array of values and number of non-zeros
+  int m,          // num rows in csr
   T* result)
-{  // output array
+{                 // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -90,8 +95,8 @@ __global__ void csr_row_normalize_l1_kernel(
 template <int TPB_X = 64, typename T>
 void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
-                          int nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
+                          int nnz,        // array of values and number of non-zeros
+                          int m,          // num rows in csr
                           T* result,
                           cudaStream_t stream)
 {  // output array
@@ -110,10 +115,10 @@ __global__ void csr_row_normalize_max_kernel(
   // over each row and then divide the values in parallel.
   const int* ia,  // csr row ind array (sorted by row)
   const T* vals,
-  int nnz,  // array of values and number of non-zeros
-  int m,    // num total rows in csr
+  int nnz,        // array of values and number of non-zeros
+  int m,          // num total rows in csr
   T* result)
-{  // output array
+{                 // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -158,8 +163,8 @@ __global__ void csr_row_normalize_max_kernel(
 template <int TPB_X = 64, typename T>
 void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
                            const T* vals,
-                           int nnz,  // array of values and number of non-zeros
-                           int m,    // num total rows in csr
+                           int nnz,        // array of values and number of non-zeros
+                           int m,          // num total rows in csr
                            T* result,
                            cudaStream_t stream)
 {
@@ -170,6 +175,62 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
+template <typename Type,
+          typename IdxType      = int,
+          typename MainLambda   = raft::identity_op,
+          typename ReduceLambda = raft::add_op,
+          typename FinalLambda  = raft::identity_op>
+void csr_row_op_wrapper(const IdxType* ia,
+                        const Type* data,
+                        IdxType nnz,
+                        IdxType N,
+                        Type init,
+                        Type* norm,
+                        cudaStream_t stream,
+                        MainLambda main_op     = raft::identity_op(),
+                        ReduceLambda reduce_op = raft::add_op(),
+                        FinalLambda final_op   = raft::identity_op())
+{
+  op::csr_row_op<IdxType>(
+    ia,
+    N,
+    nnz,
+    [data, init, norm, main_op, reduce_op, final_op] __device__(
+      IdxType row, IdxType start_idx, IdxType stop_idx) {
+      norm[row] = init;
+      for (IdxType i = start_idx; i < stop_idx; i++)
+        norm[row] = final_op(reduce_op(norm[row], main_op(data[i])));
+    },
+    stream);
+}
+
+template <typename Type, typename IdxType, typename Lambda>
+void rowNormCsrCaller(const IdxType* ia,
+                      const Type* data,
+                      IdxType nnz,
+                      IdxType N,
+                      Type* norm,
+                      raft::linalg::NormType type,
+                      Lambda fin_op,
+                      cudaStream_t stream)
+{
+  switch (type) {
+    case raft::linalg::NormType::L1Norm:
+      csr_row_op_wrapper(
+        ia, data, nnz, N, (Type)0, norm, stream, raft::abs_op(), raft::add_op(), fin_op);
+      break;
+    case raft::linalg::NormType::L2Norm:
+      csr_row_op_wrapper(
+        ia, data, nnz, N, (Type)0, norm, stream, raft::sq_op(), raft::add_op(), fin_op);
+      break;
+    case raft::linalg::NormType::LinfNorm:
+      csr_row_op_wrapper(
+        ia, data, nnz, N, (Type)0, norm, stream, raft::abs_op(), raft::max_op(), fin_op);
+      break;
+    default: THROW("Unsupported norm type: %d", type);
+  };
+}
+
 };  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 3be33820cc..c64acbfca6 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -68,7 +68,7 @@ void fit_embedding(raft::device_resources const& handle,
     handle, ro, ci, vs, n, nnz};
 
   index_type neigvs       = n_components + 1;
-  index_type maxiter      = 4000;  // default reset value (when set to 0);
+  index_type maxiter      = 4000;         // default reset value (when set to 0);
   value_type tol          = 0.01;
   index_type restart_iter = 15 + neigvs;  // what cugraph is using
 
diff --git a/cpp/include/raft/sparse/linalg/detail/spmm.hpp b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
new file mode 100644
index 0000000000..b61b561a12
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+namespace detail {
+
+/**
+ * @brief determine common data layout for both dense matrices
+ * @tparam ValueType Data type of Y,Z (float/double)
+ * @tparam IndexType Type of Y,Z
+ * @tparam LayoutPolicyY layout of Y
+ * @tparam LayoutPolicyZ layout of Z
+ * @param[in] x input raft::device_matrix_view
+ * @param[in] y input raft::device_matrix_view
+ * @returns dense matrix descriptor to be used by cuSparse API
+ */
+template <typename ValueType, typename IndexType, typename LayoutPolicyY, typename LayoutPolicyZ>
+bool is_row_major(raft::device_matrix_view<const ValueType, IndexType, LayoutPolicyY>& y,
+                  raft::device_matrix_view<ValueType, IndexType, LayoutPolicyZ>& z)
+{
+  bool is_row_major = z.stride(1) == 1 && y.stride(1) == 1;
+  bool is_col_major = z.stride(0) == 1 && y.stride(0) == 1;
+  ASSERT(is_row_major || is_col_major, "Both matrices need to be either row or col major");
+  return is_row_major;
+}
+
+/**
+ * @brief create a cuSparse dense descriptor
+ * @tparam ValueType Data type of dense_view (float/double)
+ * @tparam IndexType Type of dense_view
+ * @tparam LayoutPolicy layout of dense_view
+ * @param[in] dense_view input raft::device_matrix_view
+ * @param[in] is_row_major data layout of raft::device_matrix_view
+ * @returns dense matrix descriptor to be used by cuSparse API
+ */
+template <typename ValueType, typename IndexType, typename LayoutPolicy>
+cusparseDnMatDescr_t create_descriptor(
+  raft::device_matrix_view<ValueType, IndexType, LayoutPolicy>& dense_view, const bool is_row_major)
+{
+  auto order   = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+  IndexType ld = is_row_major ? dense_view.stride(0) : dense_view.stride(1);
+  cusparseDnMatDescr_t descr;
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednmat(
+    &descr,
+    dense_view.extent(0),
+    dense_view.extent(1),
+    ld,
+    const_cast<std::remove_const_t<ValueType>*>(dense_view.data_handle()),
+    order));
+  return descr;
+}
+
+/**
+ * @brief create a cuSparse sparse descriptor
+ * @tparam ValueType Data type of sparse_view (float/double)
+ * @tparam NZType Type of sparse_view
+ * @param[in] sparse_view input raft::device_csr_matrix_view of size M rows x K columns
+ * @returns sparse matrix descriptor to be used by cuSparse API
+ */
+template <typename ValueType, typename NZType>
+cusparseSpMatDescr_t create_descriptor(
+  raft::device_csr_matrix_view<ValueType, int, int, NZType>& sparse_view)
+{
+  cusparseSpMatDescr_t descr;
+  auto csr_structure = sparse_view.structure_view();
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr(
+    &descr,
+    csr_structure.get_n_rows(),
+    csr_structure.get_n_cols(),
+    csr_structure.get_nnz(),
+    const_cast<int*>(csr_structure.get_indptr().data()),
+    const_cast<int*>(csr_structure.get_indices().data()),
+    const_cast<std::remove_const_t<ValueType>*>(sparse_view.get_elements().data())));
+  return descr;
+}
+
+/**
+ * @brief SPMM function designed for handling all CSR * DENSE
+ * combinations of operand layouts for cuSparse.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * where X is a CSR device matrix view and Y,Z are device matrix views
+ * @tparam ValueType Data type of input/output matrices (float/double)
+ * @tparam IndexType Type of Y and Z
+ * @tparam NZType Type of X
+ * @tparam LayoutPolicyY layout of Y
+ * @tparam LayoutPolicyZ layout of Z
+ * @param[in] handle raft handle
+ * @param[in] trans_x transpose operation for X
+ * @param[in] trans_y transpose operation for Y
+ * @param[in] is_row_major data layout of Y,Z
+ * @param[in] alpha scalar
+ * @param[in] descr_x input sparse descriptor
+ * @param[in] descr_y input dense descriptor
+ * @param[in] beta scalar
+ * @param[out] descr_z output dense descriptor
+ */
+template <typename ValueType>
+void spmm(raft::device_resources const& handle,
+          const bool trans_x,
+          const bool trans_y,
+          const bool is_row_major,
+          const ValueType* alpha,
+          cusparseSpMatDescr_t& descr_x,
+          cusparseDnMatDescr_t& descr_y,
+          const ValueType* beta,
+          cusparseDnMatDescr_t& descr_z)
+{
+  auto opX = trans_x ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto opY = trans_y ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto alg = is_row_major ? CUSPARSE_SPMM_CSR_ALG2 : CUSPARSE_SPMM_CSR_ALG1;
+  size_t bufferSize;
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(handle.get_cusparse_handle(),
+                                                                  opX,
+                                                                  opY,
+                                                                  alpha,
+                                                                  descr_x,
+                                                                  descr_y,
+                                                                  beta,
+                                                                  descr_z,
+                                                                  alg,
+                                                                  &bufferSize,
+                                                                  handle.get_stream()));
+
+  raft::interruptible::synchronize(handle.get_stream());
+
+  rmm::device_uvector<ValueType> tmp(bufferSize, handle.get_stream());
+
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(handle.get_cusparse_handle(),
+                                                       opX,
+                                                       opY,
+                                                       alpha,
+                                                       descr_x,
+                                                       descr_y,
+                                                       beta,
+                                                       descr_z,
+                                                       alg,
+                                                       tmp.data(),
+                                                       handle.get_stream()));
+}
+
+}  // end namespace detail
+}  // end namespace linalg
+}  // end namespace sparse
+}  // end namespace raft
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index e13fd22843..98e23afcdf 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include <raft/linalg/norm_types.hpp>
 #include <raft/sparse/linalg/detail/norm.cuh>
 
 namespace raft {
@@ -37,8 +38,8 @@ namespace linalg {
 template <typename T>
 void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
-                          int nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
+                          int nnz,        // array of values and number of non-zeros
+                          int m,          // num rows in csr
                           T* result,
                           cudaStream_t stream)
 {  // output array
@@ -58,14 +59,46 @@ void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
 template <typename T>
 void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
                            const T* vals,
-                           int nnz,  // array of values and number of non-zeros
-                           int m,    // num total rows in csr
+                           int nnz,        // array of values and number of non-zeros
+                           int m,          // num total rows in csr
                            T* result,
                            cudaStream_t stream)
 {
   detail::csr_row_normalize_max(ia, vals, nnz, m, result, stream);
 }
 
+/**
+ * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
+ *
+ * Row-wise norm is useful while computing pairwise distance matrix, for
+ * example.
+ * This is used in many clustering algos like knn, kmeans, dbscan, etc...
+ *
+ * @tparam Type the data type
+ * @tparam Lambda device final lambda
+ * @tparam IdxType Integer type used to for addressing
+ * @param handle raft handle
+ * @param ia the input matrix row index array
+ * @param data the input matrix nnz data
+ * @param nnz number of elements in data
+ * @param N number of rows
+ * @param norm the output vector of row-wise norm, size [N]
+ * @param type the type of norm to be applied
+ * @param fin_op the final lambda op
+ */
+template <typename Type, typename IdxType = int, typename Lambda = raft::identity_op>
+void rowNormCsr(raft::device_resources const& handle,
+                const IdxType* ia,
+                const Type* data,
+                const IdxType nnz,
+                const IdxType N,
+                Type* norm,
+                raft::linalg::NormType type,
+                Lambda fin_op = raft::identity_op())
+{
+  detail::rowNormCsrCaller(ia, data, nnz, N, norm, type, fin_op, handle.get_stream());
+}
+
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/spmm.cuh b/cpp/include/raft/sparse/linalg/spmm.cuh
new file mode 100644
index 0000000000..73170cfc70
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/spmm.cuh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPMM_H
+#define __SPMM_H
+
+#pragma once
+
+#include "detail/spmm.hpp"
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief SPMM function designed for handling all CSR * DENSE
+ * combinations of operand layouts for cuSparse.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * where X is a CSR device matrix view and Y,Z are device matrix views
+ * @tparam ValueType Data type of input/output matrices (float/double)
+ * @tparam IndexType Type of Y and Z
+ * @tparam NZType Type of X
+ * @tparam LayoutPolicyY layout of Y
+ * @tparam LayoutPolicyZ layout of Z
+ * @param[in] handle raft handle
+ * @param[in] trans_x transpose operation for X
+ * @param[in] trans_y transpose operation for Y
+ * @param[in] alpha scalar
+ * @param[in] x input raft::device_csr_matrix_view
+ * @param[in] y input raft::device_matrix_view
+ * @param[in] beta scalar
+ * @param[out] z output raft::device_matrix_view
+ */
+template <typename ValueType,
+          typename IndexType,
+          typename NZType,
+          typename LayoutPolicyY,
+          typename LayoutPolicyZ>
+void spmm(raft::device_resources const& handle,
+          const bool trans_x,
+          const bool trans_y,
+          const ValueType* alpha,
+          raft::device_csr_matrix_view<const ValueType, int, int, NZType> x,
+          raft::device_matrix_view<const ValueType, IndexType, LayoutPolicyY> y,
+          const ValueType* beta,
+          raft::device_matrix_view<ValueType, IndexType, LayoutPolicyZ> z)
+{
+  bool is_row_major = detail::is_row_major(y, z);
+
+  auto descr_x = detail::create_descriptor(x);
+  auto descr_y = detail::create_descriptor(y, is_row_major);
+  auto descr_z = detail::create_descriptor(z, is_row_major);
+
+  detail::spmm(handle, trans_x, trans_y, is_row_major, alpha, descr_x, descr_y, beta, descr_z);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_z));
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // end namespace linalg
+}  // end namespace sparse
+}  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index 8f1a365f3f..eb6de1c0a1 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,9 @@
  */
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft/sparse/solver version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft/sparse/solver version instead.")
 
 #include <raft/sparse/mst/mst_solver.cuh>
 #include <raft/sparse/solver/mst.cuh>
diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp
index 1ad053d97c..5fbd264c6f 100644
--- a/cpp/include/raft/sparse/mst/mst.hpp
+++ b/cpp/include/raft/sparse/mst/mst.hpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,9 @@
  */
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft/sparse/solver version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft/sparse/solver version instead.")
 
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/mst/mst_solver.cuh>
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index 6af2226b99..76667396c3 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,9 @@
  */
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft/sparse/solver version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft/sparse/solver version instead.")
 
 #include <raft/sparse/solver/mst_solver.cuh>
 
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn.cuh b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
index 7bedec9830..6649c10c47 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
@@ -58,7 +58,7 @@ struct csr_batcher_t {
   void set_batch(int batch_num)
   {
     batch_start_ = batch_num * batch_size_;
-    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
+    batch_stop_  = batch_start_ + batch_size_ - 1;                  // zero-based indexing
 
     if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
 
@@ -355,8 +355,7 @@ class sparse_knn_t {
     // want to adjust k.
     value_idx n_neighbors = std::min(static_cast<value_idx>(k), batch_cols);
 
-    bool ascending = true;
-    if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
+    bool ascending = raft::distance::is_min_close(metric);
 
     // kernel to slice first (min) k cols and copy into batched merge buffer
     raft::spatial::knn::select_k(batch_dists,
@@ -425,4 +424,4 @@ class sparse_knn_t {
   raft::device_resources const& handle;
 };
 
-};  // namespace raft::sparse::neighbors::detail
\ No newline at end of file
+};  // namespace raft::sparse::neighbors::detail
diff --git a/cpp/include/raft/sparse/neighbors/knn.cuh b/cpp/include/raft/sparse/neighbors/knn.cuh
index d5714fbbd1..1e8ce48e16 100644
--- a/cpp/include/raft/sparse/neighbors/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn.cuh
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the sparse/spatial version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the sparse/spatial version instead.")
 
 #include <raft/sparse/neighbors/brute_force.cuh>
 
diff --git a/cpp/include/raft/sparse/neighbors/specializations.cuh b/cpp/include/raft/sparse/neighbors/specializations.cuh
index 23ba38ccda..9588a7f329 100644
--- a/cpp/include/raft/sparse/neighbors/specializations.cuh
+++ b/cpp/include/raft/sparse/neighbors/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations.cuh>
\ No newline at end of file
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index c4479bc451..9bc3f1553a 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the sparse/spatial version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the sparse/spatial version instead.")
 
 #include <raft/sparse/neighbors/connect_components.cuh>
 
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index c5b6a7ab2f..0258335941 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the sparse/spatial version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the sparse/spatial version instead.")
 
 #include <raft/sparse/neighbors/knn.cuh>
 
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index bd009bf297..942213e6c1 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the sparse/spatial version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the sparse/spatial version instead.")
 
 #include <raft/sparse/neighbors/knn_graph.cuh>
 
diff --git a/cpp/include/raft/sparse/solver/mst_solver.cuh b/cpp/include/raft/sparse/solver/mst_solver.cuh
index c10d7caf59..063f215fc8 100644
--- a/cpp/include/raft/sparse/solver/mst_solver.cuh
+++ b/cpp/include/raft/sparse/solver/mst_solver.cuh
@@ -78,10 +78,10 @@ class MST_solver {
   rmm::device_uvector<alteration_t> altered_weights;  // weights to be used for mst
   rmm::device_scalar<edge_t> mst_edge_count;  // total number of edges added after every iteration
   rmm::device_scalar<edge_t>
-    prev_mst_edge_count;                     // total number of edges up to the previous iteration
-  rmm::device_uvector<bool> mst_edge;        // mst output -  true if the edge belongs in mst
-  rmm::device_uvector<vertex_t> next_color;  //  next iteration color
-  rmm::device_uvector<vertex_t> color;       // index of color that vertex points to
+    prev_mst_edge_count;                      // total number of edges up to the previous iteration
+  rmm::device_uvector<bool> mst_edge;         // mst output -  true if the edge belongs in mst
+  rmm::device_uvector<vertex_t> next_color;   //  next iteration color
+  rmm::device_uvector<vertex_t> color;        // index of color that vertex points to
 
   // new src-dst pairs found per iteration
   rmm::device_uvector<vertex_t> temp_src;
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 0e9e323b84..73a70cbefe 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the other approximate KNN implementations defined in spatial/knn/*.")
+#pragma message(                                              \
+    __FILE__                                                  \
+    " is deprecated and will be removed in a future release." \
+    " Please use the other approximate KNN implementations defined in spatial/knn/*.")
 
 #pragma once
 
@@ -78,8 +79,7 @@ struct IVFParam : knnIndexParam {
   int nprobe;
 };
 
-struct IVFFlatParam : IVFParam {
-};
+struct IVFFlatParam : IVFParam {};
 
 struct IVFPQParam : IVFParam {
   int M;
diff --git a/cpp/include/raft/spatial/knn/ann_types.hpp b/cpp/include/raft/spatial/knn/ann_types.hpp
index 6e9a00bc0c..42ef2292f8 100644
--- a/cpp/include/raft/spatial/knn/ann_types.hpp
+++ b/cpp/include/raft/spatial/knn/ann_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,7 @@
 namespace raft::spatial::knn {
 
 /** The base for approximate KNN index structures. */
-struct index {
-};
+struct index {};
 
 /** The base for KNN index parameters. */
 struct index_params {
@@ -41,7 +40,6 @@ struct index_params {
   bool add_data_on_build = true;
 };
 
-struct search_params {
-};
+struct search_params {};
 
 };  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
index dda353e1c6..f3b1123fa2 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft::neighbors version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft::neighbors version instead.")
 
 #include <raft/neighbors/ball_cover.cuh>
 #include <raft/spatial/knn/ball_cover_types.hpp>
diff --git a/cpp/include/raft/spatial/knn/ball_cover_types.hpp b/cpp/include/raft/spatial/knn/ball_cover_types.hpp
index 6ebdcd7877..31062ff364 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_types.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft::neighbors version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft::neighbors version instead.")
 
 #include <raft/neighbors/ball_cover_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 395714a161..850b741dfd 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/core/logger.hpp>
-#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -30,6 +29,8 @@
 #include <memory>
 #include <optional>
 
+#include <cuda_fp16.hpp>
+
 namespace raft::spatial::knn::detail::utils {
 
 /** Whether pointers are accessible on the device or on the host. */
@@ -45,8 +46,7 @@ enum class pointer_residency {
 };
 
 template <typename... Types>
-struct pointer_residency_count {
-};
+struct pointer_residency_count {};
 
 template <>
 struct pointer_residency_count<> {
@@ -136,15 +136,24 @@ struct with_mapped_memory_t {
 };
 
 template <typename T>
-struct config {
-};
+struct config {};
 
+template <>
+struct config<double> {
+  using value_t                    = double;
+  static constexpr double kDivisor = 1.0;
+};
 template <>
 struct config<float> {
   using value_t                    = float;
   static constexpr double kDivisor = 1.0;
 };
 template <>
+struct config<half> {
+  using value_t                    = half;
+  static constexpr double kDivisor = 1.0;
+};
+template <>
 struct config<uint8_t> {
   using value_t                    = uint32_t;
   static constexpr double kDivisor = 256.0;
@@ -172,13 +181,13 @@ struct mapping {
    * @{
    */
   template <typename S>
-  HDI auto operator()(const S& x) const -> std::enable_if_t<std::is_same_v<S, T>, T>
+  HDI constexpr auto operator()(const S& x) const -> std::enable_if_t<std::is_same_v<S, T>, T>
   {
     return x;
   };
 
   template <typename S>
-  HDI auto operator()(const S& x) const -> std::enable_if_t<!std::is_same_v<S, T>, T>
+  HDI constexpr auto operator()(const S& x) const -> std::enable_if_t<!std::is_same_v<S, T>, T>
   {
     constexpr double kMult = config<T>::kDivisor / config<S>::kDivisor;
     if constexpr (std::is_floating_point_v<S>) { return static_cast<T>(x * static_cast<S>(kMult)); }
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index 0a6718f5a5..ce72b2648f 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "../haversine_distance.cuh"
+#include "registers_types.cuh"
 #include <cstdint>
 #include <thrust/functional.h>
 #include <thrust/tuple.h>
@@ -39,42 +40,6 @@ struct NNComp {
   }
 };
 
-template <typename value_t, typename value_int = std::uint32_t>
-struct DistFunc {
-  virtual __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
-                                                                 const value_t* b,
-                                                                 const value_int n_dims)
-  {
-    return -1;
-  };
-};
-
-template <typename value_t, typename value_int = std::uint32_t>
-struct HaversineFunc : public DistFunc<value_t, value_int> {
-  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
-                                                         const value_t* b,
-                                                         const value_int n_dims) override
-  {
-    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]);
-  }
-};
-
-template <typename value_t, typename value_int = std::uint32_t>
-struct EuclideanFunc : public DistFunc<value_t, value_int> {
-  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
-                                                         const value_t* b,
-                                                         const value_int n_dims) override
-  {
-    value_t sum_sq = 0;
-    for (value_int i = 0; i < n_dims; ++i) {
-      value_t diff = a[i] - b[i];
-      sum_sq += diff * diff;
-    }
-
-    return raft::sqrt(sum_sq);
-  }
-};
-
 /**
  * Zeros the bit at location h in a one-hot encoded 32-bit int array
  */
@@ -105,4 +70,4 @@ __device__ inline bool _get_val(std::uint32_t* arr, std::uint32_t h)
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
new file mode 100644
index 0000000000..efe1a8a70b
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../../ball_cover_types.hpp"   // BallCoverIndex
+#include "registers_types.cuh"          // DistFunc
+#include <cstdint>                      // uint32_t
+#include <raft/util/raft_explicit.hpp>  //RAFT_EXPLICIT
+
+#if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
+
+namespace raft::spatial::knn::detail {
+
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int dims           = 2,
+          typename dist_func>
+void rbc_low_dim_pass_one(raft::device_resources const& handle,
+                          const BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func& dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* dists_counter) RAFT_EXPLICIT;
+
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int dims           = 2,
+          typename dist_func>
+void rbc_low_dim_pass_two(raft::device_resources const& handle,
+                          const BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func& dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* post_dists_counter) RAFT_EXPLICIT;
+
+};      // namespace raft::spatial::knn::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  extern template void                                                                       \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  extern template void                                                                       \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::HaversineFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::HaversineFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::EuclideanFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::EuclideanFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::DistFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::DistFunc);
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::HaversineFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::HaversineFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::EuclideanFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::EuclideanFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::DistFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::DistFunc);
+
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
new file mode 100644
index 0000000000..e0e7d716ee
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.cuh"
+
+#include "../../ball_cover_types.hpp"
+#include "../haversine_distance.cuh"
+#include "registers_types.cuh"  // DistFunc
+
+#include <cstdint>
+#include <limits.h>
+
+#include <raft/neighbors/detail/faiss_select/key_value_block_select.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <thrust/fill.h>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+/**
+ * To find exact neighbors, we perform a post-processing stage
+ * that filters out those points which might have neighbors outside
+ * of their k closest landmarks. This is usually a very small portion
+ * of the total points.
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam value_int
+ * @tparam tpb
+ * @param X
+ * @param n_cols
+ * @param R_knn_inds
+ * @param R_knn_dists
+ * @param R_radius
+ * @param landmarks
+ * @param n_landmarks
+ * @param bitset_size
+ * @param k
+ * @param output
+ * @param weight
+ */
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int col_q          = 2,
+          int tpb            = 32,
+          typename distance_func>
+__global__ void perform_post_filter_registers(const value_t* X,
+                                              value_int n_cols,
+                                              const value_idx* R_knn_inds,
+                                              const value_t* R_knn_dists,
+                                              const value_t* R_radius,
+                                              const value_t* landmarks,
+                                              int n_landmarks,
+                                              value_int bitset_size,
+                                              value_int k,
+                                              distance_func dfunc,
+                                              std::uint32_t* output,
+                                              float weight = 1.0)
+{
+  // allocate array of size n_landmarks / 32 ints
+  extern __shared__ std::uint32_t shared_mem[];
+
+  // Start with all bits on
+  for (value_int i = threadIdx.x; i < bitset_size; i += tpb) {
+    shared_mem[i] = 0xffffffff;
+  }
+
+  __syncthreads();
+
+  // TODO: Would it be faster to use L1 for this?
+  value_t local_x_ptr[col_q];
+  for (value_int j = 0; j < n_cols; ++j) {
+    local_x_ptr[j] = X[n_cols * blockIdx.x + j];
+  }
+
+  value_t closest_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)];
+
+  // zero out bits for closest k landmarks
+  for (value_int j = threadIdx.x; j < k; j += tpb) {
+    _zero_bit(shared_mem, (std::uint32_t)R_knn_inds[blockIdx.x * k + j]);
+  }
+
+  __syncthreads();
+
+  // Discard any landmarks where p(q, r) > p(q, r_q) + radius(r)
+  // That is, the distance between the current point and the current
+  // landmark is > the distance between the current point and
+  // its closest landmark + the radius of the current landmark.
+  for (value_int l = threadIdx.x; l < n_landmarks; l += tpb) {
+    // compute p(q, r)
+    value_t dist = dfunc(local_x_ptr, landmarks + (n_cols * l), n_cols);
+    if (dist > weight * (closest_R_dist + R_radius[l]) || dist > 3 * closest_R_dist) {
+      _zero_bit(shared_mem, l);
+    }
+  }
+
+  __syncthreads();
+
+  /**
+   * Output bitset
+   */
+  for (value_int l = threadIdx.x; l < bitset_size; l += tpb) {
+    output[blockIdx.x * bitset_size + l] = shared_mem[l];
+  }
+}
+
+/**
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam value_int
+ * @tparam bitset_type
+ * @tparam warp_q number of registers to use per warp
+ * @tparam thread_q number of registers to use within each thread
+ * @tparam tpb number of threads per block
+ * @param X
+ * @param n_cols
+ * @param bitset
+ * @param bitset_size
+ * @param R_knn_dists
+ * @param R_indptr
+ * @param R_1nn_inds
+ * @param R_1nn_dists
+ * @param knn_inds
+ * @param knn_dists
+ * @param n_landmarks
+ * @param k
+ * @param dist_counter
+ */
+template <typename value_idx,
+          typename value_t,
+          typename value_int   = std::uint32_t,
+          typename bitset_type = std::uint32_t,
+          typename dist_func,
+          int warp_q   = 32,
+          int thread_q = 2,
+          int tpb      = 128,
+          int col_q    = 2>
+__global__ void compute_final_dists_registers(const value_t* X_index,
+                                              const value_t* X,
+                                              const value_int n_cols,
+                                              bitset_type* bitset,
+                                              value_int bitset_size,
+                                              const value_t* R_closest_landmark_dists,
+                                              const value_idx* R_indptr,
+                                              const value_idx* R_1nn_inds,
+                                              const value_t* R_1nn_dists,
+                                              value_idx* knn_inds,
+                                              value_t* knn_dists,
+                                              value_int n_landmarks,
+                                              value_int k,
+                                              dist_func dfunc,
+                                              value_int* dist_counter)
+{
+  static constexpr int kNumWarps = tpb / WarpSize;
+
+  __shared__ value_t shared_memK[kNumWarps * warp_q];
+  __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
+
+  const value_t* x_ptr = X + (n_cols * blockIdx.x);
+  value_t local_x_ptr[col_q];
+  for (value_int j = 0; j < n_cols; ++j) {
+    local_x_ptr[j] = x_ptr[j];
+  }
+
+  using namespace raft::neighbors::detail::faiss_select;
+  KeyValueBlockSelect<value_t, value_idx, false, Comparator<value_t>, warp_q, thread_q, tpb> heap(
+    std::numeric_limits<value_t>::max(),
+    std::numeric_limits<value_t>::max(),
+    -1,
+    shared_memK,
+    shared_memV,
+    k);
+
+  const value_int n_k = Pow2<WarpSize>::roundDown(k);
+  value_int i         = threadIdx.x;
+  for (; i < n_k; i += tpb) {
+    value_idx ind = knn_inds[blockIdx.x * k + i];
+    heap.add(knn_dists[blockIdx.x * k + i], R_closest_landmark_dists[ind], ind);
+  }
+
+  if (i < k) {
+    value_idx ind = knn_inds[blockIdx.x * k + i];
+    heap.addThreadQ(knn_dists[blockIdx.x * k + i], R_closest_landmark_dists[ind], ind);
+  }
+
+  heap.checkThreadQ();
+
+  for (value_int cur_R_ind = 0; cur_R_ind < n_landmarks; ++cur_R_ind) {
+    // if cur R overlaps cur point's closest R, it could be a
+    // candidate
+    if (_get_val(bitset + (blockIdx.x * bitset_size), cur_R_ind)) {
+      value_idx R_start_offset = R_indptr[cur_R_ind];
+      value_idx R_stop_offset  = R_indptr[cur_R_ind + 1];
+      value_idx R_size         = R_stop_offset - R_start_offset;
+
+      // Loop through R's neighborhood in parallel
+
+      // Round R_size to the nearest warp threads so they can
+      // all be computing in parallel.
+
+      const value_int limit = Pow2<WarpSize>::roundDown(R_size);
+
+      i = threadIdx.x;
+      for (; i < limit; i += tpb) {
+        value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
+        value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
+
+        value_t z = heap.warpKTopRDist == 0.00 ? 0.0
+                                               : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                    abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                  heap.warpKTop * cur_candidate_dist) /
+                                                   heap.warpKTopRDist;
+        z         = isnan(z) || isinf(z) ? 0.0 : z;
+
+        // If lower bound on distance could possibly be in
+        // the closest k neighbors, compute it and add to k-select
+        value_t dist = std::numeric_limits<value_t>::max();
+        if (z <= heap.warpKTop) {
+          const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
+          value_t local_y_ptr[col_q];
+          for (value_int j = 0; j < n_cols; ++j) {
+            local_y_ptr[j] = y_ptr[j];
+          }
+
+          dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
+        }
+
+        heap.add(dist, cur_candidate_dist, cur_candidate_ind);
+      }
+
+      // second round guarantees to be only a single warp.
+      if (i < R_size) {
+        value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
+        value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
+
+        value_t z = heap.warpKTopRDist == 0.00 ? 0.0
+                                               : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                    abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                  heap.warpKTop * cur_candidate_dist) /
+                                                   heap.warpKTopRDist;
+
+        z = isnan(z) || isinf(z) ? 0.0 : z;
+
+        // If lower bound on distance could possibly be in
+        // the closest k neighbors, compute it and add to k-select
+        value_t dist = std::numeric_limits<value_t>::max();
+        if (z <= heap.warpKTop) {
+          const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
+          value_t local_y_ptr[col_q];
+          for (value_int j = 0; j < n_cols; ++j) {
+            local_y_ptr[j] = y_ptr[j];
+          }
+          dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
+        }
+        heap.addThreadQ(dist, cur_candidate_dist, cur_candidate_ind);
+      }
+      heap.checkThreadQ();
+    }
+  }
+
+  heap.reduce();
+
+  for (value_int i = threadIdx.x; i < k; i += tpb) {
+    knn_dists[blockIdx.x * k + i] = shared_memK[i];
+    knn_inds[blockIdx.x * k + i]  = shared_memV[i].value;
+  }
+}
+
+/**
+ * Random ball cover kernel for n_dims == 2
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam warp_q
+ * @tparam thread_q
+ * @tparam tpb
+ * @tparam value_idx
+ * @tparam value_t
+ * @param R_knn_inds
+ * @param R_knn_dists
+ * @param m
+ * @param k
+ * @param R_indptr
+ * @param R_1nn_cols
+ * @param R_1nn_dists
+ */
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          int warp_q         = 32,
+          int thread_q       = 2,
+          int tpb            = 128,
+          int col_q          = 2,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+__global__ void block_rbc_kernel_registers(const value_t* X_index,
+                                           const value_t* X,
+                                           value_int n_cols,  // n_cols should be 2 or 3 dims
+                                           const value_idx* R_knn_inds,
+                                           const value_t* R_knn_dists,
+                                           value_int m,
+                                           value_int k,
+                                           const value_idx* R_indptr,
+                                           const value_idx* R_1nn_cols,
+                                           const value_t* R_1nn_dists,
+                                           value_idx* out_inds,
+                                           value_t* out_dists,
+                                           value_int* dist_counter,
+                                           const value_t* R_radius,
+                                           distance_func dfunc,
+                                           float weight = 1.0)
+{
+  static constexpr value_int kNumWarps = tpb / WarpSize;
+
+  __shared__ value_t shared_memK[kNumWarps * warp_q];
+  __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
+
+  // TODO: Separate kernels for different widths:
+  // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x"
+  // 2. Can fit comfortably in shared memory (32 to a few thousand?)
+  // 3. Load each time individually.
+  const value_t* x_ptr = X + (n_cols * blockIdx.x);
+
+  // Use registers only for 2d or 3d
+  value_t local_x_ptr[col_q];
+  for (value_int i = 0; i < n_cols; ++i) {
+    local_x_ptr[i] = x_ptr[i];
+  }
+
+  // Each warp works on 1 R
+  using namespace raft::neighbors::detail::faiss_select;
+  KeyValueBlockSelect<value_t, value_idx, false, Comparator<value_t>, warp_q, thread_q, tpb> heap(
+    std::numeric_limits<value_t>::max(),
+    std::numeric_limits<value_t>::max(),
+    -1,
+    shared_memK,
+    shared_memV,
+    k);
+
+  value_t min_R_dist         = R_knn_dists[blockIdx.x * k + (k - 1)];
+  value_int n_dists_computed = 0;
+
+  /**
+   * First add distances for k closest neighbors of R
+   * to the heap
+   */
+  // Start iterating through elements of each set from closest R elements,
+  // determining if the distance could even potentially be in the heap.
+  for (value_int cur_k = 0; cur_k < k; ++cur_k) {
+    // index and distance to current blockIdx.x's closest landmark
+    value_t cur_R_dist  = R_knn_dists[blockIdx.x * k + cur_k];
+    value_idx cur_R_ind = R_knn_inds[blockIdx.x * k + cur_k];
+
+    // Equation (2) in Cayton's paper- prune out R's which are > 3 * p(q, r_q)
+    if (cur_R_dist > weight * (min_R_dist + R_radius[cur_R_ind])) continue;
+    if (cur_R_dist > 3 * min_R_dist) return;
+
+    // The whole warp should iterate through the elements in the current R
+    value_idx R_start_offset = R_indptr[cur_R_ind];
+    value_idx R_stop_offset  = R_indptr[cur_R_ind + 1];
+
+    value_idx R_size = R_stop_offset - R_start_offset;
+
+    value_int limit = Pow2<WarpSize>::roundDown(R_size);
+    value_int i     = threadIdx.x;
+    for (; i < limit; i += tpb) {
+      // Index and distance of current candidate's nearest landmark
+      value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
+      value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
+
+      // Take 2 landmarks l_1 and l_2 where l_1 is the furthest point in the heap
+      // and l_2 is the current landmark R. s is the current data point and
+      // t is the new candidate data point. We know that:
+      // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) -
+      // d(l_2, t) | - d(s, l_1) * d(l_2, t)
+
+      // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to
+      // the candidate point cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s,
+      // l_1) then we should compute the distance because it's possible it could be smaller.
+      //
+      value_t z = heap.warpKTopRDist == 0.00 ? 0.0
+                                             : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                  abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                heap.warpKTop * cur_candidate_dist) /
+                                                 heap.warpKTopRDist;
+
+      z            = isnan(z) || isinf(z) ? 0.0 : z;
+      value_t dist = std::numeric_limits<value_t>::max();
+
+      if (z <= heap.warpKTop) {
+        const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
+        value_t local_y_ptr[col_q];
+        for (value_int j = 0; j < n_cols; ++j) {
+          local_y_ptr[j] = y_ptr[j];
+        }
+        dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
+        ++n_dists_computed;
+      }
+
+      heap.add(dist, cur_candidate_dist, cur_candidate_ind);
+    }
+
+    if (i < R_size) {
+      value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
+      value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
+      value_t z                   = heap.warpKTopRDist == 0.0 ? 0.0
+                                                              : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                 abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                               heap.warpKTop * cur_candidate_dist) /
+                                                heap.warpKTopRDist;
+
+      z            = isnan(z) || isinf(z) ? 0.0 : z;
+      value_t dist = std::numeric_limits<value_t>::max();
+
+      if (z <= heap.warpKTop) {
+        const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
+        value_t local_y_ptr[col_q];
+        for (value_int j = 0; j < n_cols; ++j) {
+          local_y_ptr[j] = y_ptr[j];
+        }
+        dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
+        ++n_dists_computed;
+      }
+
+      heap.addThreadQ(dist, cur_candidate_dist, cur_candidate_ind);
+    }
+
+    heap.checkThreadQ();
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += tpb) {
+    out_dists[blockIdx.x * k + i] = shared_memK[i];
+    out_inds[blockIdx.x * k + i]  = shared_memV[i].value;
+  }
+}
+
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int dims           = 2,
+          typename dist_func>
+void rbc_low_dim_pass_one(raft::device_resources const& handle,
+                          const BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func& dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* dists_counter)
+{
+  if (k <= 32)
+    block_rbc_kernel_registers<value_idx, value_t, 32, 2, 128, dims, value_int>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X().data_handle(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr().data_handle(),
+                                                      index.get_R_1nn_cols().data_handle(),
+                                                      index.get_R_1nn_dists().data_handle(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius().data_handle(),
+                                                      dfunc,
+                                                      weight);
+
+  else if (k <= 64)
+    block_rbc_kernel_registers<value_idx, value_t, 64, 3, 128, 2, value_int>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X().data_handle(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr().data_handle(),
+                                                      index.get_R_1nn_cols().data_handle(),
+                                                      index.get_R_1nn_dists().data_handle(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius().data_handle(),
+                                                      dfunc,
+                                                      weight);
+  else if (k <= 128)
+    block_rbc_kernel_registers<value_idx, value_t, 128, 3, 128, dims, value_int>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X().data_handle(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr().data_handle(),
+                                                      index.get_R_1nn_cols().data_handle(),
+                                                      index.get_R_1nn_dists().data_handle(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius().data_handle(),
+                                                      dfunc,
+                                                      weight);
+
+  else if (k <= 256)
+    block_rbc_kernel_registers<value_idx, value_t, 256, 4, 128, dims, value_int>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X().data_handle(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr().data_handle(),
+                                                      index.get_R_1nn_cols().data_handle(),
+                                                      index.get_R_1nn_dists().data_handle(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius().data_handle(),
+                                                      dfunc,
+                                                      weight);
+
+  else if (k <= 512)
+    block_rbc_kernel_registers<value_idx, value_t, 512, 8, 64, dims, value_int>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X().data_handle(),
+                                                     query,
+                                                     index.n,
+                                                     R_knn_inds,
+                                                     R_knn_dists,
+                                                     index.m,
+                                                     k,
+                                                     index.get_R_indptr().data_handle(),
+                                                     index.get_R_1nn_cols().data_handle(),
+                                                     index.get_R_1nn_dists().data_handle(),
+                                                     inds,
+                                                     dists,
+                                                     dists_counter,
+                                                     index.get_R_radius().data_handle(),
+                                                     dfunc,
+                                                     weight);
+
+  else if (k <= 1024)
+    block_rbc_kernel_registers<value_idx, value_t, 1024, 8, 64, dims, value_int>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X().data_handle(),
+                                                     query,
+                                                     index.n,
+                                                     R_knn_inds,
+                                                     R_knn_dists,
+                                                     index.m,
+                                                     k,
+                                                     index.get_R_indptr().data_handle(),
+                                                     index.get_R_1nn_cols().data_handle(),
+                                                     index.get_R_1nn_dists().data_handle(),
+                                                     inds,
+                                                     dists,
+                                                     dists_counter,
+                                                     index.get_R_radius().data_handle(),
+                                                     dfunc,
+                                                     weight);
+}
+
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int dims           = 2,
+          typename dist_func>
+void rbc_low_dim_pass_two(raft::device_resources const& handle,
+                          const BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func& dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* post_dists_counter)
+{
+  const value_int bitset_size = ceil(index.n_landmarks / 32.0);
+
+  rmm::device_uvector<std::uint32_t> bitset(bitset_size * n_query_rows, handle.get_stream());
+  thrust::fill(handle.get_thrust_policy(), bitset.data(), bitset.data() + bitset.size(), 0);
+
+  perform_post_filter_registers<value_idx, value_t, value_int, dims, 128>
+    <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t), handle.get_stream()>>>(
+      query,
+      index.n,
+      R_knn_inds,
+      R_knn_dists,
+      index.get_R_radius().data_handle(),
+      index.get_R().data_handle(),
+      index.n_landmarks,
+      bitset_size,
+      k,
+      dfunc,
+      bitset.data(),
+      weight);
+
+  if (k <= 32)
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  32,
+                                  2,
+                                  128,
+                                  dims><<<n_query_rows, 128, 0, handle.get_stream()>>>(
+      index.get_X().data_handle(),
+      query,
+      index.n,
+      bitset.data(),
+      bitset_size,
+      index.get_R_closest_landmark_dists().data_handle(),
+      index.get_R_indptr().data_handle(),
+      index.get_R_1nn_cols().data_handle(),
+      index.get_R_1nn_dists().data_handle(),
+      inds,
+      dists,
+      index.n_landmarks,
+      k,
+      dfunc,
+      post_dists_counter);
+  else if (k <= 64)
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  64,
+                                  3,
+                                  128,
+                                  dims><<<n_query_rows, 128, 0, handle.get_stream()>>>(
+      index.get_X().data_handle(),
+      query,
+      index.n,
+      bitset.data(),
+      bitset_size,
+      index.get_R_closest_landmark_dists().data_handle(),
+      index.get_R_indptr().data_handle(),
+      index.get_R_1nn_cols().data_handle(),
+      index.get_R_1nn_dists().data_handle(),
+      inds,
+      dists,
+      index.n_landmarks,
+      k,
+      dfunc,
+      post_dists_counter);
+  else if (k <= 128)
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  128,
+                                  3,
+                                  128,
+                                  dims><<<n_query_rows, 128, 0, handle.get_stream()>>>(
+      index.get_X().data_handle(),
+      query,
+      index.n,
+      bitset.data(),
+      bitset_size,
+      index.get_R_closest_landmark_dists().data_handle(),
+      index.get_R_indptr().data_handle(),
+      index.get_R_1nn_cols().data_handle(),
+      index.get_R_1nn_dists().data_handle(),
+      inds,
+      dists,
+      index.n_landmarks,
+      k,
+      dfunc,
+      post_dists_counter);
+  else if (k <= 256)
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  256,
+                                  4,
+                                  128,
+                                  dims><<<n_query_rows, 128, 0, handle.get_stream()>>>(
+      index.get_X().data_handle(),
+      query,
+      index.n,
+      bitset.data(),
+      bitset_size,
+      index.get_R_closest_landmark_dists().data_handle(),
+      index.get_R_indptr().data_handle(),
+      index.get_R_1nn_cols().data_handle(),
+      index.get_R_1nn_dists().data_handle(),
+      inds,
+      dists,
+      index.n_landmarks,
+      k,
+      dfunc,
+      post_dists_counter);
+  else if (k <= 512)
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  512,
+                                  8,
+                                  64,
+                                  dims><<<n_query_rows, 64, 0, handle.get_stream()>>>(
+      index.get_X().data_handle(),
+      query,
+      index.n,
+      bitset.data(),
+      bitset_size,
+      index.get_R_closest_landmark_dists().data_handle(),
+      index.get_R_indptr().data_handle(),
+      index.get_R_1nn_cols().data_handle(),
+      index.get_R_1nn_dists().data_handle(),
+      inds,
+      dists,
+      index.n_landmarks,
+      k,
+      dfunc,
+      post_dists_counter);
+  else if (k <= 1024)
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  1024,
+                                  8,
+                                  64,
+                                  dims><<<n_query_rows, 64, 0, handle.get_stream()>>>(
+      index.get_X().data_handle(),
+      query,
+      index.n,
+      bitset.data(),
+      bitset_size,
+      index.get_R_closest_landmark_dists().data_handle(),
+      index.get_R_indptr().data_handle(),
+      index.get_R_1nn_cols().data_handle(),
+      index.get_R_1nn_dists().data_handle(),
+      inds,
+      dists,
+      index.n_landmarks,
+      k,
+      dfunc,
+      post_dists_counter);
+}
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index f665368c41..8bd57b47cc 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -13,767 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include "common.cuh"
-
-#include "../../ball_cover_types.hpp"
-#include "../haversine_distance.cuh"
-
-#include <cstdint>
-#include <limits.h>
-
-#include <raft/neighbors/detail/faiss_select/key_value_block_select.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-#include <thrust/fill.h>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-/**
- * To find exact neighbors, we perform a post-processing stage
- * that filters out those points which might have neighbors outside
- * of their k closest landmarks. This is usually a very small portion
- * of the total points.
- * @tparam value_idx
- * @tparam value_t
- * @tparam value_int
- * @tparam tpb
- * @param X
- * @param n_cols
- * @param R_knn_inds
- * @param R_knn_dists
- * @param R_radius
- * @param landmarks
- * @param n_landmarks
- * @param bitset_size
- * @param k
- * @param output
- * @param weight
- */
-template <typename value_idx,
-          typename value_t,
-          typename value_int = std::uint32_t,
-          int col_q          = 2,
-          int tpb            = 32,
-          typename distance_func>
-__global__ void perform_post_filter_registers(const value_t* X,
-                                              value_int n_cols,
-                                              const value_idx* R_knn_inds,
-                                              const value_t* R_knn_dists,
-                                              const value_t* R_radius,
-                                              const value_t* landmarks,
-                                              int n_landmarks,
-                                              value_int bitset_size,
-                                              value_int k,
-                                              distance_func dfunc,
-                                              std::uint32_t* output,
-                                              float weight = 1.0)
-{
-  // allocate array of size n_landmarks / 32 ints
-  extern __shared__ std::uint32_t shared_mem[];
-
-  // Start with all bits on
-  for (value_int i = threadIdx.x; i < bitset_size; i += tpb) {
-    shared_mem[i] = 0xffffffff;
-  }
-
-  __syncthreads();
-
-  // TODO: Would it be faster to use L1 for this?
-  value_t local_x_ptr[col_q];
-  for (value_int j = 0; j < n_cols; ++j) {
-    local_x_ptr[j] = X[n_cols * blockIdx.x + j];
-  }
-
-  value_t closest_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)];
-
-  // zero out bits for closest k landmarks
-  for (value_int j = threadIdx.x; j < k; j += tpb) {
-    _zero_bit(shared_mem, (std::uint32_t)R_knn_inds[blockIdx.x * k + j]);
-  }
-
-  __syncthreads();
-
-  // Discard any landmarks where p(q, r) > p(q, r_q) + radius(r)
-  // That is, the distance between the current point and the current
-  // landmark is > the distance between the current point and
-  // its closest landmark + the radius of the current landmark.
-  for (value_int l = threadIdx.x; l < n_landmarks; l += tpb) {
-    // compute p(q, r)
-    value_t dist = dfunc(local_x_ptr, landmarks + (n_cols * l), n_cols);
-    if (dist > weight * (closest_R_dist + R_radius[l]) || dist > 3 * closest_R_dist) {
-      _zero_bit(shared_mem, l);
-    }
-  }
-
-  __syncthreads();
-
-  /**
-   * Output bitset
-   */
-  for (value_int l = threadIdx.x; l < bitset_size; l += tpb) {
-    output[blockIdx.x * bitset_size + l] = shared_mem[l];
-  }
-}
-
-/**
- * @tparam value_idx
- * @tparam value_t
- * @tparam value_int
- * @tparam bitset_type
- * @tparam warp_q number of registers to use per warp
- * @tparam thread_q number of registers to use within each thread
- * @tparam tpb number of threads per block
- * @param X
- * @param n_cols
- * @param bitset
- * @param bitset_size
- * @param R_knn_dists
- * @param R_indptr
- * @param R_1nn_inds
- * @param R_1nn_dists
- * @param knn_inds
- * @param knn_dists
- * @param n_landmarks
- * @param k
- * @param dist_counter
- */
-template <typename value_idx,
-          typename value_t,
-          typename value_int   = std::uint32_t,
-          typename bitset_type = std::uint32_t,
-          typename dist_func,
-          int warp_q   = 32,
-          int thread_q = 2,
-          int tpb      = 128,
-          int col_q    = 2>
-__global__ void compute_final_dists_registers(const value_t* X_index,
-                                              const value_t* X,
-                                              const value_int n_cols,
-                                              bitset_type* bitset,
-                                              value_int bitset_size,
-                                              const value_t* R_closest_landmark_dists,
-                                              const value_idx* R_indptr,
-                                              const value_idx* R_1nn_inds,
-                                              const value_t* R_1nn_dists,
-                                              value_idx* knn_inds,
-                                              value_t* knn_dists,
-                                              value_int n_landmarks,
-                                              value_int k,
-                                              dist_func dfunc,
-                                              value_int* dist_counter)
-{
-  static constexpr int kNumWarps = tpb / WarpSize;
-
-  __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
-
-  const value_t* x_ptr = X + (n_cols * blockIdx.x);
-  value_t local_x_ptr[col_q];
-  for (value_int j = 0; j < n_cols; ++j) {
-    local_x_ptr[j] = x_ptr[j];
-  }
-
-  using namespace raft::neighbors::detail::faiss_select;
-  KeyValueBlockSelect<value_t, value_idx, false, Comparator<value_t>, warp_q, thread_q, tpb> heap(
-    std::numeric_limits<value_t>::max(),
-    std::numeric_limits<value_t>::max(),
-    -1,
-    shared_memK,
-    shared_memV,
-    k);
-
-  const value_int n_k = Pow2<WarpSize>::roundDown(k);
-  value_int i         = threadIdx.x;
-  for (; i < n_k; i += tpb) {
-    value_idx ind = knn_inds[blockIdx.x * k + i];
-    heap.add(knn_dists[blockIdx.x * k + i], R_closest_landmark_dists[ind], ind);
-  }
-
-  if (i < k) {
-    value_idx ind = knn_inds[blockIdx.x * k + i];
-    heap.addThreadQ(knn_dists[blockIdx.x * k + i], R_closest_landmark_dists[ind], ind);
-  }
-
-  heap.checkThreadQ();
-
-  for (value_int cur_R_ind = 0; cur_R_ind < n_landmarks; ++cur_R_ind) {
-    // if cur R overlaps cur point's closest R, it could be a
-    // candidate
-    if (_get_val(bitset + (blockIdx.x * bitset_size), cur_R_ind)) {
-      value_idx R_start_offset = R_indptr[cur_R_ind];
-      value_idx R_stop_offset  = R_indptr[cur_R_ind + 1];
-      value_idx R_size         = R_stop_offset - R_start_offset;
-
-      // Loop through R's neighborhood in parallel
-
-      // Round R_size to the nearest warp threads so they can
-      // all be computing in parallel.
-
-      const value_int limit = Pow2<WarpSize>::roundDown(R_size);
-
-      i = threadIdx.x;
-      for (; i < limit; i += tpb) {
-        value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
-        value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
-
-        value_t z = heap.warpKTopRDist == 0.00 ? 0.0
-                                               : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                                                    abs(heap.warpKTopRDist - cur_candidate_dist) -
-                                                  heap.warpKTop * cur_candidate_dist) /
-                                                   heap.warpKTopRDist;
-        z         = isnan(z) || isinf(z) ? 0.0 : z;
-
-        // If lower bound on distance could possibly be in
-        // the closest k neighbors, compute it and add to k-select
-        value_t dist = std::numeric_limits<value_t>::max();
-        if (z <= heap.warpKTop) {
-          const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
-          value_t local_y_ptr[col_q];
-          for (value_int j = 0; j < n_cols; ++j) {
-            local_y_ptr[j] = y_ptr[j];
-          }
-
-          dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
-        }
-
-        heap.add(dist, cur_candidate_dist, cur_candidate_ind);
-      }
-
-      // second round guarantees to be only a single warp.
-      if (i < R_size) {
-        value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
-        value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
-
-        value_t z = heap.warpKTopRDist == 0.00 ? 0.0
-                                               : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                                                    abs(heap.warpKTopRDist - cur_candidate_dist) -
-                                                  heap.warpKTop * cur_candidate_dist) /
-                                                   heap.warpKTopRDist;
-
-        z = isnan(z) || isinf(z) ? 0.0 : z;
-
-        // If lower bound on distance could possibly be in
-        // the closest k neighbors, compute it and add to k-select
-        value_t dist = std::numeric_limits<value_t>::max();
-        if (z <= heap.warpKTop) {
-          const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
-          value_t local_y_ptr[col_q];
-          for (value_int j = 0; j < n_cols; ++j) {
-            local_y_ptr[j] = y_ptr[j];
-          }
-          dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
-        }
-        heap.addThreadQ(dist, cur_candidate_dist, cur_candidate_ind);
-      }
-      heap.checkThreadQ();
-    }
-  }
-
-  heap.reduce();
-
-  for (value_int i = threadIdx.x; i < k; i += tpb) {
-    knn_dists[blockIdx.x * k + i] = shared_memK[i];
-    knn_inds[blockIdx.x * k + i]  = shared_memV[i].value;
-  }
-}
-
-/**
- * Random ball cover kernel for n_dims == 2
- * @tparam value_idx
- * @tparam value_t
- * @tparam warp_q
- * @tparam thread_q
- * @tparam tpb
- * @tparam value_idx
- * @tparam value_t
- * @param R_knn_inds
- * @param R_knn_dists
- * @param m
- * @param k
- * @param R_indptr
- * @param R_1nn_cols
- * @param R_1nn_dists
- */
-template <typename value_idx = std::int64_t,
-          typename value_t,
-          int warp_q         = 32,
-          int thread_q       = 2,
-          int tpb            = 128,
-          int col_q          = 2,
-          typename value_int = std::uint32_t,
-          typename distance_func>
-__global__ void block_rbc_kernel_registers(const value_t* X_index,
-                                           const value_t* X,
-                                           value_int n_cols,  // n_cols should be 2 or 3 dims
-                                           const value_idx* R_knn_inds,
-                                           const value_t* R_knn_dists,
-                                           value_int m,
-                                           value_int k,
-                                           const value_idx* R_indptr,
-                                           const value_idx* R_1nn_cols,
-                                           const value_t* R_1nn_dists,
-                                           value_idx* out_inds,
-                                           value_t* out_dists,
-                                           value_int* dist_counter,
-                                           const value_t* R_radius,
-                                           distance_func dfunc,
-                                           float weight = 1.0)
-{
-  static constexpr value_int kNumWarps = tpb / WarpSize;
-
-  __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
-
-  // TODO: Separate kernels for different widths:
-  // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x"
-  // 2. Can fit comfortably in shared memory (32 to a few thousand?)
-  // 3. Load each time individually.
-  const value_t* x_ptr = X + (n_cols * blockIdx.x);
-
-  // Use registers only for 2d or 3d
-  value_t local_x_ptr[col_q];
-  for (value_int i = 0; i < n_cols; ++i) {
-    local_x_ptr[i] = x_ptr[i];
-  }
-
-  // Each warp works on 1 R
-  using namespace raft::neighbors::detail::faiss_select;
-  KeyValueBlockSelect<value_t, value_idx, false, Comparator<value_t>, warp_q, thread_q, tpb> heap(
-    std::numeric_limits<value_t>::max(),
-    std::numeric_limits<value_t>::max(),
-    -1,
-    shared_memK,
-    shared_memV,
-    k);
-
-  value_t min_R_dist         = R_knn_dists[blockIdx.x * k + (k - 1)];
-  value_int n_dists_computed = 0;
-
-  /**
-   * First add distances for k closest neighbors of R
-   * to the heap
-   */
-  // Start iterating through elements of each set from closest R elements,
-  // determining if the distance could even potentially be in the heap.
-  for (value_int cur_k = 0; cur_k < k; ++cur_k) {
-    // index and distance to current blockIdx.x's closest landmark
-    value_t cur_R_dist  = R_knn_dists[blockIdx.x * k + cur_k];
-    value_idx cur_R_ind = R_knn_inds[blockIdx.x * k + cur_k];
-
-    // Equation (2) in Cayton's paper- prune out R's which are > 3 * p(q, r_q)
-    if (cur_R_dist > weight * (min_R_dist + R_radius[cur_R_ind])) continue;
-    if (cur_R_dist > 3 * min_R_dist) return;
-
-    // The whole warp should iterate through the elements in the current R
-    value_idx R_start_offset = R_indptr[cur_R_ind];
-    value_idx R_stop_offset  = R_indptr[cur_R_ind + 1];
-
-    value_idx R_size = R_stop_offset - R_start_offset;
-
-    value_int limit = Pow2<WarpSize>::roundDown(R_size);
-    value_int i     = threadIdx.x;
-    for (; i < limit; i += tpb) {
-      // Index and distance of current candidate's nearest landmark
-      value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
-      value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
-
-      // Take 2 landmarks l_1 and l_2 where l_1 is the furthest point in the heap
-      // and l_2 is the current landmark R. s is the current data point and
-      // t is the new candidate data point. We know that:
-      // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) -
-      // d(l_2, t) | - d(s, l_1) * d(l_2, t)
-
-      // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to
-      // the candidate point cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s,
-      // l_1) then we should compute the distance because it's possible it could be smaller.
-      //
-      value_t z = heap.warpKTopRDist == 0.00 ? 0.0
-                                             : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                                                  abs(heap.warpKTopRDist - cur_candidate_dist) -
-                                                heap.warpKTop * cur_candidate_dist) /
-                                                 heap.warpKTopRDist;
-
-      z            = isnan(z) || isinf(z) ? 0.0 : z;
-      value_t dist = std::numeric_limits<value_t>::max();
-
-      if (z <= heap.warpKTop) {
-        const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
-        value_t local_y_ptr[col_q];
-        for (value_int j = 0; j < n_cols; ++j) {
-          local_y_ptr[j] = y_ptr[j];
-        }
-        dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
-        ++n_dists_computed;
-      }
-
-      heap.add(dist, cur_candidate_dist, cur_candidate_ind);
-    }
-
-    if (i < R_size) {
-      value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
-      value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
-      value_t z                   = heap.warpKTopRDist == 0.0 ? 0.0
-                                                              : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                                                 abs(heap.warpKTopRDist - cur_candidate_dist) -
-                                               heap.warpKTop * cur_candidate_dist) /
-                                                heap.warpKTopRDist;
-
-      z            = isnan(z) || isinf(z) ? 0.0 : z;
-      value_t dist = std::numeric_limits<value_t>::max();
-
-      if (z <= heap.warpKTop) {
-        const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
-        value_t local_y_ptr[col_q];
-        for (value_int j = 0; j < n_cols; ++j) {
-          local_y_ptr[j] = y_ptr[j];
-        }
-        dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
-        ++n_dists_computed;
-      }
-
-      heap.addThreadQ(dist, cur_candidate_dist, cur_candidate_ind);
-    }
-
-    heap.checkThreadQ();
-  }
-
-  heap.reduce();
-
-  for (int i = threadIdx.x; i < k; i += tpb) {
-    out_dists[blockIdx.x * k + i] = shared_memK[i];
-    out_inds[blockIdx.x * k + i]  = shared_memV[i].value;
-  }
-}
-
-template <typename value_idx,
-          typename value_t,
-          typename value_int = std::uint32_t,
-          int dims           = 2,
-          typename dist_func>
-void rbc_low_dim_pass_one(raft::device_resources const& handle,
-                          const BallCoverIndex<value_idx, value_t, value_int>& index,
-                          const value_t* query,
-                          const value_int n_query_rows,
-                          value_int k,
-                          const value_idx* R_knn_inds,
-                          const value_t* R_knn_dists,
-                          dist_func& dfunc,
-                          value_idx* inds,
-                          value_t* dists,
-                          float weight,
-                          value_int* dists_counter)
-{
-  if (k <= 32)
-    block_rbc_kernel_registers<value_idx, value_t, 32, 2, 128, dims, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X().data_handle(),
-                                                      query,
-                                                      index.n,
-                                                      R_knn_inds,
-                                                      R_knn_dists,
-                                                      index.m,
-                                                      k,
-                                                      index.get_R_indptr().data_handle(),
-                                                      index.get_R_1nn_cols().data_handle(),
-                                                      index.get_R_1nn_dists().data_handle(),
-                                                      inds,
-                                                      dists,
-                                                      dists_counter,
-                                                      index.get_R_radius().data_handle(),
-                                                      dfunc,
-                                                      weight);
-
-  else if (k <= 64)
-    block_rbc_kernel_registers<value_idx, value_t, 64, 3, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X().data_handle(),
-                                                      query,
-                                                      index.n,
-                                                      R_knn_inds,
-                                                      R_knn_dists,
-                                                      index.m,
-                                                      k,
-                                                      index.get_R_indptr().data_handle(),
-                                                      index.get_R_1nn_cols().data_handle(),
-                                                      index.get_R_1nn_dists().data_handle(),
-                                                      inds,
-                                                      dists,
-                                                      dists_counter,
-                                                      index.get_R_radius().data_handle(),
-                                                      dfunc,
-                                                      weight);
-  else if (k <= 128)
-    block_rbc_kernel_registers<value_idx, value_t, 128, 3, 128, dims, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X().data_handle(),
-                                                      query,
-                                                      index.n,
-                                                      R_knn_inds,
-                                                      R_knn_dists,
-                                                      index.m,
-                                                      k,
-                                                      index.get_R_indptr().data_handle(),
-                                                      index.get_R_1nn_cols().data_handle(),
-                                                      index.get_R_1nn_dists().data_handle(),
-                                                      inds,
-                                                      dists,
-                                                      dists_counter,
-                                                      index.get_R_radius().data_handle(),
-                                                      dfunc,
-                                                      weight);
-
-  else if (k <= 256)
-    block_rbc_kernel_registers<value_idx, value_t, 256, 4, 128, dims, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X().data_handle(),
-                                                      query,
-                                                      index.n,
-                                                      R_knn_inds,
-                                                      R_knn_dists,
-                                                      index.m,
-                                                      k,
-                                                      index.get_R_indptr().data_handle(),
-                                                      index.get_R_1nn_cols().data_handle(),
-                                                      index.get_R_1nn_dists().data_handle(),
-                                                      inds,
-                                                      dists,
-                                                      dists_counter,
-                                                      index.get_R_radius().data_handle(),
-                                                      dfunc,
-                                                      weight);
-
-  else if (k <= 512)
-    block_rbc_kernel_registers<value_idx, value_t, 512, 8, 64, dims, value_int>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X().data_handle(),
-                                                     query,
-                                                     index.n,
-                                                     R_knn_inds,
-                                                     R_knn_dists,
-                                                     index.m,
-                                                     k,
-                                                     index.get_R_indptr().data_handle(),
-                                                     index.get_R_1nn_cols().data_handle(),
-                                                     index.get_R_1nn_dists().data_handle(),
-                                                     inds,
-                                                     dists,
-                                                     dists_counter,
-                                                     index.get_R_radius().data_handle(),
-                                                     dfunc,
-                                                     weight);
-
-  else if (k <= 1024)
-    block_rbc_kernel_registers<value_idx, value_t, 1024, 8, 64, dims, value_int>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X().data_handle(),
-                                                     query,
-                                                     index.n,
-                                                     R_knn_inds,
-                                                     R_knn_dists,
-                                                     index.m,
-                                                     k,
-                                                     index.get_R_indptr().data_handle(),
-                                                     index.get_R_1nn_cols().data_handle(),
-                                                     index.get_R_1nn_dists().data_handle(),
-                                                     inds,
-                                                     dists,
-                                                     dists_counter,
-                                                     index.get_R_radius().data_handle(),
-                                                     dfunc,
-                                                     weight);
-}
-
-template <typename value_idx,
-          typename value_t,
-          typename value_int = std::uint32_t,
-          int dims           = 2,
-          typename dist_func>
-void rbc_low_dim_pass_two(raft::device_resources const& handle,
-                          const BallCoverIndex<value_idx, value_t, value_int>& index,
-                          const value_t* query,
-                          const value_int n_query_rows,
-                          value_int k,
-                          const value_idx* R_knn_inds,
-                          const value_t* R_knn_dists,
-                          dist_func& dfunc,
-                          value_idx* inds,
-                          value_t* dists,
-                          float weight,
-                          value_int* post_dists_counter)
-{
-  const value_int bitset_size = ceil(index.n_landmarks / 32.0);
-
-  rmm::device_uvector<std::uint32_t> bitset(bitset_size * n_query_rows, handle.get_stream());
-  thrust::fill(handle.get_thrust_policy(), bitset.data(), bitset.data() + bitset.size(), 0);
-
-  perform_post_filter_registers<value_idx, value_t, value_int, dims, 128>
-    <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t), handle.get_stream()>>>(
-      query,
-      index.n,
-      R_knn_inds,
-      R_knn_dists,
-      index.get_R_radius().data_handle(),
-      index.get_R().data_handle(),
-      index.n_landmarks,
-      bitset_size,
-      k,
-      dfunc,
-      bitset.data(),
-      weight);
-
-  if (k <= 32)
-    compute_final_dists_registers<value_idx,
-                                  value_t,
-                                  value_int,
-                                  std::uint32_t,
-                                  dist_func,
-                                  32,
-                                  2,
-                                  128,
-                                  dims><<<n_query_rows, 128, 0, handle.get_stream()>>>(
-      index.get_X().data_handle(),
-      query,
-      index.n,
-      bitset.data(),
-      bitset_size,
-      index.get_R_closest_landmark_dists().data_handle(),
-      index.get_R_indptr().data_handle(),
-      index.get_R_1nn_cols().data_handle(),
-      index.get_R_1nn_dists().data_handle(),
-      inds,
-      dists,
-      index.n_landmarks,
-      k,
-      dfunc,
-      post_dists_counter);
-  else if (k <= 64)
-    compute_final_dists_registers<value_idx,
-                                  value_t,
-                                  value_int,
-                                  std::uint32_t,
-                                  dist_func,
-                                  64,
-                                  3,
-                                  128,
-                                  dims><<<n_query_rows, 128, 0, handle.get_stream()>>>(
-      index.get_X().data_handle(),
-      query,
-      index.n,
-      bitset.data(),
-      bitset_size,
-      index.get_R_closest_landmark_dists().data_handle(),
-      index.get_R_indptr().data_handle(),
-      index.get_R_1nn_cols().data_handle(),
-      index.get_R_1nn_dists().data_handle(),
-      inds,
-      dists,
-      index.n_landmarks,
-      k,
-      dfunc,
-      post_dists_counter);
-  else if (k <= 128)
-    compute_final_dists_registers<value_idx,
-                                  value_t,
-                                  value_int,
-                                  std::uint32_t,
-                                  dist_func,
-                                  128,
-                                  3,
-                                  128,
-                                  dims><<<n_query_rows, 128, 0, handle.get_stream()>>>(
-      index.get_X().data_handle(),
-      query,
-      index.n,
-      bitset.data(),
-      bitset_size,
-      index.get_R_closest_landmark_dists().data_handle(),
-      index.get_R_indptr().data_handle(),
-      index.get_R_1nn_cols().data_handle(),
-      index.get_R_1nn_dists().data_handle(),
-      inds,
-      dists,
-      index.n_landmarks,
-      k,
-      dfunc,
-      post_dists_counter);
-  else if (k <= 256)
-    compute_final_dists_registers<value_idx,
-                                  value_t,
-                                  value_int,
-                                  std::uint32_t,
-                                  dist_func,
-                                  256,
-                                  4,
-                                  128,
-                                  dims><<<n_query_rows, 128, 0, handle.get_stream()>>>(
-      index.get_X().data_handle(),
-      query,
-      index.n,
-      bitset.data(),
-      bitset_size,
-      index.get_R_closest_landmark_dists().data_handle(),
-      index.get_R_indptr().data_handle(),
-      index.get_R_1nn_cols().data_handle(),
-      index.get_R_1nn_dists().data_handle(),
-      inds,
-      dists,
-      index.n_landmarks,
-      k,
-      dfunc,
-      post_dists_counter);
-  else if (k <= 512)
-    compute_final_dists_registers<value_idx,
-                                  value_t,
-                                  value_int,
-                                  std::uint32_t,
-                                  dist_func,
-                                  512,
-                                  8,
-                                  64,
-                                  dims><<<n_query_rows, 64, 0, handle.get_stream()>>>(
-      index.get_X().data_handle(),
-      query,
-      index.n,
-      bitset.data(),
-      bitset_size,
-      index.get_R_closest_landmark_dists().data_handle(),
-      index.get_R_indptr().data_handle(),
-      index.get_R_1nn_cols().data_handle(),
-      index.get_R_1nn_dists().data_handle(),
-      inds,
-      dists,
-      index.n_landmarks,
-      k,
-      dfunc,
-      post_dists_counter);
-  else if (k <= 1024)
-    compute_final_dists_registers<value_idx,
-                                  value_t,
-                                  value_int,
-                                  std::uint32_t,
-                                  dist_func,
-                                  1024,
-                                  8,
-                                  64,
-                                  dims><<<n_query_rows, 64, 0, handle.get_stream()>>>(
-      index.get_X().data_handle(),
-      query,
-      index.n,
-      bitset.data(),
-      bitset_size,
-      index.get_R_closest_landmark_dists().data_handle(),
-      index.get_R_indptr().data_handle(),
-      index.get_R_1nn_cols().data_handle(),
-      index.get_R_1nn_dists().data_handle(),
-      inds,
-      dists,
-      index.n_landmarks,
-      k,
-      dfunc,
-      post_dists_counter);
-}
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "registers-inl.cuh"
+#endif
 
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
+#ifdef RAFT_COMPILED
+#include "registers-ext.cuh"
+#endif
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers_types.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers_types.cuh
new file mode 100644
index 0000000000..7f4268d2dc
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers_types.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../haversine_distance.cuh"  // compute_haversine
+#include <cstdint>                    // uint32_t
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template <typename value_t, typename value_int = std::uint32_t>
+struct DistFunc {
+  virtual __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                                 const value_t* b,
+                                                                 const value_int n_dims)
+  {
+    return -1;
+  };
+};
+
+template <typename value_t, typename value_int = std::uint32_t>
+struct HaversineFunc : public DistFunc<value_t, value_int> {
+  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                         const value_t* b,
+                                                         const value_int n_dims) override
+  {
+    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]);
+  }
+};
+
+template <typename value_t, typename value_int = std::uint32_t>
+struct EuclideanFunc : public DistFunc<value_t, value_int> {
+  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                         const value_t* b,
+                                                         const value_int n_dims) override
+  {
+    value_t sum_sq = 0;
+    for (value_int i = 0; i < n_dims; ++i) {
+      value_t diff = a[i] - b[i];
+      sum_sq += diff * diff;
+    }
+
+    return raft::sqrt(sum_sq);
+  }
+};
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
new file mode 100644
index 0000000000..390436939f
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>                           // size_t
+#include <cstdint>                           // uint32_t
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
+
+#if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
+
+namespace raft::spatial::knn::detail {
+
+template <typename value_idx, typename value_t, bool usePrevTopKs = false>
+void fusedL2Knn(size_t D,
+                value_idx* out_inds,
+                value_t* out_dists,
+                const value_t* index,
+                const value_t* query,
+                size_t n_index_rows,
+                size_t n_query_rows,
+                int k,
+                bool rowMajorIndex,
+                bool rowMajorQuery,
+                cudaStream_t stream,
+                raft::distance::DistanceType metric) RAFT_EXPLICIT;
+
+}  // namespace raft::spatial::knn::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs) \
+  extern template void                                                                      \
+  raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>(              \
+    size_t D,                                                                               \
+    Mvalue_idx * out_inds,                                                                  \
+    Mvalue_t * out_dists,                                                                   \
+    const Mvalue_t* index,                                                                  \
+    const Mvalue_t* query,                                                                  \
+    size_t n_index_rows,                                                                    \
+    size_t n_query_rows,                                                                    \
+    int k,                                                                                  \
+    bool rowMajorIndex,                                                                     \
+    bool rowMajorQuery,                                                                     \
+    cudaStream_t stream,                                                                    \
+    raft::distance::DistanceType metric)
+
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, false);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, false);
+
+// These are used by brute_force_knn:
+instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
new file mode 100644
index 0000000000..4a571c1447
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
@@ -0,0 +1,1040 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cub/cub.cuh>
+#include <limits>
+#include <raft/linalg/norm.cuh>
+#include <raft/neighbors/detail/faiss_select/Select.cuh>
+// TODO: Need to hide the PairwiseDistance class impl and expose to public API
+#include "processing.cuh"
+#include <raft/core/operators.hpp>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>
+#include <raft/distance/detail/distance_ops/l2_unexp.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
+DI void loadAllWarpQShmem(myWarpSelect** heapArr,
+                          Pair* shDumpKV,
+                          const IdxT m,
+                          const unsigned int numOfNN)
+{
+  const int lid = raft::laneId();
+#pragma unroll
+  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+    const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+    if (rowId < m) {
+#pragma unroll
+      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+        const int idx = j * warpSize + lid;
+        if (idx < numOfNN) {
+          Pair KVPair          = shDumpKV[rowId * numOfNN + idx];
+          heapArr[i]->warpV[j] = KVPair.key;
+          heapArr[i]->warpK[j] = KVPair.value;
+        }
+      }
+    }
+  }
+}
+
+template <typename Policy, typename Pair, typename myWarpSelect>
+DI void loadWarpQShmem(myWarpSelect* heapArr,
+                       Pair* shDumpKV,
+                       const int rowId,
+                       const unsigned int numOfNN)
+{
+  const int lid = raft::laneId();
+#pragma unroll
+  for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+    const int idx = j * warpSize + lid;
+    if (idx < numOfNN) {
+      Pair KVPair       = shDumpKV[rowId * numOfNN + idx];
+      heapArr->warpV[j] = KVPair.key;
+      heapArr->warpK[j] = KVPair.value;
+    }
+  }
+}
+
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
+DI void storeWarpQShmem(myWarpSelect* heapArr,
+                        Pair* shDumpKV,
+                        const IdxT rowId,
+                        const unsigned int numOfNN)
+{
+  const int lid = raft::laneId();
+
+#pragma unroll
+  for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+    const int idx = j * warpSize + lid;
+    if (idx < numOfNN) {
+      Pair otherKV                    = Pair(heapArr->warpV[j], heapArr->warpK[j]);
+      shDumpKV[rowId * numOfNN + idx] = otherKV;
+    }
+  }
+}
+
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
+DI void storeWarpQGmem(myWarpSelect** heapArr,
+                       volatile OutT* out_dists,
+                       volatile IdxT* out_inds,
+                       const IdxT m,
+                       const unsigned int numOfNN,
+                       const IdxT starty)
+{
+  const int lid = raft::laneId();
+#pragma unroll
+  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+    const auto gmemRowId = starty + i * Policy::AccThRows;
+    if (gmemRowId < m) {
+#pragma unroll
+      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+        const auto idx = j * warpSize + lid;
+        if (idx < numOfNN) {
+          out_dists[std::size_t(gmemRowId) * numOfNN + idx] = heapArr[i]->warpK[j];
+          out_inds[std::size_t(gmemRowId) * numOfNN + idx]  = (IdxT)heapArr[i]->warpV[j];
+        }
+      }
+    }
+  }
+}
+
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
+DI void loadPrevTopKsGmemWarpQ(myWarpSelect** heapArr,
+                               volatile OutT* out_dists,
+                               volatile IdxT* out_inds,
+                               const IdxT m,
+                               const unsigned int numOfNN,
+                               const IdxT starty)
+{
+  const int lid = raft::laneId();
+#pragma unroll
+  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+    const auto gmemRowId = starty + i * Policy::AccThRows;
+    if (gmemRowId < m) {
+#pragma unroll
+      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+        const auto idx = j * warpSize + lid;
+        if (idx < numOfNN) {
+          heapArr[i]->warpK[j] = out_dists[std::size_t(gmemRowId) * numOfNN + idx];
+          heapArr[i]->warpV[j] = (uint32_t)out_inds[std::size_t(gmemRowId) * numOfNN + idx];
+        }
+      }
+      static constexpr auto kLaneWarpKTop = myWarpSelect::kNumWarpQRegisters - 1;
+      heapArr[i]->warpKTop = raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane);
+    }
+  }
+}
+
+template <typename Pair, int NumWarpQRegs, typename myWarpSelect>
+DI void updateSortedWarpQ(
+  myWarpSelect& heapArr, Pair* allWarpTopKs, int rowId, int finalNumVals, int startId = 0)
+{
+  constexpr uint32_t mask = 0xffffffffu;
+  const int lid           = raft::laneId();
+  // calculate srcLane such that tid 0 -> 31, 1 -> 0,... 31 -> 30.
+  // warp around 0 to 31 required for NN > 32
+  const auto srcLane = (warpSize + (lid - 1)) & (warpSize - 1);
+
+  for (int k = startId; k < finalNumVals; k++) {
+    Pair KVPair = allWarpTopKs[rowId * (256) + k];
+#pragma unroll
+    for (int i = 0; i < NumWarpQRegs; i++) {
+      unsigned activeLanes = __ballot_sync(mask, KVPair.value < heapArr->warpK[i]);
+      if (activeLanes) {
+        Pair tempKV;
+        tempKV.value               = raft::shfl(heapArr->warpK[i], srcLane);
+        tempKV.key                 = raft::shfl(heapArr->warpV[i], srcLane);
+        const auto firstActiveLane = __ffs(activeLanes) - 1;
+        if (firstActiveLane == lid) {
+          heapArr->warpK[i] = KVPair.value;
+          heapArr->warpV[i] = KVPair.key;
+        } else if (lid > firstActiveLane) {
+          heapArr->warpK[i] = tempKV.value;
+          heapArr->warpV[i] = tempKV.key;
+        }
+        if (i == 0 && NumWarpQRegs > 1) {
+          heapArr->warpK[1] = __shfl_up_sync(mask, heapArr->warpK[1], 1);
+          heapArr->warpV[1] = __shfl_up_sync(mask, heapArr->warpV[1], 1);
+          if (lid == 0) {
+            heapArr->warpK[1] = tempKV.value;
+            heapArr->warpV[1] = tempKV.key;
+          }
+          break;
+        }
+      }
+    }
+  }
+}
+
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename OpT,
+          typename FinalLambda,
+          int NumWarpQ,
+          int NumThreadQ,
+          bool usePrevTopKs = false,
+          bool isRowMajor   = true>
+__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x,
+                                                                  const DataT* y,
+                                                                  const DataT* _xn,
+                                                                  const DataT* _yn,
+                                                                  const IdxT m,
+                                                                  const IdxT n,
+                                                                  const IdxT k,
+                                                                  const IdxT lda,
+                                                                  const IdxT ldb,
+                                                                  const IdxT ldd,
+                                                                  OpT distance_op,
+                                                                  FinalLambda fin_op,
+                                                                  unsigned int numOfNN,
+                                                                  volatile int* mutexes,
+                                                                  volatile OutT* out_dists,
+                                                                  volatile IdxT* out_inds)
+{
+  using AccT = typename OpT::AccT;
+  extern __shared__ char smem[];
+
+  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
+  constexpr auto identity = std::numeric_limits<AccT>::max();
+  constexpr auto keyMax   = std::numeric_limits<uint32_t>::max();
+  constexpr auto Dir      = false;
+  using namespace raft::neighbors::detail::faiss_select;
+  typedef WarpSelect<AccT, uint32_t, Dir, Comparator<AccT>, NumWarpQ, NumThreadQ, 32> myWarpSelect;
+
+  auto rowEpilog_lambda =
+    [m, n, &distance_op, numOfNN, out_dists, out_inds, mutexes] __device__(IdxT gridStrideY) {
+      if (gridDim.x == 1) { return; }
+
+      // Use ::template to disambiguate (See:
+      // https://en.cppreference.com/w/cpp/language/dependent_name)
+      int smem_offset = OpT::template shared_mem_size<Policy>();
+      Pair* shDumpKV  = (Pair*)(&smem[smem_offset]);
+
+      const int lid     = threadIdx.x % warpSize;
+      const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
+
+      //  0 -> consumer done consuming the buffer.
+      // -1 -> consumer started consuming the buffer
+      // -2 -> producer done filling the buffer
+      //  1 -> prod acquired to fill the buffer
+      if (blockIdx.x == 0) {
+        auto cta_processed = 0;
+        myWarpSelect heapArr1(identity, keyMax, numOfNN);
+        myWarpSelect heapArr2(identity, keyMax, numOfNN);
+        myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
+        __syncwarp();
+
+        loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
+
+        while (cta_processed < gridDim.x - 1) {
+          if (threadIdx.x == 0) {
+            while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], -2, -1) != -2)
+              ;
+          }
+          __threadfence();
+          __syncthreads();
+
+#pragma unroll
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            const auto rowId = starty + i * Policy::AccThRows;
+            if (rowId < m) {
+#pragma unroll
+              for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+                Pair otherKV;
+                otherKV.value  = identity;
+                otherKV.key    = keyMax;
+                const auto idx = j * warpSize + lid;
+                if (idx < numOfNN) {
+                  otherKV.value         = out_dists[rowId * numOfNN + idx];
+                  otherKV.key           = (uint32_t)out_inds[rowId * numOfNN + idx];
+                  const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+                  shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
+                }
+              }
+            }
+          }
+          __threadfence();
+          __syncthreads();
+
+          if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], 0); }
+          __threadfence();
+
+        // Perform merging of otherKV with topk's across warp.
+#pragma unroll
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            const auto rowId = starty + i * Policy::AccThRows;
+            if (rowId < m) {
+#pragma unroll
+              for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
+                Pair otherKV;
+                otherKV.value  = identity;
+                otherKV.key    = keyMax;
+                const auto idx = j * warpSize + lid;
+                if (idx < numOfNN) {
+                  const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+                  otherKV               = shDumpKV[shMemRowId * numOfNN + idx];
+                }
+                heapArr[i]->add(otherKV.value, otherKV.key);
+              }
+            }
+          }
+          cta_processed++;
+        }
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId = starty + i * Policy::AccThRows;
+          if (rowId < m) {
+            bool needSort = (heapArr[i]->numVals > 0);
+            needSort      = __any_sync(0xffffffff, needSort);
+            if (needSort) { heapArr[i]->reduce(); }
+          }
+        }
+        storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
+      } else {
+        if (threadIdx.x == 0) {
+          while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], 0, 1) != 0)
+            ;
+        }
+        __threadfence();
+        __syncthreads();
+
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId = starty + i * Policy::AccThRows;
+          if (rowId < m) {
+            for (int idx = lid; idx < numOfNN; idx += warpSize) {
+              const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+              Pair KVPair           = shDumpKV[shMemRowId * numOfNN + idx];
+              out_dists[rowId * numOfNN + idx] = KVPair.value;
+              out_inds[rowId * numOfNN + idx]  = (IdxT)KVPair.key;
+            }
+          }
+        }
+        __threadfence();
+        __syncthreads();
+
+        if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], -2); }
+        __threadfence();
+      }
+    };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda =
+    [&distance_op, numOfNN, m, n, ldd, out_dists, out_inds, keyMax, identity] __device__(
+      AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+      DataT * regxn,
+      DataT * regyn,
+      IdxT gridStrideX,
+      IdxT gridStrideY) {
+      // Use ::template to disambiguate (See:
+      // https://en.cppreference.com/w/cpp/language/dependent_name)
+      int smem_offset = OpT::template shared_mem_size<Policy>();
+      Pair* shDumpKV  = (Pair*)(&smem[smem_offset]);
+
+      constexpr uint32_t mask = 0xffffffffu;
+      const IdxT starty       = gridStrideY + (threadIdx.x / Policy::AccThCols);
+      const IdxT startx       = gridStrideX + (threadIdx.x % Policy::AccThCols);
+      const int lid           = raft::laneId();
+
+      myWarpSelect heapArr1(identity, keyMax, numOfNN);
+      myWarpSelect heapArr2(identity, keyMax, numOfNN);
+      myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
+      if (usePrevTopKs) {
+        if (gridStrideX == blockIdx.x * Policy::Nblk) {
+          loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
+        }
+      }
+
+      if (gridStrideX > blockIdx.x * Policy::Nblk) {
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          Pair tempKV          = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
+          heapArr[i]->warpKTop = tempKV.value;
+        }
+
+        // total vals can atmost be 256, (32*8)
+        int numValsWarpTopK[Policy::AccRowsPerTh];
+        int anyWarpTopKs = 0;
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId   = starty + i * Policy::AccThRows;
+          numValsWarpTopK[i] = 0;
+          if (rowId < m) {
+#pragma unroll
+            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+              const auto colId = startx + j * Policy::AccThCols;
+              if (colId < ldd) {
+                if (acc[i][j] < heapArr[i]->warpKTop) { numValsWarpTopK[i]++; }
+              }
+            }
+            anyWarpTopKs += numValsWarpTopK[i];
+          }
+        }
+        anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
+        if (anyWarpTopKs) {
+          Pair* allWarpTopKs = (Pair*)(&smem[0]);
+          uint32_t needScanSort[Policy::AccRowsPerTh];
+
+#pragma unroll
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            const auto gmemRowId = starty + i * Policy::AccThRows;
+            needScanSort[i]      = 0;
+            if (gmemRowId < m) {
+              int myVals      = numValsWarpTopK[i];
+              needScanSort[i] = __ballot_sync(mask, myVals > 0);
+              if (needScanSort[i]) {
+#pragma unroll
+                for (unsigned int k = 1; k <= 16; k *= 2) {
+                  const unsigned int n = __shfl_up_sync(mask, numValsWarpTopK[i], k);
+                  if (lid >= k) { numValsWarpTopK[i] += n; }
+                }
+              }
+              // As each thread will know its total vals to write.
+              // we only store its starting location.
+              numValsWarpTopK[i] -= myVals;
+            }
+
+            if (needScanSort[i]) {
+              const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+              if (gmemRowId < m) {
+                if (needScanSort[i] & ((uint32_t)1 << lid)) {
+#pragma unroll
+                  for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+                    const auto colId = startx + j * Policy::AccThCols;
+                    if (colId < ldd) {
+                      if (acc[i][j] < heapArr[i]->warpKTop) {
+                        Pair otherKV                                     = {colId, acc[i][j]};
+                        allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = otherKV;
+                        numValsWarpTopK[i]++;
+                      }
+                    }
+                  }
+                }
+                __syncwarp();
+                const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
+                loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
+                updateSortedWarpQ<Pair, myWarpSelect::kNumWarpQRegisters>(
+                  heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
+              }
+            }
+          }
+          __syncthreads();
+#pragma unroll
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            if (needScanSort[i]) {
+              const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+              const auto gmemRowId = starty + i * Policy::AccThRows;
+              if (gmemRowId < m) {
+                storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId, numOfNN);
+              }
+            }
+          }
+        }
+      } else {
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto gmemRowId  = starty + i * Policy::AccThRows;
+          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          if (gmemRowId < m) {
+#pragma unroll
+            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+              const auto colId = startx + j * Policy::AccThCols;
+              Pair otherKV     = {keyMax, identity};
+              if (colId < ldd) {
+                otherKV.value = acc[i][j];
+                otherKV.key   = colId;
+              }
+              heapArr[i]->add(otherKV.value, otherKV.key);
+            }
+
+            bool needSort = (heapArr[i]->numVals > 0);
+            needSort      = __any_sync(mask, needSort);
+            if (needSort) { heapArr[i]->reduce(); }
+            storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId, numOfNN);
+          }
+        }
+      }
+
+      if (((gridStrideX + Policy::Nblk * gridDim.x) >= n) && gridDim.x == 1) {
+        // This is last iteration of grid stride X
+        loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
+        storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
+      }
+    };
+
+  constexpr bool write_out = false;
+  raft::distance::detail::PairwiseDistances<DataT,
+                                            OutT,
+                                            IdxT,
+                                            Policy,
+                                            OpT,
+                                            decltype(epilog_lambda),
+                                            FinalLambda,
+                                            decltype(rowEpilog_lambda),
+                                            isRowMajor,
+                                            write_out>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        _xn,
+        _yn,
+        nullptr,  // output ptr, can be null as write_out == false.
+        smem,
+        distance_op,
+        epilog_lambda,
+        fin_op,
+        rowEpilog_lambda);
+  obj.run();
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2UnexpKnnImpl(const DataT* x,
+                         const DataT* y,
+                         IdxT m,
+                         IdxT n,
+                         IdxT k,
+                         IdxT lda,
+                         IdxT ldb,
+                         IdxT ldd,
+                         bool sqrt,
+                         OutT* out_dists,
+                         IdxT* out_inds,
+                         IdxT numOfNN,
+                         cudaStream_t stream,
+                         void* workspace,
+                         size_t& worksize)
+{
+  typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef typename std::conditional<true, RowPolicy, ColPolicy>::type KPolicy;
+
+  ASSERT(isRowMajor, "Only Row major inputs are allowed");
+
+  dim3 blk(KPolicy::Nthreads);
+  // Accumulation operation lambda
+  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
+
+  raft::distance::detail::ops::l2_unexp_distance_op<DataT, AccT, IdxT> distance_op{sqrt};
+  raft::identity_op fin_op{};
+
+  if constexpr (isRowMajor) {
+    constexpr auto fusedL2UnexpKnn32RowMajor = fusedL2kNN<DataT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(distance_op),
+                                                          decltype(fin_op),
+                                                          32,
+                                                          2,
+                                                          usePrevTopKs,
+                                                          isRowMajor>;
+    constexpr auto fusedL2UnexpKnn64RowMajor = fusedL2kNN<DataT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(distance_op),
+                                                          decltype(fin_op),
+                                                          64,
+                                                          3,
+                                                          usePrevTopKs,
+                                                          isRowMajor>;
+
+    auto fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
+    if (numOfNN <= 32) {
+      fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
+    } else if (numOfNN <= 64) {
+      fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn64RowMajor;
+    } else {
+      ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
+    }
+
+    const auto sharedMemSize =
+      distance_op.template shared_mem_size<KPolicy>() + KPolicy::Mblk * numOfNN * sizeof(Pair);
+
+    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
+      m, n, sharedMemSize, fusedL2UnexpKnnRowMajor);
+
+    if (grid.x > 1) {
+      const auto numMutexes = raft::ceildiv<int>(m, KPolicy::Mblk);
+      if (workspace == nullptr || worksize < (sizeof(int32_t) * numMutexes)) {
+        worksize = sizeof(int32_t) * numMutexes;
+        return;
+      } else {
+        RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
+      }
+    }
+
+    fusedL2UnexpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
+                                                                  y,
+                                                                  nullptr,
+                                                                  nullptr,
+                                                                  m,
+                                                                  n,
+                                                                  k,
+                                                                  lda,
+                                                                  ldb,
+                                                                  ldd,
+                                                                  distance_op,
+                                                                  fin_op,
+                                                                  (uint32_t)numOfNN,
+                                                                  (int*)workspace,
+                                                                  out_dists,
+                                                                  out_inds);
+  } else {
+  }
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2UnexpKnn(IdxT m,
+                     IdxT n,
+                     IdxT k,
+                     IdxT lda,
+                     IdxT ldb,
+                     IdxT ldd,
+                     const DataT* x,
+                     const DataT* y,
+                     bool sqrt,
+                     OutT* out_dists,
+                     IdxT* out_inds,
+                     IdxT numOfNN,
+                     cudaStream_t stream,
+                     void* workspace,
+                     size_t& worksize)
+{
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
+  } else {
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
+                                                                              y,
+                                                                              m,
+                                                                              n,
+                                                                              k,
+                                                                              lda,
+                                                                              ldb,
+                                                                              ldd,
+                                                                              sqrt,
+                                                                              out_dists,
+                                                                              out_inds,
+                                                                              numOfNN,
+                                                                              stream,
+                                                                              workspace,
+                                                                              worksize);
+  }
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2ExpKnnImpl(const DataT* x,
+                       const DataT* y,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       IdxT lda,
+                       IdxT ldb,
+                       IdxT ldd,
+                       bool sqrt,
+                       OutT* out_dists,
+                       IdxT* out_inds,
+                       IdxT numOfNN,
+                       cudaStream_t stream,
+                       void* workspace,
+                       size_t& worksize)
+{
+  typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef typename std::conditional<true, RowPolicy, ColPolicy>::type KPolicy;
+
+  ASSERT(isRowMajor, "Only Row major inputs are allowed");
+
+  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
+         "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  dim3 blk(KPolicy::Nthreads);
+
+  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
+
+  raft::distance::detail::ops::l2_exp_distance_op<DataT, AccT, IdxT> distance_op{sqrt};
+  raft::identity_op fin_op{};
+
+  if constexpr (isRowMajor) {
+    constexpr auto fusedL2ExpKnn32RowMajor = fusedL2kNN<DataT,
+                                                        OutT,
+                                                        IdxT,
+                                                        KPolicy,
+                                                        decltype(distance_op),
+                                                        decltype(fin_op),
+                                                        32,
+                                                        2,
+                                                        usePrevTopKs,
+                                                        isRowMajor>;
+    constexpr auto fusedL2ExpKnn64RowMajor = fusedL2kNN<DataT,
+                                                        OutT,
+                                                        IdxT,
+                                                        KPolicy,
+                                                        decltype(distance_op),
+                                                        decltype(fin_op),
+                                                        64,
+                                                        3,
+                                                        usePrevTopKs,
+                                                        isRowMajor>;
+
+    auto fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
+    if (numOfNN <= 32) {
+      fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
+    } else if (numOfNN <= 64) {
+      fusedL2ExpKnnRowMajor = fusedL2ExpKnn64RowMajor;
+    } else {
+      ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
+    }
+
+    const auto sharedMemSize =
+      distance_op.template shared_mem_size<KPolicy>() + (KPolicy::Mblk * numOfNN * sizeof(Pair));
+    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
+      m, n, sharedMemSize, fusedL2ExpKnnRowMajor);
+    int32_t* mutexes = nullptr;
+    if (grid.x > 1) {
+      const auto numMutexes   = raft::ceildiv<int>(m, KPolicy::Mblk);
+      const auto normsSize    = (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT);
+      const auto requiredSize = sizeof(int32_t) * numMutexes + normsSize;
+      if (worksize < requiredSize) {
+        worksize = requiredSize;
+        return;
+      } else {
+        mutexes = (int32_t*)((char*)workspace + normsSize);
+        RAFT_CUDA_TRY(cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
+      }
+    }
+
+    DataT* xn = (DataT*)workspace;
+    DataT* yn = (DataT*)workspace;
+
+    if (x != y) {
+      yn += m;
+      raft::linalg::rowNorm(
+        xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+      raft::linalg::rowNorm(
+        yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+    } else {
+      raft::linalg::rowNorm(
+        xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+    }
+    fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
+                                                                y,
+                                                                xn,
+                                                                yn,
+                                                                m,
+                                                                n,
+                                                                k,
+                                                                lda,
+                                                                ldb,
+                                                                ldd,
+                                                                distance_op,
+                                                                fin_op,
+                                                                (uint32_t)numOfNN,
+                                                                mutexes,
+                                                                out_dists,
+                                                                out_inds);
+  } else {
+  }
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2ExpKnn(IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   IdxT lda,
+                   IdxT ldb,
+                   IdxT ldd,
+                   const DataT* x,
+                   const DataT* y,
+                   bool sqrt,
+                   OutT* out_dists,
+                   IdxT* out_inds,
+                   IdxT numOfNN,
+                   cudaStream_t stream,
+                   void* workspace,
+                   size_t& worksize)
+{
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
+  } else {
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
+                                                                            y,
+                                                                            m,
+                                                                            n,
+                                                                            k,
+                                                                            lda,
+                                                                            ldb,
+                                                                            ldd,
+                                                                            sqrt,
+                                                                            out_dists,
+                                                                            out_inds,
+                                                                            numOfNN,
+                                                                            stream,
+                                                                            workspace,
+                                                                            worksize);
+  }
+}
+
+/**
+ * Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
+
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[out] out_inds output indices array on device (size n_query_rows * k)
+ * @param[out] out_dists output dists array on device (size n_query_rows * k)
+ * @param[in] index input index array on device (size n_index_rows * D)
+ * @param[in] query input query array on device (size n_query_rows * D)
+ * @param[in] n_index_rows number of rows in index array
+ * @param[in] n_query_rows number of rows in query array
+ * @param[in] k number of closest neighbors to return
+ * @param[in] rowMajorIndex are the index arrays in row-major layout?
+ * @param[in] rowMajorQuery are the query array in row-major layout?
+ * @param[in] stream stream to order kernel launch
+ */
+template <typename value_idx, typename value_t, bool usePrevTopKs = false>
+void fusedL2Knn(size_t D,
+                value_idx* out_inds,
+                value_t* out_dists,
+                const value_t* index,
+                const value_t* query,
+                size_t n_index_rows,
+                size_t n_query_rows,
+                int k,
+                bool rowMajorIndex,
+                bool rowMajorQuery,
+                cudaStream_t stream,
+                raft::distance::DistanceType metric)
+{
+  // Validate the input data
+  ASSERT(k > 0, "l2Knn: k must be > 0");
+  ASSERT(D > 0, "l2Knn: D must be > 0");
+  ASSERT(n_index_rows > 0, "l2Knn: n_index_rows must be > 0");
+  ASSERT(index, "l2Knn: index must be provided (passed null)");
+  ASSERT(n_query_rows > 0, "l2Knn: n_query_rows must be > 0");
+  ASSERT(query, "l2Knn: query must be provided (passed null)");
+  ASSERT(out_dists, "l2Knn: out_dists must be provided (passed null)");
+  ASSERT(out_inds, "l2Knn: out_inds must be provided (passed null)");
+  // Currently we only support same layout for x & y inputs.
+  ASSERT(rowMajorIndex == rowMajorQuery,
+         "l2Knn: rowMajorIndex and rowMajorQuery should have same layout");
+  // TODO: Add support for column major layout
+  ASSERT(rowMajorIndex == true, "l2Knn: only rowMajor inputs are supported for now.");
+
+  // Even for L2 Sqrt distance case we use non-sqrt version as FAISS bfKNN only support
+  // non-sqrt metric & some tests in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
+  constexpr bool sqrt = false;
+
+  size_t worksize = 0, tempWorksize = 0;
+  rmm::device_uvector<char> workspace(worksize, stream);
+  value_idx lda = D, ldb = D, ldd = n_index_rows;
+
+  switch (metric) {
+    case raft::distance::DistanceType::L2SqrtExpanded:
+    case raft::distance::DistanceType::L2Expanded:
+      tempWorksize = raft::distance::detail::
+        getWorkspaceSize<raft::distance::DistanceType::L2Expanded, float, float, float, value_idx>(
+          query, index, n_query_rows, n_index_rows, D);
+      worksize = tempWorksize;
+      workspace.resize(worksize, stream);
+      fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                              n_index_rows,
+                                                                              D,
+                                                                              lda,
+                                                                              ldb,
+                                                                              ldd,
+                                                                              query,
+                                                                              index,
+                                                                              sqrt,
+                                                                              out_dists,
+                                                                              out_inds,
+                                                                              k,
+                                                                              stream,
+                                                                              workspace.data(),
+                                                                              worksize);
+      if (worksize > tempWorksize) {
+        workspace.resize(worksize, stream);
+        fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                n_index_rows,
+                                                                                D,
+                                                                                lda,
+                                                                                ldb,
+                                                                                ldd,
+                                                                                query,
+                                                                                index,
+                                                                                sqrt,
+                                                                                out_dists,
+                                                                                out_inds,
+                                                                                k,
+                                                                                stream,
+                                                                                workspace.data(),
+                                                                                worksize);
+      }
+      break;
+    case raft::distance::DistanceType::L2Unexpanded:
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                n_index_rows,
+                                                                                D,
+                                                                                lda,
+                                                                                ldb,
+                                                                                ldd,
+                                                                                query,
+                                                                                index,
+                                                                                sqrt,
+                                                                                out_dists,
+                                                                                out_inds,
+                                                                                k,
+                                                                                stream,
+                                                                                workspace.data(),
+                                                                                worksize);
+      if (worksize) {
+        workspace.resize(worksize, stream);
+        fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                  n_index_rows,
+                                                                                  D,
+                                                                                  lda,
+                                                                                  ldb,
+                                                                                  ldd,
+                                                                                  query,
+                                                                                  index,
+                                                                                  sqrt,
+                                                                                  out_dists,
+                                                                                  out_inds,
+                                                                                  k,
+                                                                                  stream,
+                                                                                  workspace.data(),
+                                                                                  worksize);
+      }
+      break;
+    default: printf("only L2 distance metric is supported\n"); break;
+  };
+}
+
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 4a571c1447..8cc02c7c78 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -14,1027 +14,11 @@
  * limitations under the License.
  */
 #pragma once
-#include <cub/cub.cuh>
-#include <limits>
-#include <raft/linalg/norm.cuh>
-#include <raft/neighbors/detail/faiss_select/Select.cuh>
-// TODO: Need to hide the PairwiseDistance class impl and expose to public API
-#include "processing.cuh"
-#include <raft/core/operators.hpp>
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/detail/distance_ops/l2_exp.cuh>
-#include <raft/distance/detail/distance_ops/l2_unexp.cuh>
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/util/cuda_utils.cuh>
 
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "fused_l2_knn-inl.cuh"
+#endif
 
-template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
-DI void loadAllWarpQShmem(myWarpSelect** heapArr,
-                          Pair* shDumpKV,
-                          const IdxT m,
-                          const unsigned int numOfNN)
-{
-  const int lid = raft::laneId();
-#pragma unroll
-  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-    const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-    if (rowId < m) {
-#pragma unroll
-      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-        const int idx = j * warpSize + lid;
-        if (idx < numOfNN) {
-          Pair KVPair          = shDumpKV[rowId * numOfNN + idx];
-          heapArr[i]->warpV[j] = KVPair.key;
-          heapArr[i]->warpK[j] = KVPair.value;
-        }
-      }
-    }
-  }
-}
-
-template <typename Policy, typename Pair, typename myWarpSelect>
-DI void loadWarpQShmem(myWarpSelect* heapArr,
-                       Pair* shDumpKV,
-                       const int rowId,
-                       const unsigned int numOfNN)
-{
-  const int lid = raft::laneId();
-#pragma unroll
-  for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-    const int idx = j * warpSize + lid;
-    if (idx < numOfNN) {
-      Pair KVPair       = shDumpKV[rowId * numOfNN + idx];
-      heapArr->warpV[j] = KVPair.key;
-      heapArr->warpK[j] = KVPair.value;
-    }
-  }
-}
-
-template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
-DI void storeWarpQShmem(myWarpSelect* heapArr,
-                        Pair* shDumpKV,
-                        const IdxT rowId,
-                        const unsigned int numOfNN)
-{
-  const int lid = raft::laneId();
-
-#pragma unroll
-  for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-    const int idx = j * warpSize + lid;
-    if (idx < numOfNN) {
-      Pair otherKV                    = Pair(heapArr->warpV[j], heapArr->warpK[j]);
-      shDumpKV[rowId * numOfNN + idx] = otherKV;
-    }
-  }
-}
-
-template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
-DI void storeWarpQGmem(myWarpSelect** heapArr,
-                       volatile OutT* out_dists,
-                       volatile IdxT* out_inds,
-                       const IdxT m,
-                       const unsigned int numOfNN,
-                       const IdxT starty)
-{
-  const int lid = raft::laneId();
-#pragma unroll
-  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-    const auto gmemRowId = starty + i * Policy::AccThRows;
-    if (gmemRowId < m) {
-#pragma unroll
-      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-        const auto idx = j * warpSize + lid;
-        if (idx < numOfNN) {
-          out_dists[std::size_t(gmemRowId) * numOfNN + idx] = heapArr[i]->warpK[j];
-          out_inds[std::size_t(gmemRowId) * numOfNN + idx]  = (IdxT)heapArr[i]->warpV[j];
-        }
-      }
-    }
-  }
-}
-
-template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
-DI void loadPrevTopKsGmemWarpQ(myWarpSelect** heapArr,
-                               volatile OutT* out_dists,
-                               volatile IdxT* out_inds,
-                               const IdxT m,
-                               const unsigned int numOfNN,
-                               const IdxT starty)
-{
-  const int lid = raft::laneId();
-#pragma unroll
-  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-    const auto gmemRowId = starty + i * Policy::AccThRows;
-    if (gmemRowId < m) {
-#pragma unroll
-      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-        const auto idx = j * warpSize + lid;
-        if (idx < numOfNN) {
-          heapArr[i]->warpK[j] = out_dists[std::size_t(gmemRowId) * numOfNN + idx];
-          heapArr[i]->warpV[j] = (uint32_t)out_inds[std::size_t(gmemRowId) * numOfNN + idx];
-        }
-      }
-      static constexpr auto kLaneWarpKTop = myWarpSelect::kNumWarpQRegisters - 1;
-      heapArr[i]->warpKTop = raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane);
-    }
-  }
-}
-
-template <typename Pair, int NumWarpQRegs, typename myWarpSelect>
-DI void updateSortedWarpQ(
-  myWarpSelect& heapArr, Pair* allWarpTopKs, int rowId, int finalNumVals, int startId = 0)
-{
-  constexpr uint32_t mask = 0xffffffffu;
-  const int lid           = raft::laneId();
-  // calculate srcLane such that tid 0 -> 31, 1 -> 0,... 31 -> 30.
-  // warp around 0 to 31 required for NN > 32
-  const auto srcLane = (warpSize + (lid - 1)) & (warpSize - 1);
-
-  for (int k = startId; k < finalNumVals; k++) {
-    Pair KVPair = allWarpTopKs[rowId * (256) + k];
-#pragma unroll
-    for (int i = 0; i < NumWarpQRegs; i++) {
-      unsigned activeLanes = __ballot_sync(mask, KVPair.value < heapArr->warpK[i]);
-      if (activeLanes) {
-        Pair tempKV;
-        tempKV.value               = raft::shfl(heapArr->warpK[i], srcLane);
-        tempKV.key                 = raft::shfl(heapArr->warpV[i], srcLane);
-        const auto firstActiveLane = __ffs(activeLanes) - 1;
-        if (firstActiveLane == lid) {
-          heapArr->warpK[i] = KVPair.value;
-          heapArr->warpV[i] = KVPair.key;
-        } else if (lid > firstActiveLane) {
-          heapArr->warpK[i] = tempKV.value;
-          heapArr->warpV[i] = tempKV.key;
-        }
-        if (i == 0 && NumWarpQRegs > 1) {
-          heapArr->warpK[1] = __shfl_up_sync(mask, heapArr->warpK[1], 1);
-          heapArr->warpV[1] = __shfl_up_sync(mask, heapArr->warpV[1], 1);
-          if (lid == 0) {
-            heapArr->warpK[1] = tempKV.value;
-            heapArr->warpV[1] = tempKV.key;
-          }
-          break;
-        }
-      }
-    }
-  }
-}
-
-template <typename DataT,
-          typename OutT,
-          typename IdxT,
-          typename Policy,
-          typename OpT,
-          typename FinalLambda,
-          int NumWarpQ,
-          int NumThreadQ,
-          bool usePrevTopKs = false,
-          bool isRowMajor   = true>
-__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x,
-                                                                  const DataT* y,
-                                                                  const DataT* _xn,
-                                                                  const DataT* _yn,
-                                                                  const IdxT m,
-                                                                  const IdxT n,
-                                                                  const IdxT k,
-                                                                  const IdxT lda,
-                                                                  const IdxT ldb,
-                                                                  const IdxT ldd,
-                                                                  OpT distance_op,
-                                                                  FinalLambda fin_op,
-                                                                  unsigned int numOfNN,
-                                                                  volatile int* mutexes,
-                                                                  volatile OutT* out_dists,
-                                                                  volatile IdxT* out_inds)
-{
-  using AccT = typename OpT::AccT;
-  extern __shared__ char smem[];
-
-  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
-  constexpr auto identity = std::numeric_limits<AccT>::max();
-  constexpr auto keyMax   = std::numeric_limits<uint32_t>::max();
-  constexpr auto Dir      = false;
-  using namespace raft::neighbors::detail::faiss_select;
-  typedef WarpSelect<AccT, uint32_t, Dir, Comparator<AccT>, NumWarpQ, NumThreadQ, 32> myWarpSelect;
-
-  auto rowEpilog_lambda =
-    [m, n, &distance_op, numOfNN, out_dists, out_inds, mutexes] __device__(IdxT gridStrideY) {
-      if (gridDim.x == 1) { return; }
-
-      // Use ::template to disambiguate (See:
-      // https://en.cppreference.com/w/cpp/language/dependent_name)
-      int smem_offset = OpT::template shared_mem_size<Policy>();
-      Pair* shDumpKV  = (Pair*)(&smem[smem_offset]);
-
-      const int lid     = threadIdx.x % warpSize;
-      const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
-
-      //  0 -> consumer done consuming the buffer.
-      // -1 -> consumer started consuming the buffer
-      // -2 -> producer done filling the buffer
-      //  1 -> prod acquired to fill the buffer
-      if (blockIdx.x == 0) {
-        auto cta_processed = 0;
-        myWarpSelect heapArr1(identity, keyMax, numOfNN);
-        myWarpSelect heapArr2(identity, keyMax, numOfNN);
-        myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
-        __syncwarp();
-
-        loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
-
-        while (cta_processed < gridDim.x - 1) {
-          if (threadIdx.x == 0) {
-            while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], -2, -1) != -2)
-              ;
-          }
-          __threadfence();
-          __syncthreads();
-
-#pragma unroll
-          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-            const auto rowId = starty + i * Policy::AccThRows;
-            if (rowId < m) {
-#pragma unroll
-              for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-                Pair otherKV;
-                otherKV.value  = identity;
-                otherKV.key    = keyMax;
-                const auto idx = j * warpSize + lid;
-                if (idx < numOfNN) {
-                  otherKV.value         = out_dists[rowId * numOfNN + idx];
-                  otherKV.key           = (uint32_t)out_inds[rowId * numOfNN + idx];
-                  const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-                  shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
-                }
-              }
-            }
-          }
-          __threadfence();
-          __syncthreads();
-
-          if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], 0); }
-          __threadfence();
-
-        // Perform merging of otherKV with topk's across warp.
-#pragma unroll
-          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-            const auto rowId = starty + i * Policy::AccThRows;
-            if (rowId < m) {
-#pragma unroll
-              for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
-                Pair otherKV;
-                otherKV.value  = identity;
-                otherKV.key    = keyMax;
-                const auto idx = j * warpSize + lid;
-                if (idx < numOfNN) {
-                  const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-                  otherKV               = shDumpKV[shMemRowId * numOfNN + idx];
-                }
-                heapArr[i]->add(otherKV.value, otherKV.key);
-              }
-            }
-          }
-          cta_processed++;
-        }
-#pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          if (rowId < m) {
-            bool needSort = (heapArr[i]->numVals > 0);
-            needSort      = __any_sync(0xffffffff, needSort);
-            if (needSort) { heapArr[i]->reduce(); }
-          }
-        }
-        storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
-      } else {
-        if (threadIdx.x == 0) {
-          while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], 0, 1) != 0)
-            ;
-        }
-        __threadfence();
-        __syncthreads();
-
-#pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          if (rowId < m) {
-            for (int idx = lid; idx < numOfNN; idx += warpSize) {
-              const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-              Pair KVPair           = shDumpKV[shMemRowId * numOfNN + idx];
-              out_dists[rowId * numOfNN + idx] = KVPair.value;
-              out_inds[rowId * numOfNN + idx]  = (IdxT)KVPair.key;
-            }
-          }
-        }
-        __threadfence();
-        __syncthreads();
-
-        if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], -2); }
-        __threadfence();
-      }
-    };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda =
-    [&distance_op, numOfNN, m, n, ldd, out_dists, out_inds, keyMax, identity] __device__(
-      AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
-      DataT * regxn,
-      DataT * regyn,
-      IdxT gridStrideX,
-      IdxT gridStrideY) {
-      // Use ::template to disambiguate (See:
-      // https://en.cppreference.com/w/cpp/language/dependent_name)
-      int smem_offset = OpT::template shared_mem_size<Policy>();
-      Pair* shDumpKV  = (Pair*)(&smem[smem_offset]);
-
-      constexpr uint32_t mask = 0xffffffffu;
-      const IdxT starty       = gridStrideY + (threadIdx.x / Policy::AccThCols);
-      const IdxT startx       = gridStrideX + (threadIdx.x % Policy::AccThCols);
-      const int lid           = raft::laneId();
-
-      myWarpSelect heapArr1(identity, keyMax, numOfNN);
-      myWarpSelect heapArr2(identity, keyMax, numOfNN);
-      myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
-      if (usePrevTopKs) {
-        if (gridStrideX == blockIdx.x * Policy::Nblk) {
-          loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
-        }
-      }
-
-      if (gridStrideX > blockIdx.x * Policy::Nblk) {
-#pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-          Pair tempKV          = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
-          heapArr[i]->warpKTop = tempKV.value;
-        }
-
-        // total vals can atmost be 256, (32*8)
-        int numValsWarpTopK[Policy::AccRowsPerTh];
-        int anyWarpTopKs = 0;
-#pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId   = starty + i * Policy::AccThRows;
-          numValsWarpTopK[i] = 0;
-          if (rowId < m) {
-#pragma unroll
-            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-              const auto colId = startx + j * Policy::AccThCols;
-              if (colId < ldd) {
-                if (acc[i][j] < heapArr[i]->warpKTop) { numValsWarpTopK[i]++; }
-              }
-            }
-            anyWarpTopKs += numValsWarpTopK[i];
-          }
-        }
-        anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
-        if (anyWarpTopKs) {
-          Pair* allWarpTopKs = (Pair*)(&smem[0]);
-          uint32_t needScanSort[Policy::AccRowsPerTh];
-
-#pragma unroll
-          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-            const auto gmemRowId = starty + i * Policy::AccThRows;
-            needScanSort[i]      = 0;
-            if (gmemRowId < m) {
-              int myVals      = numValsWarpTopK[i];
-              needScanSort[i] = __ballot_sync(mask, myVals > 0);
-              if (needScanSort[i]) {
-#pragma unroll
-                for (unsigned int k = 1; k <= 16; k *= 2) {
-                  const unsigned int n = __shfl_up_sync(mask, numValsWarpTopK[i], k);
-                  if (lid >= k) { numValsWarpTopK[i] += n; }
-                }
-              }
-              // As each thread will know its total vals to write.
-              // we only store its starting location.
-              numValsWarpTopK[i] -= myVals;
-            }
-
-            if (needScanSort[i]) {
-              const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-              if (gmemRowId < m) {
-                if (needScanSort[i] & ((uint32_t)1 << lid)) {
-#pragma unroll
-                  for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-                    const auto colId = startx + j * Policy::AccThCols;
-                    if (colId < ldd) {
-                      if (acc[i][j] < heapArr[i]->warpKTop) {
-                        Pair otherKV                                     = {colId, acc[i][j]};
-                        allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = otherKV;
-                        numValsWarpTopK[i]++;
-                      }
-                    }
-                  }
-                }
-                __syncwarp();
-                const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
-                loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
-                updateSortedWarpQ<Pair, myWarpSelect::kNumWarpQRegisters>(
-                  heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
-              }
-            }
-          }
-          __syncthreads();
-#pragma unroll
-          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-            if (needScanSort[i]) {
-              const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-              const auto gmemRowId = starty + i * Policy::AccThRows;
-              if (gmemRowId < m) {
-                storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId, numOfNN);
-              }
-            }
-          }
-        }
-      } else {
-#pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto gmemRowId  = starty + i * Policy::AccThRows;
-          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-          if (gmemRowId < m) {
-#pragma unroll
-            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-              const auto colId = startx + j * Policy::AccThCols;
-              Pair otherKV     = {keyMax, identity};
-              if (colId < ldd) {
-                otherKV.value = acc[i][j];
-                otherKV.key   = colId;
-              }
-              heapArr[i]->add(otherKV.value, otherKV.key);
-            }
-
-            bool needSort = (heapArr[i]->numVals > 0);
-            needSort      = __any_sync(mask, needSort);
-            if (needSort) { heapArr[i]->reduce(); }
-            storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId, numOfNN);
-          }
-        }
-      }
-
-      if (((gridStrideX + Policy::Nblk * gridDim.x) >= n) && gridDim.x == 1) {
-        // This is last iteration of grid stride X
-        loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
-        storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
-      }
-    };
-
-  constexpr bool write_out = false;
-  raft::distance::detail::PairwiseDistances<DataT,
-                                            OutT,
-                                            IdxT,
-                                            Policy,
-                                            OpT,
-                                            decltype(epilog_lambda),
-                                            FinalLambda,
-                                            decltype(rowEpilog_lambda),
-                                            isRowMajor,
-                                            write_out>
-    obj(x,
-        y,
-        m,
-        n,
-        k,
-        lda,
-        ldb,
-        ldd,
-        _xn,
-        _yn,
-        nullptr,  // output ptr, can be null as write_out == false.
-        smem,
-        distance_op,
-        epilog_lambda,
-        fin_op,
-        rowEpilog_lambda);
-  obj.run();
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          bool usePrevTopKs,
-          bool isRowMajor>
-void fusedL2UnexpKnnImpl(const DataT* x,
-                         const DataT* y,
-                         IdxT m,
-                         IdxT n,
-                         IdxT k,
-                         IdxT lda,
-                         IdxT ldb,
-                         IdxT ldd,
-                         bool sqrt,
-                         OutT* out_dists,
-                         IdxT* out_inds,
-                         IdxT numOfNN,
-                         cudaStream_t stream,
-                         void* workspace,
-                         size_t& worksize)
-{
-  typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<true, RowPolicy, ColPolicy>::type KPolicy;
-
-  ASSERT(isRowMajor, "Only Row major inputs are allowed");
-
-  dim3 blk(KPolicy::Nthreads);
-  // Accumulation operation lambda
-  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
-
-  raft::distance::detail::ops::l2_unexp_distance_op<DataT, AccT, IdxT> distance_op{sqrt};
-  raft::identity_op fin_op{};
-
-  if constexpr (isRowMajor) {
-    constexpr auto fusedL2UnexpKnn32RowMajor = fusedL2kNN<DataT,
-                                                          OutT,
-                                                          IdxT,
-                                                          KPolicy,
-                                                          decltype(distance_op),
-                                                          decltype(fin_op),
-                                                          32,
-                                                          2,
-                                                          usePrevTopKs,
-                                                          isRowMajor>;
-    constexpr auto fusedL2UnexpKnn64RowMajor = fusedL2kNN<DataT,
-                                                          OutT,
-                                                          IdxT,
-                                                          KPolicy,
-                                                          decltype(distance_op),
-                                                          decltype(fin_op),
-                                                          64,
-                                                          3,
-                                                          usePrevTopKs,
-                                                          isRowMajor>;
-
-    auto fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
-    if (numOfNN <= 32) {
-      fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
-    } else if (numOfNN <= 64) {
-      fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn64RowMajor;
-    } else {
-      ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
-    }
-
-    const auto sharedMemSize =
-      distance_op.template shared_mem_size<KPolicy>() + KPolicy::Mblk * numOfNN * sizeof(Pair);
-
-    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
-      m, n, sharedMemSize, fusedL2UnexpKnnRowMajor);
-
-    if (grid.x > 1) {
-      const auto numMutexes = raft::ceildiv<int>(m, KPolicy::Mblk);
-      if (workspace == nullptr || worksize < (sizeof(int32_t) * numMutexes)) {
-        worksize = sizeof(int32_t) * numMutexes;
-        return;
-      } else {
-        RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
-      }
-    }
-
-    fusedL2UnexpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
-                                                                  y,
-                                                                  nullptr,
-                                                                  nullptr,
-                                                                  m,
-                                                                  n,
-                                                                  k,
-                                                                  lda,
-                                                                  ldb,
-                                                                  ldd,
-                                                                  distance_op,
-                                                                  fin_op,
-                                                                  (uint32_t)numOfNN,
-                                                                  (int*)workspace,
-                                                                  out_dists,
-                                                                  out_inds);
-  } else {
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          bool usePrevTopKs,
-          bool isRowMajor>
-void fusedL2UnexpKnn(IdxT m,
-                     IdxT n,
-                     IdxT k,
-                     IdxT lda,
-                     IdxT ldb,
-                     IdxT ldd,
-                     const DataT* x,
-                     const DataT* y,
-                     bool sqrt,
-                     OutT* out_dists,
-                     IdxT* out_inds,
-                     IdxT numOfNN,
-                     cudaStream_t stream,
-                     void* workspace,
-                     size_t& worksize)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
-      x,
-      y,
-      m,
-      n,
-      k,
-      lda,
-      ldb,
-      ldd,
-      sqrt,
-      out_dists,
-      out_inds,
-      numOfNN,
-      stream,
-      workspace,
-      worksize);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
-      x,
-      y,
-      m,
-      n,
-      k,
-      lda,
-      ldb,
-      ldd,
-      sqrt,
-      out_dists,
-      out_inds,
-      numOfNN,
-      stream,
-      workspace,
-      worksize);
-  } else {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
-                                                                              y,
-                                                                              m,
-                                                                              n,
-                                                                              k,
-                                                                              lda,
-                                                                              ldb,
-                                                                              ldd,
-                                                                              sqrt,
-                                                                              out_dists,
-                                                                              out_inds,
-                                                                              numOfNN,
-                                                                              stream,
-                                                                              workspace,
-                                                                              worksize);
-  }
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          bool usePrevTopKs,
-          bool isRowMajor>
-void fusedL2ExpKnnImpl(const DataT* x,
-                       const DataT* y,
-                       IdxT m,
-                       IdxT n,
-                       IdxT k,
-                       IdxT lda,
-                       IdxT ldb,
-                       IdxT ldd,
-                       bool sqrt,
-                       OutT* out_dists,
-                       IdxT* out_inds,
-                       IdxT numOfNN,
-                       cudaStream_t stream,
-                       void* workspace,
-                       size_t& worksize)
-{
-  typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<true, RowPolicy, ColPolicy>::type KPolicy;
-
-  ASSERT(isRowMajor, "Only Row major inputs are allowed");
-
-  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
-         "workspace size error");
-  ASSERT(workspace != nullptr, "workspace is null");
-
-  dim3 blk(KPolicy::Nthreads);
-
-  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
-
-  raft::distance::detail::ops::l2_exp_distance_op<DataT, AccT, IdxT> distance_op{sqrt};
-  raft::identity_op fin_op{};
-
-  if constexpr (isRowMajor) {
-    constexpr auto fusedL2ExpKnn32RowMajor = fusedL2kNN<DataT,
-                                                        OutT,
-                                                        IdxT,
-                                                        KPolicy,
-                                                        decltype(distance_op),
-                                                        decltype(fin_op),
-                                                        32,
-                                                        2,
-                                                        usePrevTopKs,
-                                                        isRowMajor>;
-    constexpr auto fusedL2ExpKnn64RowMajor = fusedL2kNN<DataT,
-                                                        OutT,
-                                                        IdxT,
-                                                        KPolicy,
-                                                        decltype(distance_op),
-                                                        decltype(fin_op),
-                                                        64,
-                                                        3,
-                                                        usePrevTopKs,
-                                                        isRowMajor>;
-
-    auto fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
-    if (numOfNN <= 32) {
-      fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
-    } else if (numOfNN <= 64) {
-      fusedL2ExpKnnRowMajor = fusedL2ExpKnn64RowMajor;
-    } else {
-      ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
-    }
-
-    const auto sharedMemSize =
-      distance_op.template shared_mem_size<KPolicy>() + (KPolicy::Mblk * numOfNN * sizeof(Pair));
-    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
-      m, n, sharedMemSize, fusedL2ExpKnnRowMajor);
-    int32_t* mutexes = nullptr;
-    if (grid.x > 1) {
-      const auto numMutexes   = raft::ceildiv<int>(m, KPolicy::Mblk);
-      const auto normsSize    = (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT);
-      const auto requiredSize = sizeof(int32_t) * numMutexes + normsSize;
-      if (worksize < requiredSize) {
-        worksize = requiredSize;
-        return;
-      } else {
-        mutexes = (int32_t*)((char*)workspace + normsSize);
-        RAFT_CUDA_TRY(cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
-      }
-    }
-
-    DataT* xn = (DataT*)workspace;
-    DataT* yn = (DataT*)workspace;
-
-    if (x != y) {
-      yn += m;
-      raft::linalg::rowNorm(
-        xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-      raft::linalg::rowNorm(
-        yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-    } else {
-      raft::linalg::rowNorm(
-        xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-    }
-    fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
-                                                                y,
-                                                                xn,
-                                                                yn,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                lda,
-                                                                ldb,
-                                                                ldd,
-                                                                distance_op,
-                                                                fin_op,
-                                                                (uint32_t)numOfNN,
-                                                                mutexes,
-                                                                out_dists,
-                                                                out_inds);
-  } else {
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          bool usePrevTopKs,
-          bool isRowMajor>
-void fusedL2ExpKnn(IdxT m,
-                   IdxT n,
-                   IdxT k,
-                   IdxT lda,
-                   IdxT ldb,
-                   IdxT ldd,
-                   const DataT* x,
-                   const DataT* y,
-                   bool sqrt,
-                   OutT* out_dists,
-                   IdxT* out_inds,
-                   IdxT numOfNN,
-                   cudaStream_t stream,
-                   void* workspace,
-                   size_t& worksize)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
-      x,
-      y,
-      m,
-      n,
-      k,
-      lda,
-      ldb,
-      ldd,
-      sqrt,
-      out_dists,
-      out_inds,
-      numOfNN,
-      stream,
-      workspace,
-      worksize);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
-      x,
-      y,
-      m,
-      n,
-      k,
-      lda,
-      ldb,
-      ldd,
-      sqrt,
-      out_dists,
-      out_inds,
-      numOfNN,
-      stream,
-      workspace,
-      worksize);
-  } else {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
-                                                                            y,
-                                                                            m,
-                                                                            n,
-                                                                            k,
-                                                                            lda,
-                                                                            ldb,
-                                                                            ldd,
-                                                                            sqrt,
-                                                                            out_dists,
-                                                                            out_inds,
-                                                                            numOfNN,
-                                                                            stream,
-                                                                            workspace,
-                                                                            worksize);
-  }
-}
-
-/**
- * Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
-
- * @tparam value_idx
- * @tparam value_t
- * @param[out] out_inds output indices array on device (size n_query_rows * k)
- * @param[out] out_dists output dists array on device (size n_query_rows * k)
- * @param[in] index input index array on device (size n_index_rows * D)
- * @param[in] query input query array on device (size n_query_rows * D)
- * @param[in] n_index_rows number of rows in index array
- * @param[in] n_query_rows number of rows in query array
- * @param[in] k number of closest neighbors to return
- * @param[in] rowMajorIndex are the index arrays in row-major layout?
- * @param[in] rowMajorQuery are the query array in row-major layout?
- * @param[in] stream stream to order kernel launch
- */
-template <typename value_idx, typename value_t, bool usePrevTopKs = false>
-void fusedL2Knn(size_t D,
-                value_idx* out_inds,
-                value_t* out_dists,
-                const value_t* index,
-                const value_t* query,
-                size_t n_index_rows,
-                size_t n_query_rows,
-                int k,
-                bool rowMajorIndex,
-                bool rowMajorQuery,
-                cudaStream_t stream,
-                raft::distance::DistanceType metric)
-{
-  // Validate the input data
-  ASSERT(k > 0, "l2Knn: k must be > 0");
-  ASSERT(D > 0, "l2Knn: D must be > 0");
-  ASSERT(n_index_rows > 0, "l2Knn: n_index_rows must be > 0");
-  ASSERT(index, "l2Knn: index must be provided (passed null)");
-  ASSERT(n_query_rows > 0, "l2Knn: n_query_rows must be > 0");
-  ASSERT(query, "l2Knn: query must be provided (passed null)");
-  ASSERT(out_dists, "l2Knn: out_dists must be provided (passed null)");
-  ASSERT(out_inds, "l2Knn: out_inds must be provided (passed null)");
-  // Currently we only support same layout for x & y inputs.
-  ASSERT(rowMajorIndex == rowMajorQuery,
-         "l2Knn: rowMajorIndex and rowMajorQuery should have same layout");
-  // TODO: Add support for column major layout
-  ASSERT(rowMajorIndex == true, "l2Knn: only rowMajor inputs are supported for now.");
-
-  // Even for L2 Sqrt distance case we use non-sqrt version as FAISS bfKNN only support
-  // non-sqrt metric & some tests in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
-  constexpr bool sqrt = false;
-
-  size_t worksize = 0, tempWorksize = 0;
-  rmm::device_uvector<char> workspace(worksize, stream);
-  value_idx lda = D, ldb = D, ldd = n_index_rows;
-
-  switch (metric) {
-    case raft::distance::DistanceType::L2SqrtExpanded:
-    case raft::distance::DistanceType::L2Expanded:
-      tempWorksize = raft::distance::detail::
-        getWorkspaceSize<raft::distance::DistanceType::L2Expanded, float, float, float, value_idx>(
-          query, index, n_query_rows, n_index_rows, D);
-      worksize = tempWorksize;
-      workspace.resize(worksize, stream);
-      fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
-                                                                              n_index_rows,
-                                                                              D,
-                                                                              lda,
-                                                                              ldb,
-                                                                              ldd,
-                                                                              query,
-                                                                              index,
-                                                                              sqrt,
-                                                                              out_dists,
-                                                                              out_inds,
-                                                                              k,
-                                                                              stream,
-                                                                              workspace.data(),
-                                                                              worksize);
-      if (worksize > tempWorksize) {
-        workspace.resize(worksize, stream);
-        fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
-                                                                                n_index_rows,
-                                                                                D,
-                                                                                lda,
-                                                                                ldb,
-                                                                                ldd,
-                                                                                query,
-                                                                                index,
-                                                                                sqrt,
-                                                                                out_dists,
-                                                                                out_inds,
-                                                                                k,
-                                                                                stream,
-                                                                                workspace.data(),
-                                                                                worksize);
-      }
-      break;
-    case raft::distance::DistanceType::L2Unexpanded:
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
-                                                                                n_index_rows,
-                                                                                D,
-                                                                                lda,
-                                                                                ldb,
-                                                                                ldd,
-                                                                                query,
-                                                                                index,
-                                                                                sqrt,
-                                                                                out_dists,
-                                                                                out_inds,
-                                                                                k,
-                                                                                stream,
-                                                                                workspace.data(),
-                                                                                worksize);
-      if (worksize) {
-        workspace.resize(worksize, stream);
-        fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
-                                                                                  n_index_rows,
-                                                                                  D,
-                                                                                  lda,
-                                                                                  ldb,
-                                                                                  ldd,
-                                                                                  query,
-                                                                                  index,
-                                                                                  sqrt,
-                                                                                  out_dists,
-                                                                                  out_inds,
-                                                                                  k,
-                                                                                  stream,
-                                                                                  workspace.data(),
-                                                                                  worksize);
-      }
-      break;
-    default: printf("only L2 distance metric is supported\n"); break;
-  };
-}
-
-}  // namespace detail
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+#ifdef RAFT_COMPILED
+#include "fused_l2_knn-ext.cuh"
+#endif
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
index e0a63ee42a..d516743115 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft::neighbors version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft::neighbors version instead.")
 
 #include <raft/neighbors/epsilon_neighborhood.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index 92fe49be98..e63dcff475 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft::neighbors version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft::neighbors version instead.")
 
 #include <raft/neighbors/ivf_flat.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 75d777573f..9546e62be0 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft::neighbors version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft::neighbors version instead.")
 
 #include <raft/neighbors/ivf_flat_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh
index 0f175f41bb..a89968bd80 100644
--- a/cpp/include/raft/spatial/knn/ivf_pq.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft::neighbors version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft::neighbors version instead.")
 
 #include <raft/neighbors/ivf_pq.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
index 83fb78eb46..168a75034f 100644
--- a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 
 #pragma once
 
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the raft::neighbors version instead.")
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the raft::neighbors version instead.")
 
 #include <raft/neighbors/ivf_pq_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
index 5f0a39a61b..ed0b6848ae 100644
--- a/cpp/include/raft/spatial/knn/specializations.cuh
+++ b/cpp/include/raft/spatial/knn/specializations.cuh
@@ -13,9 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/neighbors/specializations/ball_cover.cuh>
-#include <raft/neighbors/specializations/brute_force.cuh>
-#include <raft/neighbors/specializations/fused_l2_knn.cuh>
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/spatial/knn/specializations/knn.cuh
index e045487597..ed0b6848ae 100644
--- a/cpp/include/raft/spatial/knn/specializations/knn.cuh
+++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh
@@ -13,31 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft::spatial::knn {
-#define RAFT_INST(IdxT, T, IntT)                                                            \
-  extern template void brute_force_knn<IdxT, T, IntT>(raft::device_resources const& handle, \
-                                                      std::vector<T*>& input,               \
-                                                      std::vector<IntT>& sizes,             \
-                                                      IntT D,                               \
-                                                      T* search_items,                      \
-                                                      IntT n,                               \
-                                                      IdxT* res_I,                          \
-                                                      T* res_D,                             \
-                                                      IntT k,                               \
-                                                      bool rowMajorIndex,                   \
-                                                      bool rowMajorQuery,                   \
-                                                      std::vector<IdxT>* translations,      \
-                                                      distance::DistanceType metric,        \
-                                                      float metric_arg);
-
-RAFT_INST(long, float, int);
-RAFT_INST(long, float, unsigned int);
-RAFT_INST(uint32_t, float, int);
-RAFT_INST(uint32_t, float, unsigned int);
-#undef RAFT_INST
-};  // namespace raft::spatial::knn
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/spectral/detail/lapack.hpp b/cpp/include/raft/spectral/detail/lapack.hpp
index 1bc930baf4..2d9e5ae9a4 100644
--- a/cpp/include/raft/spectral/detail/lapack.hpp
+++ b/cpp/include/raft/spectral/detail/lapack.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -473,33 +473,33 @@ void Lapack<T>::gemm(bool transa,
                      int ldc)
 {
   // check_lapack_enabled();
-  //#ifdef NVGRAPH_USE_LAPACK
+  // #ifdef NVGRAPH_USE_LAPACK
   const char transA_char = transa ? 'T' : 'N';
   const char transB_char = transb ? 'T' : 'N';
   lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-  //#endif
+  // #endif
 }
 
 template <typename T>
 void Lapack<T>::sterf(int n, T* d, T* e)
 {
   //    check_lapack_enabled();
-  //#ifdef NVGRAPH_USE_LAPACK
+  // #ifdef NVGRAPH_USE_LAPACK
   int info;
   lapack_sterf(n, d, e, &info);
   lapackCheckError(info);
-  //#endif
+  // #endif
 }
 
 template <typename T>
 void Lapack<T>::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work)
 {
   //    check_lapack_enabled();
-  //#ifdef NVGRAPH_USE_LAPACK
+  // #ifdef NVGRAPH_USE_LAPACK
   int info;
   lapack_steqr(compz, n, d, e, z, ldz, work, &info);
   lapackCheckError(info);
-  //#endif
+  // #endif
 }
 
 template <typename T>
diff --git a/cpp/include/raft/spectral/specializations.cuh b/cpp/include/raft/spectral/specializations.cuh
index 0ce5f0c653..9588a7f329 100644
--- a/cpp/include/raft/spectral/specializations.cuh
+++ b/cpp/include/raft/spectral/specializations.cuh
@@ -13,12 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __SPECTRAL_SPECIALIZATIONS_H
-#define __SPECTRAL_SPECIALIZATIONS_H
-
 #pragma once
 
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#endif
\ No newline at end of file
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index 87a1b8f686..2500a48bfb 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -82,7 +82,7 @@ double adjusted_rand_index(raft::device_resources const& handle,
 
 /** @} */  // end group stats_adj_rand_index
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index 17ff658ac8..10e1753423 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -84,7 +84,7 @@ double completeness_score(raft::device_resources const& handle,
 
 /** @} */  // end group stats_completeness
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/cov.cuh b/cpp/include/raft/stats/cov.cuh
index c0c387e067..f58061ba72 100644
--- a/cpp/include/raft/stats/cov.cuh
+++ b/cpp/include/raft/stats/cov.cuh
@@ -115,7 +115,7 @@ void cov(raft::device_resources const& handle,
 
 /** @} */  // end group stats_cov
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh
index 1ccd725189..4edafe82c8 100644
--- a/cpp/include/raft/stats/detail/minmax.cuh
+++ b/cpp/include/raft/stats/detail/minmax.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,8 +36,7 @@ constexpr To bit_cast(const From& from) noexcept
 }
 
 template <typename T>
-struct encode_traits {
-};
+struct encode_traits {};
 
 template <>
 struct encode_traits<float> {
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
index d59dc8e37a..05f08f52a4 100644
--- a/cpp/include/raft/stats/entropy.cuh
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -79,7 +79,7 @@ double entropy(raft::device_resources const& handle,
 
 /** @} */  // end group stats_entropy
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
index f829b0317e..d97d0759a0 100644
--- a/cpp/include/raft/stats/histogram.cuh
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -114,7 +114,7 @@ void histogram(raft::device_resources const& handle,
 
 /** @} */  // end group stats_histogram
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
index 173d63e47e..ca6c1ddf8e 100644
--- a/cpp/include/raft/stats/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -87,7 +87,7 @@ double homogeneity_score(raft::device_resources const& handle,
 
 /** @} */  // end group stats_homogeneity_score
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/kl_divergence.cuh b/cpp/include/raft/stats/kl_divergence.cuh
index d27f736255..f19cc0d90d 100644
--- a/cpp/include/raft/stats/kl_divergence.cuh
+++ b/cpp/include/raft/stats/kl_divergence.cuh
@@ -73,7 +73,7 @@ value_t kl_divergence(raft::device_resources const& handle,
 
 /** @} */  // end group kl_divergence
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index a576e63bee..303700e80d 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -92,7 +92,7 @@ void mean(raft::device_resources const& handle,
 
 /** @} */  // end group stats_mean
 
-};  // namespace stats
-};  // namespace raft
+};         // namespace stats
+};         // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index b333b3c8da..2f1deb7467 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -159,7 +159,7 @@ void mean_add(raft::device_resources const& handle,
 
 /** @} */  // end group stats_mean_center
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
index 0ee21d1325..bad85f4260 100644
--- a/cpp/include/raft/stats/meanvar.cuh
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -106,6 +106,6 @@ void meanvar(raft::device_resources const& handle,
 
 /** @} */  // end group stats_mean_var
 
-};  // namespace raft::stats
+};         // namespace raft::stats
 
 #endif
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index 8af4f7a92c..10f1ea163b 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -138,6 +138,6 @@ void minmax(raft::device_resources const& handle,
 
 /** @} */  // end group stats_minmax
 
-};  // namespace stats
-};  // namespace raft
+};         // namespace stats
+};         // namespace raft
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index ca7f33d398..be30bcd7fc 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -85,7 +85,7 @@ double mutual_info_score(raft::device_resources const& handle,
 
 /** @} */  // end group stats_mutual_info
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index 25b92e4e10..f0b37592e4 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -71,7 +71,7 @@ double rand_index(raft::device_resources const& handle,
 
 /** @} */  // end group stats_rand_index
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/specializations.cuh b/cpp/include/raft/stats/specializations.cuh
index e6622469d3..9588a7f329 100644
--- a/cpp/include/raft/stats/specializations.cuh
+++ b/cpp/include/raft/stats/specializations.cuh
@@ -13,12 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __STATS_SPECIALIZATIONS_H
-#define __STATS_SPECIALIZATIONS_H
-
 #pragma once
 
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#endif
\ No newline at end of file
+#pragma message(                                            \
+    __FILE__                                                \
+    " is deprecated and will be removed."                   \
+    " Including specializations is not necessary any more." \
+    " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 0b038c85ea..7b0cc6cbe0 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -181,7 +181,7 @@ void vars(raft::device_resources const& handle,
 
 /** @} */  // end group stats_variance
 
-};  // namespace stats
-};  // namespace raft
+};         // namespace stats
+};         // namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index 5f169b3384..d033dc8892 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -84,7 +84,7 @@ void sum(raft::device_resources const& handle,
 
 /** @} */  // end group stats_sum
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index be1d83d59d..948dd0a6ef 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -91,7 +91,7 @@ double v_measure(raft::device_resources const& handle,
 
 /** @} */  // end group stats_vmeasure
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index 7f061e0b45..273adf1641 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -185,7 +185,7 @@ void col_weighted_mean(raft::device_resources const& handle,
 
 /** @} */  // end group stats_weighted_mean
 
-};  // end namespace stats
-};  // end namespace raft
+};         // end namespace stats
+};         // end namespace raft
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/util/bitonic_sort.cuh b/cpp/include/raft/util/bitonic_sort.cuh
index e34708e332..46670d39bd 100644
--- a/cpp/include/raft/util/bitonic_sort.cuh
+++ b/cpp/include/raft/util/bitonic_sort.cuh
@@ -99,10 +99,10 @@ class bitonic {
   {
   }
 
-  bitonic(bitonic const&) = delete;
-  bitonic(bitonic&&)      = delete;
+  bitonic(bitonic const&)                    = delete;
+  bitonic(bitonic&&)                         = delete;
   auto operator=(bitonic const&) -> bitonic& = delete;
-  auto operator=(bitonic&&) -> bitonic& = delete;
+  auto operator=(bitonic&&) -> bitonic&      = delete;
 
   /**
    * You can think of this function in two ways:
diff --git a/cpp/include/raft/util/cache.cuh b/cpp/include/raft/util/cache.cuh
index 77e3ed2d6d..11b1edee73 100644
--- a/cpp/include/raft/util/cache.cuh
+++ b/cpp/include/raft/util/cache.cuh
@@ -362,9 +362,9 @@ class Cache {
   int GetSize() const { return cached_keys.size(); }
 
  private:
-  int n_vec;         //!< Number of elements in a cached vector
-  float cache_size;  //!< in MiB
-  int n_cache_sets;  //!< number of cache sets
+  int n_vec;            //!< Number of elements in a cached vector
+  float cache_size;     //!< in MiB
+  int n_cache_sets;     //!< number of cache sets
 
   const int TPB = 256;  //!< threads per block for kernel launch
   int n_iter    = 0;    //!< Counter for time stamping cache operation
diff --git a/cpp/include/raft/util/cache_util.cuh b/cpp/include/raft/util/cache_util.cuh
index 413e7522b1..bbd84d8bf2 100644
--- a/cpp/include/raft/util/cache_util.cuh
+++ b/cpp/include/raft/util/cache_util.cuh
@@ -46,7 +46,7 @@ __global__ void get_vecs(
   const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int row = tid % n_vec;  // row idx
+  int row = tid % n_vec;             // row idx
   if (tid < n_vec * n) {
     size_t out_col   = tid / n_vec;  // col idx
     size_t cache_col = cache_idx[out_col];
@@ -93,7 +93,7 @@ __global__ void store_vecs(const math_t* tile,
                            int n_cache_vecs)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int row = tid % n_vec;  // row idx
+  int row = tid % n_vec;          // row idx
   if (tid < n_vec * n) {
     int tile_col  = tid / n_vec;  // col idx
     int data_col  = tile_idx ? tile_idx[tile_col] : tile_col;
@@ -303,9 +303,6 @@ __global__ void assign_cache_idx(const int* keys,
   }
 }
 
-/* Unnamed namespace is used to avoid multiple definition error for the
-  following non-template function */
-namespace {
 /**
  * @brief Get the cache indices for keys stored in the cache.
  *
@@ -331,15 +328,15 @@ namespace {
  * @param [out] is_cached whether the element is cached size[n]
  * @param [in] time iteration counter (used for time stamping)
  */
-__global__ void get_cache_idx(int* keys,
-                              int n,
-                              int* cached_keys,
-                              int n_cache_sets,
-                              int associativity,
-                              int* cache_time,
-                              int* cache_idx,
-                              bool* is_cached,
-                              int time)
+__global__ inline void get_cache_idx(int* keys,
+                                     int n,
+                                     int* cached_keys,
+                                     int n_cache_sets,
+                                     int associativity,
+                                     int* cache_time,
+                                     int* cache_idx,
+                                     bool* is_cached,
+                                     int time)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
@@ -359,10 +356,9 @@ __global__ void get_cache_idx(int* keys,
       cache_time[cidx] = time;  // update time stamp
       cache_idx[tid]   = cidx;  // exact cache idx
     } else {
-      cache_idx[tid] = sidx;  // assign cache set
+      cache_idx[tid] = sidx;    // assign cache set
     }
   }
 }
-};  // end unnamed namespace
 };  // namespace cache
 };  // namespace raft
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 1134513587..f3b083ac4a 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -18,10 +18,9 @@
 
 #include <raft/core/error.hpp>
 #include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/memory_pool.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
@@ -451,51 +450,4 @@ constexpr inline auto upper_bound<half>() -> half
   return static_cast<half>(__half_constexpr{0x7c00u});
 }
 
-/**
- * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
- * unique pointer.
- *
- * This function is useful in the code where multiple repeated allocations/deallocations are
- * expected.
- * Use case example:
- * @code{.cpp}
- *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
- *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
- *     if (pool_guard){
- *       RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size());
- *     } else {
- *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
- *     }
- *     rmm::device_uvector<float> x(n, stream, mr);
- *     rmm::device_uvector<float> y(n, stream, mr);
- *     ...
- *   }
- * @endcode
- * Here, the new memory resource would be created within the function scope if the passed `mr` is
- * null and the default resource is not a pool. After the call, `mr` contains a valid memory
- * resource in any case.
- *
- * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
- * into a `pool_memory_resource` if necessary and return the pointer to the result.
- * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
- * to 256 bytes).
- *
- * @return if a new memory pool is created, it returns a unique_ptr to it;
- *   this managed pointer controls the lifetime of the created memory resource.
- */
-inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_t initial_size)
-{
-  using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-  std::unique_ptr<pool_res_t> pool_res{};
-  if (mr) return pool_res;
-  mr = rmm::mr::get_current_device_resource();
-  if (!dynamic_cast<pool_res_t*>(mr) &&
-      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(mr) &&
-      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(mr)) {
-    pool_res = std::make_unique<pool_res_t>(mr, (initial_size + 255) & (~255));
-    mr       = pool_res.get();
-  }
-  return pool_res;
-}
-
 }  // namespace raft
diff --git a/cpp/include/raft/util/detail/cub_wrappers.cuh b/cpp/include/raft/util/detail/cub_wrappers.cuh
index 8c70331165..0ce749d9c8 100644
--- a/cpp/include/raft/util/detail/cub_wrappers.cuh
+++ b/cpp/include/raft/util/detail/cub_wrappers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ void sortPairs(rmm::device_uvector<char>& workspace,
                int len,
                cudaStream_t stream)
 {
-  size_t worksize;
+  size_t worksize = 0;  //  Fix 'worksize' may be used uninitialized in this function.
   cub::DeviceRadixSort::SortPairs(
     nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
   workspace.resize(worksize, stream);
diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp
index e85086df42..6faab5381c 100644
--- a/cpp/include/raft/util/integer_utils.hpp
+++ b/cpp/include/raft/util/integer_utils.hpp
@@ -189,12 +189,10 @@ constexpr inline auto absolute_value(T val) -> std::enable_if_t<!std::is_signed<
  * @{
  */
 template <typename From, typename To, typename = void>
-struct is_narrowing : std::true_type {
-};
+struct is_narrowing : std::true_type {};
 
 template <typename From, typename To>
-struct is_narrowing<From, To, std::void_t<decltype(To{std::declval<From>()})>> : std::false_type {
-};
+struct is_narrowing<From, To, std::void_t<decltype(To{std::declval<From>()})>> : std::false_type {};
 /** @} */
 
 /** Check whether the numeric conversion is narrowing */
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu b/cpp/include/raft/util/memory_pool-ext.hpp
similarity index 55%
rename from cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
rename to cpp/include/raft/util/memory_pool-ext.hpp
index 33c4e7ffc0..a02908346b 100644
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
+++ b/cpp/include/raft/util/memory_pool-ext.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include <cuda_fp16.h>
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
+#pragma once
+#include <cstddef>                                   // size_t
+#include <memory>                                    // std::unique_ptr
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
 
-namespace raft::neighbors::ivf_pq::detail {
+namespace raft {
 
-template auto get_compute_similarity_kernel<float, float, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, float>;
+std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
+  rmm::mr::device_memory_resource*& mr, size_t initial_size);
 
-}  // namespace raft::neighbors::ivf_pq::detail
+}  // namespace raft
diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp
new file mode 100644
index 0000000000..070c8f4e30
--- /dev/null
+++ b/cpp/include/raft/util/memory_pool-inl.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cstddef>
+#include <memory>
+
+#include <raft/core/detail/macros.hpp>  // RAFT_INLINE_CONDITIONAL
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft {
+
+/**
+ * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
+ * unique pointer.
+ *
+ * This function is useful in the code where multiple repeated allocations/deallocations are
+ * expected.
+ * Use case example:
+ * @code{.cpp}
+ *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
+ *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
+ *     if (pool_guard){
+ *       RAFT_LOG_INFO("Created a pool");
+ *     } else {
+ *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
+ *     }
+ *     rmm::device_uvector<float> x(n, stream, mr);
+ *     rmm::device_uvector<float> y(n, stream, mr);
+ *     ...
+ *   }
+ * @endcode
+ * Here, the new memory resource would be created within the function scope if the passed `mr` is
+ * null and the default resource is not a pool. After the call, `mr` contains a valid memory
+ * resource in any case.
+ *
+ * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
+ * into a `pool_memory_resource` if necessary and return the pointer to the result.
+ * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
+ * to 256 bytes).
+ *
+ * @return if a new memory pool is created, it returns a unique_ptr to it;
+ *   this managed pointer controls the lifetime of the created memory resource.
+ */
+RAFT_INLINE_CONDITIONAL std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
+  rmm::mr::device_memory_resource*& mr, size_t initial_size)
+{
+  using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  std::unique_ptr<pool_res_t> pool_res{};
+  if (mr) return pool_res;
+  mr = rmm::mr::get_current_device_resource();
+  if (!dynamic_cast<pool_res_t*>(mr) &&
+      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(mr) &&
+      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(mr)) {
+    pool_res = std::make_unique<pool_res_t>(mr, (initial_size + 255) & (~255));
+    mr       = pool_res.get();
+  }
+  return pool_res;
+}
+
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu b/cpp/include/raft/util/memory_pool.hpp
similarity index 72%
rename from cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
rename to cpp/include/raft/util/memory_pool.hpp
index 423613dcd1..c9d25ecb1f 100644
--- a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
+++ b/cpp/include/raft/util/memory_pool.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,10 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
+#pragma once
 
-template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
+#include "memory_pool-ext.hpp"
+
+#if !defined(RAFT_COMPILED)
+#include "memory_pool-inl.hpp"
+#endif  // RAFT_COMPILED
diff --git a/cpp/include/raft/util/raft_explicit.hpp b/cpp/include/raft/util/raft_explicit.hpp
new file mode 100644
index 0000000000..77e6b57802
--- /dev/null
+++ b/cpp/include/raft/util/raft_explicit.hpp
@@ -0,0 +1,88 @@
+/* Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/**
+ * @brief Prevents a function template from being implicitly instantiated
+ *
+ * This macro defines a function body that can be used for function template
+ * definitions of functions that should not be implicitly instantiated.
+ *
+ * When the template is erroneously implicitly instantiated, it provides a
+ * useful error message that tells the user how to avoid the implicit
+ * instantiation.
+ *
+ * The error message is generated using a static assert. It is generally tricky
+ * to have a static assert fire only when you want it, as documented in
+ * P2593: https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+ *
+ * We use the strategy from paragraph 1.3 here. We define a struct
+ * `not_allowed`, whose type is dependent on the template parameters of the
+ * enclosing function instance. We use this struct type to instantiate the
+ * `implicit_instantiation` template class, whose value is always false. We pass
+ * this value to static_assert. This way, the static assert only fires when the
+ * template is instantiated, since `implicit_instantiation` cannot be
+ * instantiated without all the types in the enclosing function template.
+ */
+#define RAFT_EXPLICIT                                                                          \
+  {                                                                                            \
+    /* Type of `not_allowed` depends on template parameters of enclosing function. */          \
+    struct not_allowed {};                                                                     \
+    static_assert(                                                                             \
+      raft::util::raft_explicit::implicit_instantiation<not_allowed>::value,                   \
+      "ACCIDENTAL_IMPLICIT_INSTANTIATION\n\n"                                                  \
+                                                                                               \
+      "If you see this error, then you have implicitly instantiated a function\n"              \
+      "template. To keep compile times in check, libraft has the policy of\n"                  \
+      "explicitly instantiating templates. To fix the compilation error, follow\n"             \
+      "these steps.\n\n"                                                                       \
+                                                                                               \
+      "If you scroll up or down a bit, you probably saw a line like the following:\n\n"        \
+                                                                                               \
+      "detected during instantiation of \"void raft::foo(T) [with T=float]\" at line [..]\n\n" \
+                                                                                               \
+      "Simplest temporary solution:\n\n"                                                       \
+                                                                                               \
+      "    Add '#undef RAFT_EXPLICIT_INSTANTIATE_ONLY' at the top of your .cpp/.cu file.\n\n"  \
+                                                                                               \
+      "Best solution:\n\n"                                                                     \
+                                                                                               \
+      "    1. Add the following line to the file include/raft/foo.hpp:\n\n"                    \
+                                                                                               \
+      "        extern template void raft::foo<double>(double);\n\n"                            \
+                                                                                               \
+      "    2. Add the following line to the file src/raft/foo.cpp:\n\n"                        \
+                                                                                               \
+      "        template void raft::foo<double>(double)\n");                                    \
+                                                                                               \
+    /* Function may have non-void return type. */                                              \
+    /* To prevent warnings/errors about missing returns, throw an exception. */                \
+    throw "raft_explicit_error";                                                               \
+  }
+
+namespace raft::util::raft_explicit {
+/**
+ * @brief Template that is always false
+ *
+ * This template is from paragraph 1.3 of P2593:
+ * https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+ *
+ * The value of `value` is always false, but it depends on a template parameter.
+ */
+template <typename T>
+struct implicit_instantiation {
+  static constexpr bool value = false;
+};
+}  // namespace raft::util::raft_explicit
diff --git a/cpp/include/raft/util/vectorized.cuh b/cpp/include/raft/util/vectorized.cuh
index 5356f6a153..8af2e498e9 100644
--- a/cpp/include/raft/util/vectorized.cuh
+++ b/cpp/include/raft/util/vectorized.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,7 @@
 namespace raft {
 
 template <typename math_, int VecLen>
-struct IOType {
-};
+struct IOType {};
 template <>
 struct IOType<bool, 1> {
   static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed");
diff --git a/cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu b/cpp/include/raft_runtime/matrix/select_k.hpp
similarity index 50%
rename from cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu
rename to cpp/include/raft_runtime/matrix/select_k.hpp
index 7082873d76..08c0e01d0a 100644
--- a/cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu
+++ b/cpp/include/raft_runtime/matrix/select_k.hpp
@@ -14,18 +14,19 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/specializations.cuh>
+#pragma once
 
-namespace raft::neighbors::ivf_flat {
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
 
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                               \
-  template auto build(raft::device_resources const& handle,                       \
-                      const index_params& params,                                 \
-                      raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<T, IdxT>;
+#include <optional>
 
-RAFT_MAKE_INSTANCE(float, int64_t);
+namespace raft::runtime::matrix {
+void select_k(const device_resources& handle,
+              raft::device_matrix_view<const float, int64_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const int64_t, int64_t, row_major>> in_idx,
+              raft::device_matrix_view<float, int64_t, row_major> out_val,
+              raft::device_matrix_view<int64_t, int64_t, row_major> out_idx,
+              bool select_min);
 
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
+}  // namespace raft::runtime::matrix
diff --git a/cpp/include/raft_runtime/neighbors/refine.hpp b/cpp/include/raft_runtime/neighbors/refine.hpp
index 0171259bbb..2c162c2faa 100644
--- a/cpp/include/raft_runtime/neighbors/refine.hpp
+++ b/cpp/include/raft_runtime/neighbors/refine.hpp
@@ -18,7 +18,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-//#include <raft/core/host_mdspan.hpp>
+// #include <raft/core/host_mdspan.hpp>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
index 188122c9b4..3d7a11e91e 100644
--- a/cpp/internal/raft_internal/matrix/select_k.cuh
+++ b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -16,16 +16,11 @@
 
 #pragma once
 
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/detail/select_radix.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/matrix/select_k.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/matrix/specializations.cuh>
-#endif
-
-#include <raft/core/device_resources.hpp>
-
 namespace raft::matrix::select {
 
 struct params {
@@ -91,12 +86,13 @@ void select_k_impl(const device_resources& handle,
   auto stream = handle.get_stream();
   switch (algo) {
     case Algo::kPublicApi: {
-      auto in_extent   = make_extents<size_t>(batch_size, len);
-      auto out_extent  = make_extents<size_t>(batch_size, k);
-      auto in_span     = make_mdspan<const T, size_t, row_major, false, true>(in, in_extent);
-      auto in_idx_span = make_mdspan<const IdxT, size_t, row_major, false, true>(in_idx, in_extent);
-      auto out_span    = make_mdspan<T, size_t, row_major, false, true>(out, out_extent);
-      auto out_idx_span = make_mdspan<IdxT, size_t, row_major, false, true>(out_idx, out_extent);
+      auto in_extent  = make_extents<int64_t>(batch_size, len);
+      auto out_extent = make_extents<int64_t>(batch_size, k);
+      auto in_span    = make_mdspan<const T, int64_t, row_major, false, true>(in, in_extent);
+      auto in_idx_span =
+        make_mdspan<const IdxT, int64_t, row_major, false, true>(in_idx, in_extent);
+      auto out_span     = make_mdspan<T, int64_t, row_major, false, true>(out, out_extent);
+      auto out_idx_span = make_mdspan<IdxT, int64_t, row_major, false, true>(out_idx, out_extent);
       if (in_idx == nullptr) {
         // NB: std::nullopt prevents automatic inference of the template parameters.
         return matrix::select_k<T, IdxT>(
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
index 47d6f068e3..3ad055272b 100644
--- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -21,10 +21,6 @@
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/matrix/specializations/detail/select_k.cuh>
-#endif
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
deleted file mode 100755
index 5e29a3c5f1..0000000000
--- a/cpp/scripts/run-clang-format.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-import sys
-import re
-import os
-import subprocess
-import argparse
-import tempfile
-
-
-EXPECTED_VERSION = "11.1.0"
-VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)")
-# NOTE: populate this list with more top-level dirs as we add more of them to
-#       to the raft repo
-DEFAULT_DIRS = ["cpp/bench",
-                "cpp/include",
-                "cpp/src",
-                "cpp/test"]
-
-
-def parse_args():
-    argparser = argparse.ArgumentParser("Runs clang-format on a project")
-    argparser.add_argument("-dstdir", type=str, default=None,
-                           help="Directory to store the temporary outputs of"
-                           " clang-format. If nothing is passed for this, then"
-                           " a temporary dir will be created using `mkdtemp`")
-    argparser.add_argument("-exe", type=str, default="clang-format",
-                           help="Path to clang-format exe")
-    argparser.add_argument("-inplace", default=False, action="store_true",
-                           help="Replace the source files itself.")
-    argparser.add_argument("-regex", type=str,
-                           default=r"[.](cu|cuh|h|hpp|cpp)$",
-                           help="Regex string to filter in sources")
-    argparser.add_argument("-ignore", type=str, default=r".*thirdparty.*$",
-                           help="Regex used to ignore files from matched list")
-    argparser.add_argument("-v", dest="verbose", action="store_true",
-                           help="Print verbose messages")
-    argparser.add_argument("dirs", type=str, nargs="*",
-                           help="List of dirs where to find sources")
-    args = argparser.parse_args()
-    args.regex_compiled = re.compile(args.regex)
-    args.ignore_compiled = re.compile(args.ignore)
-    if args.dstdir is None:
-        args.dstdir = tempfile.mkdtemp()
-    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
-    ret = ret.decode("utf-8")
-    version = VERSION_REGEX.search(ret)
-    if version is None:
-        raise Exception("Failed to figure out clang-format version!")
-    version = version.group(1)
-    if version != EXPECTED_VERSION:
-        raise Exception("clang-format exe must be v%s found '%s'" % \
-                        (EXPECTED_VERSION, version))
-    if len(args.dirs) == 0:
-        args.dirs = DEFAULT_DIRS
-    return args
-
-
-def list_all_src_files(file_regex, ignore_regex, srcdirs, dstdir, inplace):
-    allFiles = []
-    for srcdir in srcdirs:
-        for root, dirs, files in os.walk(srcdir):
-            for f in files:
-                if re.search(file_regex, f):
-                    src = os.path.join(root, f)
-                    if re.search(ignore_regex, src):
-                        continue
-                    if inplace:
-                        _dir = root
-                    else:
-                        _dir = os.path.join(dstdir, root)
-                    dst = os.path.join(_dir, f)
-                    allFiles.append((src, dst))
-    return allFiles
-
-
-def run_clang_format(src, dst, exe, verbose):
-    dstdir = os.path.dirname(dst)
-    if not os.path.exists(dstdir):
-        os.makedirs(dstdir)
-    # run the clang format command itself
-    if src == dst:
-        cmd = "%s -i %s" % (exe, src)
-    else:
-        cmd = "%s %s > %s" % (exe, src, dst)
-    try:
-        subprocess.check_call(cmd, shell=True)
-    except subprocess.CalledProcessError:
-        print("Failed to run clang-format! Maybe your env is not proper?")
-        raise
-    # run the diff to check if there are any formatting issues
-    cmd = "diff -q %s %s >/dev/null" % (src, dst)
-    try:
-        subprocess.check_call(cmd, shell=True)
-        if verbose:
-            print("%s passed" % os.path.basename(src))
-    except subprocess.CalledProcessError:
-        print("%s failed! 'diff %s %s' will show formatting violations!" % \
-              (os.path.basename(src), src, dst))
-        return False
-    return True
-
-
-def main():
-    args = parse_args()
-    # Attempt to making sure that we run this script from root of repo always
-    if not os.path.exists(".git"):
-        print("Error!! This needs to always be run from the root of repo")
-        sys.exit(-1)
-    all_files = list_all_src_files(args.regex_compiled, args.ignore_compiled,
-                                   args.dirs, args.dstdir, args.inplace)
-    # actual format checker
-    status = True
-    for src, dst in all_files:
-        if not run_clang_format(src, dst, args.exe, args.verbose):
-            status = False
-    if not status:
-        print("clang-format failed! You have 2 options:")
-        print(" 1. Look at formatting differences above and fix them manually")
-        print(" 2. Or run the below command to bulk-fix all these at once")
-        print("Bulk-fix command: ")
-        print("  python cpp/scripts/run-clang-format.py %s -inplace" % \
-              " ".join(sys.argv[1:]))
-        sys.exit(-1)
-    return
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu b/cpp/src/core/logger.cpp
similarity index 71%
rename from cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
rename to cpp/src/core/logger.cpp
index d777e73dc9..8f81cf2926 100644
--- a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
+++ b/cpp/src/core/logger.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,8 +13,4 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::GramMatrixBase<float>;
\ No newline at end of file
+#include <raft/core/logger-inl.hpp>
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
new file mode 100644
index 0000000000..97fe120458
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: this template is not perfectly formatted. Use pre-commit to get
+# everything in shape again.
+header = """/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp> // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>  // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh> // dispatch
+"""
+
+
+macro = """
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \\
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \\
+  template void raft::distance::detail::                                               \\
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \\
+      OpT<DataT, AccT, IdxT> distance_op,                                              \\
+      IdxT m,                                                                          \\
+      IdxT n,                                                                          \\
+      IdxT k,                                                                          \\
+      const DataT* x,                                                                  \\
+      const DataT* y,                                                                  \\
+      const DataT* x_norm,                                                             \\
+      const DataT* y_norm,                                                             \\
+      OutT* out,                                                                       \\
+      FinOpT fin_op,                                                                   \\
+      cudaStream_t stream,                                                             \\
+      bool is_row_major)
+"""
+
+data_type_instances = [
+    dict(
+        DataT="float",
+        AccT="float",
+        OutT="float",
+        IdxT="int",
+    ),
+    dict(
+        DataT="double",
+        AccT="double",
+        OutT="double",
+        IdxT="int",
+    ),
+]
+
+op_instances = [
+    dict(
+        path_prefix="canberra",
+        OpT="raft::distance::detail::ops::canberra_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="correlation",
+        OpT="raft::distance::detail::ops::correlation_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="cosine",
+        OpT="raft::distance::detail::ops::cosine_distance_op",
+        archs = [60, 80],
+    ),
+    dict(
+        path_prefix="hamming_unexpanded",
+        OpT="raft::distance::detail::ops::hamming_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="hellinger_expanded",
+        OpT="raft::distance::detail::ops::hellinger_distance_op",
+        archs = [60],
+    ),
+    # inner product is handled by cublas.
+    dict(
+        path_prefix="jensen_shannon",
+        OpT="raft::distance::detail::ops::jensen_shannon_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="kl_divergence",
+        OpT="raft::distance::detail::ops::kl_divergence_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l1",
+        OpT="raft::distance::detail::ops::l1_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l2_expanded",
+        OpT="raft::distance::detail::ops::l2_exp_distance_op",
+        archs = [60, 80],
+    ),
+    dict(
+        path_prefix="l2_unexpanded",
+        OpT="raft::distance::detail::ops::l2_unexp_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l_inf",
+        OpT="raft::distance::detail::ops::l_inf_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="lp_unexpanded",
+        OpT="raft::distance::detail::ops::lp_unexp_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="russel_rao",
+        OpT="raft::distance::detail::ops::russel_rao_distance_op",
+        archs = [60],
+     ),
+]
+
+def arch_headers(archs):
+    include_headers ="\n".join([
+        f"#include <raft/distance/detail/pairwise_matrix/dispatch_sm{arch}.cuh>"
+        for arch in archs
+    ])
+    return include_headers
+
+
+
+for op in op_instances:
+    for dt in data_type_instances:
+        DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+        path = f"dispatch_{op['path_prefix']}_{DataT}_{AccT}_{OutT}_{IdxT}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(arch_headers(op["archs"]))
+            f.write(macro)
+
+            OpT = op['OpT']
+            FinOpT = "raft::identity_op"
+            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+            f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+        print(f"src/distance/detail/pairwise_matrix/{path}")
+
+# Dispatch kernels for with the RBF fin op.
+with open("dispatch_rbf.cu", "w") as f:
+        OpT="raft::distance::detail::ops::l2_unexp_distance_op"
+        archs = [60]
+
+        f.write(header)
+        f.write("#include <raft/distance/detail/kernels/rbf_fin_op.cuh> // rbf_fin_op\n")
+        f.write(arch_headers(archs))
+        f.write(macro)
+
+        for dt in data_type_instances:
+            DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+            IdxT = "int64_t"    # overwrite IdxT
+
+            FinOpT = f"raft::distance::kernels::detail::rbf_fin_op<{DataT}>"
+            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+
+        f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+
+print("src/distance/detail/pairwise_matrix/dispatch_rbf.cu")
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
new file mode 100644
index 0000000000..41db12e9ae
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
new file mode 100644
index 0000000000..f038e53381
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
new file mode 100644
index 0000000000..52e4cc02d8
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::correlation_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
new file mode 100644
index 0000000000..c9481d6c22
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::correlation_distance_op,
+  float,
+  float,
+  float,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
new file mode 100644
index 0000000000..517858125b
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::cosine_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
new file mode 100644
index 0000000000..62f1d9874b
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::cosine_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..500f7b4a9c
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hamming_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..3be7586b43
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hamming_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
new file mode 100644
index 0000000000..023134ddff
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hellinger_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
new file mode 100644
index 0000000000..e438f121f2
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hellinger_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
new file mode 100644
index 0000000000..31c5003ad6
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::jensen_shannon_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
new file mode 100644
index 0000000000..e78c1c320a
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::jensen_shannon_distance_op,
+  float,
+  float,
+  float,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
new file mode 100644
index 0000000000..5b95df9614
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::kl_divergence_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
new file mode 100644
index 0000000000..fb72c91b73
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::kl_divergence_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
new file mode 100644
index 0000000000..cac5acad92
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l1_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
new file mode 100644
index 0000000000..78aa097961
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l1_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
new file mode 100644
index 0000000000..c8d922f6fa
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_exp_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
new file mode 100644
index 0000000000..20cf57f898
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_exp_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..eadd0d2c2b
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..e4b5dd3a86
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
new file mode 100644
index 0000000000..45d021bce9
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l_inf_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
new file mode 100644
index 0000000000..ba48e52a18
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l_inf_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..ffa58793d9
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::lp_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..915c68f05f
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::lp_unexp_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
new file mode 100644
index 0000000000..15855cea0a
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>            // rbf_fin_op
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  float,
+  float,
+  float,
+  raft::distance::kernels::detail::rbf_fin_op<float>,
+  int64_t);
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::distance::kernels::detail::rbf_fin_op<double>,
+  int64_t);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
new file mode 100644
index 0000000000..db45dc8b94
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::russel_rao_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
new file mode 100644
index 0000000000..a2a5a9fafe
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::russel_rao_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
new file mode 100644
index 0000000000..8c94608311
--- /dev/null
+++ b/cpp/src/distance/distance.cu
@@ -0,0 +1,934 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>  // rbf_fin_op
+#include <raft/distance/distance-inl.cuh>
+
+/*
+ * Hierarchy of instantiations:
+ *
+ * This file defines the template instantiations for the public API of
+ * raft::distance. To improve compile times, the compilation of the distance
+ * kernels is handled in distance/detail/pairwise_matrix/dispatch_*.cu.
+ *
+ */
+
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT) \
+  template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>(  \
+    raft::resources const& handle,                                                   \
+    const DataT* x,                                                                  \
+    const DataT* y,                                                                  \
+    OutT* dist,                                                                      \
+    IdxT m,                                                                          \
+    IdxT n,                                                                          \
+    IdxT k,                                                                          \
+    void* workspace,                                                                 \
+    size_t worksize,                                                                 \
+    FinalLambda fin_op,                                                              \
+    bool isRowMajor,                                                                 \
+    DataT metric_arg)
+
+// The following two instances are used in test/distance/gram.cu. Note the use
+// of int64_t for the index type.
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::distance::kernels::detail::rbf_fin_op<float>,
+                                   int64_t);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::distance::kernels::detail::rbf_fin_op<double>,
+                                   int64_t);
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::identity_op,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_distance
+
+// Same, but without raft::identity_op
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT) \
+  template void raft::distance::distance<DT, DataT, AccT, OutT, IdxT>(  \
+    raft::resources const& handle,                                      \
+    const DataT* x,                                                     \
+    const DataT* y,                                                     \
+    OutT* dist,                                                         \
+    IdxT m,                                                             \
+    IdxT n,                                                             \
+    IdxT k,                                                             \
+    void* workspace,                                                    \
+    size_t worksize,                                                    \
+    bool isRowMajor,                                                    \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_distance
+
+// Same, but without workspace
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT) \
+  template void raft::distance::distance<DT, DataT, AccT, OutT, IdxT>(  \
+    raft::resources const& handle,                                      \
+    const DataT* x,                                                     \
+    const DataT* y,                                                     \
+    OutT* dist,                                                         \
+    IdxT m,                                                             \
+    IdxT n,                                                             \
+    IdxT k,                                                             \
+    bool isRowMajor,                                                    \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)  \
+  template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT, layout)  \
+  template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT, layout>( \
+    raft::device_matrix_view<DataT, IdxT, layout> const& x,                                 \
+    raft::device_matrix_view<DataT, IdxT, layout> const& y)
+
+// We could consider not taking template parameters for this function. The
+// number of instantiations seems a bit excessive..
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::InnerProduct,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::InnerProduct,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::JensenShannon,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::JensenShannon,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::KLDivergence,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::KLDivergence,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2Unexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_f_contiguous);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_pairwise_distance(DataT, IdxT)                        \
+  template void raft::distance::pairwise_distance(raft::resources const& handle,        \
+                                                  const DataT* x,                       \
+                                                  const DataT* y,                       \
+                                                  DataT* dist,                          \
+                                                  IdxT m,                               \
+                                                  IdxT n,                               \
+                                                  IdxT k,                               \
+                                                  rmm::device_uvector<char>& workspace, \
+                                                  raft::distance::DistanceType metric,  \
+                                                  bool isRowMajor,                      \
+                                                  DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, int);
+instantiate_raft_distance_pairwise_distance(double, int);
+
+#undef instantiate_raft_distance_pairwise_distance
+
+// Same, but without workspace
+#define instantiate_raft_distance_pairwise_distance(DataT, IdxT)                       \
+  template void raft::distance::pairwise_distance(raft::resources const& handle,       \
+                                                  const DataT* x,                      \
+                                                  const DataT* y,                      \
+                                                  DataT* dist,                         \
+                                                  IdxT m,                              \
+                                                  IdxT n,                              \
+                                                  IdxT k,                              \
+                                                  raft::distance::DistanceType metric, \
+                                                  bool isRowMajor,                     \
+                                                  DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, int);
+instantiate_raft_distance_pairwise_distance(double, int);
+
+#undef instantiate_raft_distance_pairwise_distance
+
+// Version with mdspan
+#define instantiate_raft_distance_distance(DistT, DataT, AccT, OutT, layout, IdxT) \
+  template void raft::distance::distance<DistT, DataT, AccT, OutT, layout, IdxT>(  \
+    raft::resources const& handle,                                                 \
+    raft::device_matrix_view<DataT, IdxT, layout> const x,                         \
+    raft::device_matrix_view<DataT, IdxT, layout> const y,                         \
+    raft::device_matrix_view<OutT, IdxT, layout> dist,                             \
+    DataT metric_arg)
+
+// Again, we might want to consider reigning in the number of instantiations...
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::InnerProduct,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::InnerProduct,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::JensenShannon,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::JensenShannon,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::KLDivergence,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::KLDivergence,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::LpUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::LpUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_pairwise_distance(DataT, layout, IdxT) \
+  template void raft::distance::pairwise_distance(                       \
+    raft::resources const& handle,                                       \
+    raft::device_matrix_view<DataT, IdxT, layout> const x,               \
+    raft::device_matrix_view<DataT, IdxT, layout> const y,               \
+    raft::device_matrix_view<DataT, IdxT, layout> dist,                  \
+    raft::distance::DistanceType metric,                                 \
+    DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_pairwise_distance(float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_pairwise_distance(double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_pairwise_distance(double, raft::layout_f_contiguous, int);
+
+#undef instantiate_raft_distance_pairwise_distance
diff --git a/cpp/src/distance/fused_l2_nn.cu b/cpp/src/distance/fused_l2_nn.cu
new file mode 100644
index 0000000000..6011aaec29
--- /dev/null
+++ b/cpp/src/distance/fused_l2_nn.cu
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>            // int64_t
+#include <raft/core/kvp.hpp>  // raft::KeyValuePair
+#include <raft/distance/fused_l2_nn-inl.cuh>
+
+#define instantiate_raft_distance_fusedL2NNMinReduce(DataT, OutT, IdxT)                   \
+  template void raft::distance::fusedL2NNMinReduce<DataT, OutT, IdxT>(OutT * min,         \
+                                                                      const DataT* x,     \
+                                                                      const DataT* y,     \
+                                                                      const DataT* xn,    \
+                                                                      const DataT* yn,    \
+                                                                      IdxT m,             \
+                                                                      IdxT n,             \
+                                                                      IdxT k,             \
+                                                                      void* workspace,    \
+                                                                      bool sqrt,          \
+                                                                      bool initOutBuffer, \
+                                                                      cudaStream_t stream)
+
+instantiate_raft_distance_fusedL2NNMinReduce(double, double, int);
+instantiate_raft_distance_fusedL2NNMinReduce(double, double, int64_t);
+instantiate_raft_distance_fusedL2NNMinReduce(float, float, int);
+instantiate_raft_distance_fusedL2NNMinReduce(float, float, int64_t);
+
+// We can't have comma's in the macro expansion, so we use the COMMA macro:
+#define COMMA ,
+
+instantiate_raft_distance_fusedL2NNMinReduce(double, raft::KeyValuePair<int COMMA double>, int);
+instantiate_raft_distance_fusedL2NNMinReduce(double,
+                                             raft::KeyValuePair<int64_t COMMA double>,
+                                             int64_t);
+instantiate_raft_distance_fusedL2NNMinReduce(float, raft::KeyValuePair<int COMMA float>, int);
+instantiate_raft_distance_fusedL2NNMinReduce(float,
+                                             raft::KeyValuePair<int64_t COMMA float>,
+                                             int64_t);
+
+#undef COMMA
+
+#undef instantiate_raft_distance_fusedL2NNMinReduce
diff --git a/cpp/src/distance/specializations/detail/00_write_template.py b/cpp/src/distance/specializations/detail/00_write_template.py
deleted file mode 100644
index 3f2f853569..0000000000
--- a/cpp/src/distance/specializations/detail/00_write_template.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env python3
-
-# NOTE: this template is not perfectly formatted. Use pre-commit to get
-# everything in shape again.
-template = """/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp> // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh> // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh> // pairwise_matrix_instantiation_point
-INCLUDE_SM_HEADERS
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<OpT,
-                                                  IdxT,
-                                                  DataT,
-                                                  OutT,
-                                                  FinopT>(
-  OpT,
-  pairwise_matrix_params<IdxT, DataT, OutT, FinopT>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
-"""
-
-data_type_instances = [
-    dict(
-        DataT="float",
-        AccT="float",
-        OutT="float",
-        IdxT="int",
-    ),
-    dict(
-        DataT="double",
-        AccT="double",
-        OutT="double",
-        IdxT="int",
-    ),
-]
-
-op_instances = [
-    dict(
-        path_prefix="canberra",
-        OpT="ops::canberra_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="correlation",
-        OpT="ops::correlation_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="cosine",
-        OpT="ops::cosine_distance_op<DataT, AccT, IdxT>",
-        archs = [60, 80],
-    ),
-    dict(
-        path_prefix="hamming_unexpanded",
-        OpT="ops::hamming_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="hellinger_expanded",
-        OpT="ops::hellinger_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    # inner product is handled by cublas.
-    dict(
-        path_prefix="jensen_shannon",
-        OpT="ops::jensen_shannon_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="kl_divergence",
-        OpT="ops::kl_divergence_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l1",
-        OpT="ops::l1_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l2_expanded",
-        OpT="ops::l2_exp_distance_op<DataT, AccT, IdxT>",
-        archs = [60, 80],
-    ),
-    dict(
-        path_prefix="l2_unexpanded",
-        OpT="ops::l2_unexp_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l_inf",
-        OpT="ops::l_inf_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="lp_unexpanded",
-        OpT="ops::lp_unexp_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="russel_rao",
-        OpT="ops::russel_rao_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-     ),
-]
-
-def fill_in(s, template):
-    for k, v in template.items():
-        s = s.replace(k, v)
-    return s
-
-def fill_include_sm_headers(op_instance):
-    include_headers ="\n".join([
-        f"#include <raft/distance/detail/pairwise_matrix/dispatch_sm{arch}.cuh>"
-        for arch in op_instance["archs"]
-    ])
-
-    return {
-        "path_prefix": op_instance["path_prefix"],
-        "OpT": op_instance["OpT"],
-        "INCLUDE_SM_HEADERS": include_headers
-    }
-
-for op_instance in op_instances:
-    op_instance = fill_include_sm_headers(op_instance)
-
-    for data_type_instance in data_type_instances:
-        op_data_instance = {
-            k : fill_in(v, data_type_instance)
-            for k, v in op_instance.items()
-        }
-        instance = {
-            **op_data_instance,
-            **data_type_instance,
-            "FinopT": "decltype(raft::identity_op())",
-        }
-
-        text = fill_in(template, instance)
-
-        path = fill_in("path_prefix_DataT_AccT_OutT_IdxT.cu", instance)
-        with open(path, "w") as f:
-            f.write(text)
diff --git a/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu b/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu
deleted file mode 100644
index 037d218178..0000000000
--- a/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::canberra_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::canberra_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu b/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu
deleted file mode 100644
index 0ed8ea7bb0..0000000000
--- a/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::canberra_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::canberra_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu b/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu
deleted file mode 100644
index 0c11f0621e..0000000000
--- a/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::correlation_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::correlation_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu b/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu
deleted file mode 100644
index 396e158554..0000000000
--- a/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::correlation_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::correlation_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu b/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu
deleted file mode 100644
index e9afb6f563..0000000000
--- a/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::cosine_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu b/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu
deleted file mode 100644
index 1033c491d6..0000000000
--- a/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::cosine_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
deleted file mode 100644
index 195115914d..0000000000
--- a/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::hamming_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::hamming_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
deleted file mode 100644
index a74c6c404e..0000000000
--- a/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::hamming_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::hamming_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
deleted file mode 100644
index bac1dd7bd0..0000000000
--- a/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::hellinger_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::hellinger_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
deleted file mode 100644
index 77c113b1a9..0000000000
--- a/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::hellinger_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::hellinger_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu b/cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu
deleted file mode 100644
index 3db0a3572e..0000000000
--- a/cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::InnerProduct, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/inner_product_float_float_float_int.cu b/cpp/src/distance/specializations/detail/inner_product_float_float_float_int.cu
deleted file mode 100644
index 2b06ca4dc2..0000000000
--- a/cpp/src/distance/specializations/detail/inner_product_float_float_float_int.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::InnerProduct, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
deleted file mode 100644
index 188e52c152..0000000000
--- a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void
-  pairwise_matrix_instantiation_point<ops::jensen_shannon_distance_op<double, double, int>,
-                                      int,
-                                      double,
-                                      double,
-                                      decltype(raft::identity_op())>(
-    ops::jensen_shannon_distance_op<double, double, int>,
-    pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-    cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
deleted file mode 100644
index b0afbf7bb2..0000000000
--- a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void
-  pairwise_matrix_instantiation_point<ops::jensen_shannon_distance_op<float, float, int>,
-                                      int,
-                                      float,
-                                      float,
-                                      decltype(raft::identity_op())>(
-    ops::jensen_shannon_distance_op<float, float, int>,
-    pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-    cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
deleted file mode 100644
index ab818db73b..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::TanhKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
deleted file mode 100644
index f06ae85414..0000000000
--- a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::kl_divergence_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
deleted file mode 100644
index 00d5a5ee5b..0000000000
--- a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::kl_divergence_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
deleted file mode 100644
index 5c235316da..0000000000
--- a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l1_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::l1_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
deleted file mode 100644
index fb293ca83d..0000000000
--- a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l1_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::l1_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
deleted file mode 100644
index 2c02f0224f..0000000000
--- a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::l2_exp_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
deleted file mode 100644
index 85e25a25ca..0000000000
--- a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::l2_exp_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
deleted file mode 100644
index 5b4d995d14..0000000000
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l2_unexp_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::l2_unexp_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
deleted file mode 100644
index a63c3f0bb8..0000000000
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l2_unexp_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::l2_unexp_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu
deleted file mode 100644
index 831167523f..0000000000
--- a/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::l_inf_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu
deleted file mode 100644
index 02e667cbe3..0000000000
--- a/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::l_inf_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
deleted file mode 100644
index ebd71065ec..0000000000
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::lp_unexp_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::lp_unexp_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
deleted file mode 100644
index b94a81fdce..0000000000
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::lp_unexp_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::lp_unexp_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu b/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu
deleted file mode 100644
index 6f952fcc37..0000000000
--- a/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::russel_rao_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::russel_rao_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu b/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu
deleted file mode 100644
index 3223ce33a7..0000000000
--- a/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::russel_rao_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::russel_rao_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/fused_l2_nn_double_int.cu b/cpp/src/distance/specializations/fused_l2_nn_double_int.cu
deleted file mode 100644
index b49132b042..0000000000
--- a/cpp/src/distance/specializations/fused_l2_nn_double_int.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-
-template void fusedL2NNMinReduce<double, raft::KeyValuePair<int, double>, int>(
-  raft::KeyValuePair<int, double>* min,
-  const double* x,
-  const double* y,
-  const double* xn,
-  const double* yn,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-template void fusedL2NNMinReduce<double, double, int>(double* min,
-                                                      const double* x,
-                                                      const double* y,
-                                                      const double* xn,
-                                                      const double* yn,
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      void* workspace,
-                                                      bool sqrt,
-                                                      bool initOutBuffer,
-                                                      cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu b/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu
deleted file mode 100644
index b1e3a900a9..0000000000
--- a/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-
-template void fusedL2NNMinReduce<double, raft::KeyValuePair<int64_t, double>, int64_t>(
-  raft::KeyValuePair<int64_t, double>* min,
-  const double* x,
-  const double* y,
-  const double* xn,
-  const double* yn,
-  int64_t m,
-  int64_t n,
-  int64_t k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-template void fusedL2NNMinReduce<double, double, int64_t>(double* min,
-                                                          const double* x,
-                                                          const double* y,
-                                                          const double* xn,
-                                                          const double* yn,
-                                                          int64_t m,
-                                                          int64_t n,
-                                                          int64_t k,
-                                                          void* workspace,
-                                                          bool sqrt,
-                                                          bool initOutBuffer,
-                                                          cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/fused_l2_nn_float_int.cu b/cpp/src/distance/specializations/fused_l2_nn_float_int.cu
deleted file mode 100644
index 44b4953d8c..0000000000
--- a/cpp/src/distance/specializations/fused_l2_nn_float_int.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-
-template void fusedL2NNMinReduce<float, raft::KeyValuePair<int, float>, int>(
-  raft::KeyValuePair<int, float>* min,
-  const float* x,
-  const float* y,
-  const float* xn,
-  const float* yn,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-template void fusedL2NNMinReduce<float, float, int>(float* min,
-                                                    const float* x,
-                                                    const float* y,
-                                                    const float* xn,
-                                                    const float* yn,
-                                                    int m,
-                                                    int n,
-                                                    int k,
-                                                    void* workspace,
-                                                    bool sqrt,
-                                                    bool initOutBuffer,
-                                                    cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu b/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu
deleted file mode 100644
index 9ca2b639a9..0000000000
--- a/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-
-template void fusedL2NNMinReduce<float, raft::KeyValuePair<int64_t, float>, int64_t>(
-  raft::KeyValuePair<int64_t, float>* min,
-  const float* x,
-  const float* y,
-  const float* xn,
-  const float* yn,
-  int64_t m,
-  int64_t n,
-  int64_t k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-template void fusedL2NNMinReduce<float, float, int64_t>(float* min,
-                                                        const float* x,
-                                                        const float* y,
-                                                        const float* xn,
-                                                        const float* yn,
-                                                        int64_t m,
-                                                        int64_t n,
-                                                        int64_t k,
-                                                        void* workspace,
-                                                        bool sqrt,
-                                                        bool initOutBuffer,
-                                                        cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/linalg/detail/coalesced_reduction.cu b/cpp/src/linalg/detail/coalesced_reduction.cu
new file mode 100644
index 0000000000..00d025df46
--- /dev/null
+++ b/cpp/src/linalg/detail/coalesced_reduction.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// #include <raft/linalg/detail/coalesced_reduction-ext.cuh>
+
+#include <raft/linalg/detail/coalesced_reduction-inl.cuh>
+
+#define instantiate_raft_linalg_detail_coalescedReduction(                       \
+  InType, OutType, IdxType, MainLambda, ReduceLambda, FinalLambda)               \
+  template void raft::linalg::detail::coalescedReduction(OutType* dots,          \
+                                                         const InType* data,     \
+                                                         IdxType D,              \
+                                                         IdxType N,              \
+                                                         OutType init,           \
+                                                         cudaStream_t stream,    \
+                                                         bool inplace,           \
+                                                         MainLambda main_op,     \
+                                                         ReduceLambda reduce_op, \
+                                                         FinalLambda final_op)
+
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::identity_op, raft::min_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::abs_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::abs_op, raft::max_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::abs_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::abs_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::identity_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::identity_op, raft::min_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, long, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::identity_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::abs_op, raft::max_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, unsigned int, raft::sq_op, raft::add_op, raft::identity_op);
+
+#undef instantiate_raft_linalg_detail_coalescedReduction
diff --git a/cpp/src/matrix/detail/select_k_double_int64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu
new file mode 100644
index 0000000000..022627283a
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(double, int64_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
new file mode 100644
index 0000000000..22c6989337
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // uint32_t
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu
new file mode 100644
index 0000000000..1f1d686048
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(float, int64_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
new file mode 100644
index 0000000000..3bb47acbf2
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_int64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu
new file mode 100644
index 0000000000..cf4e15959d
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_half_int64_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, int64_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
new file mode 100644
index 0000000000..b18887bfc0
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu b/cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu
deleted file mode 100644
index 370ab1ba50..0000000000
--- a/cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/specializations.cuh>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                               \
-  template void select_k<T, IdxT>(const T*,              \
-                                  const IdxT*,           \
-                                  size_t,                \
-                                  size_t,                \
-                                  int,                   \
-                                  T*,                    \
-                                  IdxT*,                 \
-                                  bool,                  \
-                                  rmm::cuda_stream_view, \
-                                  rmm::mr::device_memory_resource*);
-
-RAFT_INST(float, int64_t);
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu
deleted file mode 100644
index c6733c2a46..0000000000
--- a/cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/specializations.cuh>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                               \
-  template void select_k<T, IdxT>(const T*,              \
-                                  const IdxT*,           \
-                                  size_t,                \
-                                  size_t,                \
-                                  int,                   \
-                                  T*,                    \
-                                  IdxT*,                 \
-                                  bool,                  \
-                                  rmm::cuda_stream_view, \
-                                  rmm::mr::device_memory_resource*);
-
-RAFT_INST(float, uint32_t);
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu b/cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu
deleted file mode 100644
index 38e28ac54d..0000000000
--- a/cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/specializations.cuh>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                               \
-  template void select_k<T, IdxT>(const T*,              \
-                                  const IdxT*,           \
-                                  size_t,                \
-                                  size_t,                \
-                                  int,                   \
-                                  T*,                    \
-                                  IdxT*,                 \
-                                  bool,                  \
-                                  rmm::cuda_stream_view, \
-                                  rmm::mr::device_memory_resource*);
-
-RAFT_INST(half, int64_t);
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu
deleted file mode 100644
index 108bd30b49..0000000000
--- a/cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/specializations.cuh>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                               \
-  template void select_k<T, IdxT>(const T*,              \
-                                  const IdxT*,           \
-                                  size_t,                \
-                                  size_t,                \
-                                  int,                   \
-                                  T*,                    \
-                                  IdxT*,                 \
-                                  bool,                  \
-                                  rmm::cuda_stream_view, \
-                                  rmm::mr::device_memory_resource*);
-
-RAFT_INST(half, uint32_t);
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/src/neighbors/ball_cover.cu b/cpp/src/neighbors/ball_cover.cu
new file mode 100644
index 0000000000..4c49c1847b
--- /dev/null
+++ b/cpp/src/neighbors/ball_cover.cu
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/ball_cover-inl.cuh>
+
+#define instantiate_raft_neighbors_ball_cover(idx_t, value_t, int_t, matrix_idx_t)                 \
+  template void raft::neighbors::ball_cover::build_index<idx_t, value_t, int_t, matrix_idx_t>(     \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index);      \
+                                                                                                   \
+  template void raft::neighbors::ball_cover::all_knn_query<idx_t, value_t, int_t, matrix_idx_t>(   \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,       \
+    int_t k,                                                                                       \
+    idx_t* inds,                                                                                   \
+    value_t* dists,                                                                                \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  template void raft::neighbors::ball_cover::all_knn_query<idx_t, value_t, int_t, matrix_idx_t>(   \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,       \
+    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,                                 \
+    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,                              \
+    int_t k,                                                                                       \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  template void raft::neighbors::ball_cover::knn_query<idx_t, value_t, int_t>(                     \
+    raft::device_resources const& handle,                                                          \
+    const raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t>& index,               \
+    int_t k,                                                                                       \
+    const value_t* query,                                                                          \
+    int_t n_query_pts,                                                                             \
+    idx_t* inds,                                                                                   \
+    value_t* dists,                                                                                \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  template void raft::neighbors::ball_cover::knn_query<idx_t, value_t, int_t, matrix_idx_t>(       \
+    raft::device_resources const& handle,                                                          \
+    const raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index, \
+    raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,                        \
+    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,                                 \
+    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,                              \
+    int_t k,                                                                                       \
+    bool perform_post_filtering,                                                                   \
+    float weight);
+
+instantiate_raft_neighbors_ball_cover(int64_t, float, uint32_t, uint32_t);
+
+#undef instantiate_raft_neighbors_ball_cover
diff --git a/cpp/src/neighbors/brute_force_00_generate.py b/cpp/src/neighbors/brute_force_00_generate.py
new file mode 100644
index 0000000000..251dd53b1c
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_00_generate.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+"""
+
+knn_macro = """
+#define instantiate_raft_neighbors_brute_force_knn(idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op) \\
+    template void raft::neighbors::brute_force::knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>( \\
+        raft::device_resources const& handle,                           \\
+        std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \\
+        raft::device_matrix_view<const value_t, matrix_idx, search_layout> search, \\
+        raft::device_matrix_view<idx_t, matrix_idx, row_major> indices, \\
+        raft::device_matrix_view<value_t, matrix_idx, row_major> distances, \\
+        raft::distance::DistanceType metric,                            \\
+        std::optional<float> metric_arg,                                \\
+        std::optional<idx_t> global_id_offset,                          \\
+        epilogue_op distance_epilogue);
+
+"""
+
+fused_l2_knn_macro = """
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(value_t, idx_t, idx_layout, query_layout) \\
+    template void raft::neighbors::brute_force::fused_l2_knn(    \\
+        raft::device_resources const& handle,                           \\
+        raft::device_matrix_view<const value_t, idx_t, idx_layout> index, \\
+        raft::device_matrix_view<const value_t, idx_t, query_layout> query, \\
+        raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,     \\
+        raft::device_matrix_view<value_t, idx_t, row_major> out_dists,  \\
+        raft::distance::DistanceType metric);
+
+"""
+
+knn_types = dict(
+    int64_t_float_uint32_t=("int64_t","float","uint32_t"),
+    int64_t_float_int64_t=("int64_t","float","int64_t"),
+    int_float_int=("int","float","int"),
+    uint32_t_float_uint32_t=("uint32_t","float","uint32_t"),
+)
+
+fused_l2_knn_types = dict(
+    float_int64_t=("float", "int64_t"),
+)
+
+# knn
+for type_path, (idx_t, value_t, matrix_idx) in knn_types.items():
+    path = f"brute_force_knn_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(knn_macro)
+        f.write(f"instantiate_raft_neighbors_brute_force_knn({idx_t},{value_t},{matrix_idx},raft::row_major,raft::row_major,raft::identity_op);\n\n")
+        f.write("#undef instantiate_raft_neighbors_brute_force_knn\n")
+
+    # For pasting into CMakeLists.txt
+    print(f"src/neighbors/{path}")
+
+#fused_l2_knn
+for type_path, (value_t, idx_t) in fused_l2_knn_types.items():
+    path = f"brute_force_fused_l2_knn_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(fused_l2_knn_macro)
+        f.write(f"instantiate_raft_neighbors_brute_force_fused_l2_knn({value_t},{idx_t},raft::row_major,raft::row_major);\n\n")
+        f.write("#undef instantiate_raft_neighbors_brute_force_fused_l2_knn\n")
+
+    # For pasting into CMakeLists.txt
+    print(f"src/neighbors/{path}")
diff --git a/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu b/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
new file mode 100644
index 0000000000..4e1805f9a8
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
@@ -0,0 +1,45 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
+  value_t, idx_t, idx_layout, query_layout)                             \
+  template void raft::neighbors::brute_force::fused_l2_knn(             \
+    raft::device_resources const& handle,                               \
+    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
+    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
+    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
+    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_brute_force_fused_l2_knn(float,
+                                                    int64_t,
+                                                    raft::row_major,
+                                                    raft::row_major);
+
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
new file mode 100644
index 0000000000..a668b076d6
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+instantiate_raft_neighbors_brute_force_knn(
+  int64_t, float, int64_t, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
new file mode 100644
index 0000000000..21cac5034a
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+instantiate_raft_neighbors_brute_force_knn(
+  int64_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int_float_int.cu b/cpp/src/neighbors/brute_force_knn_int_float_int.cu
new file mode 100644
index 0000000000..b76fe09c2a
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_int_float_int.cu
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+instantiate_raft_neighbors_brute_force_knn(
+  int, float, int, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
diff --git a/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu b/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
new file mode 100644
index 0000000000..4d3f627182
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+instantiate_raft_neighbors_brute_force_knn(
+  uint32_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
new file mode 100644
index 0000000000..4dfa2a707c
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)  \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                                 \
+    const T* queries,                                                                       \
+    const uint32_t* coarse_query_results,                                                   \
+    const uint32_t n_queries,                                                               \
+    const raft::distance::DistanceType metric,                                              \
+    const uint32_t n_probes,                                                                \
+    const uint32_t k,                                                                       \
+    const bool select_min,                                                                  \
+    IdxT* neighbors,                                                                        \
+    float* distances,                                                                       \
+    uint32_t& grid_dim_x,                                                                   \
+    rmm::cuda_stream_view stream)
+
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(float, float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan
diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
new file mode 100644
index 0000000000..2d54248e4d
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)  \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                                 \
+    const T* queries,                                                                       \
+    const uint32_t* coarse_query_results,                                                   \
+    const uint32_t n_queries,                                                               \
+    const raft::distance::DistanceType metric,                                              \
+    const uint32_t n_probes,                                                                \
+    const uint32_t k,                                                                       \
+    const bool select_min,                                                                  \
+    IdxT* neighbors,                                                                        \
+    float* distances,                                                                       \
+    uint32_t& grid_dim_x,                                                                   \
+    rmm::cuda_stream_view stream)
+
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(int8_t, int32_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan
diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
new file mode 100644
index 0000000000..75fe52f3c7
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)  \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                                 \
+    const T* queries,                                                                       \
+    const uint32_t* coarse_query_results,                                                   \
+    const uint32_t n_queries,                                                               \
+    const raft::distance::DistanceType metric,                                              \
+    const uint32_t n_probes,                                                                \
+    const uint32_t k,                                                                       \
+    const bool select_min,                                                                  \
+    IdxT* neighbors,                                                                        \
+    float* distances,                                                                       \
+    uint32_t& grid_dim_x,                                                                   \
+    rmm::cuda_stream_view stream)
+
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(uint8_t, uint32_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan
diff --git a/cpp/src/neighbors/detail/ivf_flat_search.cu b/cpp/src/neighbors/detail/ivf_flat_search.cu
new file mode 100644
index 0000000000..345a8f499d
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_flat_search.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_flat_search-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT)  \
+  template void raft::neighbors::ivf_flat::detail::search<T, IdxT>( \
+    raft::device_resources const& handle,                           \
+    const search_params& params,                                    \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,         \
+    const T* queries,                                               \
+    uint32_t n_queries,                                             \
+    uint32_t k,                                                     \
+    IdxT* neighbors,                                                \
+    float* distances,                                               \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_neighbors_ivf_flat_detail_search(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_search(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_search
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
new file mode 100644
index 0000000000..a740d01bd2
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \\
+    template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \\
+        const cudaDeviceProp& dev_props,                                \\
+        bool manage_local_topk,                                         \\
+        int locality_hint,                                              \\
+        double preferred_shmem_carveout,                                \\
+        uint32_t pq_bits,                                               \\
+        uint32_t pq_dim,                                                \\
+        uint32_t precomp_data_count,                                    \\
+        uint32_t n_queries,                                             \\
+        uint32_t n_probes,                                              \\
+        uint32_t topk) -> raft::neighbors::ivf_pq::detail::selected<OutT, LutT>; \\
+\\
+    template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>( \\
+        raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,        \\
+        rmm::cuda_stream_view stream,                                   \\
+        uint32_t n_rows,                                                \\
+        uint32_t dim,                                                   \\
+        uint32_t n_probes,                                              \\
+        uint32_t pq_dim,                                                \\
+        uint32_t n_queries,                                             \\
+        raft::distance::DistanceType metric,                                  \\
+        raft::neighbors::ivf_pq::codebook_gen codebook_kind,            \\
+        uint32_t topk,                                                  \\
+        uint32_t max_samples,                                           \\
+        const float* cluster_centers,                                   \\
+        const float* pq_centers,                                        \\
+        const uint8_t* const* pq_dataset,                               \\
+        const uint32_t* cluster_labels,                                 \\
+        const uint32_t* _chunk_indices,                                 \\
+        const float* queries,                                           \\
+        const uint32_t* index_list,                                     \\
+        float* query_kths,                                              \\
+        LutT* lut_scores,                                               \\
+        OutT* _out_scores,                                              \\
+        uint32_t* _out_indices);
+
+
+#define COMMA ,
+"""
+
+trailer = """
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
+"""
+
+types = dict(
+    half_fp8_false=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>"),
+    half_fp8_true=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>"),
+    half_half=("half", "half"),
+    float_half=("float", "half"),
+    float_float= ("float", "float"),
+    float_fp8_false=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>"),
+    float_fp8_true=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>"),
+)
+
+for path_key, (OutT, LutT) in types.items():
+    path = f"ivf_pq_compute_similarity_{path_key}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(f"instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select({OutT}, {LutT});\n")
+        f.write(trailer)
+    print(f"src/neighbors/detail/{path}")
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
new file mode 100644
index 0000000000..956b7010d5
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
@@ -0,0 +1,73 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, float);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
new file mode 100644
index 0000000000..fba72ad1dd
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
@@ -0,0 +1,74 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
new file mode 100644
index 0000000000..030f429315
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
@@ -0,0 +1,74 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
new file mode 100644
index 0000000000..31a4d7d503
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
@@ -0,0 +1,73 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, half);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
new file mode 100644
index 0000000000..c623c80446
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
@@ -0,0 +1,74 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
new file mode 100644
index 0000000000..f2aaca20db
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
@@ -0,0 +1,74 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
new file mode 100644
index 0000000000..4420b2534b
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
@@ -0,0 +1,73 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(half, half);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/selection_faiss_00_generate.py b/cpp/src/neighbors/detail/selection_faiss_00_generate.py
new file mode 100644
index 0000000000..36ba56c9b3
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_00_generate.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \\
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \\
+                                                  const payload_t* inV, \\
+                                                  size_t n_rows,        \\
+                                                  size_t n_cols,        \\
+                                                  key_t* outK,          \\
+                                                  payload_t* outV,      \\
+                                                  bool select_min,      \\
+                                                  int k,                \\
+                                                  cudaStream_t stream)
+
+"""
+
+types = dict(
+    uint32_t_float=("uint32_t", "float"),
+    int32_t_float=("int32_t", "float"),
+    long_float=("long", "float"),
+    size_t_double=("size_t", "double"),
+    int_double=("int", "double"),
+    size_t_float=("size_t", "float"),
+)
+
+for type_path, (payload_t, key_t) in types.items():
+    path = f"selection_faiss_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(f"instantiate_raft_neighbors_detail_select_k({payload_t}, {key_t});\n\n")
+        f.write(f"#undef instantiate_raft_neighbors_detail_select_k\n")
+
+    # for pasting into CMakeLists.txt
+    print(f"src/neighbors/detail/{path}")
diff --git a/cpp/src/neighbors/detail/selection_faiss_int32_t_float.cu b/cpp/src/neighbors/detail/selection_faiss_int32_t_float.cu
new file mode 100644
index 0000000000..1f1ece05ae
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_int32_t_float.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(int32_t, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_int_double.cu b/cpp/src/neighbors/detail/selection_faiss_int_double.cu
new file mode 100644
index 0000000000..7e832410c4
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_int_double.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(int, double);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_long_float.cu b/cpp/src/neighbors/detail/selection_faiss_long_float.cu
new file mode 100644
index 0000000000..441d54fa30
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_long_float.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(long, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_size_t_double.cu b/cpp/src/neighbors/detail/selection_faiss_size_t_double.cu
new file mode 100644
index 0000000000..ca310e7697
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_size_t_double.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(size_t, double);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_size_t_float.cu b/cpp/src/neighbors/detail/selection_faiss_size_t_float.cu
new file mode 100644
index 0000000000..a830e6ecac
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_size_t_float.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(size_t, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_uint32_t_float.cu b/cpp/src/neighbors/detail/selection_faiss_uint32_t_float.cu
new file mode 100644
index 0000000000..2fecaa5cf1
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_uint32_t_float.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(uint32_t, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/ivf_flat_00_generate.py b/cpp/src/neighbors/ivf_flat_00_generate.py
new file mode 100644
index 0000000000..44ea9709c2
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_00_generate.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+"""
+
+types = dict(
+    float_int64_t= ("float", "int64_t"),
+    int8_t_int64_t=("int8_t", "int64_t"),
+    uint8_t_int64_t=("uint8_t", "int64_t"),
+)
+
+build_macro = """
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)        \\
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>( \\
+    raft::device_resources const& handle,                         \\
+    const raft::neighbors::ivf_flat::index_params& params,        \\
+    const T* dataset,                                             \\
+    IdxT n_rows,                                                  \\
+    uint32_t dim)                                                 \\
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                  \\
+                                                                  \\
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>( \\
+    raft::device_resources const& handle,                         \\
+    const raft::neighbors::ivf_flat::index_params& params,        \\
+    raft::device_matrix_view<const T, IdxT, row_major> dataset)   \\
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                  \\
+                                                                  \\
+  template void raft::neighbors::ivf_flat::build<T, IdxT>( \\
+    raft::device_resources const& handle,                         \\
+    const raft::neighbors::ivf_flat::index_params& params,        \\
+    raft::device_matrix_view<const T, IdxT, row_major> dataset,   \\
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+"""
+
+extend_macro = """
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \\
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(         \\
+    raft::device_resources const& handle,                                  \\
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \\
+    const T* new_vectors,                                                  \\
+    const IdxT* new_indices,                                               \\
+    IdxT n_rows)                                                           \\
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \\
+                                                                           \\
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(         \\
+    raft::device_resources const& handle,                                  \\
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \\
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \\
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \\
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \\
+                                                                           \\
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(         \\
+    raft::device_resources const& handle,                                  \\
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \\
+    const T* new_vectors,                                                  \\
+    const IdxT* new_indices,                                               \\
+    IdxT n_rows);                                                          \\
+                                                                           \\
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(         \\
+    raft::device_resources const& handle,                                  \\
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \\
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \\
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+"""
+
+search_macro = """
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)        \\
+  template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
+    raft::device_resources const& handle,                          \\
+    const raft::neighbors::ivf_flat::search_params& params,        \\
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \\
+    const T* queries,                                              \\
+    uint32_t n_queries,                                            \\
+    uint32_t k,                                                    \\
+    IdxT* neighbors,                                               \\
+    float* distances,                                              \\
+    rmm::mr::device_memory_resource* mr );                         \\
+                                                                   \\
+  template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
+    raft::device_resources const& handle,                          \\
+    const raft::neighbors::ivf_flat::search_params& params,        \\
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \\
+    raft::device_matrix_view<const T, IdxT, row_major> queries,    \\
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,     \\
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+"""
+
+macros = dict(
+    build=dict(
+        definition=build_macro,
+        name="instantiate_raft_neighbors_ivf_flat_build"),
+    extend=dict(
+        definition=extend_macro,
+        name="instantiate_raft_neighbors_ivf_flat_extend"),
+    search=dict(
+        definition=search_macro,
+        name="instantiate_raft_neighbors_ivf_flat_search"),
+)
+
+for type_path, (T, IdxT) in types.items():
+    for macro_path, macro in macros.items():
+        path = f"ivf_flat_{macro_path}_{type_path}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(macro['definition'])
+
+
+            f.write(f"{macro['name']}({T}, {IdxT});\n\n")
+            f.write(f"#undef {macro['name']}\n")
+
+        print(f"src/neighbors/{path}")
diff --git a/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu
new file mode 100644
index 0000000000..622f7c7d90
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    const T* dataset,                                           \
+    IdxT n_rows,                                                \
+    uint32_t dim)                                               \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset) \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template void raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset, \
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+instantiate_raft_neighbors_ivf_flat_build(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_build
diff --git a/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu
new file mode 100644
index 0000000000..7b1eeae32d
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    const T* dataset,                                           \
+    IdxT n_rows,                                                \
+    uint32_t dim)                                               \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset) \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template void raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset, \
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+instantiate_raft_neighbors_ivf_flat_build(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_build
diff --git a/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..40cf28151f
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    const T* dataset,                                           \
+    IdxT n_rows,                                                \
+    uint32_t dim)                                               \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset) \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template void raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset, \
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+instantiate_raft_neighbors_ivf_flat_build(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_build
diff --git a/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu
new file mode 100644
index 0000000000..f7d99d7081
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows)                                                           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows);                                                          \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+instantiate_raft_neighbors_ivf_flat_extend(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_extend
diff --git a/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
new file mode 100644
index 0000000000..9eec4f9648
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows)                                                           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows);                                                          \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+instantiate_raft_neighbors_ivf_flat_extend(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_extend
diff --git a/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..fc24cbff74
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows)                                                           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows);                                                          \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+instantiate_raft_neighbors_ivf_flat_extend(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_extend
diff --git a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
new file mode 100644
index 0000000000..5a1fae6d5a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    const T* queries,                                           \
+    uint32_t n_queries,                                         \
+    uint32_t k,                                                 \
+    IdxT* neighbors,                                            \
+    float* distances,                                           \
+    rmm::mr::device_memory_resource* mr);                       \
+                                                                \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    raft::device_matrix_view<const T, IdxT, row_major> queries, \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+instantiate_raft_neighbors_ivf_flat_search(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_search
diff --git a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
new file mode 100644
index 0000000000..bc84159a41
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    const T* queries,                                           \
+    uint32_t n_queries,                                         \
+    uint32_t k,                                                 \
+    IdxT* neighbors,                                            \
+    float* distances,                                           \
+    rmm::mr::device_memory_resource* mr);                       \
+                                                                \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    raft::device_matrix_view<const T, IdxT, row_major> queries, \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+instantiate_raft_neighbors_ivf_flat_search(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_search
diff --git a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..9e70e21af4
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    const T* queries,                                           \
+    uint32_t n_queries,                                         \
+    uint32_t k,                                                 \
+    IdxT* neighbors,                                            \
+    float* distances,                                           \
+    rmm::mr::device_memory_resource* mr);                       \
+                                                                \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    raft::device_matrix_view<const T, IdxT, row_major> queries, \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+instantiate_raft_neighbors_ivf_flat_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_search
diff --git a/cpp/src/neighbors/ivfpq_build_float_int64_t.cu b/cpp/src/neighbors/ivfpq_build_float_int64_t.cu
new file mode 100644
index 0000000000..6771964cae
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_build_float_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                         \
+                                                                                         \
+  template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    const T* dataset,                                                                    \
+    IdxT n_rows,                                                                         \
+    uint32_t dim)                                                                        \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
diff --git a/cpp/src/neighbors/ivfpq_build_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_build_int8_t_int64_t.cu
new file mode 100644
index 0000000000..759045faa7
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_build_int8_t_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                         \
+                                                                                         \
+  template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    const T* dataset,                                                                    \
+    IdxT n_rows,                                                                         \
+    uint32_t dim)                                                                        \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
diff --git a/cpp/src/neighbors/ivfpq_build_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_build_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..62a47e9bcf
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_build_uint8_t_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                         \
+                                                                                         \
+  template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    const T* dataset,                                                                    \
+    IdxT n_rows,                                                                         \
+    uint32_t dim)                                                                        \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
diff --git a/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu
new file mode 100644
index 0000000000..3e728be38d
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
+    raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
+                                                                                          \
+  template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                                      \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows)                                                                          \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                                               \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::neighbors::ivf_pq::index<IdxT>* idx,                                            \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows);
+
+instantiate_raft_neighbors_ivf_pq_extend(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_extend
diff --git a/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu
new file mode 100644
index 0000000000..7853e53f63
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
+    raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
+                                                                                          \
+  template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                                      \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows)                                                                          \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                                               \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::neighbors::ivf_pq::index<IdxT>* idx,                                            \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows);
+
+instantiate_raft_neighbors_ivf_pq_extend(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_extend
diff --git a/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..599a88fc67
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
+    raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
+                                                                                          \
+  template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                                      \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows)                                                                          \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                                               \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::neighbors::ivf_pq::index<IdxT>* idx,                                            \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows);
+
+instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_extend
diff --git a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
index 91093d3a39..ab946d2b65 100644
--- a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
@@ -14,26 +14,29 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
-#include <raft_runtime/neighbors/ivf_pq.hpp>
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)        \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,  \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,   \
+    raft::device_matrix_view<float, IdxT, row_major> distances); \
+                                                                 \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    const T* queries,                                            \
+    uint32_t n_queries,                                          \
+    uint32_t k,                                                  \
+    IdxT* neighbors,                                             \
+    float* distances,                                            \
+    rmm::mr::device_memory_resource* mr)
 
-namespace raft::runtime::neighbors::ivf_pq {
+instantiate_raft_neighbors_ivf_pq_search(float, int64_t);
 
-#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
-  void search(raft::device_resources const& handle,                                               \
-              const raft::neighbors::ivf_pq::search_params& params,                               \
-              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
-              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
-              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
-              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
-  {                                                                                               \
-    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
-  }
-
-RAFT_SEARCH_INST(float, int64_t);
-
-#undef RAFT_INST_SEARCH
-
-}  // namespace raft::runtime::neighbors::ivf_pq
+#undef instantiate_raft_neighbors_ivf_pq_search
diff --git a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
index e1552c0b27..af54a9312a 100644
--- a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
@@ -14,26 +14,29 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
-#include <raft_runtime/neighbors/ivf_pq.hpp>
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)        \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,  \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,   \
+    raft::device_matrix_view<float, IdxT, row_major> distances); \
+                                                                 \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    const T* queries,                                            \
+    uint32_t n_queries,                                          \
+    uint32_t k,                                                  \
+    IdxT* neighbors,                                             \
+    float* distances,                                            \
+    rmm::mr::device_memory_resource* mr)
 
-namespace raft::runtime::neighbors::ivf_pq {
+instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t);
 
-#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
-  void search(raft::device_resources const& handle,                                               \
-              const raft::neighbors::ivf_pq::search_params& params,                               \
-              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
-              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
-              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
-              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
-  {                                                                                               \
-    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
-  }
-
-RAFT_SEARCH_INST(int8_t, int64_t);
-
-#undef RAFT_INST_SEARCH
-
-}  // namespace raft::runtime::neighbors::ivf_pq
+#undef instantiate_raft_neighbors_ivf_pq_search
diff --git a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
index 85195a7551..7b49487506 100644
--- a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -14,26 +14,29 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
-#include <raft_runtime/neighbors/ivf_pq.hpp>
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)        \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,  \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,   \
+    raft::device_matrix_view<float, IdxT, row_major> distances); \
+                                                                 \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    const T* queries,                                            \
+    uint32_t n_queries,                                          \
+    uint32_t k,                                                  \
+    IdxT* neighbors,                                             \
+    float* distances,                                            \
+    rmm::mr::device_memory_resource* mr)
 
-namespace raft::runtime::neighbors::ivf_pq {
+instantiate_raft_neighbors_ivf_pq_search(uint8_t, int64_t);
 
-#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
-  void search(raft::device_resources const& handle,                                               \
-              const raft::neighbors::ivf_pq::search_params& params,                               \
-              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
-              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
-              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
-              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
-  {                                                                                               \
-    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
-  }
-
-RAFT_SEARCH_INST(uint8_t, int64_t);
-
-#undef RAFT_INST_SEARCH
-
-}  // namespace raft::runtime::neighbors::ivf_pq
+#undef instantiate_raft_neighbors_ivf_pq_search
diff --git a/cpp/src/neighbors/refine_00_generate.py b/cpp/src/neighbors/refine_00_generate.py
new file mode 100644
index 0000000000..18c8857e3f
--- /dev/null
+++ b/cpp/src/neighbors/refine_00_generate.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by refine_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python refine_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/refine-inl.cuh>
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)       \\
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(        \\
+    raft::device_resources const& handle,                                              \\
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,             \\
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,             \\
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,  \\
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                    \\
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,             \\
+    raft::distance::DistanceType metric);                                              \\
+                                                                                       \\
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(        \\
+    raft::device_resources const& handle,                                              \\
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,               \\
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,               \\
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,    \\
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                      \\
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,               \\
+    raft::distance::DistanceType metric);
+
+"""
+
+types = dict(
+    float_float= ("float", "float"),
+    int8_t_float=("int8_t", "float"),
+    uint8_t_float=("uint8_t", "float"),
+)
+
+for type_path, (data_t, distance_t) in types.items():
+    path = f"refine_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(f"instantiate_raft_neighbors_refine(int64_t, {data_t}, {distance_t}, int64_t);\n\n")
+        f.write(f"#undef instantiate_raft_neighbors_refine\n")
+
+    # for pasting into CMakeLists.txt
+    print(f"src/neighbors/{path}")
diff --git a/cpp/src/neighbors/refine_float_float.cu b/cpp/src/neighbors/refine_float_float.cu
new file mode 100644
index 0000000000..7e811fd7e3
--- /dev/null
+++ b/cpp/src/neighbors/refine_float_float.cu
@@ -0,0 +1,50 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by refine_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python refine_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/refine-inl.cuh>
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,            \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,            \
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates, \
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                   \
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,            \
+    raft::distance::DistanceType metric);                                             \
+                                                                                      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,              \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,              \
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,   \
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                     \
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,              \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_refine(int64_t, float, float, int64_t);
+
+#undef instantiate_raft_neighbors_refine
diff --git a/cpp/src/neighbors/refine_int8_t_float.cu b/cpp/src/neighbors/refine_int8_t_float.cu
new file mode 100644
index 0000000000..6983c2492c
--- /dev/null
+++ b/cpp/src/neighbors/refine_int8_t_float.cu
@@ -0,0 +1,50 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by refine_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python refine_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/refine-inl.cuh>
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,            \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,            \
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates, \
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                   \
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,            \
+    raft::distance::DistanceType metric);                                             \
+                                                                                      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,              \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,              \
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,   \
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                     \
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,              \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_refine(int64_t, int8_t, float, int64_t);
+
+#undef instantiate_raft_neighbors_refine
diff --git a/cpp/src/neighbors/refine_uint8_t_float.cu b/cpp/src/neighbors/refine_uint8_t_float.cu
new file mode 100644
index 0000000000..f61bc508c0
--- /dev/null
+++ b/cpp/src/neighbors/refine_uint8_t_float.cu
@@ -0,0 +1,50 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by refine_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python refine_00_generate.py
+ *
+ */
+
+#include <raft/neighbors/refine-inl.cuh>
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,            \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,            \
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates, \
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                   \
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,            \
+    raft::distance::DistanceType metric);                                             \
+                                                                                      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,              \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,              \
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,   \
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                     \
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,              \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_refine(int64_t, uint8_t, float, int64_t);
+
+#undef instantiate_raft_neighbors_refine
diff --git a/cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu b/cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu
deleted file mode 100644
index 305dd6796e..0000000000
--- a/cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ball_cover.cuh>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cstdint>
-
-namespace raft::neighbors::ball_cover {
-template void all_knn_query<std::int64_t, float, std::uint32_t>(
-  raft::device_resources const& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/neighbors/specializations/ball_cover_build_index.cu b/cpp/src/neighbors/specializations/ball_cover_build_index.cu
deleted file mode 100644
index ec7f4bcf52..0000000000
--- a/cpp/src/neighbors/specializations/ball_cover_build_index.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ball_cover.cuh>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cstdint>
-
-namespace raft::neighbors::ball_cover {
-template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
-template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
-
-template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  raft::device_resources const& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
-
-};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/neighbors/specializations/ball_cover_knn_query.cu b/cpp/src/neighbors/specializations/ball_cover_knn_query.cu
deleted file mode 100644
index 634427200e..0000000000
--- a/cpp/src/neighbors/specializations/ball_cover_knn_query.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/ball_cover.cuh>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ball_cover {
-template void knn_query<std::int64_t, float, std::uint32_t>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  const float* query,
-  std::uint32_t n_query_pts,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
deleted file mode 100644
index b69751a62a..0000000000
--- a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* dists_counter);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
deleted file mode 100644
index ca44ad3165..0000000000
--- a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* dists_counter);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
deleted file mode 100644
index ba44327653..0000000000
--- a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* post_dists_counter);
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
deleted file mode 100644
index 59132c1f99..0000000000
--- a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* post_dists_counter);
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
deleted file mode 100644
index 04aa42c9f1..0000000000
--- a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                          \
-  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                    std::vector<T*>& input,               \
-                                                    std::vector<IntT>& sizes,             \
-                                                    IntT D,                               \
-                                                    T* search_items,                      \
-                                                    IntT n,                               \
-                                                    IdxT* res_I,                          \
-                                                    T* res_D,                             \
-                                                    IntT k,                               \
-                                                    bool rowMajorIndex,                   \
-                                                    bool rowMajorQuery,                   \
-                                                    std::vector<IdxT>* translations,      \
-                                                    raft::distance::DistanceType metric,  \
-                                                    float metricArg,                      \
-                                                    raft::identity_op);
-RAFT_INST(long, float, int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
deleted file mode 100644
index a8b9d4299a..0000000000
--- a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                          \
-  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                    std::vector<T*>& input,               \
-                                                    std::vector<IntT>& sizes,             \
-                                                    IntT D,                               \
-                                                    T* search_items,                      \
-                                                    IntT n,                               \
-                                                    IdxT* res_I,                          \
-                                                    T* res_D,                             \
-                                                    IntT k,                               \
-                                                    bool rowMajorIndex,                   \
-                                                    bool rowMajorQuery,                   \
-                                                    std::vector<IdxT>* translations,      \
-                                                    raft::distance::DistanceType metric,  \
-                                                    float metricArg,                      \
-                                                    raft::identity_op);
-RAFT_INST(long, float, unsigned int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
deleted file mode 100644
index c97e6e936a..0000000000
--- a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                          \
-  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                    std::vector<T*>& input,               \
-                                                    std::vector<IntT>& sizes,             \
-                                                    IntT D,                               \
-                                                    T* search_items,                      \
-                                                    IntT n,                               \
-                                                    IdxT* res_I,                          \
-                                                    T* res_D,                             \
-                                                    IntT k,                               \
-                                                    bool rowMajorIndex,                   \
-                                                    bool rowMajorQuery,                   \
-                                                    std::vector<IdxT>* translations,      \
-                                                    raft::distance::DistanceType metric,  \
-                                                    float metricArg,                      \
-                                                    raft::identity_op);
-RAFT_INST(uint32_t, float, int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
deleted file mode 100644
index 87451c385a..0000000000
--- a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                          \
-  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                    std::vector<T*>& input,               \
-                                                    std::vector<IntT>& sizes,             \
-                                                    IntT D,                               \
-                                                    T* search_items,                      \
-                                                    IntT n,                               \
-                                                    IdxT* res_I,                          \
-                                                    T* res_D,                             \
-                                                    IntT k,                               \
-                                                    bool rowMajorIndex,                   \
-                                                    bool rowMajorQuery,                   \
-                                                    std::vector<IdxT>* translations,      \
-                                                    raft::distance::DistanceType metric,  \
-                                                    float metricArg,                      \
-                                                    raft::identity_op);
-RAFT_INST(uint32_t, float, unsigned int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
deleted file mode 100644
index c7b5c9ffe9..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, true>, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
deleted file mode 100644
index efb2a477a7..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, true>, false, true>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
deleted file mode 100644
index b9051eb011..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, true>, true, false>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
deleted file mode 100644
index c6b1bad123..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, false>, true, true>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
deleted file mode 100644
index d6033345da..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, false>, false, true>(uint32_t,
-                                                                                   uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
deleted file mode 100644
index 1add18cb4a..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, false>, true, false>(uint32_t,
-                                                                                   uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
deleted file mode 100644
index 6020d7035b..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, half, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
deleted file mode 100644
index 145312f334..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, half, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
deleted file mode 100644
index c9365e1bb4..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, true>, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
deleted file mode 100644
index d5c6934da2..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, true>, false, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
deleted file mode 100644
index bac8c8706b..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, true>, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
deleted file mode 100644
index 2809005dd0..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, false>, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
deleted file mode 100644
index 015ef21a15..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, false>, false, true>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
deleted file mode 100644
index 0ac96c8440..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, false>, true, false>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
deleted file mode 100644
index f3501d11c0..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, half, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
deleted file mode 100644
index 7d10020480..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, half, false, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
deleted file mode 100644
index 91ec2eca3e..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, half, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu
deleted file mode 100644
index 145312f334..0000000000
--- a/cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, half, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu b/cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu
deleted file mode 100644
index 72fdac9526..0000000000
--- a/cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void fusedL2Knn<int, float, false>(size_t D,
-                                            int* out_inds,
-                                            float* out_dists,
-                                            const float* index,
-                                            const float* query,
-                                            size_t n_index_rows,
-                                            size_t n_query_rows,
-                                            int k,
-                                            bool rowMajorIndex,
-                                            bool rowMajorQuery,
-                                            cudaStream_t stream,
-                                            raft::distance::DistanceType metric);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu b/cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu
deleted file mode 100644
index c7616462fe..0000000000
--- a/cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-template void fusedL2Knn<int, float, true>(size_t D,
-                                           int* out_inds,
-                                           float* out_dists,
-                                           const float* index,
-                                           const float* query,
-                                           size_t n_index_rows,
-                                           size_t n_query_rows,
-                                           int k,
-                                           bool rowMajorIndex,
-                                           bool rowMajorQuery,
-                                           cudaStream_t stream,
-                                           raft::distance::DistanceType metric);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu b/cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu
deleted file mode 100644
index 16bf058238..0000000000
--- a/cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void fusedL2Knn<long, float, false>(size_t D,
-                                             long* out_inds,
-                                             float* out_dists,
-                                             const float* index,
-                                             const float* query,
-                                             size_t n_index_rows,
-                                             size_t n_query_rows,
-                                             int k,
-                                             bool rowMajorIndex,
-                                             bool rowMajorQuery,
-                                             cudaStream_t stream,
-                                             raft::distance::DistanceType metric);
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu b/cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu
deleted file mode 100644
index 06cf55eae3..0000000000
--- a/cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void fusedL2Knn<long, float, true>(size_t D,
-                                            long* out_inds,
-                                            float* out_dists,
-                                            const float* index,
-                                            const float* query,
-                                            size_t n_index_rows,
-                                            size_t n_query_rows,
-                                            int k,
-                                            bool rowMajorIndex,
-                                            bool rowMajorQuery,
-                                            cudaStream_t stream,
-                                            raft::distance::DistanceType metric);
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
deleted file mode 100644
index ebc1a7fefa..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                               \
-  template auto build(raft::device_resources const& handle,                       \
-                      const index_params& params,                                 \
-                      raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<T, IdxT>;
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
deleted file mode 100644
index 870db6e97e..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                               \
-  template auto build(raft::device_resources const& handle,                       \
-                      const index_params& params,                                 \
-                      raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<T, IdxT>;
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
deleted file mode 100644
index 71af06ad71..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                           \
-  template auto extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->raft::neighbors::ivf_flat::index<T, IdxT>;                                              \
-                                                                                              \
-  template void extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       raft::neighbors::ivf_flat::index<T, IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
deleted file mode 100644
index bb7bb6e7eb..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                           \
-  template auto extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->raft::neighbors::ivf_flat::index<T, IdxT>;                                              \
-                                                                                              \
-  template void extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       raft::neighbors::ivf_flat::index<T, IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
deleted file mode 100644
index 607b4b0913..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                           \
-  template auto extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->raft::neighbors::ivf_flat::index<T, IdxT>;                                              \
-                                                                                              \
-  template void extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       raft::neighbors::ivf_flat::index<T, IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu
deleted file mode 100644
index dce7083139..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-// greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-// function is used in both raft::neighbors::ivf_flat::search and
-// raft::neighbors::detail::refine_device. To prevent a duplicate instantiation
-// of this function (which defines ~270 kernels) in the refine specializations,
-// an extern template definition is provided. To make sure
-// ivfflat_interleaved_scan is actually compiled here, we explicitly instantiate
-// it below. Please check related function calls after editing template
-// definition below. Search for `greppable-id-specializations-ivf-flat-search`
-// to find them.
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
-  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
-    T,                                                                       \
-    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
-    IdxT>(const index<T, IdxT>& index,                                       \
-          const T* queries,                                                  \
-          const uint32_t* coarse_query_results,                              \
-          const uint32_t n_queries,                                          \
-          const raft::distance::DistanceType metric,                         \
-          const uint32_t n_probes,                                           \
-          const uint32_t k,                                                  \
-          const bool select_min,                                             \
-          IdxT* neighbors,                                                   \
-          float* distances,                                                  \
-          uint32_t& grid_dim_x,                                              \
-          rmm::cuda_stream_view stream);                                     \
-                                                                             \
-  template void search(raft::device_resources const&,                        \
-                       raft::neighbors::ivf_flat::search_params const&,      \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
-                       raft::device_matrix_view<const T, IdxT, row_major>,   \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
deleted file mode 100644
index b03d878bae..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
-  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
-    T,                                                                       \
-    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
-    IdxT>(const index<T, IdxT>& index,                                       \
-          const T* queries,                                                  \
-          const uint32_t* coarse_query_results,                              \
-          const uint32_t n_queries,                                          \
-          const raft::distance::DistanceType metric,                         \
-          const uint32_t n_probes,                                           \
-          const uint32_t k,                                                  \
-          const bool select_min,                                             \
-          IdxT* neighbors,                                                   \
-          float* distances,                                                  \
-          uint32_t& grid_dim_x,                                              \
-          rmm::cuda_stream_view stream);                                     \
-                                                                             \
-  template void search(raft::device_resources const&,                        \
-                       raft::neighbors::ivf_flat::search_params const&,      \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
-                       raft::device_matrix_view<const T, IdxT, row_major>,   \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
deleted file mode 100644
index 2d42bae0d1..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
-  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
-    T,                                                                       \
-    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
-    IdxT>(const index<T, IdxT>& index,                                       \
-          const T* queries,                                                  \
-          const uint32_t* coarse_query_results,                              \
-          const uint32_t n_queries,                                          \
-          const raft::distance::DistanceType metric,                         \
-          const uint32_t n_probes,                                           \
-          const uint32_t k,                                                  \
-          const bool select_min,                                             \
-          IdxT* neighbors,                                                   \
-          float* distances,                                                  \
-          uint32_t& grid_dim_x,                                              \
-          rmm::cuda_stream_view stream);                                     \
-                                                                             \
-  template void search(raft::device_resources const&,                        \
-                       raft::neighbors::ivf_flat::search_params const&,      \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
-                       raft::device_matrix_view<const T, IdxT, row_major>,   \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu
deleted file mode 100644
index d559291b93..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                        \
-  template auto build<T, IdxT>(raft::device_resources const& handle,                       \
-                               const index_params& params,                                 \
-                               raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<IdxT>;
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
deleted file mode 100644
index c8b31e1fff..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                        \
-  template auto build<T, IdxT>(raft::device_resources const& handle,                       \
-                               const index_params& params,                                 \
-                               raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<IdxT>;
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
deleted file mode 100644
index 5fc62969f0..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                        \
-  template auto build<T, IdxT>(raft::device_resources const& handle,                       \
-                               const index_params& params,                                 \
-                               raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<IdxT>;
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
deleted file mode 100644
index 584bbfc45c..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                        \
-  template auto extend<T, IdxT>(                                           \
-    raft::device_resources const& handle,                                  \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-    const index<IdxT>& idx)                                                \
-    ->index<IdxT>;                                                         \
-  template void extend<T, IdxT>(                                           \
-    raft::device_resources const& handle,                                  \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-    index<IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
deleted file mode 100644
index 00311a77e4..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                        \
-  template auto extend<T, IdxT>(                                           \
-    raft::device_resources const& handle,                                  \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-    const index<IdxT>& idx)                                                \
-    ->index<IdxT>;                                                         \
-  template void extend<T, IdxT>(                                           \
-    raft::device_resources const& handle,                                  \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-    index<IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
deleted file mode 100644
index 11524886f0..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                        \
-  template auto extend<T, IdxT>(                                           \
-    raft::device_resources const& handle,                                  \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-    const index<IdxT>& idx)                                                \
-    ->index<IdxT>;                                                         \
-  template void extend<T, IdxT>(                                           \
-    raft::device_resources const& handle,                                  \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-    index<IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu
deleted file mode 100644
index 92a4d89e6b..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                         \
-  template void search<T, IdxT>(raft::device_resources const& handle,                       \
-                                const search_params& params,                                \
-                                const index<IdxT>& idx,                                     \
-                                raft::device_matrix_view<const T, IdxT, row_major> queries, \
-                                raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
-                                raft::device_matrix_view<float, IdxT, row_major> distances);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
deleted file mode 100644
index 62a8b48ad5..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                         \
-  template void search<T, IdxT>(raft::device_resources const& handle,                       \
-                                const search_params& params,                                \
-                                const index<IdxT>& idx,                                     \
-                                raft::device_matrix_view<const T, IdxT, row_major> queries, \
-                                raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
-                                raft::device_matrix_view<float, IdxT, row_major> distances);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
deleted file mode 100644
index 3bcf134a22..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                         \
-  template void search<T, IdxT>(raft::device_resources const& handle,                       \
-                                const search_params& params,                                \
-                                const index<IdxT>& idx,                                     \
-                                raft::device_matrix_view<const T, IdxT, row_major> queries, \
-                                raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
-                                raft::device_matrix_view<float, IdxT, row_major> distances);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/refine_d_int64_t_float.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_float.cu
deleted file mode 100644
index 0b0125459d..0000000000
--- a/cpp/src/neighbors/specializations/refine_d_int64_t_float.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, float, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::device_matrix_view<const float, int64_t, row_major> dataset,
-  raft::device_matrix_view<const float, int64_t, row_major> queries,
-  raft::device_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::device_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::device_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu
deleted file mode 100644
index d6c817b971..0000000000
--- a/cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, int8_t, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::device_matrix_view<const int8_t, int64_t, row_major> dataset,
-  raft::device_matrix_view<const int8_t, int64_t, row_major> queries,
-  raft::device_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::device_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::device_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
deleted file mode 100644
index 3e0ca627a6..0000000000
--- a/cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, uint8_t, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::device_matrix_view<const uint8_t, int64_t, row_major> dataset,
-  raft::device_matrix_view<const uint8_t, int64_t, row_major> queries,
-  raft::device_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::device_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::device_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_h_int64_t_float.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_float.cu
deleted file mode 100644
index 66a6bace53..0000000000
--- a/cpp/src/neighbors/specializations/refine_h_int64_t_float.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, float, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::host_matrix_view<const float, int64_t, row_major> dataset,
-  raft::host_matrix_view<const float, int64_t, row_major> queries,
-  raft::host_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::host_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::host_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu
deleted file mode 100644
index 22824b3a8e..0000000000
--- a/cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-template void refine<int64_t, int8_t, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::host_matrix_view<const int8_t, int64_t, row_major> dataset,
-  raft::host_matrix_view<const int8_t, int64_t, row_major> queries,
-  raft::host_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::host_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::host_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
deleted file mode 100644
index 58dcfc87c9..0000000000
--- a/cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, uint8_t, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::host_matrix_view<const uint8_t, int64_t, row_major> dataset,
-  raft::host_matrix_view<const uint8_t, int64_t, row_major> queries,
-  raft::host_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::host_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::host_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu b/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu
deleted file mode 100644
index 2c21d1ec64..0000000000
--- a/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-template void brute_force_knn<long, float, int>(raft::device_resources const& handle,
-                                                std::vector<float*>& input,
-                                                std::vector<int>& sizes,
-                                                int D,
-                                                float* search_items,
-                                                int n,
-                                                long* res_I,
-                                                float* res_D,
-                                                int k,
-                                                bool rowMajorIndex,
-                                                bool rowMajorQuery,
-                                                std::vector<long>* translations,
-                                                distance::DistanceType metric,
-                                                float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu b/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu
deleted file mode 100644
index 7e6e7e80d0..0000000000
--- a/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-template void brute_force_knn<long, float, unsigned int>(raft::device_resources const& handle,
-                                                         std::vector<float*>& input,
-                                                         std::vector<unsigned int>& sizes,
-                                                         unsigned int D,
-                                                         float* search_items,
-                                                         unsigned int n,
-                                                         long* res_I,
-                                                         float* res_D,
-                                                         unsigned int k,
-                                                         bool rowMajorIndex,
-                                                         bool rowMajorQuery,
-                                                         std::vector<long>* translations,
-                                                         distance::DistanceType metric,
-                                                         float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu b/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
deleted file mode 100644
index e94c12d579..0000000000
--- a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-template void brute_force_knn<uint32_t, float, int>(raft::device_resources const& handle,
-                                                    std::vector<float*>& input,
-                                                    std::vector<int>& sizes,
-                                                    int D,
-                                                    float* search_items,
-                                                    int n,
-                                                    uint32_t* res_I,
-                                                    float* res_D,
-                                                    int k,
-                                                    bool rowMajorIndex,
-                                                    bool rowMajorQuery,
-                                                    std::vector<uint32_t>* translations,
-                                                    distance::DistanceType metric,
-                                                    float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu b/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu
deleted file mode 100644
index 95cf8a1eb3..0000000000
--- a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-template void brute_force_knn<uint32_t, float, unsigned int>(raft::device_resources const& handle,
-                                                             std::vector<float*>& input,
-                                                             std::vector<unsigned int>& sizes,
-                                                             unsigned int D,
-                                                             float* search_items,
-                                                             unsigned int n,
-                                                             uint32_t* res_I,
-                                                             float* res_D,
-                                                             unsigned int k,
-                                                             bool rowMajorIndex,
-                                                             bool rowMajorQuery,
-                                                             std::vector<uint32_t>* translations,
-                                                             distance::DistanceType metric,
-                                                             float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/cluster/cluster_cost.cuh b/cpp/src/raft_runtime/cluster/cluster_cost.cuh
similarity index 100%
rename from cpp/src/cluster/cluster_cost.cuh
rename to cpp/src/raft_runtime/cluster/cluster_cost.cuh
diff --git a/cpp/src/cluster/cluster_cost_double.cu b/cpp/src/raft_runtime/cluster/cluster_cost_double.cu
similarity index 96%
rename from cpp/src/cluster/cluster_cost_double.cu
rename to cpp/src/raft_runtime/cluster/cluster_cost_double.cu
index 2244ba4ed3..b6df92c839 100644
--- a/cpp/src/cluster/cluster_cost_double.cu
+++ b/cpp/src/raft_runtime/cluster/cluster_cost_double.cu
@@ -15,7 +15,6 @@
  */
 
 #include "cluster_cost.cuh"
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
diff --git a/cpp/src/cluster/cluster_cost_float.cu b/cpp/src/raft_runtime/cluster/cluster_cost_float.cu
similarity index 96%
rename from cpp/src/cluster/cluster_cost_float.cu
rename to cpp/src/raft_runtime/cluster/cluster_cost_float.cu
index 4164265b55..2c26b69984 100644
--- a/cpp/src/cluster/cluster_cost_float.cu
+++ b/cpp/src/raft_runtime/cluster/cluster_cost_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include "cluster_cost.cuh"
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
diff --git a/cpp/src/cluster/kmeans_fit_double.cu b/cpp/src/raft_runtime/cluster/kmeans_fit_double.cu
similarity index 96%
rename from cpp/src/cluster/kmeans_fit_double.cu
rename to cpp/src/raft_runtime/cluster/kmeans_fit_double.cu
index 12f4fba318..0b8b458042 100644
--- a/cpp/src/cluster/kmeans_fit_double.cu
+++ b/cpp/src/raft_runtime/cluster/kmeans_fit_double.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 
 namespace raft::runtime::cluster::kmeans {
diff --git a/cpp/src/cluster/kmeans_fit_float.cu b/cpp/src/raft_runtime/cluster/kmeans_fit_float.cu
similarity index 96%
rename from cpp/src/cluster/kmeans_fit_float.cu
rename to cpp/src/raft_runtime/cluster/kmeans_fit_float.cu
index 48505dcc3e..a2831c2cf0 100644
--- a/cpp/src/cluster/kmeans_fit_float.cu
+++ b/cpp/src/raft_runtime/cluster/kmeans_fit_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 
 namespace raft::runtime::cluster::kmeans {
diff --git a/cpp/src/cluster/kmeans_init_plus_plus_double.cu b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
similarity index 96%
rename from cpp/src/cluster/kmeans_init_plus_plus_double.cu
rename to cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
index 5bb0835595..d2ec26f882 100644
--- a/cpp/src/cluster/kmeans_init_plus_plus_double.cu
+++ b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 
 namespace raft::runtime::cluster::kmeans {
diff --git a/cpp/src/cluster/kmeans_init_plus_plus_float.cu b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
similarity index 96%
rename from cpp/src/cluster/kmeans_init_plus_plus_float.cu
rename to cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
index f211afd06e..bacab3b7d6 100644
--- a/cpp/src/cluster/kmeans_init_plus_plus_float.cu
+++ b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 
 namespace raft::runtime::cluster::kmeans {
diff --git a/cpp/src/cluster/update_centroids.cuh b/cpp/src/raft_runtime/cluster/update_centroids.cuh
similarity index 98%
rename from cpp/src/cluster/update_centroids.cuh
rename to cpp/src/raft_runtime/cluster/update_centroids.cuh
index 7c13252384..de219329df 100644
--- a/cpp/src/cluster/update_centroids.cuh
+++ b/cpp/src/raft_runtime/cluster/update_centroids.cuh
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/norm.cuh>
diff --git a/cpp/src/cluster/update_centroids_double.cu b/cpp/src/raft_runtime/cluster/update_centroids_double.cu
similarity index 97%
rename from cpp/src/cluster/update_centroids_double.cu
rename to cpp/src/raft_runtime/cluster/update_centroids_double.cu
index 0f38c7dd53..d967c503ff 100644
--- a/cpp/src/cluster/update_centroids_double.cu
+++ b/cpp/src/raft_runtime/cluster/update_centroids_double.cu
@@ -15,7 +15,6 @@
  */
 
 #include "update_centroids.cuh"
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
diff --git a/cpp/src/cluster/update_centroids_float.cu b/cpp/src/raft_runtime/cluster/update_centroids_float.cu
similarity index 97%
rename from cpp/src/cluster/update_centroids_float.cu
rename to cpp/src/raft_runtime/cluster/update_centroids_float.cu
index 8f0e79b438..b141a1ef20 100644
--- a/cpp/src/cluster/update_centroids_float.cu
+++ b/cpp/src/raft_runtime/cluster/update_centroids_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include "update_centroids.cuh"
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
similarity index 97%
rename from cpp/src/distance/fused_l2_min_arg.cu
rename to cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
index b682446cc2..bec71ae698 100644
--- a/cpp/src/distance/fused_l2_min_arg.cu
+++ b/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
@@ -19,7 +19,7 @@
 #include <raft/core/kvp.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
+#include <raft/linalg/norm.cuh>
 #include <thrust/for_each.h>
 #include <thrust/tuple.h>
 
@@ -95,4 +95,4 @@ void fused_l2_nn_min_arg(raft::device_resources const& handle,
   compute_fused_l2_nn_min_arg<double, int>(handle, min, x, y, m, n, k, sqrt);
 }
 
-}  // end namespace raft::runtime::distance
\ No newline at end of file
+}  // end namespace raft::runtime::distance
diff --git a/cpp/src/distance/pairwise_distance.cu b/cpp/src/raft_runtime/distance/pairwise_distance.cu
similarity index 97%
rename from cpp/src/distance/pairwise_distance.cu
rename to cpp/src/raft_runtime/distance/pairwise_distance.cu
index dfdfa553e9..62597a4799 100644
--- a/cpp/src/distance/pairwise_distance.cu
+++ b/cpp/src/raft_runtime/distance/pairwise_distance.cu
@@ -17,7 +17,6 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
-#include <raft/distance/specializations.cuh>
 
 namespace raft::runtime::distance {
 
diff --git a/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu b/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
new file mode 100644
index 0000000000..8814a8aafc
--- /dev/null
+++ b/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/matrix/select_k.cuh>
+
+#include <raft_runtime/matrix/select_k.hpp>
+
+#include <vector>
+
+namespace raft::runtime::matrix {
+
+void select_k(const device_resources& handle,
+              raft::device_matrix_view<const float, int64_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const int64_t, int64_t, row_major>> in_idx,
+              raft::device_matrix_view<float, int64_t, row_major> out_val,
+              raft::device_matrix_view<int64_t, int64_t, row_major> out_idx,
+              bool select_min)
+{
+  raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, select_min);
+}
+}  // namespace raft::runtime::matrix
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
similarity index 97%
rename from cpp/src/neighbors/brute_force_knn_int64_t_float.cu
rename to cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
index 88545b3607..ea6002eab0 100644
--- a/cpp/src/neighbors/brute_force_knn_int64_t_float.cu
+++ b/cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
@@ -18,8 +18,6 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/neighbors/brute_force.cuh>
 
-#include <raft/neighbors/specializations.cuh>
-
 #include <raft_runtime/neighbors/brute_force.hpp>
 
 #include <vector>
diff --git a/cpp/src/neighbors/ivf_flat_build.cu b/cpp/src/raft_runtime/neighbors/ivf_flat_build.cu
similarity index 98%
rename from cpp/src/neighbors/ivf_flat_build.cu
rename to cpp/src/raft_runtime/neighbors/ivf_flat_build.cu
index 0d82fdbb08..48a40ab56e 100644
--- a/cpp/src/neighbors/ivf_flat_build.cu
+++ b/cpp/src/raft_runtime/neighbors/ivf_flat_build.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
 #include <raft_runtime/neighbors/ivf_flat.hpp>
 
 namespace raft::runtime::neighbors::ivf_flat {
diff --git a/cpp/src/neighbors/ivf_flat_search.cu b/cpp/src/raft_runtime/neighbors/ivf_flat_search.cu
similarity index 97%
rename from cpp/src/neighbors/ivf_flat_search.cu
rename to cpp/src/raft_runtime/neighbors/ivf_flat_search.cu
index b843ee7c30..eefc7f2932 100644
--- a/cpp/src/neighbors/ivf_flat_search.cu
+++ b/cpp/src/raft_runtime/neighbors/ivf_flat_search.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
 #include <raft_runtime/neighbors/ivf_flat.hpp>
 
 namespace raft::runtime::neighbors::ivf_flat {
diff --git a/cpp/src/neighbors/ivfpq_build.cu b/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
similarity index 98%
rename from cpp/src/neighbors/ivfpq_build.cu
rename to cpp/src/raft_runtime/neighbors/ivfpq_build.cu
index 7f91e34969..5bfb546060 100644
--- a/cpp/src/neighbors/ivfpq_build.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
 namespace raft::runtime::neighbors::ivf_pq {
diff --git a/cpp/src/neighbors/ivfpq_deserialize.cu b/cpp/src/raft_runtime/neighbors/ivfpq_deserialize.cu
similarity index 95%
rename from cpp/src/neighbors/ivfpq_deserialize.cu
rename to cpp/src/raft_runtime/neighbors/ivfpq_deserialize.cu
index 8d54e3cc55..45b731fdcf 100644
--- a/cpp/src/neighbors/ivfpq_deserialize.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_deserialize.cu
@@ -15,7 +15,7 @@
  */
 
 #include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq_serialize.cuh>
 
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
new file mode 100644
index 0000000000..d55d726671
--- /dev/null
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
+  void search(raft::device_resources const& handle,                                               \
+              const raft::neighbors::ivf_pq::search_params& params,                               \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
+              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
+              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
+              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
+  {                                                                                               \
+    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
+  }
+
+RAFT_SEARCH_INST(float, int64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
new file mode 100644
index 0000000000..b73cbc0751
--- /dev/null
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
+  void search(raft::device_resources const& handle,                                               \
+              const raft::neighbors::ivf_pq::search_params& params,                               \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
+              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
+              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
+              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
+  {                                                                                               \
+    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
+  }
+
+RAFT_SEARCH_INST(int8_t, int64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..2b3dfe585d
--- /dev/null
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
+  void search(raft::device_resources const& handle,                                               \
+              const raft::neighbors::ivf_pq::search_params& params,                               \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
+              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
+              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
+              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
+  {                                                                                               \
+    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
+  }
+
+RAFT_SEARCH_INST(uint8_t, int64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/ivfpq_serialize.cu b/cpp/src/raft_runtime/neighbors/ivfpq_serialize.cu
similarity index 95%
rename from cpp/src/neighbors/ivfpq_serialize.cu
rename to cpp/src/raft_runtime/neighbors/ivfpq_serialize.cu
index e251f1442f..21bd221c45 100644
--- a/cpp/src/neighbors/ivfpq_serialize.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_serialize.cu
@@ -15,7 +15,7 @@
  */
 
 #include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq_serialize.cuh>
 
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
diff --git a/cpp/src/neighbors/refine_d_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu
similarity index 96%
rename from cpp/src/neighbors/refine_d_int64_t_float.cu
rename to cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu
index 8ad8f9e8f1..79cec55294 100644
--- a/cpp/src/neighbors/refine_d_int64_t_float.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/neighbors/refine_d_int64_t_int8_t.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
similarity index 96%
rename from cpp/src/neighbors/refine_d_int64_t_int8_t.cu
rename to cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
index 817369ed6a..f8a7a8c9c8 100644
--- a/cpp/src/neighbors/refine_d_int64_t_int8_t.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/neighbors/refine_d_int64_t_uint8_t.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
similarity index 96%
rename from cpp/src/neighbors/refine_d_int64_t_uint8_t.cu
rename to cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
index fb426b2c02..8f68f9f88e 100644
--- a/cpp/src/neighbors/refine_d_int64_t_uint8_t.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/neighbors/refine_h_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu
similarity index 96%
rename from cpp/src/neighbors/refine_h_int64_t_float.cu
rename to cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu
index 1f950dc3b6..7f19d44700 100644
--- a/cpp/src/neighbors/refine_h_int64_t_float.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu
@@ -16,7 +16,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/neighbors/refine_h_int64_t_int8_t.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
similarity index 96%
rename from cpp/src/neighbors/refine_h_int64_t_int8_t.cu
rename to cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
index da99df3618..bd21c6b198 100644
--- a/cpp/src/neighbors/refine_h_int64_t_int8_t.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/neighbors/refine_h_int64_t_uint8_t.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
similarity index 96%
rename from cpp/src/neighbors/refine_h_int64_t_uint8_t.cu
rename to cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
index 990754b033..f10d01cc09 100644
--- a/cpp/src/neighbors/refine_h_int64_t_uint8_t.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/random/common.cuh b/cpp/src/raft_runtime/random/common.cuh
similarity index 100%
rename from cpp/src/random/common.cuh
rename to cpp/src/raft_runtime/random/common.cuh
diff --git a/cpp/src/random/rmat_rectangular_generator_int64_double.cu b/cpp/src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu
similarity index 100%
rename from cpp/src/random/rmat_rectangular_generator_int64_double.cu
rename to cpp/src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu
diff --git a/cpp/src/random/rmat_rectangular_generator_int64_float.cu b/cpp/src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
similarity index 100%
rename from cpp/src/random/rmat_rectangular_generator_int64_float.cu
rename to cpp/src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
diff --git a/cpp/src/random/rmat_rectangular_generator_int_double.cu b/cpp/src/raft_runtime/random/rmat_rectangular_generator_int_double.cu
similarity index 100%
rename from cpp/src/random/rmat_rectangular_generator_int_double.cu
rename to cpp/src/raft_runtime/random/rmat_rectangular_generator_int_double.cu
diff --git a/cpp/src/random/rmat_rectangular_generator_int_float.cu b/cpp/src/raft_runtime/random/rmat_rectangular_generator_int_float.cu
similarity index 100%
rename from cpp/src/random/rmat_rectangular_generator_int_float.cu
rename to cpp/src/raft_runtime/random/rmat_rectangular_generator_int_float.cu
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers.cu b/cpp/src/spatial/knn/detail/ball_cover/registers.cu
new file mode 100644
index 0000000000..0bb6d123a9
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers.cu
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims)                                                   \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    raft::spatial::knn::detail::DistFunc<Mvalue_t, Mvalue_int>& dfunc,                       \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims)                                                   \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    raft::spatial::knn::detail::DistFunc<Mvalue_t, Mvalue_int>& dfunc,                       \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(std::int64_t, float, std::uint32_t, 2);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(std::int64_t, float, std::uint32_t, 3);
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(std::int64_t, float, std::uint32_t, 2);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(std::int64_t, float, std::uint32_t, 3);
+
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py b/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py
new file mode 100644
index 0000000000..f8ce27728b
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint> // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+"""
+
+
+macro_pass_one = """
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \\
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \\
+  template void                                                                       \\
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \\
+    raft::device_resources const& handle,                                                    \\
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \\
+    const Mvalue_t* query,                                                                   \\
+    const Mvalue_int n_query_rows,                                                           \\
+    Mvalue_int k,                                                                            \\
+    const Mvalue_idx* R_knn_inds,                                                            \\
+    const Mvalue_t* R_knn_dists,                                                             \\
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \\
+    Mvalue_idx* inds,                                                                        \\
+    Mvalue_t* dists,                                                                         \\
+    float weight,                                                                            \\
+    Mvalue_int* dists_counter)
+
+"""
+
+macro_pass_two = """
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \\
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \\
+  template void                                                                       \\
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \\
+    raft::device_resources const& handle,                                                    \\
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \\
+    const Mvalue_t* query,                                                                   \\
+    const Mvalue_int n_query_rows,                                                           \\
+    Mvalue_int k,                                                                            \\
+    const Mvalue_idx* R_knn_inds,                                                            \\
+    const Mvalue_t* R_knn_dists,                                                             \\
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \\
+    Mvalue_idx* inds,                                                                        \\
+    Mvalue_t* dists,                                                                         \\
+    float weight,                                                                            \\
+    Mvalue_int* dists_counter)
+
+"""
+
+distances = dict(
+    haversine="raft::spatial::knn::detail::HaversineFunc",
+    euclidean="raft::spatial::knn::detail::EuclideanFunc",
+    dist="raft::spatial::knn::detail::DistFunc",
+)
+
+for k, v in distances.items():
+    for dim in [2, 3]:
+        path = f"registers_pass_one_{dim}d_{k}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(macro_pass_one)
+            f.write(f"instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(\n")
+            f.write(f"  std::int64_t, float, std::uint32_t, {dim}, {v});\n")
+            f.write("#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one\n")
+        print(f"src/spatial/knn/detail/ball_cover/{path}")
+
+for k, v in distances.items():
+    for dim in [2, 3]:
+        path = f"registers_pass_two_{dim}d_{k}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(macro_pass_two)
+            f.write(f"instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(\n")
+            f.write(f"  std::int64_t, float, std::uint32_t, {dim}, {v});\n")
+            f.write("#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two\n")
+        print(f"src/spatial/knn/detail/ball_cover/{path}")
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
new file mode 100644
index 0000000000..b4ecac06e6
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::DistFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
new file mode 100644
index 0000000000..31628d8b82
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::EuclideanFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
new file mode 100644
index 0000000000..80fda1bf9d
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::HaversineFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
new file mode 100644
index 0000000000..40aa89aa39
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::DistFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
new file mode 100644
index 0000000000..be159932a6
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::EuclideanFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
new file mode 100644
index 0000000000..a9fe8f355f
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::HaversineFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
new file mode 100644
index 0000000000..b20df46a4f
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::DistFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
new file mode 100644
index 0000000000..d5042b0142
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::EuclideanFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
new file mode 100644
index 0000000000..01002d356e
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::HaversineFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
new file mode 100644
index 0000000000..5746ab99fb
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::DistFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
new file mode 100644
index 0000000000..fad007a2d4
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::EuclideanFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
new file mode 100644
index 0000000000..93083da5c6
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::HaversineFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
new file mode 100644
index 0000000000..67b08655e6
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>                           // size_t
+#include <cstdint>                           // int_Xt
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/spatial/knn/detail/fused_l2_knn-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs)  \
+  template void raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>( \
+    size_t D,                                                                                \
+    Mvalue_idx * out_inds,                                                                   \
+    Mvalue_t * out_dists,                                                                    \
+    const Mvalue_t* index,                                                                   \
+    const Mvalue_t* query,                                                                   \
+    size_t n_index_rows,                                                                     \
+    size_t n_query_rows,                                                                     \
+    int k,                                                                                   \
+    bool rowMajorIndex,                                                                      \
+    bool rowMajorQuery,                                                                      \
+    cudaStream_t stream,                                                                     \
+    raft::distance::DistanceType metric)
+
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
new file mode 100644
index 0000000000..3c0d13710e
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>                           // size_t
+#include <cstdint>                           // int_Xt
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/spatial/knn/detail/fused_l2_knn-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs)  \
+  template void raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>( \
+    size_t D,                                                                                \
+    Mvalue_idx * out_inds,                                                                   \
+    Mvalue_t * out_dists,                                                                    \
+    const Mvalue_t* index,                                                                   \
+    const Mvalue_t* query,                                                                   \
+    size_t n_index_rows,                                                                     \
+    size_t n_query_rows,                                                                     \
+    int k,                                                                                   \
+    bool rowMajorIndex,                                                                      \
+    bool rowMajorQuery,                                                                      \
+    cudaStream_t stream,                                                                     \
+    raft::distance::DistanceType metric)
+
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
new file mode 100644
index 0000000000..e799c5181f
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>                           // size_t
+#include <cstdint>                           // int_Xt
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/spatial/knn/detail/fused_l2_knn-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs)  \
+  template void raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>( \
+    size_t D,                                                                                \
+    Mvalue_idx * out_inds,                                                                   \
+    Mvalue_t * out_dists,                                                                    \
+    const Mvalue_t* index,                                                                   \
+    const Mvalue_t* query,                                                                   \
+    size_t n_index_rows,                                                                     \
+    size_t n_query_rows,                                                                     \
+    int k,                                                                                   \
+    bool rowMajorIndex,                                                                      \
+    bool rowMajorQuery,                                                                      \
+    cudaStream_t stream,                                                                     \
+    raft::distance::DistanceType metric)
+
+// These are used by brute_force_knn:
+instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu b/cpp/src/util/memory_pool.cpp
similarity index 72%
rename from cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
rename to cpp/src/util/memory_pool.cpp
index 7ea4b60e09..837e870043 100644
--- a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
+++ b/cpp/src/util/memory_pool.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,4 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::RBFKernel<double>;
\ No newline at end of file
+#include <raft/util/memory_pool-inl.hpp>
diff --git a/cpp/template/src/test_distance.cu b/cpp/template/src/test_distance.cu
index b86dde70e5..e165cd8f14 100644
--- a/cpp/template/src/test_distance.cu
+++ b/cpp/template/src/test_distance.cu
@@ -20,10 +20,6 @@
 #include <raft/distance/distance.cuh>
 #include <raft/random/make_blobs.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 int main()
 {
   raft::device_resources handle;
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 22e8a9d73c..88ad7772c2 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 function(ConfigureTest)
 
-  set(options OPTIONAL LIB)
+  set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY)
   set(oneValueArgs NAME)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
@@ -59,6 +59,10 @@ function(ConfigureTest)
                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
+  if(ConfigureTest_EXPLICIT_INSTANTIATE_ONLY)
+    target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
+  endif()
+
   target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
 
   install(
@@ -88,6 +92,7 @@ if(BUILD_TESTS)
     test/cluster/kmeans_find_k.cu
     OPTIONAL
     LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -112,6 +117,9 @@ if(BUILD_TESTS)
     test/core/span.cu
     test/core/temporary_device_buffer.cu
     test/test.cpp
+    OPTIONAL
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -119,6 +127,7 @@ if(BUILD_TESTS)
     DISTANCE_TEST
     PATH
     test/distance/dist_adj.cu
+    test/distance/dist_adj_distance_instance.cu
     test/distance/dist_canberra.cu
     test/distance/dist_correlation.cu
     test/distance/dist_cos.cu
@@ -140,7 +149,45 @@ if(BUILD_TESTS)
     test/distance/gram.cu
     OPTIONAL
     LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  list(
+    APPEND
+    EXT_HEADER_TEST_SOURCES
+    test/ext_headers/raft_neighbors_brute_force.cu
+    test/ext_headers/raft_distance_distance.cu
+    test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
+    test/ext_headers/raft_matrix_detail_select_k.cu
+    test/ext_headers/raft_neighbors_ball_cover.cu
+    test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
+    test/ext_headers/raft_distance_fused_l2_nn.cu
+    test/ext_headers/raft_neighbors_ivf_pq.cu
+    test/ext_headers/raft_util_memory_pool.cpp
+    test/ext_headers/raft_neighbors_ivf_flat.cu
+    test/ext_headers/raft_core_logger.cpp
+    test/ext_headers/raft_neighbors_refine.cu
+    test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
+    test/ext_headers/raft_neighbors_detail_selection_faiss.cu
+    test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
+    test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
+    test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
+    test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
+  )
+
+  # Test that the split headers compile in isolation with:
+  #
+  # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
+  # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
+  # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
+  ConfigureTest(
+    NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+  ConfigureTest(
+    NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB
   )
+  ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
 
   ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
 
@@ -201,6 +248,7 @@ if(BUILD_TESTS)
     test/sparse/spectral_matrix.cu
     OPTIONAL
     LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -220,7 +268,7 @@ if(BUILD_TESTS)
 
   ConfigureTest(
     NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
-    test/lap/lap.cu test/sparse/mst.cu OPTIONAL LIB
+    test/lap/lap.cu test/sparse/mst.cu OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -236,6 +284,7 @@ if(BUILD_TESTS)
     test/sparse/degree.cu
     test/sparse/filter.cu
     test/sparse/norm.cu
+    test/sparse/normalize.cu
     test/sparse/reduce.cu
     test/sparse/row_op.cu
     test/sparse/sort.cu
@@ -244,12 +293,20 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu OPTIONAL LIB
+    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
+    test/sparse/gram.cu OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
-    NAME SPARSE_NEIGHBORS_TEST PATH test/sparse/neighbors/connect_components.cu
-    test/sparse/neighbors/brute_force.cu test/sparse/neighbors/knn_graph.cu OPTIONAL LIB
+    NAME
+    SPARSE_NEIGHBORS_TEST
+    PATH
+    test/sparse/neighbors/connect_components.cu
+    test/sparse/neighbors/brute_force.cu
+    test/sparse/neighbors/knn_graph.cu
+    OPTIONAL
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -257,6 +314,8 @@ if(BUILD_TESTS)
     NEIGHBORS_TEST
     PATH
     test/neighbors/ann_cagra/test_float_uint32_t.cu
+    test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
+    test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
     test/neighbors/ann_ivf_flat/test_float_int64_t.cu
     test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
     test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
@@ -275,6 +334,7 @@ if(BUILD_TESTS)
     test/neighbors/selection.cu
     OPTIONAL
     LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
@@ -308,6 +368,7 @@ if(BUILD_TESTS)
     test/stats/v_measure.cu
     OPTIONAL
     LIB
+    EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
diff --git a/cpp/test/cluster/cluster_solvers.cu b/cpp/test/cluster/cluster_solvers.cu
index f26c598a2b..60e5f62dc0 100644
--- a/cpp/test/cluster/cluster_solvers.cu
+++ b/cpp/test/cluster/cluster_solvers.cu
@@ -19,10 +19,6 @@
 #include <memory>
 #include <raft/core/device_resources.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/spectral/specializations.cuh>
-#endif
-
 #include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/modularity_maximization.cuh>
 
diff --git a/cpp/test/cluster/kmeans.cu b/cpp/test/cluster/kmeans.cu
index cfec84256b..20110eed11 100644
--- a/cpp/test/cluster/kmeans.cu
+++ b/cpp/test/cluster/kmeans.cu
@@ -29,10 +29,6 @@
 #include <rmm/device_uvector.hpp>
 #include <thrust/fill.h>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 namespace raft {
 
 template <typename T>
diff --git a/cpp/test/cluster/kmeans_balanced.cu b/cpp/test/cluster/kmeans_balanced.cu
index 220eba4186..a34f2f3b59 100644
--- a/cpp/test/cluster/kmeans_balanced.cu
+++ b/cpp/test/cluster/kmeans_balanced.cu
@@ -30,10 +30,6 @@
 #include <rmm/device_uvector.hpp>
 #include <thrust/fill.h>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 /* This test takes advantage of the fact that make_blobs generates balanced clusters.
  * It doesn't currently test whether the algorithm can make balanced clusters with an imbalanced
  * dataset.
diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu
index a865651f56..bb41d4fafc 100644
--- a/cpp/test/cluster/kmeans_find_k.cu
+++ b/cpp/test/cluster/kmeans_find_k.cu
@@ -25,10 +25,6 @@
 #include <raft/random/make_blobs.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 namespace raft {
 
 template <typename T>
diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu
index 4946d52f26..b2b177dde6 100644
--- a/cpp/test/cluster/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -14,16 +14,21 @@
  * limitations under the License.
  */
 
+// XXX: We allow the instantiation of fused_l2_nn here:
+// raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(colors.data(), params.n_row);
+// raft::linkage::connect_components<value_idx, value_t>(
+//   handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op);
+//
+// TODO: consider adding this to libraft.so or creating an instance in a
+// separate translation unit for this test.
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
 #include "../test_utils.cuh"
 
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/transpose.cuh>
 #include <raft/sparse/coo.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <raft/core/device_mdspan.hpp>
 #include <raft/sparse/hierarchy/single_linkage.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp
index 9f416d3ae8..fddfd58bb8 100644
--- a/cpp/test/core/handle.cpp
+++ b/cpp/test/core/handle.cpp
@@ -22,6 +22,7 @@
 #include <raft/core/comms.hpp>
 #include <raft/core/handle.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <unordered_map>
 
 namespace raft {
diff --git a/cpp/test/core/logger.cpp b/cpp/test/core/logger.cpp
index 3f29c9f12c..d7f001a700 100644
--- a/cpp/test/core/logger.cpp
+++ b/cpp/test/core/logger.cpp
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+// We set RAFT_ACTIVE_LEVEL to a value that would enable testing trace and debug logs
+// (otherwise trace and debug logs are desabled by default).
+#undef RAFT_ACTIVE_LEVEL
+#define RAFT_ACTIVE_LEVEL 6
+
 #include <gtest/gtest.h>
 #include <raft/core/logger.hpp>
 #include <string>
diff --git a/cpp/test/core/mdarray.cu b/cpp/test/core/mdarray.cu
index 85d7bdb6c8..aab7979c0e 100644
--- a/cpp/test/core/mdarray.cu
+++ b/cpp/test/core/mdarray.cu
@@ -821,7 +821,7 @@ void test_mdspan_aligned_matrix()
   // manually aligning the above, using -1 as filler
   static constexpr int X = -1;
   long data_padded[]     = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  X, X, X, X, X, X,
-                        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, X, X, X, X, X, X};
+                            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, X, X, X, X, X, X};
 
   auto my_aligned_host_span =
     make_host_aligned_matrix_view<long, int, layout_right_padded<long>>(data_padded, rows, cols);
diff --git a/cpp/test/core/mdspan_utils.cu b/cpp/test/core/mdspan_utils.cu
index 526442da95..5e479b839f 100644
--- a/cpp/test/core/mdspan_utils.cu
+++ b/cpp/test/core/mdspan_utils.cu
@@ -30,8 +30,7 @@ template <typename ElementType,
           typename LayoutPolicy   = layout_c_contiguous,
           typename AccessorPolicy = stdex::default_accessor<ElementType>>
 struct derived_device_mdspan
-  : public device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy> {
-};
+  : public device_mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy> {};
 
 void test_template_asserts()
 {
diff --git a/cpp/test/core/temporary_device_buffer.cu b/cpp/test/core/temporary_device_buffer.cu
index 52a2ec4c9b..cc8af24f10 100644
--- a/cpp/test/core/temporary_device_buffer.cu
+++ b/cpp/test/core/temporary_device_buffer.cu
@@ -16,6 +16,7 @@
 
 #include "../test_utils.cuh"
 
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/temporary_device_buffer.hpp>
 
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index ce802e5138..413e548532 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -16,12 +16,15 @@
 
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include "dist_adj.cuh"
+
 namespace raft {
 namespace distance {
 
@@ -74,18 +77,6 @@ struct DistanceAdjInputs {
   unsigned long long int seed;
 };
 
-template <typename AccT, typename DataT, typename OutT, typename Index>
-struct threshold_final_op {
-  DataT threshold_val;
-
-  __device__ __host__ threshold_final_op() noexcept : threshold_val(0.0) {}
-  __device__ __host__ threshold_final_op(DataT val) noexcept : threshold_val(val) {}
-  __device__ __host__ OutT operator()(AccT d_val, Index g_idx) const noexcept
-  {
-    return d_val <= threshold_val;
-  }
-};
-
 template <typename DataType>
 ::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs<DataType>& dims)
 {
@@ -140,7 +131,7 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
                                                   n,
                                                   k,
                                                   workspace.data(),
-                                                  workspace.size(),
+                                                  worksize,
                                                   threshold_op,
                                                   isRowMajor);
     handle.sync_stream(stream);
diff --git a/cpp/test/distance/dist_adj.cuh b/cpp/test/distance/dist_adj.cuh
new file mode 100644
index 0000000000..ee4554ff29
--- /dev/null
+++ b/cpp/test/distance/dist_adj.cuh
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dist_adj_threshold.cuh"
+#include <raft/distance/distance.cuh>
+
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT)       \
+  extern template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>( \
+    raft::resources const& handle,                                                         \
+    const DataT* x,                                                                        \
+    const DataT* y,                                                                        \
+    OutT* dist,                                                                            \
+    IdxT m,                                                                                \
+    IdxT n,                                                                                \
+    IdxT k,                                                                                \
+    void* workspace,                                                                       \
+    size_t worksize,                                                                       \
+    FinalLambda fin_op,                                                                    \
+    bool isRowMajor,                                                                       \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Expanded,
+                                   float,
+                                   float,
+                                   uint8_t,
+                                   raft::distance::threshold_float,
+                                   int);
+
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Expanded,
+                                   double,
+                                   double,
+                                   uint8_t,
+                                   raft::distance::threshold_double,
+                                   int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)         \
+  extern template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, uint8_t, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, uint8_t, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)         \
+  extern template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, uint8_t, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, uint8_t, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
diff --git a/cpp/test/distance/dist_adj_distance_instance.cu b/cpp/test/distance/dist_adj_distance_instance.cu
new file mode 100644
index 0000000000..d4685d8095
--- /dev/null
+++ b/cpp/test/distance/dist_adj_distance_instance.cu
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#include "dist_adj_threshold.cuh"
+#include <cstdint>
+#include <raft/distance/distance-inl.cuh>
+
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT) \
+  template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>(  \
+    raft::resources const& handle,                                                   \
+    const DataT* x,                                                                  \
+    const DataT* y,                                                                  \
+    OutT* dist,                                                                      \
+    IdxT m,                                                                          \
+    IdxT n,                                                                          \
+    IdxT k,                                                                          \
+    void* workspace,                                                                 \
+    size_t worksize,                                                                 \
+    FinalLambda fin_op,                                                              \
+    bool isRowMajor,                                                                 \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Expanded,
+                                   float,
+                                   float,
+                                   uint8_t,
+                                   raft::distance::threshold_float,
+                                   int);
+
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Expanded,
+                                   double,
+                                   double,
+                                   uint8_t,
+                                   raft::distance::threshold_double,
+                                   int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)  \
+  template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, uint8_t, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, uint8_t, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
diff --git a/cpp/test/distance/dist_adj_threshold.cuh b/cpp/test/distance/dist_adj_threshold.cuh
new file mode 100644
index 0000000000..78663b3cd1
--- /dev/null
+++ b/cpp/test/distance/dist_adj_threshold.cuh
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // uint8_t
+
+namespace raft::distance {
+
+template <typename AccT, typename DataT, typename OutT, typename Index>
+struct threshold_final_op {
+  DataT threshold_val;
+
+  __device__ __host__ threshold_final_op() noexcept : threshold_val(0.0) {}
+  __device__ __host__ threshold_final_op(DataT val) noexcept : threshold_val(val) {}
+  __device__ __host__ OutT operator()(AccT d_val, Index g_idx) const noexcept
+  {
+    return d_val <= threshold_val;
+  }
+};
+
+using threshold_float  = threshold_final_op<float, float, uint8_t, int>;
+using threshold_double = threshold_final_op<double, double, uint8_t, int>;
+
+}  // namespace raft::distance
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index db5555d9c8..9b8b6c016b 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,7 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceCanberra : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {
-};
+class DistanceCanberra : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -64,8 +63,7 @@ TEST_P(DistanceCanberraD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd));
 
-class BigMatrixCanberra : public BigMatrixDistanceTest<raft::distance::DistanceType::Canberra> {
-};
+class BigMatrixCanberra : public BigMatrixDistanceTest<raft::distance::DistanceType::Canberra> {};
 TEST_F(BigMatrixCanberra, Result) {}
 
 }  // end namespace distance
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
index 0e3f0ee0b5..fc729dec1c 100644
--- a/cpp/test/distance/dist_correlation.cu
+++ b/cpp/test/distance/dist_correlation.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceCorrelation
-  : public DistanceTest<raft::distance::DistanceType::CorrelationExpanded, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::CorrelationExpanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -66,8 +65,7 @@ TEST_P(DistanceCorrelationD, Result)
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, ::testing::ValuesIn(inputsd));
 
 class BigMatrixCorrelation
-  : public BigMatrixDistanceTest<raft::distance::DistanceType::CorrelationExpanded> {
-};
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::CorrelationExpanded> {};
 TEST_F(BigMatrixCorrelation, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index 9faf7651f7..9e1cf5af17 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,8 +64,7 @@ TEST_P(DistanceExpCosD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd));
 
-class BigMatrixCos : public BigMatrixDistanceTest<raft::distance::DistanceType::CosineExpanded> {
-};
+class BigMatrixCos : public BigMatrixDistanceTest<raft::distance::DistanceType::CosineExpanded> {};
 TEST_F(BigMatrixCos, Result) {}
 
 }  // end namespace distance
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
index 1eef9fba4e..9529ec2eaa 100644
--- a/cpp/test/distance/dist_hamming.cu
+++ b/cpp/test/distance/dist_hamming.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceHamming
-  : public DistanceTest<raft::distance::DistanceType::HammingUnexpanded, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::HammingUnexpanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -66,8 +65,7 @@ TEST_P(DistanceHammingD, Result)
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, ::testing::ValuesIn(inputsd));
 
 class BigMatrixHamming
-  : public BigMatrixDistanceTest<raft::distance::DistanceType::HammingUnexpanded> {
-};
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::HammingUnexpanded> {};
 TEST_F(BigMatrixHamming, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index 85a157aa31..93d6101a18 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceHellingerExp
-  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -66,8 +65,7 @@ TEST_P(DistanceHellingerExpD, Result)
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd));
 
 class BigMatrixHellingerExp
-  : public BigMatrixDistanceTest<raft::distance::DistanceType::HellingerExpanded> {
-};
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::HellingerExpanded> {};
 TEST_F(BigMatrixHellingerExp, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_inner_product.cu b/cpp/test/distance/dist_inner_product.cu
index 68ce4c841a..8dd7ef0874 100644
--- a/cpp/test/distance/dist_inner_product.cu
+++ b/cpp/test/distance/dist_inner_product.cu
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceInnerProduct
-  : public DistanceTest<raft::distance::DistanceType::InnerProduct, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::InnerProduct, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 10, 5, 32, true, 1234ULL},
@@ -68,8 +67,7 @@ TEST_P(DistanceInnerProductD, Result)
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceInnerProductD, ::testing::ValuesIn(inputsd));
 
 class BigMatrixInnerProduct
-  : public BigMatrixDistanceTest<raft::distance::DistanceType::InnerProduct> {
-};
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::InnerProduct> {};
 TEST_F(BigMatrixInnerProduct, Result) {}
 
 }  // end namespace distance
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
index a1e2f9f38c..e0e256c925 100644
--- a/cpp/test/distance/dist_jensen_shannon.cu
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceJensenShannon
-  : public DistanceTest<raft::distance::DistanceType::JensenShannon, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::JensenShannon, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -66,8 +65,7 @@ TEST_P(DistanceJensenShannonD, Result)
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, ::testing::ValuesIn(inputsd));
 
 class BigMatrixJensenShannon
-  : public BigMatrixDistanceTest<raft::distance::DistanceType::JensenShannon> {
-};
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::JensenShannon> {};
 TEST_F(BigMatrixJensenShannon, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
index 94330d9450..1f79ebcad4 100644
--- a/cpp/test/distance/dist_kl_divergence.cu
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceKLDivergence
-  : public DistanceTest<raft::distance::DistanceType::KLDivergence, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::KLDivergence, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -66,8 +65,7 @@ TEST_P(DistanceKLDivergenceD, Result)
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, ::testing::ValuesIn(inputsd));
 
 class BigMatrixKLDivergence
-  : public BigMatrixDistanceTest<raft::distance::DistanceType::KLDivergence> {
-};
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::KLDivergence> {};
 TEST_F(BigMatrixKLDivergence, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index dc6bcf72b7..ce62a4aeec 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,7 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceUnexpL1 : public DistanceTest<raft::distance::DistanceType::L1, DataType> {
-};
+class DistanceUnexpL1 : public DistanceTest<raft::distance::DistanceType::L1, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -64,8 +63,7 @@ TEST_P(DistanceUnexpL1D, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd));
 
-class BigMatrixUnexpL1 : public BigMatrixDistanceTest<raft::distance::DistanceType::L1> {
-};
+class BigMatrixUnexpL1 : public BigMatrixDistanceTest<raft::distance::DistanceType::L1> {};
 TEST_F(BigMatrixUnexpL1, Result) {}
 
 }  // end namespace distance
diff --git a/cpp/test/distance/dist_l2_exp.cu b/cpp/test/distance/dist_l2_exp.cu
index ae67215e51..6b6a290386 100644
--- a/cpp/test/distance/dist_l2_exp.cu
+++ b/cpp/test/distance/dist_l2_exp.cu
@@ -67,8 +67,7 @@ TEST_P(DistanceEucExpTestD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd));
 
-class BigMatrixEucExp : public BigMatrixDistanceTest<raft::distance::DistanceType::L2Expanded> {
-};
+class BigMatrixEucExp : public BigMatrixDistanceTest<raft::distance::DistanceType::L2Expanded> {};
 TEST_F(BigMatrixEucExp, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_l2_sqrt_exp.cu b/cpp/test/distance/dist_l2_sqrt_exp.cu
index 94d254f44b..5bccabcc3f 100644
--- a/cpp/test/distance/dist_l2_sqrt_exp.cu
+++ b/cpp/test/distance/dist_l2_sqrt_exp.cu
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceEucSqrtExpTest
-  : public DistanceTest<raft::distance::DistanceType::L2SqrtExpanded, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::L2SqrtExpanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 2048, 4096, 128, true, 1234ULL},
@@ -69,8 +68,7 @@ TEST_P(DistanceEucSqrtExpTestD, Result)
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucSqrtExpTestD, ::testing::ValuesIn(inputsd));
 
 class BigMatrixEucSqrtExp
-  : public BigMatrixDistanceTest<raft::distance::DistanceType::L2SqrtExpanded> {
-};
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::L2SqrtExpanded> {};
 TEST_F(BigMatrixEucSqrtExp, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_l2_unexp.cu b/cpp/test/distance/dist_l2_unexp.cu
index d74a41d2a4..19b0ff6dbf 100644
--- a/cpp/test/distance/dist_l2_unexp.cu
+++ b/cpp/test/distance/dist_l2_unexp.cu
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceEucUnexpTest
-  : public DistanceTest<raft::distance::DistanceType::L2Unexpanded, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::L2Unexpanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
diff --git a/cpp/test/distance/dist_l_inf.cu b/cpp/test/distance/dist_l_inf.cu
index b9d6413a10..223d186a8d 100644
--- a/cpp/test/distance/dist_l_inf.cu
+++ b/cpp/test/distance/dist_l_inf.cu
@@ -21,8 +21,7 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLinf : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {
-};
+class DistanceLinf : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -64,8 +63,7 @@ TEST_P(DistanceLinfD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd));
 
-class BigMatrixLinf : public BigMatrixDistanceTest<raft::distance::DistanceType::Linf> {
-};
+class BigMatrixLinf : public BigMatrixDistanceTest<raft::distance::DistanceType::Linf> {};
 TEST_F(BigMatrixLinf, Result) {}
 
 }  // end namespace distance
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
index 3c5124c31f..73cf4b33a4 100644
--- a/cpp/test/distance/dist_russell_rao.cu
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,7 @@ namespace distance {
 
 template <typename DataType>
 class DistanceRussellRao
-  : public DistanceTest<raft::distance::DistanceType::RusselRaoExpanded, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::RusselRaoExpanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -66,8 +65,7 @@ TEST_P(DistanceRussellRaoD, Result)
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, ::testing::ValuesIn(inputsd));
 
 class BigMatrixRussellRao
-  : public BigMatrixDistanceTest<raft::distance::DistanceType::RusselRaoExpanded> {
-};
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::RusselRaoExpanded> {};
 TEST_F(BigMatrixRussellRao, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 438e212fbd..60951daeb7 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -16,24 +16,15 @@
 
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
-#include <raft/common/nvtx.hpp>  // common::nvtx::range
+#include <raft/common/nvtx.hpp>              // common::nvtx::range
 
 #include <raft/core/device_mdspan.hpp>       // make_device_matrix_view
 #include <raft/core/device_resources.hpp>    // raft::device_resources
 #include <raft/core/operators.hpp>           // raft::sqrt
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
 #include <raft/random/rng.cuh>
-#include <rmm/device_uvector.hpp>  // rmm::device_uvector
-
-// When the distance library is precompiled, include only the raft_runtime
-// headers. This way, a small change in one of the kernel internals does not
-// trigger a rebuild of the test files (it of course still triggers a rebuild of
-// the raft specializations)
-#if defined RAFT_COMPILED
-#include <raft_runtime/distance/pairwise_distance.hpp>
-#else
-#include <raft/distance/distance.cuh>
-#endif
+#include <rmm/device_uvector.hpp>            // rmm::device_uvector
 
 namespace raft {
 namespace distance {
@@ -449,23 +440,12 @@ void distanceLauncher(raft::device_resources const& handle,
                       DataType threshold,
                       DataType metric_arg = 2.0f)
 {
-#if defined RAFT_COMPILED
-  // TODO: Implement and use mdspan-based
-  // raft::runtime::distance::pairwise_distance here.
-  //
-  // Context:
-  // https://github.com/rapidsai/raft/issues/1338
-  bool row_major = layout_to_row_major<layout>();
-  raft::runtime::distance::pairwise_distance(
-    handle, x, y, dist, m, n, k, distanceType, row_major, metric_arg);
-#else
   auto x_v    = make_device_matrix_view<DataType, int, layout>(x, m, k);
   auto y_v    = make_device_matrix_view<DataType, int, layout>(y, n, k);
   auto dist_v = make_device_matrix_view<DataType, int, layout>(dist, m, n);
 
   raft::distance::distance<distanceType, DataType, DataType, DataType, layout>(
     handle, x_v, y_v, dist_v, metric_arg);
-#endif
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
@@ -573,13 +553,8 @@ class BigMatrixDistanceTest : public ::testing::Test {
                            float metric_arg);
     constexpr bool row_major   = true;
     constexpr float metric_arg = 0.0f;
-#if defined RAFT_COMPILED
-    raft::runtime::distance::pairwise_distance(
-      handle, x.data(), x.data(), dist.data(), m, n, k, distanceType, row_major, metric_arg);
-#else
     raft::distance::distance<distanceType, float, float, float>(
       handle, x.data(), x.data(), dist.data(), m, n, k, row_major, metric_arg);
-#endif
     RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
   }
 
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 383ad39319..c4ccd55f69 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -24,10 +24,6 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 namespace raft {
 namespace distance {
 
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index f99d02dc7f..797e31c85d 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -14,11 +14,8 @@
  * limitations under the License.
  */
 
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 #include "../test_utils.cuh"
+#include "gram_base.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
@@ -31,12 +28,6 @@
 
 namespace raft::distance::kernels {
 
-// Get the offset of element [i,k].
-HDI int get_offset(int i, int k, int ld, bool is_row_major)
-{
-  return is_row_major ? i * ld + k : i + k * ld;
-}
-
 struct GramMatrixInputs {
   int n1;      // feature vectors in matrix 1
   int n2;      // featuer vectors in matrix 2
@@ -110,62 +101,46 @@ class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
 
   ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); }
 
-  // Calculate the Gram matrix on the host.
-  void naiveKernel()
-  {
-    std::vector<math_t> x1_host(x1.size());
-    raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
-    std::vector<math_t> x2_host(x2.size());
-    raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
-    handle.sync_stream(stream);
-
-    for (int i = 0; i < params.n1; i++) {
-      for (int j = 0; j < params.n2; j++) {
-        float d = 0;
-        for (int k = 0; k < params.n_cols; k++) {
-          if (params.kernel.kernel == KernelType::RBF) {
-            math_t diff = x1_host[get_offset(i, k, params.ld1, params.is_row_major)] -
-                          x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
-            d += diff * diff;
-          } else {
-            d += x1_host[get_offset(i, k, params.ld1, params.is_row_major)] *
-                 x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
-          }
-        }
-        int idx  = get_offset(i, j, params.ld_out, params.is_row_major);
-        math_t v = 0;
-        switch (params.kernel.kernel) {
-          case (KernelType::LINEAR): gram_host[idx] = d; break;
-          case (KernelType::POLYNOMIAL):
-            v              = params.kernel.gamma * d + params.kernel.coef0;
-            gram_host[idx] = std::pow(v, params.kernel.degree);
-            break;
-          case (KernelType::TANH):
-            gram_host[idx] = std::tanh(params.kernel.gamma * d + params.kernel.coef0);
-            break;
-          case (KernelType::RBF): gram_host[idx] = exp(-params.kernel.gamma * d); break;
-        }
-      }
-    }
-  }
-
   void runTest()
   {
-    std::unique_ptr<GramMatrixBase<math_t>> kernel = std::unique_ptr<GramMatrixBase<math_t>>(
-      KernelFactory<math_t>::create(params.kernel, handle.get_cublas_handle()));
-
-    kernel->evaluate(x1.data(),
-                     params.n1,
-                     params.n_cols,
-                     x2.data(),
-                     params.n2,
-                     gram.data(),
-                     params.is_row_major,
-                     stream,
-                     params.ld1,
-                     params.ld2,
-                     params.ld_out);
-    naiveKernel();
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
+
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
+    (*kernel)(handle, x1_span, x2_span, out_span);
+
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+
     ASSERT_TRUE(raft::devArrMatchHost(
       gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
   }
diff --git a/cpp/test/distance/gram_base.cuh b/cpp/test/distance/gram_base.cuh
new file mode 100644
index 0000000000..8c0652bc16
--- /dev/null
+++ b/cpp/test/distance/gram_base.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace distance {
+namespace kernels {
+
+// Get the offset of element [i,k].
+HDI int get_offset(int i, int k, int ld, bool is_row_major)
+{
+  return is_row_major ? i * ld + k : i + k * ld;
+}
+
+// Calculate the Gram matrix on the host.
+template <typename math_t>
+void naiveGramMatrixKernel(int n1,
+                           int n2,
+                           int n_cols,
+                           const rmm::device_uvector<math_t>& x1,
+                           const rmm::device_uvector<math_t>& x2,
+                           math_t* gram_host,
+                           int ld1,
+                           int ld2,
+                           int ld_out,
+                           bool is_row_major,
+                           KernelParams kernel,
+                           cudaStream_t stream,
+                           const raft::device_resources& handle)
+{
+  std::vector<math_t> x1_host(x1.size());
+  raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
+  std::vector<math_t> x2_host(x2.size());
+  raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
+  handle.sync_stream(stream);
+
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      float d = 0;
+      for (int k = 0; k < n_cols; k++) {
+        if (kernel.kernel == KernelType::RBF) {
+          math_t diff = x1_host[get_offset(i, k, ld1, is_row_major)] -
+                        x2_host[get_offset(j, k, ld2, is_row_major)];
+          d += diff * diff;
+        } else {
+          d += x1_host[get_offset(i, k, ld1, is_row_major)] *
+               x2_host[get_offset(j, k, ld2, is_row_major)];
+        }
+      }
+      int idx  = get_offset(i, j, ld_out, is_row_major);
+      math_t v = 0;
+      switch (kernel.kernel) {
+        case (KernelType::LINEAR): gram_host[idx] = d; break;
+        case (KernelType::POLYNOMIAL):
+          v              = kernel.gamma * d + kernel.coef0;
+          gram_host[idx] = std::pow(v, kernel.degree);
+          break;
+        case (KernelType::TANH): gram_host[idx] = std::tanh(kernel.gamma * d + kernel.coef0); break;
+        case (KernelType::RBF): gram_host[idx] = exp(-kernel.gamma * d); break;
+      }
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/test/distance/masked_nn.cu b/cpp/test/distance/masked_nn.cu
index d01911206b..66d5a77dbf 100644
--- a/cpp/test/distance/masked_nn.cu
+++ b/cpp/test/distance/masked_nn.cu
@@ -28,10 +28,6 @@
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
-#ifdef RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 namespace raft::distance::masked_nn {
 
 // The adjacency pattern determines what distances get computed.
diff --git a/cpp/test/ext_headers/00_generate.py b/cpp/test/ext_headers/00_generate.py
new file mode 100644
index 0000000000..15f90e1cc5
--- /dev/null
+++ b/cpp/test/ext_headers/00_generate.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+copyright_notice = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+"""
+
+ext_headers = [
+    "raft/neighbors/brute_force-ext.cuh",
+    "raft/distance/distance-ext.cuh",
+    "raft/distance/detail/pairwise_matrix/dispatch-ext.cuh",
+    "raft/matrix/detail/select_k-ext.cuh",
+    "raft/neighbors/ball_cover-ext.cuh",
+    "raft/spatial/knn/detail/fused_l2_knn-ext.cuh",
+    "raft/distance/fused_l2_nn-ext.cuh",
+    "raft/neighbors/ivf_pq-ext.cuh",
+    "raft/util/memory_pool-ext.hpp",
+    "raft/neighbors/ivf_flat-ext.cuh",
+    "raft/core/logger-ext.hpp",
+    "raft/neighbors/refine-ext.cuh",
+    "raft/neighbors/detail/ivf_flat_search-ext.cuh",
+    "raft/neighbors/detail/selection_faiss-ext.cuh",
+    "raft/linalg/detail/coalesced_reduction-ext.cuh",
+    "raft/spatial/knn/detail/ball_cover/registers-ext.cuh",
+    "raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh",
+    "raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh",
+]
+
+for ext_header in ext_headers:
+    header = ext_header.replace("-ext", "")
+
+    path = (
+        header
+        .replace("/", "_")
+        .replace(".cuh", ".cu")
+        .replace(".hpp", ".cpp")
+    )
+
+    with open(path, "w") as f:
+        f.write(copyright_notice)
+        f.write(f"#include <{header}>\n")
+
+    # For in CMakeLists.txt
+    print(f"test/ext_headers/{path}")
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu b/cpp/test/ext_headers/raft_core_logger.cpp
similarity index 72%
rename from cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
rename to cpp/test/ext_headers/raft_core_logger.cpp
index f7825e577a..18ba9ef48d 100644
--- a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
+++ b/cpp/test/ext_headers/raft_core_logger.cpp
@@ -1,5 +1,6 @@
+
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +15,13 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
 
-template class raft::distance::kernels::detail::TanhKernel<float>;
\ No newline at end of file
+#include <raft/core/logger.hpp>
diff --git a/cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu b/cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
new file mode 100644
index 0000000000..02e4c8e331
--- /dev/null
+++ b/cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>
diff --git a/cpp/test/ext_headers/raft_distance_distance.cu b/cpp/test/ext_headers/raft_distance_distance.cu
new file mode 100644
index 0000000000..458d6385ed
--- /dev/null
+++ b/cpp/test/ext_headers/raft_distance_distance.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/distance/distance.cuh>
diff --git a/cpp/test/ext_headers/raft_distance_fused_l2_nn.cu b/cpp/test/ext_headers/raft_distance_fused_l2_nn.cu
new file mode 100644
index 0000000000..23ab58a67b
--- /dev/null
+++ b/cpp/test/ext_headers/raft_distance_fused_l2_nn.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/distance/fused_l2_nn.cuh>
diff --git a/cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu b/cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
new file mode 100644
index 0000000000..7f94824287
--- /dev/null
+++ b/cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/linalg/detail/coalesced_reduction.cuh>
diff --git a/cpp/test/ext_headers/raft_matrix_detail_select_k.cu b/cpp/test/ext_headers/raft_matrix_detail_select_k.cu
new file mode 100644
index 0000000000..adb10f5bbb
--- /dev/null
+++ b/cpp/test/ext_headers/raft_matrix_detail_select_k.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/matrix/detail/select_k.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_ball_cover.cu b/cpp/test/ext_headers/raft_neighbors_ball_cover.cu
new file mode 100644
index 0000000000..8aaabe1872
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_ball_cover.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ball_cover.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_brute_force.cu b/cpp/test/ext_headers/raft_neighbors_brute_force.cu
new file mode 100644
index 0000000000..2c37799ae6
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_brute_force.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/brute_force.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu b/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
new file mode 100644
index 0000000000..5a3a0b3f76
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu b/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
new file mode 100644
index 0000000000..a6274c1c80
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_flat_search.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu b/cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
new file mode 100644
index 0000000000..fd5ad62204
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_selection_faiss.cu b/cpp/test/ext_headers/raft_neighbors_detail_selection_faiss.cu
new file mode 100644
index 0000000000..f8bd21e86f
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_detail_selection_faiss.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/selection_faiss.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_ivf_flat.cu b/cpp/test/ext_headers/raft_neighbors_ivf_flat.cu
new file mode 100644
index 0000000000..ab38e4c02c
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_ivf_flat.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_ivf_pq.cu b/cpp/test/ext_headers/raft_neighbors_ivf_pq.cu
new file mode 100644
index 0000000000..43a66bde18
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_ivf_pq.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_refine.cu b/cpp/test/ext_headers/raft_neighbors_refine.cu
new file mode 100644
index 0000000000..6152f83aab
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_refine.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/refine.cuh>
diff --git a/cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu b/cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
new file mode 100644
index 0000000000..39320a40c0
--- /dev/null
+++ b/cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu b/cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
similarity index 70%
rename from cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
rename to cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
index 6609de69ac..f884d1b062 100644
--- a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
+++ b/cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
@@ -1,5 +1,6 @@
+
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +15,13 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
 
-template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
\ No newline at end of file
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
diff --git a/cpp/test/ext_headers/raft_util_memory_pool.cpp b/cpp/test/ext_headers/raft_util_memory_pool.cpp
new file mode 100644
index 0000000000..11a024b958
--- /dev/null
+++ b/cpp/test/ext_headers/raft_util_memory_pool.cpp
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/util/memory_pool.hpp>
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 5229e99d20..99d6cd0a31 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -273,5 +273,9 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputs
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2));
 
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecCompareF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecCompareD, ::testing::ValuesIn(inputsd2));
+
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/eigen_solvers.cu b/cpp/test/linalg/eigen_solvers.cu
index 1f29d7e275..ca34b0c3a4 100644
--- a/cpp/test/linalg/eigen_solvers.cu
+++ b/cpp/test/linalg/eigen_solvers.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <raft/common/nvtx.hpp>
 #include <raft/core/device_resources.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/spectral/eigen_solvers.cuh>
 #include <raft/spectral/partition.cuh>
 
@@ -24,6 +24,7 @@
 #include <cstddef>
 #include <iostream>
 #include <memory>
+#include <type_traits>
 
 namespace raft {
 namespace spectral {
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index ba2572b5a9..48a077aa26 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -159,24 +159,24 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
 const std::vector<RsvdInputs<float>> inputs_fx = {
   // Test with ratios
-  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},      // Square + BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Tall + BBT
 
-  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Square + non-BBT
-  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Tall + non-BBT
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},     // Square + non-BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Tall + non-BBT
 
-  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Square + BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Tall + BBT
 
-  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
-  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Square + non-BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}   // Tall + non-BBT
 
-  ,                                                              // Test with fixed ranks
-  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
-  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+  ,                                                                // Test with fixed ranks
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
 
-  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Square + non-BBT
-  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Tall + non-BBT
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
 
   {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
   {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
@@ -187,14 +187,14 @@ const std::vector<RsvdInputs<float>> inputs_fx = {
 
 const std::vector<RsvdInputs<double>> inputs_dx = {
   // Test with ratios
-  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
-  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
-  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
-  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
-  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
-  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
-  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},      // Square + BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},     // Tall + BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},     // Square + non-BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},    // Tall + non-BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},    // Square + BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},   // Tall + BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},   // Square + non-BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}   // Tall + non-BBT
 
   ,                                                             // Test with fixed ranks
   {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index bd66459962..c780476a5f 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -202,6 +202,10 @@ INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2));
 
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD, ::testing::ValuesIn(inputsd2));
+
 // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF,
 // ::testing::ValuesIn(inputsf2));
 
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 17955abb34..6f5800dd8f 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -16,6 +16,7 @@
 
 #include "../test_utils.cuh"
 
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/transpose.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/test/matrix/columnSort.cu b/cpp/test/matrix/columnSort.cu
index 2292772b1a..9a65918f8f 100644
--- a/cpp/test/matrix/columnSort.cu
+++ b/cpp/test/matrix/columnSort.cu
@@ -19,6 +19,7 @@
 #include <gtest/gtest.h>
 #include <numeric>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/col_wise_sort.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
index 2a40d70abc..7a8a5b7aa8 100644
--- a/cpp/test/matrix/select_k.cu
+++ b/cpp/test/matrix/select_k.cu
@@ -18,10 +18,6 @@
 
 #include <raft_internal/matrix/select_k.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/matrix/specializations.cuh>
-#endif
-
 #include <raft/core/device_resources.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/sparse/detail/utils.h>
@@ -232,9 +228,10 @@ struct SelectK  // NOLINT
     auto& in_dists   = ref.get_in_dists();
     auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) {
       if (i == j) return true;
-      auto ix_i = uint64_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
-      auto ix_j = uint64_t(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin());
-      if (ix_i >= in_ids.size() || ix_j >= in_ids.size()) return false;
+      auto ix_i = static_cast<int64_t>(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
+      auto ix_j = static_cast<int64_t>(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin());
+      if (static_cast<size_t>(ix_i) >= in_ids.size() || static_cast<size_t>(ix_j) >= in_ids.size())
+        return false;
       auto dist_i = in_dists[ix_i];
       auto dist_j = in_dists[ix_j];
       if (dist_i == dist_j) return true;
@@ -434,7 +431,7 @@ INSTANTIATE_TEST_CASE_P(                          // NOLINT
                                    select::Algo::kWarpDistributedShm)));
 
 using ReferencedRandomDoubleSizeT =
-  SelectK<double, uint64_t, with_ref<select::Algo::kPublicApi>::params_random>;
+  SelectK<double, int64_t, with_ref<select::Algo::kPublicApi>::params_random>;
 TEST_P(ReferencedRandomDoubleSizeT, Run) { run(); }  // NOLINT
 INSTANTIATE_TEST_CASE_P(                             // NOLINT
   SelectK,
@@ -461,7 +458,7 @@ INSTANTIATE_TEST_CASE_P(                                 // NOLINT
                                    select::Algo::kRadix11bitsExtraPass)));
 
 using ReferencedRandomFloatSizeT =
-  SelectK<float, uint64_t, with_ref<select::Algo::kRadix8bits>::params_random>;
+  SelectK<float, int64_t, with_ref<select::Algo::kRadix8bits>::params_random>;
 TEST_P(ReferencedRandomFloatSizeT, LargeK) { run(); }  // NOLINT
 INSTANTIATE_TEST_CASE_P(SelectK,                       // NOLINT
                         ReferencedRandomFloatSizeT,
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 385e9a80c0..f9df1f724f 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -82,6 +82,10 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
  protected:
   void testCagra()
   {
+    if (ps.dim * sizeof(DataT) % 8 != 0) {
+      GTEST_SKIP()
+        << "CAGRA requires the input data rows to be aligned at least to 8 bytes for now.";
+    }
     size_t queries_size = ps.n_queries * ps.k;
     std::vector<IdxT> indices_Cagra(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
@@ -221,7 +225,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {100},
     {1000},
     {8},
-    {1, 16, 33},  // k
+    {1, 16, 33},   // k
     {search_algo::SINGLE_CTA, search_algo::MULTI_KERNEL},
     {1, 10, 100},  // query size
     {0},
@@ -310,4 +314,4 @@ inline std::vector<AnnCagraInputs> generate_inputs()
 
 const std::vector<AnnCagraInputs> inputs = generate_inputs();
 
-}  // namespace raft::neighbors::experimental::cagra
\ No newline at end of file
+}  // namespace raft::neighbors::experimental::cagra
diff --git a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
index 71a83e2cca..1497a515d2 100644
--- a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
@@ -18,10 +18,6 @@
 
 #include "../ann_cagra.cuh"
 
-// #if defined RAFT_DISTANCE_COMPILED
-// #include <raft/neighbors/specializations.cuh>
-// #endif
-
 namespace raft::neighbors::experimental::cagra {
 
 typedef AnnCagraTest<float, float, std::uint32_t> AnnCagraTestF;
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
similarity index 58%
rename from cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
rename to cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
index 62be67e1a9..f148ebc186 100644
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
+++ b/cpp/test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <gtest/gtest.h>
 
-#include <cuda_fp16.h>
+#include "../ann_cagra.cuh"
 
-namespace raft::neighbors::ivf_pq::detail {
+namespace raft::neighbors::experimental::cagra {
 
-template auto get_compute_similarity_kernel<float, half, false, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, half>;
+typedef AnnCagraTest<float, std::int8_t, std::uint32_t> AnnCagraTestI8;
+TEST_P(AnnCagraTestI8, AnnCagra) { this->testCagra(); }
 
-}  // namespace raft::neighbors::ivf_pq::detail
+INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestI8, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::cagra
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
similarity index 58%
rename from cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
rename to cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
index f543369de5..087d7cec71 100644
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
+++ b/cpp/test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <gtest/gtest.h>
 
-#include <cuda_fp16.h>
+#include "../ann_cagra.cuh"
 
-namespace raft::neighbors::ivf_pq::detail {
+namespace raft::neighbors::experimental::cagra {
 
-template auto get_compute_similarity_kernel<float, float, false, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, float>;
+typedef AnnCagraTest<float, std::uint8_t, std::uint32_t> AnnCagraTestU8;
+TEST_P(AnnCagraTestU8, AnnCagra) { this->testCagra(); }
 
-}  // namespace raft::neighbors::ivf_pq::detail
+INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestU8, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::cagra
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index fe6f9163a0..4d90c3d7e4 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -36,10 +36,6 @@
 
 #include <thrust/sequence.h>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <cstddef>
 #include <iostream>
 #include <vector>
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
index e430af89df..f0988ca988 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
@@ -18,10 +18,6 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, float, std::int64_t> AnnIVFFlatTestF;
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
index e4e7a207fb..2f542bd6ec 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
@@ -18,10 +18,6 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, int8_t, std::int64_t> AnnIVFFlatTestF_int8;
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
index ef7980401a..7659707089 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
@@ -18,10 +18,6 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, uint8_t, std::int64_t> AnnIVFFlatTestF_uint8;
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index c69829821a..90c66ace06 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -22,13 +22,13 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/map_reduce.cuh>
+#include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_helpers.cuh>
+#include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft/random/rng.cuh>
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#else
-#pragma message("NN specializations are not enabled; expect very long building times.")
-#endif
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -38,8 +38,6 @@
 #include <gtest/gtest.h>
 
 #include <cub/cub.cuh>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
 
 #include <algorithm>
 #include <cstddef>
@@ -115,6 +113,33 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   return os;
 }
 
+template <typename T>
+void compare_vectors_l2(
+  const raft::device_resources& res, T a, T b, uint32_t label, double compression_ratio, double eps)
+{
+  auto n_rows = a.extent(0);
+  auto dim    = a.extent(1);
+  rmm::mr::managed_memory_resource managed_memory;
+  auto dist = make_device_mdarray<double>(res, &managed_memory, make_extents<uint32_t>(n_rows));
+  linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) {
+    spatial::knn::detail::utils::mapping<float> f{};
+    double d = 0.0f;
+    for (uint32_t j = 0; j < dim; j++) {
+      double t = f(a(i, j)) - f(b(i, j));
+      d += t * t;
+    }
+    return sqrt(d / double(dim));
+  });
+  res.sync_stream();
+  for (uint32_t i = 0; i < n_rows; i++) {
+    double d = dist(i);
+    // The theoretical estimate of the error is hard to come up with,
+    // the estimate below is based on experimentation + curse of dimensionality
+    ASSERT_LE(d, 1.2 * eps * std::pow(2.0, compression_ratio))
+      << " (label = " << label << ", ix = " << i << ", eps = " << eps << ")";
+  }
+}
+
 template <typename IdxT>
 auto min_output_size(const raft::device_resources& handle,
                      const ivf_pq::index<IdxT>& index,
@@ -139,7 +164,6 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   {
   }
 
- protected:
   void gen_data()
   {
     database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_);
@@ -178,7 +202,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     handle_.sync_stream(stream_);
   }
 
-  index<IdxT> build_only()
+  auto build_only()
   {
     auto ipams              = ps.index_params;
     ipams.add_data_on_build = true;
@@ -188,19 +212,17 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return ivf_pq::build<DataT, IdxT>(handle_, ipams, index_view);
   }
 
-  index<IdxT> build_2_extends()
+  auto build_2_extends()
   {
-    rmm::device_uvector<IdxT> db_indices(ps.num_db_vecs, stream_);
-    thrust::sequence(handle_.get_thrust_policy(),
-                     thrust::device_pointer_cast(db_indices.data()),
-                     thrust::device_pointer_cast(db_indices.data() + ps.num_db_vecs));
+    auto db_indices = make_device_vector<IdxT>(handle_, ps.num_db_vecs);
+    linalg::map_offset(handle_, db_indices.view(), identity_op{});
     handle_.sync_stream(stream_);
     auto size_1 = IdxT(ps.num_db_vecs) / 2;
     auto size_2 = IdxT(ps.num_db_vecs) - size_1;
     auto vecs_1 = database.data();
     auto vecs_2 = database.data() + size_t(size_1) * size_t(ps.dim);
-    auto inds_1 = db_indices.data();
-    auto inds_2 = db_indices.data() + size_t(size_1);
+    auto inds_1 = db_indices.data_handle();
+    auto inds_2 = db_indices.data_handle() + size_t(size_1);
 
     auto ipams              = ps.index_params;
     ipams.add_data_on_build = false;
@@ -220,17 +242,160 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return idx;
   }
 
-  index<IdxT> build_serialize()
+  auto build_serialize()
   {
     ivf_pq::serialize<IdxT>(handle_, "ivf_pq_index", build_only());
     return ivf_pq::deserialize<IdxT>(handle_, "ivf_pq_index");
   }
 
+  void check_reconstruction(const index<IdxT>& index,
+                            double compression_ratio,
+                            uint32_t label,
+                            uint32_t n_take,
+                            uint32_t n_skip)
+  {
+    auto& rec_list = index.lists()[label];
+    auto dim       = index.dim();
+    n_take         = std::min<uint32_t>(n_take, rec_list->size.load());
+    n_skip         = std::min<uint32_t>(n_skip, rec_list->size.load() - n_take);
+
+    if (n_take == 0) { return; }
+
+    auto rec_data  = make_device_matrix<DataT>(handle_, n_take, dim);
+    auto orig_data = make_device_matrix<DataT>(handle_, n_take, dim);
+
+    ivf_pq::helpers::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
+
+    matrix::gather(database.data(),
+                   IdxT{dim},
+                   IdxT{n_take},
+                   rec_list->indices.data_handle() + n_skip,
+                   IdxT{n_take},
+                   orig_data.data_handle(),
+                   stream_);
+
+    compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.06);
+  }
+
+  void check_reconstruct_extend(index<IdxT>* index, double compression_ratio, uint32_t label)
+  {
+    // NB: this is not reference, the list is retained; the index will have to create a new list on
+    // `erase_list` op.
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
+    if (n_rows == 0) { return; }
+
+    auto vectors_1 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
+    auto indices   = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
+    // NB: passing the type parameter because const->non-const implicit conversion of the mdspans
+    // breaks type inference
+    ivf_pq::helpers::extend_list<EvalT, IdxT>(
+      handle_, index, vectors_1.view(), indices.view(), label);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+
+    auto vectors_2 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
+    ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0);
+    // The code search is unstable, and there's high chance of repeating values of the lvl-2 codes.
+    // Hence, encoding-decoding chain often leads to altering both the PQ codes and the
+    // reconstructed data.
+    compare_vectors_l2(
+      handle_, vectors_1.view(), vectors_2.view(), label, compression_ratio, 0.025);
+  }
+
+  void check_packing(index<IdxT>* index, uint32_t label)
+  {
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
+
+    if (n_rows == 0) { return; }
+
+    auto codes   = make_device_matrix<uint8_t>(handle_, n_rows, index->pq_dim());
+    auto indices = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    ivf_pq::helpers::unpack_list_data(handle_, *index, codes.view(), label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
+    ivf_pq::helpers::extend_list_with_codes<IdxT>(
+      handle_, index, codes.view(), indices.view(), label);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+    auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) *
+                          new_list->data.extent(2) * new_list->data.extent(3);
+
+    ASSERT_TRUE(old_list->data.size() >= list_data_size);
+    ASSERT_TRUE(new_list->data.size() >= list_data_size);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+
+    // Pack a few vectors back to the list.
+    int row_offset = 9;
+    int n_vec      = 3;
+    ASSERT_TRUE(row_offset + n_vec < n_rows);
+    size_t offset      = row_offset * index->pq_dim();
+    auto codes_to_pack = make_device_matrix_view<const uint8_t, uint32_t>(
+      codes.data_handle() + offset, n_vec, index->pq_dim());
+    ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+
+    // Another test with the API that take list_data directly
+    auto list_data  = index->lists()[label]->data.view();
+    uint32_t n_take = 4;
+    ASSERT_TRUE(row_offset + n_take < n_rows);
+    auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, index->pq_dim());
+    ivf_pq::helpers::codepacker::unpack(
+      handle_, list_data, index->pq_bits(), row_offset, codes2.view());
+
+    // Write it back
+    ivf_pq::helpers::codepacker::pack(
+      handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+  }
+
   template <typename BuildIndex>
   void run(BuildIndex build_index)
   {
     index<IdxT> index = build_index();
 
+    double compression_ratio =
+      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
+
+    for (uint32_t label = 0; label < index.n_lists(); label++) {
+      switch (label % 3) {
+        case 0: {
+          // Reconstruct and re-write vectors for one label
+          check_reconstruct_extend(&index, compression_ratio, label);
+        } break;
+        case 1: {
+          // Dump and re-write codes for one label
+          check_packing(&index, label);
+        } break;
+        default: {
+          // check a small subset of data in a randomly chosen cluster to see if the data
+          // reconstruction works well.
+          check_reconstruction(index, compression_ratio, label, 100, 7);
+        }
+      }
+    }
+
     size_t queries_size = ps.num_queries * ps.k;
     std::vector<IdxT> indices_ivf_pq(queries_size);
     std::vector<EvalT> distances_ivf_pq(queries_size);
@@ -255,11 +420,9 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // A very conservative lower bound on recall
     double min_recall =
       static_cast<double>(ps.search_params.n_probes) / static_cast<double>(ps.index_params.n_lists);
-    double low_precision_factor =
-      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
     // Using a heuristic to lower the required recall due to code-packing errors
     min_recall =
-      std::min(std::erfc(0.05 * low_precision_factor / std::max(min_recall, 0.5)), min_recall);
+      std::min(std::erfc(0.05 * compression_ratio / std::max(min_recall, 0.5)), min_recall);
     // Use explicit per-test min recall value if provided.
     min_recall = ps.min_recall.value_or(min_recall);
 
@@ -269,7 +432,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                 distances_ivf_pq,
                                 ps.num_queries,
                                 ps.k,
-                                0.0001 * low_precision_factor,
+                                0.0001 * compression_ratio,
                                 min_recall))
       << ps;
 
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
index c14afe4d70..3d362a5261 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+// XXX: the uint32_t instance is not compiled in libraft.so. So we allow
+// instantiating the template here.
+//
+// TODO: consider removing this test or consider adding an instantiation to the
+// library.
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
 #include "../ann_ivf_pq.cuh"
 
 namespace raft::neighbors::ivf_pq {
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index fc448f014f..438c56da21 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/device_mdarray.hpp>  // raft::make_device_matrix
 #include <raft/distance/distance_types.hpp>
 #include <raft/matrix/detail/select_k.cuh>
 #include <raft/matrix/matrix.cuh>
diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu
index 46ef3a9150..19935154df 100644
--- a/cpp/test/neighbors/ball_cover.cu
+++ b/cpp/test/neighbors/ball_cover.cu
@@ -23,10 +23,6 @@
 #include <raft/random/make_blobs.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/test/neighbors/epsilon_neighborhood.cu b/cpp/test/neighbors/epsilon_neighborhood.cu
index 769cb7ec2d..c78a15dd2d 100644
--- a/cpp/test/neighbors/epsilon_neighborhood.cu
+++ b/cpp/test/neighbors/epsilon_neighborhood.cu
@@ -23,10 +23,6 @@
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/test/neighbors/fused_l2_knn.cu b/cpp/test/neighbors/fused_l2_knn.cu
index ab05b41cc9..9fbccf681d 100644
--- a/cpp/test/neighbors/fused_l2_knn.cu
+++ b/cpp/test/neighbors/fused_l2_knn.cu
@@ -23,10 +23,6 @@
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <raft/distance/distance.cuh>
 
 #include <rmm/device_buffer.hpp>
@@ -81,9 +77,9 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     rmm::device_uvector<T> temp_distances(num_db_vecs * num_queries, stream_);
     distance::pairwise_distance(
       handle_,
-      raft::make_device_matrix_view<T, int64_t>(search_queries.data(), num_queries, dim),
-      raft::make_device_matrix_view<T, int64_t>(database.data(), num_db_vecs, dim),
-      raft::make_device_matrix_view<T, int64_t>(temp_distances.data(), num_queries, num_db_vecs),
+      raft::make_device_matrix_view<T, int32_t>(search_queries.data(), num_queries, dim),
+      raft::make_device_matrix_view<T, int32_t>(database.data(), num_db_vecs, dim),
+      raft::make_device_matrix_view<T, int32_t>(temp_distances.data(), num_queries, num_db_vecs),
       metric);
 
     spatial::knn::select_k<int64_t, T>(temp_distances.data(),
diff --git a/cpp/test/neighbors/knn.cu b/cpp/test/neighbors/knn.cu
index bcd4b9cb0b..a03a761c7e 100644
--- a/cpp/test/neighbors/knn.cu
+++ b/cpp/test/neighbors/knn.cu
@@ -21,10 +21,6 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/brute_force.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <rmm/device_buffer.hpp>
 
 #include <gtest/gtest.h>
@@ -80,11 +76,11 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
  protected:
   void testBruteForce()
   {
-    //#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
+    // #if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
     raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, std::cout);
     std::cout << "K: " << k_ << std::endl;
     raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, std::cout);
-    //#endif
+    // #endif
 
     std::vector<device_matrix_view<const T, IdxT, row_major>> index = {
       make_device_matrix_view((const T*)(input_.data()), rows_, cols_)};
diff --git a/cpp/test/neighbors/refine.cu b/cpp/test/neighbors/refine.cu
index dd3491673e..d868ba06cf 100644
--- a/cpp/test/neighbors/refine.cu
+++ b/cpp/test/neighbors/refine.cu
@@ -31,10 +31,6 @@
 
 #include <gtest/gtest.h>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <vector>
 
 namespace raft::neighbors {
diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu
index 9f13de357c..a21ff9f99e 100644
--- a/cpp/test/neighbors/selection.cu
+++ b/cpp/test/neighbors/selection.cu
@@ -17,6 +17,8 @@
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <numeric>
+#include <raft/neighbors/detail/selection_faiss.cuh>
+#include <raft/neighbors/detail/selection_faiss_helpers.cuh>  // kFaissMax
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -24,9 +26,6 @@
 
 #include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/knn.cuh>
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
 
 namespace raft::spatial::selection {
 
diff --git a/cpp/test/neighbors/tiled_knn.cu b/cpp/test/neighbors/tiled_knn.cu
index ccc3a64edd..aa46fc29f1 100644
--- a/cpp/test/neighbors/tiled_knn.cu
+++ b/cpp/test/neighbors/tiled_knn.cu
@@ -20,14 +20,13 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/distance/distance.cuh>  // raft::distance::pairwise_distance
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/transpose.cuh>
 #include <raft/matrix/init.cuh>
 #include <raft/neighbors/brute_force.cuh>
-
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
+#include <raft/neighbors/detail/knn_brute_force.cuh>  // raft::neighbors::detail::brute_force_knn_impl
+#include <raft/neighbors/detail/selection_faiss.cuh>  // raft::neighbors::detail::select_k
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index c2dbc5dc1c..0565635e3b 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -18,6 +18,8 @@
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+
 #include <raft/random/make_blobs.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu
index 7508b57bdd..74aa00171b 100644
--- a/cpp/test/random/make_regression.cu
+++ b/cpp/test/random/make_regression.cu
@@ -20,8 +20,10 @@
 #include <thrust/device_vector.h>
 
 #include "../test_utils.cuh"
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/subtract.cuh>
+
 #include <raft/linalg/transpose.cuh>
 #include <raft/random/make_regression.cuh>
 #include <raft/util/cuda_utils.cuh>
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index 1aa8b6a555..a27dffc7bf 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -18,8 +18,10 @@
 #include <cmath>
 #include <gtest/gtest.h>
 #include <iostream>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/multi_variable_gaussian.cuh>
 #include <raft/util/cudart_utils.hpp>
+
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/random/permute.cu b/cpp/test/random/permute.cu
index d5fcca270e..2c5ddf9d5a 100644
--- a/cpp/test/random/permute.cu
+++ b/cpp/test/random/permute.cu
@@ -16,8 +16,10 @@
 
 #include "../test_utils.cuh"
 #include <algorithm>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/permute.cuh>
 #include <raft/random/rng.cuh>
+
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <vector>
diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/test/random/rmat_rectangular_generator.cu
index aae3898389..fd9a8ec732 100644
--- a/cpp/test/random/rmat_rectangular_generator.cu
+++ b/cpp/test/random/rmat_rectangular_generator.cu
@@ -21,8 +21,10 @@
 
 #include "../test_utils.cuh"
 
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rmat_rectangular_generator.cuh>
 #include <raft/random/rng.cuh>
+
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index d3b8e44b05..92f79b1fa0 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -20,9 +20,11 @@
 #include "../test_utils.cuh"
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/stats/mean.cuh>
 #include <raft/stats/stddev.cuh>
+
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/test/random/rng_discrete.cu b/cpp/test/random/rng_discrete.cu
index 741f7c65e0..b9b283b87d 100644
--- a/cpp/test/random/rng_discrete.cu
+++ b/cpp/test/random/rng_discrete.cu
@@ -18,9 +18,11 @@
 #include <algorithm>
 #include <cmath>
 #include <gtest/gtest.h>
+#include <raft/core/device_resources.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/random/rng.cuh>
+
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <vector>
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 83300b3ecc..8208b04489 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -17,6 +17,7 @@
 #include "../test_utils.cuh"
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index ae5a58da3d..dcad32ce8a 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -16,6 +16,7 @@
 
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/random/sample_without_replacement.cuh>
 #include <raft/util/cuda_utils.cuh>
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
new file mode 100644
index 0000000000..86a2e0cf43
--- /dev/null
+++ b/cpp/test/sparse/gram.cu
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+#include "../distance/gram_base.cuh"
+#include "../test_utils.cuh"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/dense.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::distance::kernels {
+
+/**
+ * Structure to describe structure of the input matrices:
+ *  - DENSE: dense, dense
+ *  - MIX: CSR, dense
+ *  - CSR: CSR, CSR
+ */
+enum SparseType { DENSE, MIX, CSR };
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  SparseType sparse_input;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/")
+     << (p.sparse_input == SparseType::DENSE
+           ? "DenseDense/"
+           : (p.sparse_input == SparseType::MIX ? "CsrDense/" : "CsrCsr/"))
+     << kernel_names[p.kernel.kernel] << "/ld_" << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+/*struct KernelParams {
+  // Kernel function parameters
+  KernelType kernel;  //!< Type of the kernel function
+  int degree;         //!< Degree of polynomial kernel (ignored by others)
+  double gamma;       //!< multiplier in the
+  double coef0;       //!< additive constant in poly and tanh kernels
+};*/
+
+// const KernelParams linear_kernel_params{.kernel=KernelType::LINEAR};
+
+// {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, {KernelType::TANH, 0, 0.5, 2.4}, {KernelType::RBF, 0, 0.5}
+const std::vector<GramMatrixInputs> inputs = raft::util::itertools::product<GramMatrixInputs>(
+  {42},
+  {137},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX, SparseType::CSR},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4},
+   KernelParams{KernelType::RBF, 0, 0.5}});
+
+// (ld_1, ld_2, ld_out) not supported by RBF and CSR
+const std::vector<GramMatrixInputs> inputs_ld = raft::util::itertools::product<GramMatrixInputs>(
+  {137},
+  {42},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+  {159},
+  {73},
+  {144});
+
+// (ld_1, ld_2) are supported by CSR
+const std::vector<GramMatrixInputs> inputs_ld_csr =
+  raft::util::itertools::product<GramMatrixInputs>(
+    {42},
+    {137},
+    {2},
+    {true, false},
+    {SparseType::CSR, SparseType::MIX},
+    {KernelParams{KernelType::LINEAR},
+     KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+     KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+    {64},
+    {155},
+    {0});
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()),
+      stream(0),
+      x1(0, stream),
+      x2(0, stream),
+      x1_csr_indptr(0, stream),
+      x1_csr_indices(0, stream),
+      x1_csr_data(0, stream),
+      x2_csr_indptr(0, stream),
+      x2_csr_indices(0, stream),
+      x2_csr_data(0, stream),
+      gram(0, stream),
+      gram_host(0)
+  {
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the output from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::Rng r(42137ULL);
+    r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream);
+    r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream);
+  }
+
+  ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); }
+
+  int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data)
+  {
+    int nnz           = 0;
+    double eps        = 1e-6;
+    int n_cols        = params.n_cols;
+    bool is_row_major = params.is_row_major;
+    size_t dense_size = get_offset(n_rows - 1, n_cols - 1, ld, is_row_major) + 1;
+
+    std::vector<math_t> dense_host(dense_size);
+    raft::update_host(dense_host.data(), dense, dense_size, stream);
+    handle.sync_stream(stream);
+
+    std::vector<int> indptr_host(n_rows + 1);
+    std::vector<int> indices_host(n_rows * n_cols);
+    std::vector<math_t> data_host(n_rows * n_cols);
+
+    // create csr matrix from dense (with threshold)
+    for (int i = 0; i < n_rows; ++i) {
+      indptr_host[i] = nnz;
+      for (int j = 0; j < n_cols; ++j) {
+        math_t value = dense_host[get_offset(i, j, ld, is_row_major)];
+        if (value > eps) {
+          indices_host[nnz] = j;
+          data_host[nnz]    = value;
+          nnz++;
+        }
+      }
+    }
+    indptr_host[n_rows] = nnz;
+
+    // fill back dense matrix from CSR
+    std::fill(dense_host.data(), dense_host.data() + dense_size, 0);
+    for (int i = 0; i < n_rows; ++i) {
+      for (int idx = indptr_host[i]; idx < indptr_host[i + 1]; ++idx) {
+        dense_host[get_offset(i, indices_host[idx], ld, is_row_major)] = data_host[idx];
+      }
+    }
+
+    raft::update_device(dense, dense_host.data(), dense_size, stream);
+    raft::update_device(indptr, indptr_host.data(), n_rows + 1, stream);
+    raft::update_device(indices, indices_host.data(), nnz, stream);
+    raft::update_device(data, data_host.data(), nnz, stream);
+    handle.sync_stream(stream);
+
+    return nnz;
+  }
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
+
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
+    if (params.sparse_input == SparseType::DENSE) {
+      (*kernel)(handle, x1_span, x2_span, out_span);
+    } else {
+      x1_csr_indptr.reserve(params.n1 + 1, stream);
+      x1_csr_indices.reserve(params.n1 * params.n_cols, stream);
+      x1_csr_data.reserve(params.n1 * params.n_cols, stream);
+      int x1_nnz = prepareCsr(x1.data(),
+                              params.n1,
+                              params.ld1,
+                              x1_csr_indptr.data(),
+                              x1_csr_indices.data(),
+                              x1_csr_data.data());
+
+      auto x1_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+        x1_csr_indptr.data(), x1_csr_indices.data(), params.n1, params.n_cols, x1_nnz);
+      auto x1_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+        raft::device_span<const math_t>(x1_csr_data.data(), x1_csr_structure.get_nnz()),
+        x1_csr_structure);
+
+      if (params.sparse_input == SparseType::MIX) {
+        (*kernel)(handle, x1_csr, x2_span, out_span);
+      } else {
+        x2_csr_indptr.reserve(params.n2 + 1, stream);
+        x2_csr_indices.reserve(params.n2 * params.n_cols, stream);
+        x2_csr_data.reserve(params.n2 * params.n_cols, stream);
+        int x2_nnz = prepareCsr(x2.data(),
+                                params.n2,
+                                params.ld2,
+                                x2_csr_indptr.data(),
+                                x2_csr_indices.data(),
+                                x2_csr_data.data());
+
+        auto x2_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+          x2_csr_indptr.data(), x2_csr_indices.data(), params.n2, params.n_cols, x2_nnz);
+        auto x2_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+          raft::device_span<const math_t>(x2_csr_data.data(), x2_csr_structure.get_nnz()),
+          x2_csr_structure);
+
+        (*kernel)(handle, x1_csr, x2_csr, out_span);
+      }
+    }
+
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+
+    handle.sync_stream(stream);
+
+    ASSERT_TRUE(raft::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
+  }
+
+  raft::device_resources handle;
+  cudaStream_t stream = 0;
+  GramMatrixInputs params;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+
+  rmm::device_uvector<int> x1_csr_indptr;
+  rmm::device_uvector<int> x1_csr_indices;
+  rmm::device_uvector<math_t> x1_csr_data;
+  rmm::device_uvector<int> x2_csr_indptr;
+  rmm::device_uvector<int> x2_csr_indices;
+  rmm::device_uvector<math_t> x2_csr_data;
+
+  rmm::device_uvector<math_t> gram;
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloatStandard;
+typedef GramMatrixTest<float> GramMatrixTestFloatLd;
+typedef GramMatrixTest<float> GramMatrixTestFloatLdCsr;
+
+TEST_P(GramMatrixTestFloatStandard, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLd, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLdCsr, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatStandard, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatLd, ::testing::ValuesIn(inputs_ld));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests,
+                         GramMatrixTestFloatLdCsr,
+                         ::testing::ValuesIn(inputs_ld_csr));
+};  // end namespace raft::distance::kernels
diff --git a/cpp/test/sparse/neighbors/connect_components.cu b/cpp/test/sparse/neighbors/connect_components.cu
index d200744329..e14cd9a180 100644
--- a/cpp/test/sparse/neighbors/connect_components.cu
+++ b/cpp/test/sparse/neighbors/connect_components.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+// XXX: We allow the instantiation of fused_l2_nn here:
+// raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(colors.data(), params.n_row);
+// raft::linkage::connect_components<value_idx, value_t>(
+//   handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op);
+//
+// TODO: consider adding this to libraft.so or creating an instance in a
+// separate translation unit for this test.
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
 #include <gtest/gtest.h>
 
 #include <cub/cub.cuh>
diff --git a/cpp/test/sparse/neighbors/knn_graph.cu b/cpp/test/sparse/neighbors/knn_graph.cu
index 8873445c37..aadb00879b 100644
--- a/cpp/test/sparse/neighbors/knn_graph.cu
+++ b/cpp/test/sparse/neighbors/knn_graph.cu
@@ -22,9 +22,6 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/neighbors/knn_graph.cuh>
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 91b7b09fcc..65d857652c 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -19,7 +19,7 @@
 #include "../test_utils.cuh"
 
 #include <raft/core/device_resources.hpp>
-#include <raft/sparse/csr.hpp>
+#include <raft/linalg/norm_types.hpp>
 #include <raft/sparse/linalg/norm.cuh>
 #include <raft/util/cudart_utils.hpp>
 
@@ -29,26 +29,24 @@
 namespace raft {
 namespace sparse {
 
-enum NormalizeMethod { MAX, L1 };
-
 template <typename Type_f, typename Index_>
-struct CSRRowNormalizeInputs {
-  NormalizeMethod method;
-  std::vector<Index_> ex_scan;
-  std::vector<Type_f> in_vals;
+struct CSRRowNormInputs {
+  raft::linalg::NormType norm;
+  std::vector<Index_> indptr;
+  std::vector<Type_f> data;
   std::vector<Type_f> verify;
 };
 
 template <typename Type_f, typename Index_>
-class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+class CSRRowNormTest : public ::testing::TestWithParam<CSRRowNormInputs<Type_f, Index_>> {
  public:
-  CSRRowNormalizeTest()
-    : params(::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
+  CSRRowNormTest()
+    : params(::testing::TestWithParam<CSRRowNormInputs<Type_f, Index_>>::GetParam()),
       stream(handle.get_stream()),
-      in_vals(params.in_vals.size(), stream),
-      verify(params.verify.size(), stream),
-      ex_scan(params.ex_scan.size(), stream),
-      result(params.verify.size(), stream)
+      data(params.data.size(), stream),
+      verify(params.indptr.size() - 1, stream),
+      indptr(params.indptr.size(), stream),
+      result(params.indptr.size() - 1, stream)
   {
   }
 
@@ -57,71 +55,66 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
 
   void Run()
   {
-    Index_ n_rows = params.ex_scan.size();
-    Index_ nnz    = params.in_vals.size();
-
-    raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
-    raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream);
-    raft::update_device(verify.data(), params.verify.data(), nnz, stream);
-
-    switch (params.method) {
-      case MAX:
-        linalg::csr_row_normalize_max<Type_f>(
-          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
-        break;
-      case L1:
-        linalg::csr_row_normalize_l1<Type_f>(
-          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
-        break;
-    }
+    Index_ n_rows = params.indptr.size() - 1;
+    Index_ nnz    = params.data.size();
+
+    raft::update_device(indptr.data(), params.indptr.data(), n_rows + 1, stream);
+    raft::update_device(data.data(), params.data.data(), nnz, stream);
+    raft::update_device(verify.data(), params.verify.data(), n_rows, stream);
+
+    linalg::rowNormCsr(handle, indptr.data(), data.data(), nnz, n_rows, result.data(), params.norm);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     ASSERT_TRUE(
-      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
+      raft::devArrMatch<Type_f>(verify.data(), result.data(), n_rows, raft::Compare<Type_f>()));
   }
 
  protected:
   raft::device_resources handle;
   cudaStream_t stream;
 
-  CSRRowNormalizeInputs<Type_f, Index_> params;
-  rmm::device_uvector<Index_> ex_scan;
-  rmm::device_uvector<Type_f> in_vals, result, verify;
+  CSRRowNormInputs<Type_f, Index_> params;
+  rmm::device_uvector<Index_> indptr;
+  rmm::device_uvector<Type_f> data, result, verify;
 };
 
-using CSRRowNormalizeTestF = CSRRowNormalizeTest<float, int>;
-TEST_P(CSRRowNormalizeTestF, Result) { Run(); }
-
-using CSRRowNormalizeTestD = CSRRowNormalizeTest<double, int>;
-TEST_P(CSRRowNormalizeTestD, Result) { Run(); }
-
-const std::vector<CSRRowNormalizeInputs<float, int>> csrnormalize_inputs_f = {
-  {MAX,
-   {0, 4, 8, 9},
-   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
-   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
-  {L1,
-   {0, 4, 8, 9},
-   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
-   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+using CSRRowNormTestF = CSRRowNormTest<float, int>;
+TEST_P(CSRRowNormTestF, Result) { Run(); }
+
+using CSRRowNormTestD = CSRRowNormTest<double, int>;
+TEST_P(CSRRowNormTestD, Result) { Run(); }
+
+const std::vector<CSRRowNormInputs<float, int>> csrnorm_inputs_f = {
+  {raft::linalg::NormType::LinfNorm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {5.0, 10.0, 2.0}},
+  {raft::linalg::NormType::L1Norm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {8.0, 13.0, 4.0}},
+  {raft::linalg::NormType::L2Norm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {30.0, 105.0, 6.0}},
 };
-const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
-  {MAX,
-   {0, 4, 8, 9},
-   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
-   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
-  {L1,
-   {0, 4, 8, 9},
-   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
-   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+const std::vector<CSRRowNormInputs<double, int>> csrnorm_inputs_d = {
+  {raft::linalg::NormType::LinfNorm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {5.0, 10.0, 2.0}},
+  {raft::linalg::NormType::L1Norm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {8.0, 13.0, 4.0}},
+  {raft::linalg::NormType::L2Norm,
+   {0, 3, 7, 10},
+   {5.0, 1.0, 2.0, 0.0, 10.0, 1.0, 2.0, 1.0, 1.0, 2.0},
+   {30.0, 105.0, 6.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseNormTest,
-                        CSRRowNormalizeTestF,
-                        ::testing::ValuesIn(csrnormalize_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseNormTest,
-                        CSRRowNormalizeTestD,
-                        ::testing::ValuesIn(csrnormalize_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormTestF, ::testing::ValuesIn(csrnorm_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormTestD, ::testing::ValuesIn(csrnorm_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/normalize.cu b/cpp/test/sparse/normalize.cu
new file mode 100644
index 0000000000..91b7b09fcc
--- /dev/null
+++ b/cpp/test/sparse/normalize.cu
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_resources.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/linalg/norm.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <iostream>
+#include <limits>
+
+namespace raft {
+namespace sparse {
+
+enum NormalizeMethod { MAX, L1 };
+
+template <typename Type_f, typename Index_>
+struct CSRRowNormalizeInputs {
+  NormalizeMethod method;
+  std::vector<Index_> ex_scan;
+  std::vector<Type_f> in_vals;
+  std::vector<Type_f> verify;
+};
+
+template <typename Type_f, typename Index_>
+class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+ public:
+  CSRRowNormalizeTest()
+    : params(::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
+      stream(handle.get_stream()),
+      in_vals(params.in_vals.size(), stream),
+      verify(params.verify.size(), stream),
+      ex_scan(params.ex_scan.size(), stream),
+      result(params.verify.size(), stream)
+  {
+  }
+
+ protected:
+  void SetUp() override {}
+
+  void Run()
+  {
+    Index_ n_rows = params.ex_scan.size();
+    Index_ nnz    = params.in_vals.size();
+
+    raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
+    raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream);
+    raft::update_device(verify.data(), params.verify.data(), nnz, stream);
+
+    switch (params.method) {
+      case MAX:
+        linalg::csr_row_normalize_max<Type_f>(
+          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
+        break;
+      case L1:
+        linalg::csr_row_normalize_l1<Type_f>(
+          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
+        break;
+    }
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
+  }
+
+ protected:
+  raft::device_resources handle;
+  cudaStream_t stream;
+
+  CSRRowNormalizeInputs<Type_f, Index_> params;
+  rmm::device_uvector<Index_> ex_scan;
+  rmm::device_uvector<Type_f> in_vals, result, verify;
+};
+
+using CSRRowNormalizeTestF = CSRRowNormalizeTest<float, int>;
+TEST_P(CSRRowNormalizeTestF, Result) { Run(); }
+
+using CSRRowNormalizeTestD = CSRRowNormalizeTest<double, int>;
+TEST_P(CSRRowNormalizeTestD, Result) { Run(); }
+
+const std::vector<CSRRowNormalizeInputs<float, int>> csrnormalize_inputs_f = {
+  {MAX,
+   {0, 4, 8, 9},
+   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
+  {L1,
+   {0, 4, 8, 9},
+   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+};
+const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
+  {MAX,
+   {0, 4, 8, 9},
+   {5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0}},
+  {L1,
+   {0, 4, 8, 9},
+   {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0},
+   {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
+};
+
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestF,
+                        ::testing::ValuesIn(csrnormalize_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestD,
+                        ::testing::ValuesIn(csrnormalize_inputs_d));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/sparse/spgemmi.cu b/cpp/test/sparse/spgemmi.cu
index ec77b8e88b..e0aa4bc43b 100644
--- a/cpp/test/sparse/spgemmi.cu
+++ b/cpp/test/sparse/spgemmi.cu
@@ -63,20 +63,20 @@ class SPGemmiTest : public ::testing::TestWithParam<SPGemmiInputs> {
     int hB_rows[]       = {0, 2, 3, 1, 0, 2, 3, 1, 3};
     float hB_values[]   = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
     float hA[]          = {1.0f,
-                  2.0f,
-                  3.0f,
-                  4.0f,
-                  5.0f,
-                  6.0f,
-                  7.0f,
-                  8.0f,
-                  9.0f,
-                  10.0f,
-                  11.0f,
-                  12.0f,
-                  13.0f,
-                  14.0f,
-                  15.0f};
+                           2.0f,
+                           3.0f,
+                           4.0f,
+                           5.0f,
+                           6.0f,
+                           7.0f,
+                           8.0f,
+                           9.0f,
+                           10.0f,
+                           11.0f,
+                           12.0f,
+                           13.0f,
+                           14.0f,
+                           15.0f};
     std::vector<float> hC(C_size);
     std::vector<float> hC_expected{23, 26, 29, 32,  35,  24, 28, 32, 36, 40,
                                    71, 82, 93, 104, 115, 48, 56, 64, 72, 80};
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
index 9ad7998180..c6c3dd48ca 100644
--- a/cpp/test/stats/histogram.cu
+++ b/cpp/test/stats/histogram.cu
@@ -16,6 +16,7 @@
 
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/interruptible.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/stats/histogram.cuh>
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index 8b58f9692a..e0dc77520d 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -18,6 +18,7 @@
 #include <gtest/gtest.h>
 #include <limits>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/stats/minmax.cuh>
 #include <raft/util/cuda_utils.cuh>
diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu
index 40b7e59d81..9ad89d59c0 100644
--- a/cpp/test/stats/silhouette_score.cu
+++ b/cpp/test/stats/silhouette_score.cu
@@ -20,10 +20,6 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/stats/specializations.cuh>
-#endif
-
 #include <raft/stats/silhouette_score.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/stats/trustworthiness.cu b/cpp/test/stats/trustworthiness.cu
index 2fde6b29c1..15b27c7669 100644
--- a/cpp/test/stats/trustworthiness.cu
+++ b/cpp/test/stats/trustworthiness.cu
@@ -20,10 +20,6 @@
 #include <raft/distance/distance.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/stats/specializations.cuh>
-#endif
-
 #include <raft/stats/trustworthiness_score.cuh>
 #include <vector>
 
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/test/util/bitonic_sort.cu
index 2cf5420334..d1f03f78b5 100644
--- a/cpp/test/util/bitonic_sort.cu
+++ b/cpp/test/util/bitonic_sort.cu
@@ -103,12 +103,12 @@ struct bitonic_launch {
 };
 
 template <typename T>
-class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
+class BitonicTest : public testing::TestWithParam<test_spec> {     // NOLINT
  protected:
-  const test_spec spec;  // NOLINT
-  std::vector<T> in;     // NOLINT
-  std::vector<T> out;    // NOLINT
-  std::vector<T> ref;    // NOLINT
+  const test_spec spec;                                            // NOLINT
+  std::vector<T> in;                                               // NOLINT
+  std::vector<T> out;                                              // NOLINT
+  std::vector<T> ref;                                              // NOLINT
 
   void segmented_sort(std::vector<T>& vec, int k, bool ascending)  // NOLINT
   {
@@ -184,13 +184,13 @@ auto inputs = ::testing::Values(test_spec{1, 1, 1, true},
                                 test_spec{70, 1, 64, true},
                                 test_spec{70, 2, 128, false});
 
-using Floats = BitonicTest<float>;                     // NOLINT
-TEST_P(Floats, Run) { run(); }                         // NOLINT
-INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs);  // NOLINT
+using Floats = BitonicTest<float>;                      // NOLINT
+TEST_P(Floats, Run) { run(); }                          // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs);   // NOLINT
 
-using Ints = BitonicTest<int>;                       // NOLINT
-TEST_P(Ints, Run) { run(); }                         // NOLINT
-INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs);  // NOLINT
+using Ints = BitonicTest<int>;                          // NOLINT
+TEST_P(Ints, Run) { run(); }                            // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs);     // NOLINT
 
 using Doubles = BitonicTest<double>;                    // NOLINT
 TEST_P(Doubles, Run) { run(); }                         // NOLINT
diff --git a/dependencies.yaml b/dependencies.yaml
index f3e0cd1167..ccaf3fe0d8 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,6 +10,7 @@ files:
       - build_pylibraft
       - cudatoolkit
       - develop
+      - test_libraft
       - docs
       - run_raft_dask
       - run_pylibraft
@@ -29,6 +30,7 @@ files:
     output: none
     includes:
       - cudatoolkit
+      - test_libraft
   test_python:
     output: none
     includes:
@@ -109,7 +111,7 @@ dependencies:
           - cmake>=3.23.1,!=3.25.0
           - cython>=0.29,<0.30
           - ninja
-          - scikit-build>=0.13.1
+          - scikit-build>=0.13.1,<0.17.2
       - output_types: [conda]
         packages:
           - c-compiler
@@ -132,7 +134,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cuda_python cuda-python >=11.7.1,<12.0
+          - &cuda_python cuda-python>=11.7.1,<12.0
           - &rmm rmm==23.6.*
   checks:
     common:
@@ -143,10 +145,10 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - clang=11.1.0
+          - clang=16.0.1
       - output_types: [conda]
         packages:
-          - clang-tools=11.1.0
+          - clang-tools=16.0.1
   nn_bench:
     common:
       - output_types: [conda]
@@ -216,6 +218,12 @@ dependencies:
               - *libcusolver114
               - *libcusparse_dev114
               - *libcusparse114
+  test_libraft:
+    common:
+      - output_types: [conda]
+        packages:
+          - gtest>=1.13.0
+          - gmock>=1.13.0
   docs:
     common:
       - output_types: [conda]
@@ -239,10 +247,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              py: "3.8"
-            packages:
-              - python=3.8
           - matrix:
               py: "3.9"
             packages:
@@ -253,7 +257,7 @@ dependencies:
               - python=3.10
           - matrix:
             packages:
-              - python>=3.8,<3.11
+              - python>=3.9,<3.11
   run_pylibraft:
     common:
       - output_types: [conda, pyproject]
@@ -290,6 +294,11 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cupy
           - scikit-learn
           - scipy
+      - output_types: conda
+        packages:
+          - cupy>=12.0.0
+      - output_types: pyproject
+        packages:
+          - cupy-cuda11x>=12.0.0
diff --git a/docs/source/build.md b/docs/source/build.md
index 262c5703bc..bd2afe6638 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -4,7 +4,7 @@
 
 The easiest way to install RAFT is through conda and several packages are provided.
 - `libraft-headers` RAFT headers
-- `libraft` (optional) shared library containing pre-compiled template specializations and runtime API.
+- `libraft` (optional) shared library containing pre-compiled template instantiations and runtime API.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
 - `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
 
@@ -276,15 +276,7 @@ If the RAFT headers have already been installed into your environment with cmake
 
 Use `find_package(raft COMPONENTS compiled distributed)` to enable the shared library and transitively pass dependencies through separate targets for each component. In this example, the `raft::compiled` and `raft::distributed` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as NCCL for the `distributed` component).
 
-The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `specializations.cuh` and located in the base directory for the packages that contain specializations.
-
-The following example tells the compiler to ignore the pre-compiled templates for the `raft::distance` API so any symbols already compiled into the `libraft` shared library will be used instead. RAFT's cmake creates a variable `RAFT_COMPILED` which can be used to ignore the pre-compiled template specializations only when the shared library has been enabled through cmake (such as by specifying the `compiled` component in `find_package`):
-```c++
-#ifdef RAFT_COMPILED
-#include <raft/distance/distance.cuh>
-#include <raft/distance/specializations.cuh>
-#endif
-```
+The pre-compiled libraries contain template instantiations for commonly used types, such as single- and double-precision floating-point. By default, these are used automatically when the `RAFT_COMPILED` macro is defined during compilation. This definition is automatically added by CMake.
 
 ### Building RAFT C++ from source in cmake
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 6f57453e28..c206808d21 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -14,7 +14,7 @@ Developing features and fixing bugs for the RAFT library itself is straightforwa
 
 The process for working on a CUDA/C++ feature which might span RAFT and one or more consuming libraries can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#install_header_only_cpp) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream.
 
-If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. 
+If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise.
 
 
 ## Threading Model
@@ -95,12 +95,12 @@ template <typename value_t, typename idx_t>
 class ivf_pq {
   ivf_pq_params params_;
   raft::resources const& res_;
-  
+
 public:
   ivf_pq(raft::resources const& res);
   void train(raft::device_matrix<value_t, idx_t, raft::row_major> dataset);
-  void search(raft::device_matrix<value_t, idx_t, raft::row_major> queries, 
-              raft::device_matrix<value_t, idx_t, raft::row_major> out_inds, 
+  void search(raft::device_matrix<value_t, idx_t, raft::row_major> queries,
+              raft::device_matrix<value_t, idx_t, raft::row_major> out_inds,
               raft::device_matrix<value_t, idx_t, raft::row_major> out_dists);
 };
 ```
@@ -134,46 +134,77 @@ namespace raft::ivf_pq {
 
 ## Coding style
 
-### Code format
-#### Introduction
-RAFT relies on `clang-format` to enforce code style across all C++ and CUDA source code. The coding style is based on the [Google style guide](https://google.github.io/styleguide/cppguide.html#Formatting). The only digressions from this style are the following.
-1. Do not split empty functions/records/namespaces.
-2. Two-space indentation everywhere, including the line continuations.
-3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/.clang-format).
+### Code Formatting
+
+#### Using pre-commit hooks
+
+RAFT uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These
+tools ensure a consistent code format throughout the project. Using pre-commit ensures that linter
+versions and options are aligned for all developers. Additionally, there is a CI check in place to
+enforce that committed code follows our standards.
+
+To use `pre-commit`, install via `conda` or `pip`:
+
+```bash
+conda install -c conda-forge pre-commit
+```
+
+```bash
+pip install pre-commit
+```
+
+Then run pre-commit hooks before committing code:
+
+```bash
+pre-commit run
+```
+
+By default, pre-commit runs on staged files (only changes and additions that will be committed).
+To run pre-commit checks on all files, execute:
 
-#### How is the check done?
-All formatting checks are done by this python script: [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) which is effectively a wrapper over `clang-format`. An error is raised if the code diverges from the format suggested by clang-format. It is expected that the developers run this script to detect and fix formatting violations before creating PR.
+```bash
+pre-commit run --all-files
+```
 
-##### As part of CI
-[run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) is executed as part of our `ci/checks/style.sh` CI test. If there are any formatting violations, PR author is expected to fix those to get CI passing. Steps needed to fix the formatting violations are described in the subsequent sub-section.
+Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
 
-##### Manually
-Developers can also manually (or setup this command as part of git pre-commit hook) run this check by executing:
 ```bash
-python ./cpp/scripts/run-clang-format.py
+pre-commit install
 ```
-From the root of the RAFT repository.
 
-#### How to know the formatting violations?
-When there are formatting errors, [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) prints a `diff` command, showing where there are formatting differences. Unfortunately, unlike `flake8`, `clang-format` does NOT print descriptions of the violations, but instead directly formats the code. So, the only way currently to know about formatting differences is to run the diff command as suggested by this script against each violating source file.
+Now code linters and formatters will be run each time you commit changes.
 
-#### How to fix the formatting violations?
-When there are formatting violations, [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) prints at the end, the exact command that can be run by developers to fix them. This is the easiest way to fix formatting errors. [This screencast](https://asciinema.org/a/287367) shows how developers can check for formatting violations in their branches and also how to fix those, before sending out PRs.
+You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`.
+
+#### Summary of pre-commit hooks
+
+The following section describes some of the core pre-commit hooks used by the repository.
+See `.pre-commit-config.yaml` for a full list.
+
+C++/CUDA is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
+
+RAFT relies on `clang-format` to enforce code style across all C++ and CUDA source code. The coding style is based on the [Google style guide](https://google.github.io/styleguide/cppguide.html#Formatting). The only digressions from this style are the following.
+1. Do not split empty functions/records/namespaces.
+2. Two-space indentation everywhere, including the line continuations.
+3. Disable reflowing of comments.
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/.clang-format).
+
+[`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
+In order to run doxygen as a linter on C++/CUDA code, run
 
-In short, to bulk-fix all the formatting violations, execute the following command:
 ```bash
-python ./cpp/scripts/run-clang-format.py -inplace
+./ci/checks/doxygen.sh
 ```
-From the root of the RAFT repository.
 
-#### clang-format version?
-To avoid spurious code style violations we specify the exact clang-format version required, currently `11.1.0`. This is enforced by the [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) script itself. Refer [here](../build#build-dependencies) for the list of build-time dependencies.
+Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/),
+[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/).
 
-#### Additional scripts
-Along with clang, there are an include checker and copyright checker scripts for checking style, which can be performed as part of CI, as well as manually.
+RAFT also uses [codespell](https://github.com/codespell-project/codespell) to find spelling
+mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes,
+you can run  `codespell -i 3 -w .` from the repository root directory.
+This will bring up an interactive prompt to select which spelling fixes to apply.
 
-##### #include style
+### #include style
 [include_checker.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
@@ -183,7 +214,7 @@ Manually, run the following to bulk-fix include style issues:
 python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list of folders which you want to fix]
 ```
 
-##### Copyright header
+### Copyright header
 [copyright.py](https://github.com/rapidsai/raft/blob/branch-23.06/ci/checks/copyright.py) checks the Copyright header for all git-modified files
 
 Manually, you can run the following to bulk-fix the header if only the years need to be updated:
@@ -252,17 +283,108 @@ Sometimes, we need to temporarily change the log pattern (eg: for reporting deci
 
 ## Common Design Considerations
 
-1. Use the `hpp` extension for files which can be compiled with `gcc` against the CUDA-runtime. Use the `cuh` extension for files which require `nvcc` to be compiled. `hpp` can also be used for functions marked `__host__ __device__` only if proper checks are in place to remove the `__device__` designation when not compiling with `nvcc`. 
+1. Use the `hpp` extension for files which can be compiled with `gcc` against the CUDA-runtime. Use the `cuh` extension for files which require `nvcc` to be compiled. `hpp` can also be used for functions marked `__host__ __device__` only if proper checks are in place to remove the `__device__` designation when not compiling with `nvcc`.
 
-2. When additional classes, structs, or general POCO types are needed to be used for representing data in the public API, place them in a new file called `<primitive_name>_types.hpp`. This tells users they are safe to expose these types on their own public APIs without bringing in device code. At a minimum, the definitions for these types, at least, should not require `nvcc`. In general, these classes should only store very simple state and should not perform their own computations. Instead, new functions should be exposed on the public API which accept these objects, reading or updating their state as necessary. 
+2. When additional classes, structs, or general POCO types are needed to be used for representing data in the public API, place them in a new file called `<primitive_name>_types.hpp`. This tells users they are safe to expose these types on their own public APIs without bringing in device code. At a minimum, the definitions for these types, at least, should not require `nvcc`. In general, these classes should only store very simple state and should not perform their own computations. Instead, new functions should be exposed on the public API which accept these objects, reading or updating their state as necessary.
 
 3. Documentation for public APIs should be well documented, easy to use, and it is highly preferred that they include usage instructions.
 
 4. Before creating a new primitive, check to see if one exists already. If one exists but the API isn't flexible enough to include your use-case, consider first refactoring the existing primitive. If that is not possible without an extreme number of changes, consider how the public API could be made more flexible. If the new primitive is different enough from all existing primitives, consider whether an existing public API could invoke the new primitive as an option or argument. If the new primitive is different enough from what exists already, add a header for the new public API function to the appropriate subdirectory and namespace.
 
+## Header organization of expensive function templates
+
+RAFT is a heavily templated library. Several core functions are expensive to compile and we want to prevent duplicate compilation of this functionality. To limit build time, RAFT provides a precompiled library (libraft.so) where expensive function templates are instantiated for the most commonly used template parameters. To prevent (1) accidental instantiation of these templates and (2) unnecessary dependency on the internals of these templates, we use a split header structure and define macros to control template instantiation. This section describes the macros and header structure.
+
+**Macros.** We define the macros `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY`. The `RAFT_COMPILED` macro is defined by `CMake` when compiling code that (1) is part of `libraft.so` or (2) is linked with `libraft.so`. It indicates that a precompiled `libraft.so` is present at runtime.
+
+The `RAFT_EXPLICIT_INSTANTIATE_ONLY` macro is defined by `CMake` during compilation of `libraft.so` itself. When defined, it indicates that implicit instantiations of expensive function templates are forbidden (they result in a compiler error). In the RAFT project, we additionally define this macro during compilation of the tests and benchmarks. 
+
+Below, we summarize which combinations of `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` are used in practice and what the effect of the combination is. 
+
+| RAFT_COMPILED | RAFT_EXPLICIT_INSTANTIATE_ONLY | Which targets                                                                                        |
+|---------------|--------------------------------|------------------------------------------------------------------------------------------------------|
+| defined       | defined                        | `raft::compiled`, RAFT tests, RAFT benchmarks                                                        |
+| defined       |                                | Downstream libraries depending on `libraft` like cuML, cuGraph.                                      |
+|               |                                | Downstream libraries depending on `libraft-headers` like cugraph-ops.                                |
+
+
+| RAFT_COMPILED | RAFT_EXPLICIT_INSTANTIATE_ONLY | Effect                                                                                                |
+|---------------|--------------------------------|-------------------------------------------------------------------------------------------------------|
+| defined       | defined                        | Templates are precompiled. Compiler error on accidental instantiation of expensive function template. |
+| defined       |                                | Templates are precompiled. Implicit instantiation allowed.                                            |
+|               |                                | Nothing precompiled. Implicit instantiation allowed.                                                  |
+|               | defined                        | Avoid this: nothing precompiled. Compiler error on any instantiation of expensive function template.  |
+
+
+
+**Header organization.** Any header file that defines an expensive function template (say `expensive.cuh`) should be split in three parts: `expensive.cuh`, `expensive-inl.cuh`, and `expensive-ext.cuh`. The file `expensive-inl.cuh` ("inl" for "inline") contains the template definitions, i.e., the actual code. The file `expensive.cuh` includes one or both of the other two files, depending on the values of the `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` macros. The file `expensive-ext.cuh` contains `extern template` instantiations. In addition, if `RAFT_EXPLICIT_INSTANTIATE_ONLY` is set, it contains template definitions to ensure that a compiler error is raised in case of accidental instantiation.
+
+The dispatching by `expensive.cuh` is performed as follows:
+``` c++
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+// If implicit instantiation is allowed, include template definitions.
+#include "expensive-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+// Include extern template instantiations when RAFT is compiled.
+#include "expensive-ext.cuh"
+#endif
+```
+
+The file `expensive-inl.cuh` is unchanged:
+``` c++
+namespace raft {
+template <typename T>
+void expensive(T arg) {
+  // .. function body
+}
+} // namespace raft
+```
+
+The file `expensive-ext.cuh` contains the following:
+``` c++
+#include <raft/util/raft_explicit.cuh> // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+namespace raft {
+// (1) define templates to raise an error in case of accidental instantiation 
+template <typename T> void expensive(T arg) RAFT_EXPLICIT;
+} // namespace raft
+#endif //RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+// (2) Provide extern template instantiations.
+extern template void raft::expensive<int>(int);
+extern template void raft::expensive<float>(float);
+```
+
+This header has two responsibilities: (1) define templates to raise an error in case of accidental instantiation and (2) provide `extern template` instantiations.
+First, if `RAFT_EXPLICIT_INSTANTIATE_ONLY` is set, `expensive` is defined. This is done for two reasons: (1) to give a definition, because the definition in `expensive-inl.cuh` was skipped and (2) to indicate that the template should be explicitly instantiated by taging it with the `RAFT_EXPLICIT` macro. This macro defines the function body, and it ensures that an informative error message is generated when an implicit instantiation erroneously occurs. Finally, the `extern template` instantiations are listed.
+
+To actually generate the code for the template instances, the file `src/expensive.cu` contains the following. Note that the only difference between the extern template instantiations in `expensive-ext.cuh` and these lines are the removal of the word `extern`:
+
+``` c++
+#include <raft/expensive-inl.cuh>
+
+template void raft::expensive<int>(int);
+template void raft::expensive<float>(float);
+```
+
+**Design considerations**: 
+
+1. In the `-ext.cuh` header, do not include implementation headers. Only include function parameter types and types that are used to instantiate the templates. If a primitive takes custom parameter types, define them in a separate header called `<primitive_name>_types.hpp`. (see [Common Design Considerations](https://github.com/rapidsai/raft/blob/7b065aff81a0b1976e2a9e2f3de6690361a1111b/docs/source/developer_guide.md#common-design-considerations)).
+
+2. Keep docstrings in the `-inl.cuh` header, as it is closer to the code. Remove docstrings from template definitions in the `-ext.cuh` header. Make sure to explicitly include public APIs in the RAFT API docs. That is, add `#include <raft/expensive.cuh>` to the docs in `docs/source/cpp_api/expensive.rst` (instead of `#include <raft/expensive-inl.cuh>`).
+
+3. The order of inclusion in `expensive.cuh` is extremely important. If `RAFT_EXPLICIT_INSTANTIATE_ONLY` is not defined, but `RAFT_COMPILED` is defined, then we must include the template definitions before the `extern template` instantiations.
+
+4. If a header file defines multiple expensive templates, it can be that one of them is not instantiated. In this case, **do define** the template with `RAFT_EXPLICIT` in the `-ext` header. This way, when the template is instantiated, the developer gets a helpful error message instead of a confusing "function not found".
+
+This header structure was proposed in [issue #1416](https://github.com/rapidsai/raft/issues/1416), which contains more background on the motivation of this structure and the mechanics of C++ template instantiation. 
+
 ## Testing
 
-It's important for RAFT to maintain a high test coverage of the public APIs in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. 
+It's important for RAFT to maintain a high test coverage of the public APIs in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes.
 
 A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects.
 
@@ -292,9 +414,9 @@ void foo(const double* srcdata, double* result)
     CUDA_RT_CALL( cudaStreamCreate( &stream ) );
     raft::resources res;
     set_cuda_stream(res, stream);
-    
+
     ...
-    
+
     RAFT_CUDA_TRY( cudaMemcpyAsync( srcdata, h_srcdata.data(), n*sizeof(double), cudaMemcpyHostToDevice, stream ) );
 
     raft::algo(raft::resources, dopredict, srcdata, result, ... );
@@ -348,7 +470,7 @@ void foo(const raft::resources& h, ...)
 }
 ```
 
-The example below shows one way to create `n_stream` number of internal cuda streams with an `rmm::stream_pool` which can later be used by the algos inside RAFT. 
+The example below shows one way to create `n_stream` number of internal cuda streams with an `rmm::stream_pool` which can later be used by the algos inside RAFT.
 ```cpp
 #include <raft/core/resources.hpp>
 #include <raft/core/resource/cuda_stream_pool.hpp>
diff --git a/docs/source/using_libraft.md b/docs/source/using_libraft.md
index f4f966f2c8..ef055184e7 100644
--- a/docs/source/using_libraft.md
+++ b/docs/source/using_libraft.md
@@ -1,59 +1,64 @@
 # Using The Pre-Compiled Binary
 
-At its core, RAFT is a header-only template library, which makes it very powerful in that APIs can be called with various different combinations of data types and only the templates which are actually used will be compiled into your binaries. This increased flexibility comes with a drawback that all the APIs need to be declared inline and thus calls which are made frequently in your code could be compiled again each source file for which they are invoked.
+At its core, RAFT is a header-only template library, which makes it very powerful in that APIs can be called with various different combinations of data types and only the templates which are actually used will be compiled into your binaries. This increased flexibility comes with a drawback that all the APIs need to be declared inline and thus calls which are made frequently in your code could be compiled again in each source file for which they are invoked.
 
-For most functions, this overhead is pretty minimal and not noticeable but some of RAFT's APIs consist of very complex hierarchies of function calls that ultimately end up dispatching to device code that's executed on the GPU. The compile times for these APIs may still be bearable when compiling for only a single compute architecture but could end up becoming extremely slow to compile for all of the supported architectures at once.
+For most functions, compile-time overhead is minimal but some of RAFT's APIs take a substantial time to compile. As a rule of thumb, most functionality in `raft::distance`, `raft::neighbors`, and `raft::cluster` is expensive to compile and most functionality in other namespaces has little compile-time overhead.
 
-There are three ways to solve this problem and speed up compile times:
-1. Continue to use RAFT as a header-only library and create a CUDA source file in your project to explicitly instantiate the templates which are slow to compile. This can be tedious and will still require compiling the slow code at least once, but it's the most flexible option if you are using types that aren't already compiled into `libraft`
-2. If you are able to use one of the template types that are already being compiled into `libraft`, you can use the pre-compiled template specializations, which I will describe in more detail in the following section.
-3. If you would like to use RAFT but either cannot or would prefer not to compile any CUDA code yourself, you can simply add `libraft` to your link libraries and use the growing set of runtime APIs.
+There are three ways to speed up compile times:
 
-## Using Template Specializations
+1. Continue to use RAFT as a header-only library and create a CUDA source file
+   in your project to explicitly instantiate the templates which are slow to
+   compile. This can be tedious and will still require compiling the slow code
+   at least once, but it's the most flexible option if you are using types that
+   aren't already compiled into `libraft`
 
-As mentioned above, the pre-compiled template instantiations can save a lot of time if you are able to use the type combinations for the templates which are already specialized in the `libraft` binary. This will, of course, mean that you will need to add `libraft` to your link libraries.
+2. If you are able to use one of the template types that are already being
+   compiled into `libraft`, you can use the pre-compiled template
+   instantiations, which are described in more detail in the following section.
 
-At the top level of each namespace containing pre-compiled template specializations is a header file called `specializations.cuh`. This header file includes `extern template` directives for all the specializations which are compiled into libraft. As an example, including `raft/neighbors/specializations.cuh` in one of your source files will effectively tell the compiler to skip over any of the template specializations that are already compiled into the `libraft` binary.
+3. If you would like to use RAFT but either cannot or would prefer not to
+   compile any CUDA code yourself, you can simply add `libraft` to your link
+   libraries and use the growing set of `raft::runtime` APIs.
 
-### How do I verify template specializations didn't compile into my binary?
+### How do I verify template instantiations didn't compile into my binary?
 
-Which specializations were chosen to instantiations were based on compile time analysis and reuse. This means you can't assume that all specializations are for the public API itself. Take the following example in `raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh`:
+To verify that you are not accidentally instantiating templates that have not been pre-compiled in RAFT, set the `RAFT_EXPLICIT_INSTANTIATE_ONLY` macro. This only works if you are linking with the pre-compiled libraft (i.e., when `RAFT_COMPILED` has been defined). To check if, for instance, `raft::distance::distance` has been precompiled with specific template arguments, you can set `RAFT_EXPLICIT_INSTANTIATE_ONLY` at the top of the file you are compiling, as in the following example:
 
 ```c++
-namespace raft::neighbors::ivf_pq::detail {
-
-namespace {
-using fp8s_t = fp_8bit<5, true>;
-using fp8u_t = fp_8bit<5, false>;
-}  // namespace
-
-#define RAFT_INST(OutT, LutT)                                                                     \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, true, true>(uint32_t, uint32_t)  \
-    ->compute_similarity_kernel_t<OutT, LutT>;                                                    \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, true, false>(uint32_t, uint32_t) \
-    ->compute_similarity_kernel_t<OutT, LutT>;                                                    \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, false, true>(uint32_t, uint32_t) \
-    ->compute_similarity_kernel_t<OutT, LutT>;
-
-#define RAFT_INST_ALL_OUT_T(LutT) \
-  RAFT_INST(float, LutT)          \
-  RAFT_INST(half, LutT)
-
-RAFT_INST_ALL_OUT_T(float)
-RAFT_INST_ALL_OUT_T(half)
-RAFT_INST_ALL_OUT_T(fp8s_t)
-RAFT_INST_ALL_OUT_T(fp8u_t)
-
-#undef RAFT_INST
-#undef RAFT_INST_ALL_OUT_T
-
-}  // namespace raft::neighbors::ivf_pq::detail
-```
 
-We can see here that the function `raft::neighbors::ivf_pq::detail::get_compute_similarity_kernel` is being instantiated for the cartesian product of `OutT={float, half, fp8s_t, fp8u_t}` and `LutT={float, half}`. After linking against the `libraft` binary and including `raft/neighbors/specializations.cuh` in your source file, you can invoke the `raft::neighbors::ivf_pq` functions and compile your code. If the specializations are working, you should be able to use `nm -g -C --defined-only /path/to/your/binary | grep raft::neighbors::ivf_pq::detail::get_compute_similarity::kernel` and you shouldn't see any results, because those symbols should be coming from the `libraft` binary and skipped from compiling into your binary.
+#ifdef RAFT_COMPILED
+#define RAFT_EXPLICIT_INSTANTIATE_ONLY
+#endif
+
+#include <cstdint>
+#include <raft/core/device_resources.hpp>
+#include <raft/distance/distance.cuh>
+
+int main()
+{
+  raft::resources handle{};
+
+  // Change IdxT to uint64_t and you will get an error because you are
+  // instantiating a template that has not been pre-compiled.
+  using IdxT = int;
+
+  const float* x = nullptr;
+  const float* y = nullptr;
+  float* out     = nullptr;
+  int m          = 1024;
+  int n          = 1024;
+  int k          = 1024;
+  bool row_major = true;
+  raft::distance::distance<raft::distance::DistanceType::L1, float, float, float, IdxT>(
+    handle, x, y, out, m, n, k, row_major, 2.0f);
+}
+```
 
 ## Runtime APIs
 
-RAFT contains a growing list of runtime APIs that, unlike the pre-compiled template specializations, allow you to link against `libraft` and invoke RAFT directly from `cpp` files. The benefit to RAFT's runtime APIs are two-fold- unlike the template specializations, which still require your code be compiled with the CUDA compiler (`nvcc`), the `runtime` APIs are the lightweight wrappers which enable `pylibraft`.
+RAFT contains a growing list of runtime APIs that, unlike the pre-compiled
+template instantiations, allow you to link against `libraft` and invoke RAFT
+directly from `cpp` files. The benefit to RAFT's runtime APIs is that they can
+be used from code that is compiled with a `c++` compiler (rather than the CUDA
+compiler `nvcc`). This enables the `runtime` APIs to power `pylibraft`.
 
-Similar to the pre-compiled template specializations, RAFT's runtime APIs 
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index f8b09bed89..2982db2a23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.black]
 line-length = 79
-target-version = ["py38"]
+target-version = ["py39"]
 include = '\.py?$'
 force-exclude = '''
 /(
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 349a2b08ba..069bd98222 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -86,6 +86,7 @@ rapids_cython_init()
 
 add_subdirectory(pylibraft/common)
 add_subdirectory(pylibraft/distance)
+add_subdirectory(pylibraft/matrix)
 add_subdirectory(pylibraft/neighbors)
 add_subdirectory(pylibraft/random)
 add_subdirectory(pylibraft/cluster)
diff --git a/python/pylibraft/pylibraft/matrix/CMakeLists.txt b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
new file mode 100644
index 0000000000..ffba10dea9
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
@@ -0,0 +1,24 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Set the list of Cython files to build
+set(cython_sources select_k.pyx)
+set(linked_libraries raft::raft raft::compiled)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX matrix_
+)
diff --git a/python/pylibraft/pylibraft/matrix/__init__.pxd b/python/pylibraft/pylibraft/matrix/__init__.pxd
new file mode 100644
index 0000000000..a7e7b75096
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/__init__.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/matrix/__init__.py b/python/pylibraft/pylibraft/matrix/__init__.py
new file mode 100644
index 0000000000..5eb35795ed
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .select_k import select_k
+
+__all__ = ["select_k"]
diff --git a/python/pylibraft/pylibraft/matrix/cpp/__init__.pxd b/python/pylibraft/pylibraft/matrix/cpp/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/matrix/cpp/__init__.py b/python/pylibraft/pylibraft/matrix/cpp/__init__.py
new file mode 100644
index 0000000000..8f2cc34855
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/cpp/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/matrix/cpp/select_k.pxd b/python/pylibraft/pylibraft/matrix/cpp/select_k.pxd
new file mode 100644
index 0000000000..ab466fdce6
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/cpp/select_k.pxd
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+
+from pylibraft.common.cpp.mdspan cimport device_matrix_view, row_major
+from pylibraft.common.cpp.optional cimport optional
+from pylibraft.common.handle cimport device_resources
+
+
+cdef extern from "raft_runtime/matrix/select_k.hpp" \
+        namespace "raft::runtime::matrix" nogil:
+
+    cdef void select_k(const device_resources & handle,
+                       device_matrix_view[float, int64_t, row_major],
+                       optional[device_matrix_view[int64_t,
+                                                   int64_t,
+                                                   row_major]],
+                       device_matrix_view[float, int64_t, row_major],
+                       device_matrix_view[int64_t, int64_t, row_major],
+                       bool) except +
diff --git a/python/pylibraft/pylibraft/matrix/select_k.pyx b/python/pylibraft/pylibraft/matrix/select_k.pyx
new file mode 100644
index 0000000000..fbb1e2e5d3
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/select_k.pyx
@@ -0,0 +1,133 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cython.operator cimport dereference as deref
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+
+import numpy as np
+
+from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
+from pylibraft.common.handle import auto_sync_handle
+from pylibraft.common.input_validation import is_c_contiguous
+
+from pylibraft.common.cpp.mdspan cimport (
+    device_matrix_view,
+    host_matrix_view,
+    make_device_matrix_view,
+    make_host_matrix_view,
+    row_major,
+)
+from pylibraft.common.cpp.optional cimport optional
+from pylibraft.common.handle cimport device_resources
+from pylibraft.common.mdspan cimport get_dmv_float, get_dmv_int64
+from pylibraft.matrix.cpp.select_k cimport select_k as c_select_k
+
+
+@auto_sync_handle
+@auto_convert_output
+def select_k(dataset, k=None, distances=None, indices=None, select_min=True,
+             handle=None):
+    """
+    Selects the top k items from each row in a matrix
+
+
+    Parameters
+    ----------
+    dataset : array interface compliant matrix, row-major layout,
+        shape (n_rows, dim). Supported dtype [float]
+    k : int
+        Number of items to return for each row.  Optional if indices or
+        distances arrays are given (in which case their second dimension
+        is k).
+    distances :  Optional array interface compliant matrix shape
+                (n_rows, k), dtype float. If supplied,
+                distances will be written here in-place. (default None)
+    indices :  Optional array interface compliant matrix shape
+                (n_rows, k), dtype int64_t. If supplied, neighbor
+                indices will be written here in-place. (default None)
+    select_min: : bool
+        Whether to select the minimum or maximum K items
+
+    {handle_docstring}
+
+    Returns
+    -------
+    distances: array interface compliant object containing resulting distances
+               shape (n_rows, k)
+
+    indices: array interface compliant object containing resulting indices
+             shape (n_rows, k)
+
+    Examples
+    --------
+
+    >>> import cupy as cp
+
+    >>> from pylibraft.matrix import select_k
+
+    >>> n_features = 50
+    >>> n_rows = 1000
+
+    >>> queries = cp.random.random_sample((n_rows, n_features),
+    ...                                   dtype=cp.float32)
+    >>> k = 40
+    >>> distances, ids = select_k(queries, k)
+    >>> distances = cp.asarray(distances)
+    >>> ids = cp.asarray(ids)
+    """
+
+    dataset_cai = cai_wrapper(dataset)
+
+    if k is None:
+        if indices is not None:
+            k = cai_wrapper(indices).shape[1]
+        elif distances is not None:
+            k = cai_wrapper(distances).shape[1]
+        else:
+            raise ValueError("Argument k must be specified if both indices "
+                             "and distances arg is None")
+
+    n_rows = dataset.shape[0]
+    if indices is None:
+        indices = device_ndarray.empty((n_rows, k), dtype='int64')
+
+    if distances is None:
+        distances = device_ndarray.empty((n_rows, k), dtype='float32')
+
+    distances_cai = cai_wrapper(distances)
+    indices_cai = cai_wrapper(indices)
+
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
+
+    cdef optional[device_matrix_view[int64_t, int64_t, row_major]] in_idx
+
+    if dataset_cai.dtype == np.float32:
+        c_select_k(deref(handle_),
+                   get_dmv_float(dataset_cai, check_shape=True),
+                   in_idx,
+                   get_dmv_float(distances_cai, check_shape=True),
+                   get_dmv_int64(indices_cai, check_shape=True),
+                   <bool>select_min)
+    else:
+        raise TypeError("dtype %s not supported" % dataset_cai.dtype)
+
+    return distances, indices
diff --git a/python/pylibraft/pylibraft/neighbors/brute_force.pyx b/python/pylibraft/pylibraft/neighbors/brute_force.pyx
index dbd888756d..8836307a5a 100644
--- a/python/pylibraft/pylibraft/neighbors/brute_force.pyx
+++ b/python/pylibraft/pylibraft/neighbors/brute_force.pyx
@@ -40,7 +40,6 @@ from pylibraft.common.handle cimport device_resources
 from pylibraft.common.mdspan cimport get_dmv_float, get_dmv_int64
 
 from pylibraft.common.handle import auto_sync_handle
-from pylibraft.common.input_validation import is_c_contiguous
 from pylibraft.common.interruptible import cuda_interruptible
 
 from pylibraft.distance.distance_type cimport DistanceType
@@ -144,7 +143,7 @@ def knn(dataset, queries, k=None, indices=None, distances=None,
             raise ValueError("Argument k must be specified if both indices "
                              "and distances arg is None")
 
-    n_queries = cai_wrapper(queries).shape[0]
+    n_queries = queries_cai.shape[0]
 
     if indices is None:
         indices = device_ndarray.empty((n_queries, k), dtype='int64')
diff --git a/python/pylibraft/pylibraft/test/test_brue_force.py b/python/pylibraft/pylibraft/test/test_brute_force.py
similarity index 96%
rename from python/pylibraft/pylibraft/test/test_brue_force.py
rename to python/pylibraft/pylibraft/test/test_brute_force.py
index f349be892d..0bd5e6eaaf 100644
--- a/python/pylibraft/pylibraft/test/test_brue_force.py
+++ b/python/pylibraft/pylibraft/test/test_brute_force.py
@@ -90,9 +90,6 @@ def test_knn(
         expected_indices = argsort[i]
         gpu_dists = actual_distances[i]
 
-        if metric == "correlation" or metric == "cosine":
-            gpu_dists = gpu_dists[::-1]
-
         cpu_ordered = pw_dists[i, expected_indices]
         np.testing.assert_allclose(
             cpu_ordered[:k], gpu_dists, atol=1e-4, rtol=1e-4
diff --git a/python/pylibraft/pylibraft/test/test_doctests.py b/python/pylibraft/pylibraft/test/test_doctests.py
index 34be6c55f5..19e5c5c22f 100644
--- a/python/pylibraft/pylibraft/test/test_doctests.py
+++ b/python/pylibraft/pylibraft/test/test_doctests.py
@@ -22,6 +22,7 @@
 
 import pylibraft.cluster
 import pylibraft.distance
+import pylibraft.matrix
 import pylibraft.neighbors
 import pylibraft.random
 
@@ -94,6 +95,7 @@ def _find_doctests_in_obj(obj, finder=None, criteria=None):
 DOC_STRINGS = list(_find_doctests_in_obj(pylibraft.cluster))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.common))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.distance))
+DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.matrix.select_k))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.ivf_pq))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.brute_force))
diff --git a/python/pylibraft/pylibraft/test/test_select_k.py b/python/pylibraft/pylibraft/test/test_select_k.py
new file mode 100644
index 0000000000..203e735b9c
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_select_k.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from pylibraft.common import device_ndarray
+from pylibraft.matrix import select_k
+
+
+@pytest.mark.parametrize("n_rows", [32, 100])
+@pytest.mark.parametrize("n_cols", [40, 100])
+@pytest.mark.parametrize("k", [1, 5, 16, 35])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_select_k(n_rows, n_cols, k, inplace):
+    dataset = np.random.random_sample((n_rows, n_cols)).astype("float32")
+    dataset_device = device_ndarray(dataset)
+
+    indices = np.zeros((n_rows, k), dtype="int64")
+    distances = np.zeros((n_rows, k), dtype="float32")
+    indices_device = device_ndarray(indices)
+    distances_device = device_ndarray(distances)
+
+    ret_distances, ret_indices = select_k(
+        dataset_device,
+        k=k,
+        distances=distances_device,
+        indices=indices_device,
+    )
+
+    distances_device = ret_distances if not inplace else distances_device
+    actual_distances = distances_device.copy_to_host()
+    argsort = np.argsort(dataset, axis=1)
+
+    for i in range(dataset.shape[0]):
+        expected_indices = argsort[i]
+        gpu_dists = actual_distances[i]
+
+        cpu_ordered = dataset[i, expected_indices]
+        np.testing.assert_allclose(
+            cpu_ordered[:k], gpu_dists, atol=1e-4, rtol=1e-4
+        )
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 4fe0a52ce6..4aa11b41ea 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -16,11 +16,11 @@
 
 requires = [
     "cmake>=3.23.1,!=3.25.0",
-    "cuda-python >=11.7.1,<12.0",
+    "cuda-python>=11.7.1,<12.0",
     "cython>=0.29,<0.30",
     "ninja",
     "rmm==23.6.*",
-    "scikit-build>=0.13.1",
+    "scikit-build>=0.13.1,<0.17.2",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -35,22 +35,22 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
-    "cuda-python >=11.7.1,<12.0",
+    "cuda-python>=11.7.1,<12.0",
     "numpy>=1.21",
     "rmm==23.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
 
 [project.optional-dependencies]
 test = [
-    "cupy",
+    "cupy-cuda11x>=12.0.0",
     "pytest",
     "pytest-cov",
     "scikit-learn",
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index d7095aa00c..ac6a35b5ab 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -18,7 +18,7 @@ requires = [
     "cmake>=3.23.1,!=3.25.0",
     "cython>=0.29,<0.30",
     "ninja",
-    "scikit-build>=0.13.1",
+    "scikit-build>=0.13.1,<0.17.2",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -32,7 +32,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
     "dask-cuda==23.6.*",
     "dask==2023.3.2",
@@ -46,7 +46,7 @@ dependencies = [
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
 
diff --git a/thirdparty/pcg/pcg_basic.c b/thirdparty/pcg/pcg_basic.c
index 8c2fd0d22b..166663ea13 100644
--- a/thirdparty/pcg/pcg_basic.c
+++ b/thirdparty/pcg/pcg_basic.c
@@ -41,17 +41,14 @@ static pcg32_random_t pcg32_global = PCG32_INITIALIZER;
 
 void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, uint64_t initseq)
 {
-    rng->state = 0U;
-    rng->inc = (initseq << 1u) | 1u;
-    pcg32_random_r(rng);
-    rng->state += initstate;
-    pcg32_random_r(rng);
+  rng->state = 0U;
+  rng->inc   = (initseq << 1u) | 1u;
+  pcg32_random_r(rng);
+  rng->state += initstate;
+  pcg32_random_r(rng);
 }
 
-void pcg32_srandom(uint64_t seed, uint64_t seq)
-{
-    pcg32_srandom_r(&pcg32_global, seed, seq);
-}
+void pcg32_srandom(uint64_t seed, uint64_t seq) { pcg32_srandom_r(&pcg32_global, seed, seq); }
 
 // pcg32_random()
 // pcg32_random_r(rng)
@@ -59,18 +56,14 @@ void pcg32_srandom(uint64_t seed, uint64_t seq)
 
 uint32_t pcg32_random_r(pcg32_random_t* rng)
 {
-    uint64_t oldstate = rng->state;
-    rng->state = oldstate * 6364136223846793005ULL + rng->inc;
-    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
-    uint32_t rot = oldstate >> 59u;
-    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
-}
-
-uint32_t pcg32_random()
-{
-    return pcg32_random_r(&pcg32_global);
+  uint64_t oldstate   = rng->state;
+  rng->state          = oldstate * 6364136223846793005ULL + rng->inc;
+  uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+  uint32_t rot        = oldstate >> 59u;
+  return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
 }
 
+uint32_t pcg32_random() { return pcg32_random_r(&pcg32_global); }
 
 // pcg32_boundedrand(bound):
 // pcg32_boundedrand_r(rng, bound):
@@ -78,39 +71,33 @@ uint32_t pcg32_random()
 
 uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound)
 {
-    // To avoid bias, we need to make the range of the RNG a multiple of
-    // bound, which we do by dropping output less than a threshold.
-    // A naive scheme to calculate the threshold would be to do
-    //
-    //     uint32_t threshold = 0x100000000ull % bound;
-    //
-    // but 64-bit div/mod is slower than 32-bit div/mod (especially on
-    // 32-bit platforms).  In essence, we do
-    //
-    //     uint32_t threshold = (0x100000000ull-bound) % bound;
-    //
-    // because this version will calculate the same modulus, but the LHS
-    // value is less than 2^32.
-
-    uint32_t threshold = -bound % bound;
-
-    // Uniformity guarantees that this loop will terminate.  In practice, it
-    // should usually terminate quickly; on average (assuming all bounds are
-    // equally likely), 82.25% of the time, we can expect it to require just
-    // one iteration.  In the worst case, someone passes a bound of 2^31 + 1
-    // (i.e., 2147483649), which invalidates almost 50% of the range.  In 
-    // practice, bounds are typically small and only a tiny amount of the range
-    // is eliminated.
-    for (;;) {
-        uint32_t r = pcg32_random_r(rng);
-        if (r >= threshold)
-            return r % bound;
-    }
-}
-
-
-uint32_t pcg32_boundedrand(uint32_t bound)
-{
-    return pcg32_boundedrand_r(&pcg32_global, bound);
+  // To avoid bias, we need to make the range of the RNG a multiple of
+  // bound, which we do by dropping output less than a threshold.
+  // A naive scheme to calculate the threshold would be to do
+  //
+  //     uint32_t threshold = 0x100000000ull % bound;
+  //
+  // but 64-bit div/mod is slower than 32-bit div/mod (especially on
+  // 32-bit platforms).  In essence, we do
+  //
+  //     uint32_t threshold = (0x100000000ull-bound) % bound;
+  //
+  // because this version will calculate the same modulus, but the LHS
+  // value is less than 2^32.
+
+  uint32_t threshold = -bound % bound;
+
+  // Uniformity guarantees that this loop will terminate.  In practice, it
+  // should usually terminate quickly; on average (assuming all bounds are
+  // equally likely), 82.25% of the time, we can expect it to require just
+  // one iteration.  In the worst case, someone passes a bound of 2^31 + 1
+  // (i.e., 2147483649), which invalidates almost 50% of the range.  In
+  // practice, bounds are typically small and only a tiny amount of the range
+  // is eliminated.
+  for (;;) {
+    uint32_t r = pcg32_random_r(rng);
+    if (r >= threshold) return r % bound;
+  }
 }
 
+uint32_t pcg32_boundedrand(uint32_t bound) { return pcg32_boundedrand_r(&pcg32_global, bound); }