diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index bed83fca98..fd42a2842c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,19 +69,17 @@ jobs: sha: ${{ inputs.sha }} wheel-build-pylibraft: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} - package-name: pylibraft - package-dir: python/pylibraft - skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + script: ci/build_wheel_pylibraft.sh wheel-publish-pylibraft: needs: wheel-build-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -91,19 +89,17 @@ jobs: wheel-build-raft-dask: needs: wheel-publish-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} - package-name: raft_dask - package-dir: python/raft-dask - skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + script: ci/build_wheel_raft_dask.sh wheel-publish-raft-dask: needs: wheel-build-raft-dask secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 28efc135b2..e7f3a1caff 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -23,41 +23,41 @@ jobs: - wheel-build-raft-dask - wheel-tests-raft-dask secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08 with: build_type: pull-request node_type: cpu16 conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08 with: build_type: pull-request docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -67,40 +67,28 @@ jobs: wheel-build-pylibraft: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.08 with: build_type: pull-request - package-name: pylibraft - package-dir: python/pylibraft - skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + script: ci/build_wheel_pylibraft.sh wheel-tests-pylibraft: needs: wheel-build-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.08 with: build_type: pull-request - package-name: pylibraft - test-unittest: "python -m pytest ./python/pylibraft/pylibraft/test" - test-smoketest: "python ./ci/wheel_smoke_test_pylibraft.py" + script: ci/test_wheel_pylibraft.sh wheel-build-raft-dask: needs: wheel-tests-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.08 with: build_type: pull-request - package-name: raft_dask - package-dir: python/raft-dask - before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibraft_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibraft && python -m pip install --no-deps ./local-pylibraft/pylibraft*.whl" - skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + script: "ci/build_wheel_raft_dask.sh" wheel-tests-raft-dask: needs: wheel-build-raft-dask secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.08 with: build_type: pull-request - package-name: raft_dask - # Always want to test against latest dask/distributed. - test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" - test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" - test-unittest: "python -m pytest ./python/raft-dask/raft_dask/test" - test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py" + script: ci/test_wheel_raft_dask.sh diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ffd7fa3bcb..b752576b75 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -32,23 +32,19 @@ jobs: sha: ${{ inputs.sha }} wheel-tests-pylibraft: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.08 with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - package-name: pylibraft - test-unittest: "python -m pytest ./python/pylibraft/pylibraft/test" + script: ci/test_wheel_pylibraft.sh wheel-tests-raft-dask: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.08 with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - package-name: raft_dask - test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" - test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" - test-unittest: "python -m pytest ./python/raft-dask/raft_dask/test" + script: ci/test_wheel_raft_dask.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 16c3ba4985..8642f2bdf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,97 @@ +# raft 23.08.00 (9 Aug 2023) + +## 🚨 Breaking Changes + +- Separate CAGRA index type from internal idx type ([#1664](https://github.com/rapidsai/raft/pull/1664)) [@tfeher](https://github.com/tfeher) +- Stop using setup.py in build.sh ([#1645](https://github.com/rapidsai/raft/pull/1645)) [@vyasr](https://github.com/vyasr) +- CAGRA max_queries auto configuration ([#1613](https://github.com/rapidsai/raft/pull/1613)) [@enp1s0](https://github.com/enp1s0) +- Rename the CAGRA prune function to optimize ([#1588](https://github.com/rapidsai/raft/pull/1588)) [@enp1s0](https://github.com/enp1s0) +- CAGRA pad dataset for 128bit vectorized load ([#1505](https://github.com/rapidsai/raft/pull/1505)) [@tfeher](https://github.com/tfeher) +- Sparse Pairwise Distances API Updates ([#1502](https://github.com/rapidsai/raft/pull/1502)) [@divyegala](https://github.com/divyegala) +- Cagra index construction without copying device mdarrays ([#1494](https://github.com/rapidsai/raft/pull/1494)) [@tfeher](https://github.com/tfeher) +- [FEA] Masked NN for connect_components ([#1445](https://github.com/rapidsai/raft/pull/1445)) [@tarang-jain](https://github.com/tarang-jain) +- Limiting workspace memory resource ([#1356](https://github.com/rapidsai/raft/pull/1356)) [@achirkin](https://github.com/achirkin) + +## 🐛 Bug Fixes + +- Remove push condition on docs-build ([#1693](https://github.com/rapidsai/raft/pull/1693)) [@raydouglass](https://github.com/raydouglass) +- IVF-PQ: Fix illegal memory access with large max_samples ([#1685](https://github.com/rapidsai/raft/pull/1685)) [@achirkin](https://github.com/achirkin) +- Fix missing parameter for select_k ([#1682](https://github.com/rapidsai/raft/pull/1682)) [@ucassjy](https://github.com/ucassjy) +- Separate CAGRA index type from internal idx type ([#1664](https://github.com/rapidsai/raft/pull/1664)) [@tfeher](https://github.com/tfeher) +- Add rmm to pylibraft run dependencies, since it is used by Cython. ([#1656](https://github.com/rapidsai/raft/pull/1656)) [@bdice](https://github.com/bdice) +- Hotfix: wrong constant in IVF-PQ fp_8bit2half ([#1654](https://github.com/rapidsai/raft/pull/1654)) [@achirkin](https://github.com/achirkin) +- Fix sparse KNN for large batches ([#1640](https://github.com/rapidsai/raft/pull/1640)) [@viclafargue](https://github.com/viclafargue) +- Fix uploading of RAFT nightly packages ([#1638](https://github.com/rapidsai/raft/pull/1638)) [@dantegd](https://github.com/dantegd) +- Fix cagra multi CTA bug ([#1628](https://github.com/rapidsai/raft/pull/1628)) [@enp1s0](https://github.com/enp1s0) +- pass correct stream to cutlass kernel launch of L2/cosine pairwise distance kernels ([#1597](https://github.com/rapidsai/raft/pull/1597)) [@mdoijade](https://github.com/mdoijade) +- Fix launchconfig y-gridsize too large in epilogue kernel ([#1586](https://github.com/rapidsai/raft/pull/1586)) [@mfoerste4](https://github.com/mfoerste4) +- Fix update version and pinnings for 23.08. ([#1556](https://github.com/rapidsai/raft/pull/1556)) [@bdice](https://github.com/bdice) +- Fix for function exposing KNN merge ([#1418](https://github.com/rapidsai/raft/pull/1418)) [@viclafargue](https://github.com/viclafargue) + +## 📖 Documentation + +- Critical doc fixes and updates for 23.08 ([#1705](https://github.com/rapidsai/raft/pull/1705)) [@cjnolet](https://github.com/cjnolet) +- Fix the documentation about changing the logging level ([#1596](https://github.com/rapidsai/raft/pull/1596)) [@enp1s0](https://github.com/enp1s0) +- Fix raft::bitonic_sort small usage example ([#1580](https://github.com/rapidsai/raft/pull/1580)) [@enp1s0](https://github.com/enp1s0) + +## 🚀 New Features + +- Use rapids-cmake new parallel testing feature ([#1623](https://github.com/rapidsai/raft/pull/1623)) [@robertmaynard](https://github.com/robertmaynard) +- Add support for row-major slice ([#1591](https://github.com/rapidsai/raft/pull/1591)) [@lowener](https://github.com/lowener) +- IVF-PQ tutorial notebook ([#1544](https://github.com/rapidsai/raft/pull/1544)) [@achirkin](https://github.com/achirkin) +- [FEA] Masked NN for connect_components ([#1445](https://github.com/rapidsai/raft/pull/1445)) [@tarang-jain](https://github.com/tarang-jain) +- raft: Build CUDA 12 packages ([#1388](https://github.com/rapidsai/raft/pull/1388)) [@vyasr](https://github.com/vyasr) +- Limiting workspace memory resource ([#1356](https://github.com/rapidsai/raft/pull/1356)) [@achirkin](https://github.com/achirkin) + +## 🛠️ Improvements + +- Pin `dask` and `distributed` for `23.08` release ([#1711](https://github.com/rapidsai/raft/pull/1711)) [@galipremsagar](https://github.com/galipremsagar) +- Add algo parameter for CAGRA ANN bench ([#1687](https://github.com/rapidsai/raft/pull/1687)) [@tfeher](https://github.com/tfeher) +- ANN benchmarks python wrapper for splitting billion-scale dataset groundtruth ([#1679](https://github.com/rapidsai/raft/pull/1679)) [@divyegala](https://github.com/divyegala) +- Rename CAGRA parameter num_parents to search_width ([#1676](https://github.com/rapidsai/raft/pull/1676)) [@tfeher](https://github.com/tfeher) +- Renaming namespaces to promote CAGRA from experimental ([#1666](https://github.com/rapidsai/raft/pull/1666)) [@cjnolet](https://github.com/cjnolet) +- CAGRA Python wrappers ([#1665](https://github.com/rapidsai/raft/pull/1665)) [@dantegd](https://github.com/dantegd) +- Add notebook for Vector Search - Question Retrieval ([#1662](https://github.com/rapidsai/raft/pull/1662)) [@lowener](https://github.com/lowener) +- Fix CMake CUDA support for pylibraft when raft is found. ([#1659](https://github.com/rapidsai/raft/pull/1659)) [@bdice](https://github.com/bdice) +- Cagra ANN benchmark improvements ([#1658](https://github.com/rapidsai/raft/pull/1658)) [@tfeher](https://github.com/tfeher) +- ANN-benchmarks: avoid using the dataset during search when possible ([#1657](https://github.com/rapidsai/raft/pull/1657)) [@achirkin](https://github.com/achirkin) +- Revert CUDA 12.0 CI workflows to branch-23.08. ([#1652](https://github.com/rapidsai/raft/pull/1652)) [@bdice](https://github.com/bdice) +- ANN: Optimize host-side refine ([#1651](https://github.com/rapidsai/raft/pull/1651)) [@achirkin](https://github.com/achirkin) +- Cagra template instantiations ([#1650](https://github.com/rapidsai/raft/pull/1650)) [@tfeher](https://github.com/tfeher) +- Modify comm_split to avoid ucp ([#1649](https://github.com/rapidsai/raft/pull/1649)) [@ChuckHastings](https://github.com/ChuckHastings) +- Stop using setup.py in build.sh ([#1645](https://github.com/rapidsai/raft/pull/1645)) [@vyasr](https://github.com/vyasr) +- IVF-PQ: Add a (faster) direct conversion fp8->half ([#1644](https://github.com/rapidsai/raft/pull/1644)) [@achirkin](https://github.com/achirkin) +- Simplify `bench/ann` scripts to Python based module ([#1642](https://github.com/rapidsai/raft/pull/1642)) [@divyegala](https://github.com/divyegala) +- Further removal of uses-setup-env-vars ([#1639](https://github.com/rapidsai/raft/pull/1639)) [@dantegd](https://github.com/dantegd) +- Drop blank line in `raft-dask/meta.yaml` ([#1637](https://github.com/rapidsai/raft/pull/1637)) [@jakirkham](https://github.com/jakirkham) +- Enable conservative memory allocations for RAFT IVF-Flat benchmarks. ([#1634](https://github.com/rapidsai/raft/pull/1634)) [@tfeher](https://github.com/tfeher) +- [FEA] Codepacking for IVF-flat ([#1632](https://github.com/rapidsai/raft/pull/1632)) [@tarang-jain](https://github.com/tarang-jain) +- Fixing ann bench cmake (and docs) ([#1630](https://github.com/rapidsai/raft/pull/1630)) [@cjnolet](https://github.com/cjnolet) +- [WIP] Test CI issues ([#1626](https://github.com/rapidsai/raft/pull/1626)) [@VibhuJawa](https://github.com/VibhuJawa) +- Set pool memory resource for raft IVF ANN benchmarks ([#1625](https://github.com/rapidsai/raft/pull/1625)) [@tfeher](https://github.com/tfeher) +- Adding sort option to matrix::select_k api ([#1615](https://github.com/rapidsai/raft/pull/1615)) [@cjnolet](https://github.com/cjnolet) +- CAGRA max_queries auto configuration ([#1613](https://github.com/rapidsai/raft/pull/1613)) [@enp1s0](https://github.com/enp1s0) +- Use exceptions instead of `exit(-1)` ([#1594](https://github.com/rapidsai/raft/pull/1594)) [@benfred](https://github.com/benfred) +- [REVIEW] Add scheduler_file argument to support MNMG setup ([#1593](https://github.com/rapidsai/raft/pull/1593)) [@VibhuJawa](https://github.com/VibhuJawa) +- Rename the CAGRA prune function to optimize ([#1588](https://github.com/rapidsai/raft/pull/1588)) [@enp1s0](https://github.com/enp1s0) +- This PR adds support to __half and nb_bfloat16 to myAtomicReduce ([#1585](https://github.com/rapidsai/raft/pull/1585)) [@Kh4ster](https://github.com/Kh4ster) +- [IMP] move core CUDA RT macros to cuda_rt_essentials.hpp ([#1584](https://github.com/rapidsai/raft/pull/1584)) [@MatthiasKohl](https://github.com/MatthiasKohl) +- preprocessor syntax fix ([#1582](https://github.com/rapidsai/raft/pull/1582)) [@AyodeAwe](https://github.com/AyodeAwe) +- use rapids-upload-docs script ([#1578](https://github.com/rapidsai/raft/pull/1578)) [@AyodeAwe](https://github.com/AyodeAwe) +- Unpin `dask` and `distributed` for development and fix `merge_labels` test ([#1574](https://github.com/rapidsai/raft/pull/1574)) [@galipremsagar](https://github.com/galipremsagar) +- Remove documentation build scripts for Jenkins ([#1570](https://github.com/rapidsai/raft/pull/1570)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add support to __half and nv_bfloat16 to most math functions ([#1554](https://github.com/rapidsai/raft/pull/1554)) [@Kh4ster](https://github.com/Kh4ster) +- Add RAFT ANN benchmark for CAGRA ([#1552](https://github.com/rapidsai/raft/pull/1552)) [@enp1s0](https://github.com/enp1s0) +- Update CAGRA knn_graph_sort to use Raft::bitonic_sort ([#1550](https://github.com/rapidsai/raft/pull/1550)) [@enp1s0](https://github.com/enp1s0) +- Add identity matrix function ([#1548](https://github.com/rapidsai/raft/pull/1548)) [@lowener](https://github.com/lowener) +- Unpin scikit-build upper bound ([#1547](https://github.com/rapidsai/raft/pull/1547)) [@vyasr](https://github.com/vyasr) +- Migrate wheel workflow scripts locally ([#1546](https://github.com/rapidsai/raft/pull/1546)) [@divyegala](https://github.com/divyegala) +- Add sample filtering for ivf_flat. Filtering code refactoring and cleanup ([#1541](https://github.com/rapidsai/raft/pull/1541)) [@alexanderguzhva](https://github.com/alexanderguzhva) +- CAGRA pad dataset for 128bit vectorized load ([#1505](https://github.com/rapidsai/raft/pull/1505)) [@tfeher](https://github.com/tfeher) +- Sparse Pairwise Distances API Updates ([#1502](https://github.com/rapidsai/raft/pull/1502)) [@divyegala](https://github.com/divyegala) +- Add CAGRA gbench ([#1496](https://github.com/rapidsai/raft/pull/1496)) [@tfeher](https://github.com/tfeher) +- Cagra index construction without copying device mdarrays ([#1494](https://github.com/rapidsai/raft/pull/1494)) [@tfeher](https://github.com/tfeher) + # raft 23.06.00 (7 Jun 2023) ## 🚨 Breaking Changes diff --git a/README.md b/README.md index 10cd7b16fc..2c7f83ad02 100755 --- a/README.md +++ b/README.md @@ -1,19 +1,20 @@ -#
 RAFT: Reusable Accelerated Functions and Tools
+#
 RAFT: Reusable Accelerated Functions and Tools for Vector Search and More
-![Navigating the canyons of accelerated possibilities](img/raft.png) +![RAFT tech stack](img/raft-tech-stack-vss.png) ## Resources - [RAFT Reference Documentation](https://docs.rapids.ai/api/raft/stable/): API Documentation. - [RAFT Getting Started](./docs/source/quick_start.md): Getting started with RAFT. - [Build and Install RAFT](./docs/source/build.md): Instructions for installing and building RAFT. +- [Example Notebooks](./notebooks): Example jupyer notebooks - [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate. - [GitHub repository](https://github.com/rapidsai/raft): Download the RAFT source code. - [Issue tracker](https://github.com/rapidsai/raft/issues): Report issues or request features. ## Overview -RAFT contains fundamental widely-used algorithms and primitives for data science and machine learning. The algorithms are CUDA-accelerated and form building blocks for rapidly composing analytics. +RAFT contains fundamental widely-used algorithms and primitives for machine learning and information retrieval. The algorithms are CUDA-accelerated and form building blocks for more easily writing high performance applications. By taking a primitives-based approach to algorithm development, RAFT - accelerates algorithm construction time @@ -22,20 +23,20 @@ By taking a primitives-based approach to algorithm development, RAFT While not exhaustive, the following general categories help summarize the accelerated functions in RAFT: ##### -| Category | Examples | -| --- | --- | -| **Data Formats** | sparse & dense, conversions, data generation | +| Category | Examples | +| --- |-----------------------------------------------------------------------------------------------------------------------------------| +| **Data Formats** | sparse & dense, conversions, data generation | | **Dense Operations** | linear algebra, matrix and vector operations, reductions, slicing, norms, factorization, least squares, svd & eigenvalue problems | -| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling | -| **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction | -| **Basic Clustering** | spectral clustering, hierarchical clustering, k-means | -| **Solvers** | combinatorial optimization, iterative solvers | -| **Statistics** | sampling, moments and summary statistics, metrics | -| **Tools & Utilities** | common utilities for developing CUDA applications, multi-node multi-gpu infrastructure | +| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling | +| **Spatial** | pairwise distances, nearest neighbors and vector search, neighborhood graph construction | +| **Basic Clustering** | spectral clustering, hierarchical clustering, k-means | +| **Solvers** | combinatorial optimization, iterative solvers | +| **Statistics** | sampling, moments and summary statistics, metrics | +| **Tools & Utilities** | common utilities for developing CUDA applications, multi-node multi-gpu infrastructure | -RAFT is a C++ header-only template library with an optional shared library that -1) can speed up compile times for common template types, and +RAFT is a C++ header-only template library with an optional shared library that +1) can speed up compile times for common template types, and 2) provides host-accessible "runtime" APIs, which don't require a CUDA compiler to use In addition being a C++ library, RAFT also provides 2 Python libraries: @@ -44,6 +45,29 @@ In addition being a C++ library, RAFT also provides 2 Python libraries: ![RAFT is a C++ header-only template library with optional shared library and lightweight Python wrappers](img/arch.png) +## Use cases + +### Vector Similarity Search + +RAFT contains state-of-the-art implementations of approximate nearest neighbors algorithms on the GPU that enable vector similarity search. Vector similarity search applications often require fast online queries done one-at-a-time and RAFT's graph-based [CAGRA](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#cagra) algorithm outperforms the state-of-the art on the CPU (hierarchical navigable small-world graph or HNSW). + +In addition to CAGRA, RAFT contains other state-of-the-art GPU-accelerated implementations of popular algorithms for vector similarity search, such as [IVF-Flat](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#ivf-flat) and [IVF-PQ](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#ivf-pq) algorithms originally popularized by the [FAISS](https://github.com/facebookresearch/faiss) library. + +### Information Retrieval + +RAFT also contains a catalog of reusable primitives for composing algorithms that require fast neighborhood computations, such as + +1. Computing distances between vectors and computing kernel gramm matrices +2. Performing ball radius queries for constructing epsilon neighborhoods +3. Clustering points to partition a space for smaller and faster searches +4. Constructing neighborhood "connectivities" graphs from dense vectors + +As an example, computations such as the above list are critical for information retrieval, data mining, and machine learning applications such as clustering, manifold learning, and dimensionality reduction. + +## Is RAFT right for me? + +RAFT contains low level primitives for accelerating applications and workflows. Data source providers and application developers may find specific tools -- like ANN algorithms -- very useful. RAFT is not intended to be used directly by data scientists for discovery and experimentation. For data science tools, please see the [RAPIDS website](https://rapids.ai/). + ## Getting started ### RAPIDS Memory Manager (RMM) @@ -291,6 +315,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders: - `template`: A skeleton template containing the bare-bones file structure and cmake configuration for writing applications with RAFT. - `test`: Googletests source code - `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs) +- `notebooks`: IPython notebooks with usage examples and tutorials - `python`: Source code for Python libraries. - `pylibraft`: Python build and source code for pylibraft library - `raft-dask`: Python build and source code for raft-dask library @@ -322,3 +347,14 @@ If citing the sparse pairwise distances API, please consider using the following year={2021} } ``` + +If citing the single-linkage agglomerative clustering APIs, please consider the following bibtex: +```bibtex +@misc{nolet2023cuslink, + title={cuSLINK: Single-linkage Agglomerative Clustering on the GPU}, + author={Corey J. Nolet and Divye Gala and Alex Fender and Mahesh Doijade and Joe Eaton and Edward Raff and John Zedlewski and Brad Rees and Tim Oates}, + year={2023}, + eprint={2306.16354}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} \ No newline at end of file diff --git a/build.sh b/build.sh index ab904abdad..1213500159 100755 --- a/build.sh +++ b/build.sh @@ -88,9 +88,7 @@ DISABLE_DEPRECATION_WARNINGS=ON CMAKE_TARGET="" # Set defaults for vars that may not have been defined externally -# FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check -# CONDA_PREFIX, but there is no fallback from there! -INSTALL_PREFIX=${INSTALL_PREFIX:=${PREFIX:=${CONDA_PREFIX}}} +INSTALL_PREFIX=${INSTALL_PREFIX:=${PREFIX:=${CONDA_PREFIX:=$LIBRAFT_BUILD_DIR/install}}} PARALLEL_LEVEL=${PARALLEL_LEVEL:=`nproc`} BUILD_ABI=${BUILD_ABI:=ON} @@ -367,8 +365,9 @@ if [[ ${CMAKE_TARGET} == "" ]]; then fi # Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option. +SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON" + SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON" fi # If clean given, run it prior to any other steps @@ -383,14 +382,6 @@ if (( ${CLEAN} == 1 )); then rmdir ${bd} || true fi done - - cd ${REPODIR}/python/raft-dask - python setup.py clean --all - cd ${REPODIR} - - cd ${REPODIR}/python/pylibraft - python setup.py clean --all - cd ${REPODIR} fi ################################################################################ @@ -484,29 +475,16 @@ fi # Build and (optionally) install the pylibraft Python package if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then - # Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option. - if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON" - fi - cd ${REPODIR}/python/pylibraft - python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1} - if [[ ${INSTALL_TARGET} != "" ]]; then - python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} ${EXTRA_CMAKE_ARGS} - fi + SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS}" \ + SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \ + python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/pylibraft fi # Build and (optionally) install the raft-dask Python package if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then - # Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option. - if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON" - fi - - cd ${REPODIR}/python/raft-dask - python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH="${RAFT_DASK_BUILD_DIR};${INSTALL_PREFIX}" -DCMAKE_LIBRARY_PATH=${LIBRAFT_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1} - if [[ ${INSTALL_TARGET} != "" ]]; then - python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} ${EXTRA_CMAKE_ARGS} - fi + SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS}" \ + SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \ + python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-dask fi diff --git a/ci/build_docs.sh b/ci/build_docs.sh index b1cb993798..4f99348c95 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -19,7 +19,6 @@ rapids-print-env rapids-logger "Downloading artifacts from previous jobs" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) -VERSION_NUMBER="23.06" rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ @@ -29,21 +28,21 @@ rapids-mamba-retry install \ pylibraft \ raft-dask +export RAPIDS_VERSION_NUMBER="23.08" +export RAPIDS_DOCS_DIR="$(mktemp -d)" -rapids-logger "Build Doxygen docs" +rapids-logger "Build CPP docs" pushd cpp/doxygen doxygen Doxyfile popd -rapids-logger "Build Sphinx docs" +rapids-logger "Build Python docs" pushd docs sphinx-build -b dirhtml source _html sphinx-build -b text source _text +mkdir -p "${RAPIDS_DOCS_DIR}/raft/"{html,txt} +mv _html/* "${RAPIDS_DOCS_DIR}/raft/html" +mv _text/* "${RAPIDS_DOCS_DIR}/raft/txt" popd - -if [[ ${RAPIDS_BUILD_TYPE} != "pull-request" ]]; then - rapids-logger "Upload Docs to S3" - aws s3 sync --no-progress --delete docs/_html "s3://rapidsai-docs/raft/${VERSION_NUMBER}/html" - aws s3 sync --no-progress --delete docs/_text "s3://rapidsai-docs/raft/${VERSION_NUMBER}/txt" -fi +rapids-upload-docs diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh new file mode 100755 index 0000000000..a9f7f64294 --- /dev/null +++ b/ci/build_wheel.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +package_name=$1 +package_dir=$2 + +source rapids-configure-sccache +source rapids-date-string + +# Use gha-tools rapids-pip-wheel-version to generate wheel version then +# update the necessary files +version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})" + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + +ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}" +echo "The package name and/or version was modified in the package source. The git diff is:" +git diff + +cd "${package_dir}" + +# Hardcode the output dir +python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check + +mkdir -p final_dist +python -m auditwheel repair -w final_dist dist/* + +RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh new file mode 100755 index 0000000000..f17f038675 --- /dev/null +++ b/ci/build_wheel_pylibraft.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +# Set up skbuild options. Enable sccache in skbuild config options +export SKBUILD_CONFIGURE_OPTIONS="-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + +ci/build_wheel.sh pylibraft python/pylibraft diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh new file mode 100755 index 0000000000..f0204d45c0 --- /dev/null +++ b/ci/build_wheel_raft_dask.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +# Set up skbuild options. Enable sccache in skbuild config options +export SKBUILD_CONFIGURE_OPTIONS="-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF" + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + +RAPIDS_PY_WHEEL_NAME=pylibraft_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibraft +python -m pip install --no-deps ./local-pylibraft/pylibraft*.whl + +ci/build_wheel.sh raft_dask python/raft-dask diff --git a/ci/docs/build.sh b/ci/docs/build.sh deleted file mode 100644 index e3062107c0..0000000000 --- a/ci/docs/build.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -################################# -# RAFT docs build script for CI # -################################# - -if [ -z "$PROJECT_WORKSPACE" ]; then - echo ">>>> ERROR: Could not detect PROJECT_WORKSPACE in environment" - echo ">>>> WARNING: This script contains git commands meant for automated building, do not run locally" - exit 1 -fi - -export DOCS_WORKSPACE="$WORKSPACE/docs" -export PATH=/conda/bin:/usr/local/cuda/bin:$PATH -export HOME="$WORKSPACE" -export PROJECT_WORKSPACE=/rapids/raft -export PROJECTS=(raft) - -gpuci_logger "Check environment" -env - -gpuci_logger "Check GPU usage" -nvidia-smi - - -gpuci_logger "Activate conda env" -. /opt/conda/etc/profile.d/conda.sh -conda activate rapids - -gpuci_logger "Check versions" -python --version -$CC --version -$CXX --version - -gpuci_logger "Show conda info" -conda info -conda config --show-sources -conda list --show-channel-urls - -# Build Doxygen docs -gpuci_logger "Build Doxygen and Sphinx docs" -"$PROJECT_WORKSPACE/build.sh" docs -v - -#Commit to Website -cd "$DOCS_WORKSPACE" - -for PROJECT in ${PROJECTS[@]}; do - if [ ! -d "api/$PROJECT/$BRANCH_VERSION" ]; then - mkdir -p "api/$PROJECT/$BRANCH_VERSION" - fi - rm -rf "$DOCS_WORKSPACE/api/$PROJECT/$BRANCH_VERSION/"* -done - -mv "$PROJECT_WORKSPACE/docs/_html/"* "$DOCS_WORKSPACE/api/raft/$BRANCH_VERSION" \ No newline at end of file diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index f6c6b08644..ef935ba518 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -25,6 +25,10 @@ NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} NEXT_UCX_PY_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})" NEXT_UCX_PY_VERSION="${NEXT_UCX_PY_SHORT_TAG}.*" +# Need to distutils-normalize the original version +NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") +NEXT_UCX_PY_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCX_PY_SHORT_TAG}'))") + echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" # Inplace sed replace; workaround for Linux and Mac @@ -33,6 +37,7 @@ function sed_runner() { } sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/CMakeLists.txt +sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/template/cmake/thirdparty/fetch_rapids.cmake sed_runner "s/set(RAFT_VERSION .*)/set(RAFT_VERSION \"${NEXT_FULL_TAG}\")/g" cpp/CMakeLists.txt sed_runner 's/'"pylibraft_version .*)"'/'"pylibraft_version ${NEXT_FULL_TAG})"'/g' python/pylibraft/CMakeLists.txt sed_runner 's/'"raft_dask_version .*)"'/'"raft_dask_version ${NEXT_FULL_TAG})"'/g' python/raft-dask/CMakeLists.txt @@ -50,13 +55,23 @@ sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/p sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py -for FILE in conda/environments/*.yaml dependencies.yaml; do - sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/rapids-build-env=${CURRENT_SHORT_TAG}/rapids-build-env=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/rapids-doc-env=${CURRENT_SHORT_TAG}/rapids-doc-env=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/rapids-notebook-env=${CURRENT_SHORT_TAG}/rapids-notebook-env=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCX_PY_VERSION}/g" ${FILE}; +DEPENDENCIES=( + dask-cuda + pylibraft + rmm + # ucx-py is handled separately below +) +for FILE in dependencies.yaml conda/environments/*.yaml; do + for DEP in "${DEPENDENCIES[@]}"; do + sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}\.*/g" ${FILE}; + done + sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; +done +for FILE in python/*/pyproject.toml; do + for DEP in "${DEPENDENCIES[@]}"; do + sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE} + done + sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\"/g" ${FILE} done sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml @@ -66,21 +81,10 @@ for FILE in .github/workflows/*.yaml; do sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE}; done -# Need to distutils-normalize the original version -NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") -NEXT_UCX_PY_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCX_PY_SHORT_TAG}'))") - -# Dependency versions in pyproject.toml -sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pylibraft/pyproject.toml - -sed_runner "s/pylibraft==.*\",/pylibraft==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/pyproject.toml -sed_runner "s/dask-cuda==.*\",/dask-cuda==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/pyproject.toml -sed_runner "s/ucx-py.*\",/ucx-py==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\",/g" python/raft-dask/pyproject.toml - for FILE in .github/workflows/*.yaml; do sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done -sed_runner "s/VERSION_NUMBER=\".*/VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh +sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxyfile diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index e32697a68a..9c487be156 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -36,12 +36,7 @@ trap "EXITCODE=1" ERR set +e # Run libraft gtests from libraft-tests package -rapids-logger "Run gtests" -for gt in "$CONDA_PREFIX"/bin/gtests/libraft/* ; do - test_name=$(basename ${gt}) - echo "Running gtest $test_name" - ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR} -done +ctest -j8 --output-on-failure rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_wheel_pylibraft.sh b/ci/test_wheel_pylibraft.sh new file mode 100755 index 0000000000..d990a0e6c2 --- /dev/null +++ b/ci/test_wheel_pylibraft.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +mkdir -p ./dist +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist + +# echo to expand wildcard before adding `[extra]` requires for pip +python -m pip install $(echo ./dist/pylibraft*.whl)[test] + +# Run smoke tests for aarch64 pull requests +if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then + python ./ci/wheel_smoke_test_pylibraft.py +else + python -m pytest ./python/pylibraft/pylibraft/test +fi diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh new file mode 100755 index 0000000000..6aa459ca7c --- /dev/null +++ b/ci/test_wheel_raft_dask.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +mkdir -p ./dist +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist + +# Download the pylibraft built in the previous step +RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep +python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl + +# Always install latest dask for testing +python -m pip install git+https://github.com/dask/dask.git@2023.7.1 git+https://github.com/dask/distributed.git@2023.7.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.08 + +# echo to expand wildcard before adding `[extra]` requires for pip +python -m pip install $(echo ./dist/raft_dask*.whl)[test] + +# Run smoke tests for aarch64 pull requests +if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then + python ./ci/wheel_smoke_test_raft_dask.py +else + python -m pytest ./python/raft-dask/raft_dask/test +fi diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 9cb299889d..55e03f0be4 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -13,15 +13,16 @@ dependencies: - clang=16.0.1 - cmake>=3.23.1,!=3.25.0 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0 -- cudatoolkit=11.8 +- cuda-python>=11.7.1,<12.0a0 +- cuda-version=11.8 +- cudatoolkit - cupy>=12.0.0 - cxx-compiler - cython>=0.29,<0.30 -- dask-core==2023.3.2 -- dask-cuda==23.6.* -- dask==2023.3.2 -- distributed==2023.3.2.1 +- dask-core==2023.7.1 +- dask-cuda==23.8.* +- dask==2023.7.1 +- distributed==2023.7.1 - doxygen>=1.8.20 - gcc_linux-64=11.* - gmock>=1.13.0 @@ -46,14 +47,14 @@ dependencies: - pytest - pytest-cov - recommonmark -- rmm==23.6.* -- scikit-build>=0.13.1,<0.17.2 +- rmm==23.8.* +- scikit-build>=0.13.1 - scikit-learn - scipy - sphinx-copybutton - sphinx-markdown-tables - sysroot_linux-64==2.17 - ucx-proc=*=gpu -- ucx-py=0.32.* +- ucx-py==0.33.* - ucx>=1.13.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml new file mode 100644 index 0000000000..28d7dd0591 --- /dev/null +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -0,0 +1,56 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.1 +- clang=16.0.1 +- cmake>=3.23.1,!=3.25.0 +- cuda-cudart-dev +- cuda-profiler-api +- cuda-python>=12.0,<13.0a0 +- cuda-version=12.0 +- cupy>=12.0.0 +- cxx-compiler +- cython>=0.29,<0.30 +- dask-core==2023.7.1 +- dask-cuda==23.8.* +- dask==2023.7.1 +- distributed==2023.7.1 +- doxygen>=1.8.20 +- gcc_linux-64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- joblib>=0.11 +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- nccl>=2.9.9 +- ninja +- numba>=0.57 +- numpy>=1.21 +- numpydoc +- pydata-sphinx-theme +- pytest +- pytest-cov +- recommonmark +- rmm==23.8.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-64==2.17 +- ucx-proc=*=gpu +- ucx-py==0.33.* +- ucx>=1.13.0 +name: all_cuda-120_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 3ea560025e..a982febeed 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -12,7 +12,8 @@ dependencies: - clang=16.0.1 - cmake>=3.23.1,!=3.25.0 - cuda-profiler-api=11.8.86 -- cudatoolkit=11.8 +- cuda-version=11.8 +- cudatoolkit - cxx-compiler - cython>=0.29,<0.30 - faiss-proc=*=cuda @@ -29,9 +30,10 @@ dependencies: - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 - libfaiss>=1.7.1 +- matplotlib - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 -- scikit-build>=0.13.1,<0.17.2 +- scikit-build>=0.13.1 - sysroot_linux-64==2.17 name: bench_ann_cuda-118_arch-x86_64 diff --git a/conda/recipes/libraft/build_libraft_template.sh b/conda/recipes/libraft/build_libraft_template.sh index 9759402884..bd7719af76 100644 --- a/conda/recipes/libraft/build_libraft_template.sh +++ b/conda/recipes/libraft/build_libraft_template.sh @@ -2,4 +2,4 @@ # Copyright (c) 2022-2023, NVIDIA CORPORATION. # Just building template so we verify it uses libraft.so and fail if it doesn't build -./build.sh template \ No newline at end of file +./build.sh template diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml index bec773d26d..c8dcce90eb 100644 --- a/conda/recipes/libraft/conda_build_config.yaml +++ b/conda/recipes/libraft/conda_build_config.yaml @@ -5,6 +5,9 @@ cxx_compiler_version: - 11 cuda_compiler: + - cuda-nvcc + +cuda11_compiler: - nvcc sysroot_version: @@ -31,40 +34,40 @@ h5py_version: nlohmann_json_version: - ">=3.11.2" -# The CTK libraries below are missing from the conda-forge::cudatoolkit -# package. The "*_host_*" version specifiers correspond to `11.8` packages and the -# "*_run_*" version specifiers correspond to `11.x` packages. +# The CTK libraries below are missing from the conda-forge::cudatoolkit package +# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages +# and the "*_run_*" version specifiers correspond to `11.x` packages. -libcublas_host_version: +cuda11_libcublas_host_version: - "=11.11.3.6" -libcublas_run_version: +cuda11_libcublas_run_version: - ">=11.5.2.43,<12.0.0" -libcurand_host_version: +cuda11_libcurand_host_version: - "=10.3.0.86" -libcurand_run_version: +cuda11_libcurand_run_version: - ">=10.2.5.43,<10.3.1" -libcusolver_host_version: +cuda11_libcusolver_host_version: - "=11.4.1.48" -libcusolver_run_version: +cuda11_libcusolver_run_version: - ">=11.2.0.43,<11.4.2" -libcusparse_host_version: +cuda11_libcusparse_host_version: - "=11.7.5.86" -libcusparse_run_version: +cuda11_libcusparse_run_version: - ">=11.6.0.43,<12.0.0" # `cuda-profiler-api` only has `11.8.0` and `12.0.0` packages for all # architectures. The "*_host_*" version specifiers correspond to `11.8` packages and the # "*_run_*" version specifiers correspond to `11.x` packages. -cuda_profiler_api_host_version: +cuda11_cuda_profiler_api_host_version: - "=11.8.86" -cuda_profiler_api_run_version: +cuda11_cuda_profiler_api_run_version: - ">=11.4.240,<12" diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml index b89fcfb788..09ef7ae4ab 100644 --- a/conda/recipes/libraft/meta.yaml +++ b/conda/recipes/libraft/meta.yaml @@ -40,21 +40,34 @@ outputs: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} - librmm requirements: build: - {{ compiler('c') }} - {{ compiler('cxx') }} - - {{ compiler('cuda') }} {{ cuda_version }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - cmake {{ cmake_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: + {% if cuda_major != "11" %} + - cuda-cudart-dev + {% endif %} + - cuda-version ={{ cuda_version }} - librmm ={{ minor_version }} - - cudatoolkit {{ cuda_version }} run: - - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} - librmm ={{ minor_version }} about: home: https://rapids.ai/ @@ -66,21 +79,36 @@ outputs: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} - librmm requirements: + host: + - cuda-version ={{ cuda_version }} run: - {{ pin_subpackage('libraft-headers-only', exact=True) }} - - cuda-profiler-api {{ cuda_profiler_api_run_version }} - librmm ={{ minor_version }} - - libcublas {{ libcublas_run_version }} - - libcublas-dev {{ libcublas_run_version }} - - libcurand {{ libcurand_run_version }} - - libcurand-dev {{ libcurand_run_version }} - - libcusolver {{ libcusolver_run_version }} - - libcusolver-dev {{ libcusolver_run_version }} - - libcusparse {{ libcusparse_run_version }} - - libcusparse-dev {{ libcusparse_run_version }} + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }} + - libcublas {{ cuda11_libcublas_run_version }} + - libcublas-dev {{ cuda11_libcublas_run_version }} + - libcurand {{ cuda11_libcurand_run_version }} + - libcurand-dev {{ cuda11_libcurand_run_version }} + - libcusolver {{ cuda11_libcusolver_run_version }} + - libcusolver-dev {{ cuda11_libcusolver_run_version }} + - libcusparse {{ cuda11_libcusparse_run_version }} + - libcusparse-dev {{ cuda11_libcusparse_run_version }} + {% else %} + - cuda-cudart-dev + - cuda-profiler-api + - libcublas-dev + - libcurand-dev + - libcusolver-dev + - libcusparse-dev + {% endif %} about: home: https://rapids.ai/ license: Apache-2.0 @@ -93,29 +121,45 @@ outputs: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - {{ compiler('c') }} - - {{ compiler('cuda') }} {{ cuda_version }} - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - cmake {{ cmake_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - {{ pin_subpackage('libraft-headers', exact=True) }} - - cudatoolkit {{ cuda_version }} - - cuda-profiler-api {{ cuda_profiler_api_host_version }} - - libcublas {{ libcublas_host_version }} - - libcublas-dev {{ libcublas_host_version }} - - libcurand {{ libcurand_host_version }} - - libcurand-dev {{ libcurand_host_version }} - - libcusolver {{ libcusolver_host_version }} - - libcusolver-dev {{ libcusolver_host_version }} - - libcusparse {{ libcusparse_host_version }} - - libcusparse-dev {{ libcusparse_host_version }} + - cuda-version ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }} + - libcublas {{ cuda11_libcublas_host_version }} + - libcublas-dev {{ cuda11_libcublas_host_version }} + - libcurand {{ cuda11_libcurand_host_version }} + - libcurand-dev {{ cuda11_libcurand_host_version }} + - libcusolver {{ cuda11_libcusolver_host_version }} + - libcusolver-dev {{ cuda11_libcusolver_host_version }} + - libcusparse {{ cuda11_libcusparse_host_version }} + - libcusparse-dev {{ cuda11_libcusparse_host_version }} + {% else %} + - cuda-profiler-api + - libcublas-dev + - libcurand-dev + - libcusolver-dev + - libcusparse-dev + {% endif %} run: - {{ pin_subpackage('libraft-headers', exact=True) }} + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} about: home: https://rapids.ai/ license: Apache-2.0 @@ -128,30 +172,50 @@ outputs: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - {{ compiler('c') }} - - {{ compiler('cuda') }} {{ cuda_version }} - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - cmake {{ cmake_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - {{ pin_subpackage('libraft', exact=True) }} - - cudatoolkit {{ cuda_version }} - - cuda-profiler-api {{ cuda_profiler_api_host_version }} + - cuda-version ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }} + - libcublas {{ cuda11_libcublas_host_version }} + - libcublas-dev {{ cuda11_libcublas_host_version }} + - libcurand {{ cuda11_libcurand_host_version }} + - libcurand-dev {{ cuda11_libcurand_host_version }} + - libcusolver {{ cuda11_libcusolver_host_version }} + - libcusolver-dev {{ cuda11_libcusolver_host_version }} + - libcusparse {{ cuda11_libcusparse_host_version }} + - libcusparse-dev {{ cuda11_libcusparse_host_version }} + {% else %} + - cuda-cudart-dev + - cuda-profiler-api + - libcublas-dev + - libcurand-dev + - libcusolver-dev + - libcusparse-dev + {% endif %} - gmock {{ gtest_version }} - gtest {{ gtest_version }} - - libcublas {{ libcublas_host_version }} - - libcublas-dev {{ libcublas_host_version }} - - libcurand {{ libcurand_host_version }} - - libcurand-dev {{ libcurand_host_version }} - - libcusolver {{ libcusolver_host_version }} - - libcusolver-dev {{ libcusolver_host_version }} - - libcusparse {{ libcusparse_host_version }} - - libcusparse-dev {{ libcusparse_host_version }} run: + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} - {{ pin_subpackage('libraft', exact=True) }} - gmock {{ gtest_version }} - gtest {{ gtest_version }} @@ -167,20 +231,39 @@ outputs: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - {{ compiler('c') }} - - {{ compiler('cuda') }} {{ cuda_version }} - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - cmake {{ cmake_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - {{ pin_subpackage('libraft', exact=True) }} - - libcublas {{ libcublas_host_version }} - - libcublas-dev {{ libcublas_host_version }} + - {{ pin_subpackage('libraft-headers', exact=True) }} + - cuda-version ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }} + - libcublas {{ cuda11_libcublas_host_version }} + - libcublas-dev {{ cuda11_libcublas_host_version }} + {% else %} + - cuda-profiler-api + - libcublas-dev + {% endif %} run: + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} - {{ pin_subpackage('libraft', exact=True) }} about: home: https://rapids.ai/ @@ -194,29 +277,52 @@ outputs: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - {{ compiler('c') }} - - {{ compiler('cuda') }} {{ cuda_version }} - {{ compiler('cxx') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - cmake {{ cmake_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - {{ pin_subpackage('libraft', exact=True) }} - - cudatoolkit {{ cuda_version }} - - libcublas {{ libcublas_host_version }} - - libcublas-dev {{ libcublas_host_version }} + - cuda-version ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }} + - libcublas {{ cuda11_libcublas_host_version }} + - libcublas-dev {{ cuda11_libcublas_host_version }} + {% else %} + - cuda-profiler-api + - libcublas-dev + {% endif %} - glog {{ glog_version }} - nlohmann_json {{ nlohmann_json_version }} - - libfaiss>=1.7.1 + # Temporarily ignore faiss benchmarks on CUDA 12 because packages do not exist yet + {% if cuda_major == "11" %} - faiss-proc=*=cuda + - libfaiss {{ faiss_version }} + {% endif %} run: - {{ pin_subpackage('libraft', exact=True) }} + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} - glog {{ glog_version }} + # Temporarily ignore faiss benchmarks on CUDA 12 because packages do not exist yet + {% if cuda_major == "11" %} - faiss-proc=*=cuda - libfaiss {{ faiss_version }} + {% endif %} - h5py {{ h5py_version }} about: home: https://rapids.ai/ diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml index add119d796..41bf15c12c 100644 --- a/conda/recipes/pylibraft/conda_build_config.yaml +++ b/conda/recipes/pylibraft/conda_build_config.yaml @@ -5,6 +5,9 @@ cxx_compiler_version: - 11 cuda_compiler: + - cuda-nvcc + +cuda11_compiler: - nvcc sysroot_version: diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml index 7730801801..7468039539 100644 --- a/conda/recipes/pylibraft/meta.yaml +++ b/conda/recipes/pylibraft/meta.yaml @@ -20,19 +20,31 @@ build: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - {{ compiler('c') }} - {{ compiler('cxx') }} - - {{ compiler('cuda') }} {{ cuda_version }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - cmake {{ cmake_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - - cuda-python >=11.7.1,<12.0 - - cudatoolkit ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-python >=11.7.1,<12.0a0 + - cudatoolkit + {% else %} + - cuda-python >=12.0,<13.0a0 + {% endif %} + - cuda-version ={{ cuda_version }} - cython >=0.29,<0.30 - libraft {{ version }} - libraft-headers {{ version }} @@ -42,15 +54,18 @@ requirements: - scikit-build >=0.13.1 - setuptools run: - - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} - - cuda-python >=11.7.1,<12.0 + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} - libraft {{ version }} - libraft-headers {{ version }} - python x.x + - rmm ={{ minor_version }} tests: requirements: - - cudatoolkit ={{ cuda_version }} + - cuda-version ={{ cuda_version }} imports: - pylibraft diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml index 4f88728f4b..fb09c6d1f5 100644 --- a/conda/recipes/raft-dask/conda_build_config.yaml +++ b/conda/recipes/raft-dask/conda_build_config.yaml @@ -5,6 +5,9 @@ cxx_compiler_version: - 11 cuda_compiler: + - cuda-nvcc + +cuda11_compiler: - nvcc sysroot_version: @@ -14,7 +17,7 @@ ucx_version: - ">=1.13.0,<1.15.0" ucx_py_version: - - "0.32.*" + - "0.33.*" cmake_version: - ">=3.23.1,!=3.25.0" diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml index cd08deabfa..5f3ea8257f 100644 --- a/conda/recipes/raft-dask/meta.yaml +++ b/conda/recipes/raft-dask/meta.yaml @@ -20,19 +20,31 @@ build: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - {{ compiler('c') }} - {{ compiler('cxx') }} - - {{ compiler('cuda') }} {{ cuda_version }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - cmake {{ cmake_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - - cuda-python >=11.7.1,<12.0 - - cudatoolkit ={{ cuda_version }} + {% if cuda_major == "11" %} + - cuda-python >=11.7.1,<12.0a0 + - cudatoolkit + {% else %} + - cuda-python >=12.0,<13.0a0 + {% endif %} + - cuda-version ={{ cuda_version }} - cython >=0.29,<0.30 - nccl >=2.9.9 - pylibraft {{ version }} @@ -44,12 +56,14 @@ requirements: - ucx-proc=*=gpu - ucx-py {{ ucx_py_version }} run: - - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} - - cuda-python >=11.7.1,<12.0 - - dask ==2023.3.2 - - dask-core ==2023.3.2 + {% if cuda_major == "11" %} + - cudatoolkit + {% endif %} + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + - dask ==2023.7.1 + - dask-core ==2023.7.1 - dask-cuda ={{ minor_version }} - - distributed ==2023.3.2.1 + - distributed ==2023.7.1 - joblib >=0.11 - nccl >=2.9.9 - pylibraft {{ version }} @@ -61,7 +75,7 @@ requirements: tests: requirements: - - cudatoolkit ={{ cuda_version }} + - cuda-version ={{ cuda_version }} imports: - raft_dask diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9f3031c6d2..7ee8293c5d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -10,8 +10,8 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. -set(RAPIDS_VERSION "23.06") -set(RAFT_VERSION "23.06.02") +set(RAPIDS_VERSION "23.08") +set(RAFT_VERSION "23.08.00") cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../fetch_rapids.cmake) @@ -307,6 +307,30 @@ if(RAFT_COMPILE_LIBRARY) src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu src/neighbors/brute_force_knn_int_float_int.cu src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu @@ -318,6 +342,9 @@ if(RAFT_COMPILE_LIBRARY) src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu + src/neighbors/detail/refine_host_float_float.cpp + src/neighbors/detail/refine_host_int8_t_float.cpp + src/neighbors/detail/refine_host_uint8_t_float.cpp src/neighbors/detail/selection_faiss_int32_t_float.cu src/neighbors/detail/selection_faiss_int_double.cu src/neighbors/detail/selection_faiss_long_float.cu @@ -363,6 +390,9 @@ if(RAFT_COMPILE_LIBRARY) src/raft_runtime/distance/pairwise_distance.cu src/raft_runtime/matrix/select_k_float_int64_t.cu src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu + src/raft_runtime/neighbors/cagra_build.cu + src/raft_runtime/neighbors/cagra_search.cu + src/raft_runtime/neighbors/cagra_serialize.cu src/raft_runtime/neighbors/ivf_flat_build.cu src/raft_runtime/neighbors/ivf_flat_search.cu src/raft_runtime/neighbors/ivf_flat_serialize.cu @@ -602,7 +632,9 @@ target_link_libraries(raft::raft INTERFACE # Use `rapids_export` for 22.04 as it will have COMPONENT support rapids_export( INSTALL raft - EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets} + EXPORT_SET raft-exports + COMPONENTS ${raft_components} + COMPONENTS_EXPORT_SET ${raft_export_sets} GLOBAL_TARGETS raft compiled distributed NAMESPACE raft:: DOCUMENTATION doc_string @@ -613,7 +645,9 @@ rapids_export( # * build export ------------------------------------------------------------- rapids_export( BUILD raft - EXPORT_SET raft-exports COMPONENTS ${raft_components} COMPONENTS_EXPORT_SET ${raft_export_sets} + EXPORT_SET raft-exports + COMPONENTS ${raft_components} + COMPONENTS_EXPORT_SET ${raft_export_sets} GLOBAL_TARGETS raft compiled distributed DOCUMENTATION doc_string NAMESPACE raft:: diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index a14018a15d..6977d77684 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -18,14 +18,22 @@ option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) -option(RAFT_ANN_BENCH_USE_RAFT_BFKNN "Include raft's brute-force knn algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON) option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON) find_package(Threads REQUIRED) +# Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled. +# https://github.com/rapidsai/raft/issues/1627 +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0) + set(RAFT_ANN_BENCH_USE_FAISS_BFKNN OFF) + set(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ OFF) +endif() + set(RAFT_ANN_BENCH_USE_FAISS OFF) if(RAFT_ANN_BENCH_USE_FAISS_BFKNN OR RAFT_ANN_BENCH_USE_FAISS_IVFPQ @@ -35,9 +43,9 @@ if(RAFT_ANN_BENCH_USE_FAISS_BFKNN endif() set(RAFT_ANN_BENCH_USE_RAFT OFF) -if(RAFT_ANN_BENCH_USE_RAFT_BFKNN - OR RAFT_ANN_BENCH_USE_RAFT_IVFPQ - OR RAFT_ANN_BENCH_USE_RAFT_IVFFLAT +if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ + OR RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT + OR RAFT_ANN_BENCH_USE_RAFT_CAGRA ) set(RAFT_ANN_BENCH_USE_RAFT ON) endif() @@ -133,25 +141,58 @@ if(RAFT_ANN_BENCH_USE_HNSWLIB) ) endif() -if(RAFT_ANN_BENCH_USE_RAFT) +if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ) ConfigureAnnBench( NAME RAFT_IVF_PQ PATH bench/ann/src/raft/raft_benchmark.cu $<$:bench/ann/src/raft/raft_ivf_pq.cu> + LINKS + raft::compiled + ) +endif() + +if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT) + ConfigureAnnBench( + NAME + RAFT_IVF_FLAT + PATH + bench/ann/src/raft/raft_benchmark.cu $<$:bench/ann/src/raft/raft_ivf_flat.cu> LINKS raft::compiled ) endif() -if(RAFT_ANN_BENCH_USE_FAISS) +if(RAFT_ANN_BENCH_USE_RAFT_CAGRA) + ConfigureAnnBench( + NAME + RAFT_CAGRA + PATH + bench/ann/src/raft/raft_benchmark.cu + $<$:bench/ann/src/raft/raft_cagra.cu> + LINKS + raft::compiled + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT) ConfigureAnnBench( NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss ) endif() +if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ) + ConfigureAnnBench( + NAME FAISS_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_BFKNN) + ConfigureAnnBench(NAME FAISS_BFKNN PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss) +endif() + if(RAFT_ANN_BENCH_USE_GGNN) include(cmake/thirdparty/get_glog.cmake) ConfigureAnnBench( diff --git a/cpp/bench/ann/conf/bigann-100M.json b/cpp/bench/ann/conf/bigann-100M.json index 5f16f3378d..0ff7df4776 100644 --- a/cpp/bench/ann/conf/bigann-100M.json +++ b/cpp/bench/ann/conf/bigann-100M.json @@ -168,7 +168,35 @@ "search_result_file" : "result/bigann-100M/ivf_flat/nlist100K" }, + { + "name" : "cagra.dim32", + "algo" : "cagra", + "build_param": { + "index_dim" : 32 + }, + "file" : "index/bigann-100M/cagra/dim32", + "search_params" : [ + "itopk": 32, + "itopk": 64, + "itopk": 128 + ], + "search_result_file" : "result/bigann-100M/cagra/dim32" + }, + { + "name" : "cagra.dim64", + "algo" : "cagra", + "build_param": { + "index_dim" : 64 + }, + "file" : "index/bigann-100M/cagra/dim64", + "search_params" : [ + "itopk": 32, + "itopk": 64, + "itopk": 128 + ], + "search_result_file" : "result/bigann-100M/cagra/dim64" + } ] } diff --git a/cpp/bench/ann/conf/deep-100M.json b/cpp/bench/ann/conf/deep-100M.json index b3a945d50e..97d670b614 100644 --- a/cpp/bench/ann/conf/deep-100M.json +++ b/cpp/bench/ann/conf/deep-100M.json @@ -218,6 +218,328 @@ "search_result_file" : "result/deep-100M/ivf_flat/nlist100K" }, - + { + "name" : "cagra.dim32", + "algo" : "raft_cagra", + "build_param": { + "index_dim": 32, + "intermediate_graph_degree": 48 + }, + "file": "index/deep-100M/cagra/dim32", + "search_params": [ + { + "itopk": 32, + "search_width": 1, + "max_iterations": 0, + "algo": "single_cta" + }, + { + "itopk": 32, + "search_width": 1, + "max_iterations": 32, + "algo": "single_cta" + }, + { + "itopk": 64, + "search_width": 4, + "max_iterations": 16, + "algo": "single_cta" + }, + { + "itopk": 64, + "search_width": 1, + "max_iterations": 64, + "algo": "single_cta" + }, + { + "itopk": 96, + "search_width": 2, + "max_iterations": 48, + "algo": "single_cta" + }, + { + "itopk": 128, + "search_width": 8, + "max_iterations": 16, + "algo": "single_cta" + }, + { + "itopk": 128, + "search_width": 2, + "max_iterations": 64, + "algo": "single_cta" + }, + { + "itopk": 192, + "search_width": 8, + "max_iterations": 24, + "algo": "single_cta" + }, + { + "itopk": 192, + "search_width": 2, + "max_iterations": 96, + "algo": "single_cta" + }, + { + "itopk": 256, + "search_width": 8, + "max_iterations": 32, + "algo": "single_cta" + }, + { + "itopk": 384, + "search_width": 8, + "max_iterations": 48, + "algo": "single_cta" + }, + { + "itopk": 512, + "search_width": 8, + "max_iterations": 64, + "algo": "single_cta" + } + ], + "search_result_file": "result/deep-100M/cagra/dim32" + }, + { + "name": "cagra.dim32.multi_cta", + "algo": "raft_cagra", + "build_param": { + "index_dim": 32, + "intermediate_graph_degree": 48 + }, + "file": "index/deep-100M/cagra/dim32", + "search_params": [ + { + "itopk": 32, + "search_width": 1, + "max_iterations": 0, + "algo": "multi_cta" + }, + { + "itopk": 32, + "search_width": 1, + "max_iterations": 32, + "algo": "multi_cta" + }, + { + "itopk": 64, + "search_width": 4, + "max_iterations": 16, + "algo": "multi_cta" + }, + { + "itopk": 64, + "search_width": 1, + "max_iterations": 64, + "algo": "multi_cta" + }, + { + "itopk": 96, + "search_width": 2, + "max_iterations": 48, + "algo": "multi_cta" + }, + { + "itopk": 128, + "search_width": 8, + "max_iterations": 16, + "algo": "multi_cta" + }, + { + "itopk": 128, + "search_width": 2, + "max_iterations": 64, + "algo": "multi_cta" + }, + { + "itopk": 192, + "search_width": 8, + "max_iterations": 24, + "algo": "multi_cta" + }, + { + "itopk": 192, + "search_width": 2, + "max_iterations": 96, + "algo": "multi_cta" + }, + { + "itopk": 256, + "search_width": 8, + "max_iterations": 32, + "algo": "multi_cta" + }, + { + "itopk": 384, + "search_width": 8, + "max_iterations": 48, + "algo": "multi_cta" + }, + { + "itopk": 512, + "search_width": 8, + "max_iterations": 64, + "algo": "multi_cta" + } + ], + "search_result_file": "result/deep-100M/cagra/dim32_multi_cta" + }, + { + "name": "cagra.dim32.multi_kernel", + "algo": "raft_cagra", + "build_param": { + "index_dim": 32, + "intermediate_graph_degree": 48 + }, + "file": "index/deep-100M/cagra/dim32", + "search_params": [ + { + "itopk": 32, + "search_width": 1, + "max_iterations": 0, + "algo": "multi_kernel" + }, + { + "itopk": 32, + "search_width": 1, + "max_iterations": 32, + "algo": "multi_kernel" + }, + { + "itopk": 64, + "search_width": 4, + "max_iterations": 16, + "algo": "multi_kernel" + }, + { + "itopk": 64, + "search_width": 1, + "max_iterations": 64, + "algo": "multi_kernel" + }, + { + "itopk": 96, + "search_width": 2, + "max_iterations": 48, + "algo": "multi_kernel" + }, + { + "itopk": 128, + "search_width": 8, + "max_iterations": 16, + "algo": "multi_kernel" + }, + { + "itopk": 128, + "search_width": 2, + "max_iterations": 64, + "algo": "multi_kernel" + }, + { + "itopk": 192, + "search_width": 8, + "max_iterations": 24, + "algo": "multi_kernel" + }, + { + "itopk": 192, + "search_width": 2, + "max_iterations": 96, + "algo": "multi_kernel" + }, + { + "itopk": 256, + "search_width": 8, + "max_iterations": 32, + "algo": "multi_kernel" + }, + { + "itopk": 384, + "search_width": 8, + "max_iterations": 48, + "algo": "multi_kernel" + }, + { + "itopk": 512, + "search_width": 8, + "max_iterations": 64, + "algo": "multi_kernel" + } + ], + "search_result_file": "result/deep-100M/cagra/dim32_multi_kernel" + }, + { + "name": "cagra.dim64", + "algo": "raft_cagra", + "build_param": { + "index_dim": 64 + }, + "file": "index/deep-100M/cagra/dim64", + "search_params" : [ + { + "itopk": 32, + "search_width": 1, + "max_iterations": 0 + }, + { + "itopk": 32, + "search_width": 1, + "max_iterations": 32 + }, + { + "itopk": 64, + "search_width": 4, + "max_iterations": 16 + }, + { + "itopk": 64, + "search_width": 1, + "max_iterations": 64 + }, + { + "itopk": 96, + "search_width": 2, + "max_iterations": 48 + }, + { + "itopk": 128, + "search_width": 8, + "max_iterations": 16 + }, + { + "itopk": 128, + "search_width": 2, + "max_iterations": 64 + }, + { + "itopk": 192, + "search_width": 8, + "max_iterations": 24 + }, + { + "itopk": 192, + "search_width": 2, + "max_iterations": 96 + }, + { + "itopk": 256, + "search_width": 8, + "max_iterations": 32 + }, + { + "itopk": 384, + "search_width": 8, + "max_iterations": 48 + }, + { + "itopk": 512, + "search_width": 8, + "max_iterations": 64 + } + ], + "search_result_file" : "result/deep-100M/cagra/dim32" + } ] } diff --git a/cpp/bench/ann/conf/glove-100-inner.json b/cpp/bench/ann/conf/glove-100-inner.json index d210aca654..5d0bbf970c 100644 --- a/cpp/bench/ann/conf/glove-100-inner.json +++ b/cpp/bench/ann/conf/glove-100-inner.json @@ -789,9 +789,5 @@ ], "search_result_file" : "result/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10" - }, - - - ] - + }] } diff --git a/cpp/bench/ann/conf/sift-128-euclidean.json b/cpp/bench/ann/conf/sift-128-euclidean.json index 476c363ecd..98983fd62e 100644 --- a/cpp/bench/ann/conf/sift-128-euclidean.json +++ b/cpp/bench/ann/conf/sift-128-euclidean.json @@ -90,8 +90,8 @@ - - { + + { "name": "raft_bfknn", "algo": "raft_bfknn", "build_param": {}, @@ -1316,6 +1316,36 @@ } ], "search_result_file": "result/sift-128-euclidean/raft_ivf_flat/nlist16384" + }, + + { + "name" : "cagra.dim32", + "algo" : "raft_cagra", + "build_param": { + "index_dim" : 32 + }, + "file" : "index/sift-128-euclidean/cagra/dim32", + "search_params" : [ + {"itopk": 32}, + {"itopk": 64}, + {"itopk": 128} + ], + "search_result_file" : "result/sift-128-euclidean/cagra/dim32" + }, + + { + "name" : "cagra.dim64", + "algo" : "raft_cagra", + "build_param": { + "index_dim" : 64 + }, + "file" : "index/sift-128-euclidean/cagra/dim64", + "search_params" : [ + {"itopk": 32}, + {"itopk": 64}, + {"itopk": 128} + ], + "search_result_file" : "result/sift-128-euclidean/cagra/dim64" } ] } diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index c34b95010f..28df4640ee 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -30,6 +30,8 @@ #include #include +#include + #include "benchmark_util.hpp" #include "conf.h" #include "dataset.h" @@ -108,8 +110,8 @@ inline bool mkdir(const std::vector& dirs) } inline bool check(const std::vector& indices, - bool build_mode, - bool force_overwrite) + const bool build_mode, + const bool force_overwrite) { std::vector files_should_exist; std::vector dirs_should_exist; @@ -119,7 +121,7 @@ inline bool check(const std::vector& indices, output_files.push_back(index.file); output_files.push_back(index.file + ".txt"); - auto pos = index.file.rfind('/'); + const auto pos = index.file.rfind('/'); if (pos != std::string::npos) { dirs_should_exist.push_back(index.file.substr(0, pos)); } } else { files_should_exist.push_back(index.file); @@ -128,7 +130,7 @@ inline bool check(const std::vector& indices, output_files.push_back(index.search_result_file + ".0.ibin"); output_files.push_back(index.search_result_file + ".0.txt"); - auto pos = index.search_result_file.rfind('/'); + const auto pos = index.search_result_file.rfind('/'); if (pos != std::string::npos) { dirs_should_exist.push_back(index.search_result_file.substr(0, pos)); } @@ -149,7 +151,7 @@ inline void write_build_info(const std::string& file_prefix, const std::string& name, const std::string& algo, const std::string& build_param, - float build_time) + const float build_time) { std::ofstream ofs(file_prefix + ".txt"); if (!ofs) { throw std::runtime_error("can't open build info file: " + file_prefix + ".txt"); } @@ -175,13 +177,13 @@ void build(const Dataset* dataset, const std::vector& i for (const auto& index : indices) { log_info("creating algo '%s', param=%s", index.algo.c_str(), index.build_param.dump().c_str()); - auto algo = create_algo(index.algo, - dataset->distance(), - dataset->dim(), - index.refine_ratio, - index.build_param, - index.dev_list); - auto algo_property = algo->get_property(); + const auto algo = create_algo(index.algo, + dataset->distance(), + dataset->dim(), + index.refine_ratio, + index.build_param, + index.dev_list); + const auto algo_property = algo->get_property(); const T* base_set_ptr = nullptr; if (algo_property.dataset_memory_type == MemoryType::Host) { @@ -203,7 +205,7 @@ void build(const Dataset* dataset, const std::vector& i Timer timer; algo->build(base_set_ptr, dataset->base_set_size(), stream); RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); - float elapsed_ms = timer.elapsed_ms(); + const float elapsed_ms = timer.elapsed_ms(); #ifdef NVTX nvtxRangePop(); #endif @@ -232,15 +234,17 @@ inline void write_search_result(const std::string& file_prefix, const std::string& algo, const std::string& build_param, const std::string& search_param, - int batch_size, - int run_count, - int k, + std::size_t batch_size, + unsigned run_count, + unsigned k, float search_time_average, float search_time_p99, float search_time_p999, + float query_per_second, const int* neighbors, size_t query_set_size) { + log_info("throughput : %e [QPS]", query_per_second); std::ofstream ofs(file_prefix + ".txt"); if (!ofs) { throw std::runtime_error("can't open search result file: " + file_prefix + ".txt"); } ofs << "dataset: " << dataset << "\n" @@ -254,13 +258,16 @@ inline void write_search_result(const std::string& file_prefix, << "batch_size: " << batch_size << "\n" << "run_count: " << run_count << "\n" << "k: " << k << "\n" + << "query_per_second: " << query_per_second << "\n" << "average_search_time: " << search_time_average << endl; + if (search_time_p99 != std::numeric_limits::max()) { ofs << "p99_search_time: " << search_time_p99 << endl; } if (search_time_p999 != std::numeric_limits::max()) { ofs << "p999_search_time: " << search_time_p999 << endl; } + ofs.close(); if (!ofs) { throw std::runtime_error("can't write to search result file: " + file_prefix + ".txt"); @@ -280,15 +287,15 @@ inline void search(const Dataset* dataset, const std::vectorname().c_str(), dataset->query_set_size()); - const T* query_set = dataset->query_set(); + const T* const query_set = dataset->query_set(); // query set is usually much smaller than base set, so load it eagerly - const T* d_query_set = dataset->query_set_on_gpu(); - size_t query_set_size = dataset->query_set_size(); + const T* const d_query_set = dataset->query_set_on_gpu(); + const size_t query_set_size = dataset->query_set_size(); // currently all indices has same batch_size, k and run_count - const int batch_size = indices[0].batch_size; - const int k = indices[0].k; - const int run_count = indices[0].run_count; + const std::size_t batch_size = indices[0].batch_size; + const unsigned k = indices[0].k; + const unsigned run_count = indices[0].run_count; log_info( "basic search parameters: batch_size = %d, k = %d, run_count = %d", batch_size, k, run_count); if (query_set_size % batch_size != 0) { @@ -297,10 +304,10 @@ inline void search(const Dataset* dataset, const std::vector search_times; search_times.reserve(num_batches); std::size_t* d_neighbors; @@ -310,13 +317,13 @@ inline void search(const Dataset* dataset, const std::vector(index.algo, - dataset->distance(), - dataset->dim(), - index.refine_ratio, - index.build_param, - index.dev_list); - auto algo_property = algo->get_property(); + const auto algo = create_algo(index.algo, + dataset->distance(), + dataset->dim(), + index.refine_ratio, + index.build_param, + index.dev_list); + const auto algo_property = algo->get_property(); log_info("loading index '%s' from file '%s'", index.name.c_str(), index.file.c_str()); algo->load(index.file); @@ -349,7 +356,7 @@ inline void search(const Dataset* dataset, const std::vector(index.algo, index.search_params[i]); + const auto p_param = create_search_param(index.algo, index.search_params[i]); algo->set_search_param(*p_param); log_info("search with param: %s", index.search_params[i].dump().c_str()); @@ -364,11 +371,13 @@ inline void search(const Dataset* dataset, const std::vector::max(); float best_search_time_p99 = std::numeric_limits::max(); float best_search_time_p999 = std::numeric_limits::max(); - for (int run = 0; run < run_count; ++run) { + float total_search_time = 0; + for (unsigned run = 0; run < run_count; ++run) { log_info("run %d / %d", run + 1, run_count); for (std::size_t batch_id = 0; batch_id < num_batches; ++batch_id) { - std::size_t row = batch_id * batch_size; - int actual_batch_size = (batch_id == num_batches - 1) ? query_set_size - row : batch_size; + const std::size_t row = batch_id * batch_size; + const std::size_t actual_batch_size = + (batch_id == num_batches - 1) ? query_set_size - row : batch_size; RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); #ifdef NVTX string nvtx_label = "batch" + to_string(batch_id); @@ -389,7 +398,7 @@ inline void search(const Dataset* dataset, const std::vector* dataset, const std::vector= 100) { std::sort(search_times.begin(), search_times.end()); - auto calc_percentile_pos = [](float percentile, size_t N) { + const auto calc_percentile_pos = [](float percentile, size_t N) { return static_cast(std::ceil(percentile / 100.0 * N)) - 1; }; - float search_time_p99 = search_times[calc_percentile_pos(99, search_times.size())]; - best_search_time_p99 = std::min(best_search_time_p99, search_time_p99); + const float search_time_p99 = search_times[calc_percentile_pos(99, search_times.size())]; + best_search_time_p99 = std::min(best_search_time_p99, search_time_p99); if (search_times.size() >= 1000) { - float search_time_p999 = search_times[calc_percentile_pos(99.9, search_times.size())]; - best_search_time_p999 = std::min(best_search_time_p999, search_time_p999); + const float search_time_p999 = + search_times[calc_percentile_pos(99.9, search_times.size())]; + best_search_time_p999 = std::min(best_search_time_p999, search_time_p999); } } search_times.clear(); } RAFT_CUDA_TRY(cudaDeviceSynchronize()); RAFT_CUDA_TRY(cudaPeekAtLastError()); + const auto query_per_second = + (run_count * raft::round_down_safe(query_set_size, batch_size)) / total_search_time; if (algo_property.query_memory_type == MemoryType::Device) { RAFT_CUDA_TRY(cudaMemcpy(neighbors, @@ -436,7 +450,7 @@ inline void search(const Dataset* dataset, const std::vector* dataset, const std::vector -inline int dispatch_benchmark(Configuration& conf, - std::string& index_patterns, +inline int dispatch_benchmark(const Configuration& conf, + const std::string& index_patterns, bool force_overwrite, bool only_check, bool build_mode, bool search_mode) { try { - auto dataset_conf = conf.get_dataset_conf(); + const auto dataset_conf = conf.get_dataset_conf(); BinDataset dataset(dataset_conf.name, dataset_conf.base_file, diff --git a/cpp/bench/ann/src/common/conf.cpp b/cpp/bench/ann/src/common/conf.cpp index f690f68783..d180f37973 100644 --- a/cpp/bench/ann/src/common/conf.cpp +++ b/cpp/bench/ann/src/common/conf.cpp @@ -78,7 +78,7 @@ void Configuration::parse_dataset_(const nlohmann::json& conf) } else if (!filename.compare(filename.size() - 5, 5, "i8bin")) { dataset_conf_.dtype = "int8"; } else { - log_error("Could not determine data type of the dataset"); + log_error("Could not determine data type of the dataset %s", filename.c_str()); } } } diff --git a/cpp/bench/ann/src/common/dataset.h b/cpp/bench/ann/src/common/dataset.h index 46dd66d649..ae05cd02a1 100644 --- a/cpp/bench/ann/src/common/dataset.h +++ b/cpp/bench/ann/src/common/dataset.h @@ -14,21 +14,27 @@ * limitations under the License. */ #pragma once + +#include + +#ifndef CPU_ONLY #include +#include +#else +typedef uint16_t half; +#endif + #include #include #include #include -#include #include #include #include #include #include -#include - namespace raft::bench::ann { // http://big-ann-benchmarks.com/index.html: @@ -46,13 +52,17 @@ class BinFile { const std::string& mode, uint32_t subset_first_row = 0, uint32_t subset_size = 0); - ~BinFile() { fclose(fp_); } + ~BinFile() + { + if (fp_) { fclose(fp_); } + } BinFile(const BinFile&) = delete; BinFile& operator=(const BinFile&) = delete; - void get_shape(size_t* nrows, int* ndims) + void get_shape(size_t* nrows, int* ndims) const { assert(read_mode_); + if (!fp_) { open_file_(); } *nrows = nrows_; *ndims = ndims_; } @@ -60,6 +70,7 @@ class BinFile { void read(T* data) const { assert(read_mode_); + if (!fp_) { open_file_(); } size_t total = static_cast(nrows_) * ndims_; if (fread(data, sizeof(T), total, fp_) != total) { throw std::runtime_error("fread() BinFile " + file_ + " failed"); @@ -69,6 +80,7 @@ class BinFile { void write(const T* data, uint32_t nrows, uint32_t ndims) { assert(!read_mode_); + if (!fp_) { open_file_(); } if (fwrite(&nrows, sizeof(uint32_t), 1, fp_) != 1) { throw std::runtime_error("fwrite() BinFile " + file_ + " failed"); } @@ -82,34 +94,41 @@ class BinFile { } } - void* map() const + T* map() const { assert(read_mode_); - int fid = fileno(fp_); - auto mmap_ptr = mmap(NULL, file_size_, PROT_READ, MAP_PRIVATE, fid, 0); - if (mmap_ptr == MAP_FAILED) { + if (!fp_) { open_file_(); } + int fid = fileno(fp_); + mapped_ptr_ = mmap(nullptr, file_size_, PROT_READ, MAP_PRIVATE, fid, 0); + if (mapped_ptr_ == MAP_FAILED) { throw std::runtime_error("mmap error: Value of errno " + std::to_string(errno) + ", " + std::string(strerror(errno))); } - return mmap_ptr; + return reinterpret_cast(reinterpret_cast(mapped_ptr_) + 2 * sizeof(uint32_t) + + subset_first_row_ * ndims_ * sizeof(T)); } - void unmap(void* data) const + void unmap() const { - if (munmap(data, file_size_) == -1) { + if (munmap(mapped_ptr_, file_size_) == -1) { throw std::runtime_error("munmap error: " + std::string(strerror(errno))); } } private: void check_suffix_(); + void open_file_() const; std::string file_; - FILE* fp_; bool read_mode_; - uint32_t nrows_; - uint32_t ndims_; - size_t file_size_; + uint32_t subset_first_row_; + uint32_t subset_size_; + + mutable FILE* fp_; + mutable uint32_t nrows_; + mutable uint32_t ndims_; + mutable size_t file_size_; + mutable void* mapped_ptr_; }; template @@ -117,23 +136,32 @@ BinFile::BinFile(const std::string& file, const std::string& mode, uint32_t subset_first_row, uint32_t subset_size) - : file_(file) + : file_(file), + read_mode_(mode == "r"), + subset_first_row_(subset_first_row), + subset_size_(subset_size), + fp_(nullptr) { check_suffix_(); - if (mode == "r") { - read_mode_ = true; - } else if (mode == "w") { - read_mode_ = false; - if (subset_first_row != 0) { - throw std::runtime_error("subset_first_row should be zero for write mode"); + if (!read_mode_) { + if (mode == "w") { + if (subset_first_row != 0) { + throw std::runtime_error("subset_first_row should be zero for write mode"); + } + if (subset_size != 0) { + throw std::runtime_error("subset_size should be zero for write mode"); + } + } else { + throw std::runtime_error("BinFile's mode must be either 'r' or 'w': " + file_); } - if (subset_size != 0) { throw std::runtime_error("subset_size should be zero for write mode"); } - } else { - throw std::runtime_error("BinFile's mode must be either 'r' or 'w': " + file_); } +} - fp_ = fopen(file_.c_str(), mode.c_str()); +template +void BinFile::open_file_() const +{ + fp_ = fopen(file_.c_str(), read_mode_ ? "r" : "w"); if (!fp_) { throw std::runtime_error("open BinFile failed: " + file_); } if (read_mode_) { @@ -156,24 +184,24 @@ BinFile::BinFile(const std::string& file, std::to_string(file_size_)); } - if (subset_first_row >= nrows_) { - throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row) + + if (subset_first_row_ >= nrows_) { + throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row_) + ") >= nrows (" + std::to_string(nrows_) + ")"); } - if (subset_first_row + subset_size > nrows_) { - throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row) + - ") + subset_size (" + std::to_string(subset_size) + ") > nrows (" + + if (subset_first_row_ + subset_size_ > nrows_) { + throw std::runtime_error(file_ + ": subset_first_row (" + std::to_string(subset_first_row_) + + ") + subset_size (" + std::to_string(subset_size_) + ") > nrows (" + std::to_string(nrows_) + ")"); } - if (subset_first_row) { + if (subset_first_row_) { static_assert(sizeof(long) == 8, "fseek() don't support 64-bit offset"); - if (fseek(fp_, sizeof(T) * subset_first_row * ndims_, SEEK_CUR) == -1) { + if (fseek(fp_, sizeof(T) * subset_first_row_ * ndims_, SEEK_CUR) == -1) { throw std::runtime_error(file_ + ": fseek failed"); } - nrows_ -= subset_first_row; + nrows_ -= subset_first_row_; } - if (subset_size) { nrows_ = subset_size; } + if (subset_size_) { nrows_ = subset_size_; } } } @@ -225,9 +253,9 @@ class Dataset { std::string name() const { return name_; } std::string distance() const { return distance_; } - int dim() const { return dim_; } - size_t base_set_size() const { return base_set_size_; } - size_t query_set_size() const { return query_set_size_; } + virtual int dim() const = 0; + virtual size_t base_set_size() const = 0; + virtual size_t query_set_size() const = 0; // load data lazily, so don't pay the overhead of reading unneeded set // e.g. don't load base set when searching @@ -254,9 +282,6 @@ class Dataset { std::string name_; std::string distance_; - int dim_; - size_t base_set_size_; - size_t query_set_size_; mutable T* base_set_ = nullptr; mutable T* query_set_ = nullptr; @@ -270,31 +295,37 @@ Dataset::~Dataset() { delete[] base_set_; delete[] query_set_; - if (d_base_set_) { RAFT_CUDA_TRY_NO_THROW(cudaFree(d_base_set_)); } - if (d_query_set_) { RAFT_CUDA_TRY_NO_THROW(cudaFree(d_query_set_)); } +#ifndef CPU_ONLY + if (d_base_set_) { cudaFree(d_base_set_); } + if (d_query_set_) { cudaFree(d_query_set_); } +#endif } template const T* Dataset::base_set_on_gpu() const { +#ifndef CPU_ONLY if (!d_base_set_) { base_set(); - RAFT_CUDA_TRY(cudaMalloc((void**)&d_base_set_, base_set_size_ * dim_ * sizeof(T))); + RAFT_CUDA_TRY(cudaMalloc((void**)&d_base_set_, base_set_size() * dim() * sizeof(T))); RAFT_CUDA_TRY(cudaMemcpy( - d_base_set_, base_set_, base_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice)); + d_base_set_, base_set_, base_set_size() * dim() * sizeof(T), cudaMemcpyHostToDevice)); } +#endif return d_base_set_; } template const T* Dataset::query_set_on_gpu() const { +#ifndef CPU_ONLY if (!d_query_set_) { query_set(); - RAFT_CUDA_TRY(cudaMalloc((void**)&d_query_set_, query_set_size_ * dim_ * sizeof(T))); + RAFT_CUDA_TRY(cudaMalloc((void**)&d_query_set_, query_set_size() * dim() * sizeof(T))); RAFT_CUDA_TRY(cudaMemcpy( - d_query_set_, query_set_, query_set_size_ * dim_ * sizeof(T), cudaMemcpyHostToDevice)); + d_query_set_, query_set_, query_set_size() * dim() * sizeof(T), cudaMemcpyHostToDevice)); } +#endif return d_query_set_; } @@ -316,24 +347,24 @@ class BinDataset : public Dataset { const std::string& distance); ~BinDataset() { - if (this->mapped_base_set_) { - base_file_.unmap(reinterpret_cast(this->mapped_base_set_) - subset_offset_); - } + if (this->mapped_base_set_) { base_file_.unmap(); } } + int dim() const override; + size_t base_set_size() const override; + size_t query_set_size() const override; + private: void load_base_set_() const override; void load_query_set_() const override; void map_base_set_() const override; - using Dataset::dim_; - using Dataset::base_set_size_; - using Dataset::query_set_size_; + mutable int dim_ = 0; + mutable size_t base_set_size_ = 0; + mutable size_t query_set_size_ = 0; BinFile base_file_; BinFile query_file_; - - size_t subset_offset_; }; template @@ -345,37 +376,71 @@ BinDataset::BinDataset(const std::string& name, const std::string& distance) : Dataset(name, distance), base_file_(base_file, "r", subset_first_row, subset_size), - query_file_(query_file, "r"), - subset_offset_(2 * sizeof(uint32_t) + subset_first_row * dim_ * sizeof(T)) + query_file_(query_file, "r") +{ +} + +template +int BinDataset::dim() const +{ + if (dim_ > 0) { return dim_; } + if (base_set_size() > 0) { return dim_; } + if (query_set_size() > 0) { return dim_; } + return dim_; +} + +template +size_t BinDataset::query_set_size() const { - base_file_.get_shape(&base_set_size_, &dim_); - int query_dim; - query_file_.get_shape(&query_set_size_, &query_dim); - if (query_dim != dim_) { + if (query_set_size_ > 0) { return query_set_size_; } + int dim; + query_file_.get_shape(&query_set_size_, &dim); + if (query_set_size_ == 0) { throw std::runtime_error("Zero query set size"); } + if (dim == 0) { throw std::runtime_error("Zero query set dim"); } + if (dim_ == 0) { + dim_ = dim; + } else if (dim_ != dim) { throw std::runtime_error("base set dim (" + std::to_string(dim_) + ") != query set dim (" + - std::to_string(query_dim)); + std::to_string(dim)); + } + return query_set_size_; +} + +template +size_t BinDataset::base_set_size() const +{ + if (base_set_size_ > 0) { return base_set_size_; } + int dim; + base_file_.get_shape(&base_set_size_, &dim); + if (base_set_size_ == 0) { throw std::runtime_error("Zero base set size"); } + if (dim == 0) { throw std::runtime_error("Zero base set dim"); } + if (dim_ == 0) { + dim_ = dim; + } else if (dim_ != dim) { + throw std::runtime_error("base set dim (" + std::to_string(dim) + ") != query set dim (" + + std::to_string(dim_)); } + return base_set_size_; } template void BinDataset::load_base_set_() const { - this->base_set_ = new T[base_set_size_ * dim_]; + this->base_set_ = new T[base_set_size() * dim()]; base_file_.read(this->base_set_); } template void BinDataset::load_query_set_() const { - this->query_set_ = new T[query_set_size_ * dim_]; + this->query_set_ = new T[query_set_size() * dim()]; query_file_.read(this->query_set_); } template void BinDataset::map_base_set_() const { - char* original_map_ptr = static_cast(base_file_.map()); - this->mapped_base_set_ = reinterpret_cast(original_map_ptr + subset_offset_); + this->mapped_base_set_ = base_file_.map(); } } // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_benchmark.cu index 294da9a14f..0bad86905b 100644 --- a/cpp/bench/ann/src/faiss/faiss_benchmark.cu +++ b/cpp/bench/ann/src/faiss/faiss_benchmark.cu @@ -104,10 +104,10 @@ std::unique_ptr> create_algo(const std::string& algo, // stop compiler warning; not all algorithms support multi-GPU so it may not be used (void)dev_list; - raft::bench::ann::Metric metric = parse_metric(distance); std::unique_ptr> ann; if constexpr (std::is_same_v) { + raft::bench::ann::Metric metric = parse_metric(distance); if (algo == "faiss_gpu_ivf_flat") { ann = make_algo(metric, dim, conf, dev_list); } else if (algo == "faiss_gpu_ivf_pq") { @@ -147,4 +147,4 @@ std::unique_ptr::AnnSearchParam> create_search #include "../common/benchmark.hpp" -int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } \ No newline at end of file +int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu index baff1b1c45..dcc4ae18be 100644 --- a/cpp/bench/ann/src/raft/raft_benchmark.cu +++ b/cpp/bench/ann/src/raft/raft_benchmark.cu @@ -40,6 +40,12 @@ extern template class raft::bench::ann::RaftIvfPQ; extern template class raft::bench::ann::RaftIvfPQ; extern template class raft::bench::ann::RaftIvfPQ; #endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA +#include "raft_cagra_wrapper.h" +extern template class raft::bench::ann::RaftCagra; +extern template class raft::bench::ann::RaftCagra; +extern template class raft::bench::ann::RaftCagra; +#endif #define JSON_DIAGNOSTICS 1 #include @@ -117,28 +123,43 @@ void parse_search_param(const nlohmann::json& conf, } #endif -template class Algo> -std::unique_ptr> make_algo(raft::bench::ann::Metric metric, - int dim, - const nlohmann::json& conf) +#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftCagra::BuildParam& param) { - typename Algo::BuildParam param; - parse_build_param(conf, param); - return std::make_unique>(metric, dim, param); + if (conf.contains("index_dim")) { + param.graph_degree = conf.at("index_dim"); + param.intermediate_graph_degree = param.graph_degree * 2; + } + if (conf.contains("intermediate_graph_degree")) { + param.intermediate_graph_degree = conf.at("intermediate_graph_degree"); + } } -template class Algo> -std::unique_ptr> make_algo(raft::bench::ann::Metric metric, - int dim, - const nlohmann::json& conf, - const std::vector& dev_list) +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftCagra::SearchParam& param) { - typename Algo::BuildParam param; - parse_build_param(conf, param); - - (void)dev_list; - return std::make_unique>(metric, dim, param); + if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); } + if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); } + if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); } + if (conf.contains("algo")) { + if (conf.at("algo") == "single_cta") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::SINGLE_CTA; + } else if (conf.at("algo") == "multi_cta") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_CTA; + } else if (conf.at("algo") == "multi_kernel") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_KERNEL; + } else if (conf.at("algo") == "auto") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::AUTO; + } else { + std::string tmp = conf.at("algo"); + THROW("Invalid value for algo: %s", tmp.c_str()); + } + } } +#endif template std::unique_ptr> create_algo(const std::string& algo, @@ -176,6 +197,13 @@ std::unique_ptr> create_algo(const std::string& algo, ann = std::make_unique>(metric, dim, param, refine_ratio); } +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA + if (algo == "raft_cagra") { + typename raft::bench::ann::RaftCagra::BuildParam param; + parse_build_param(conf, param); + ann = std::make_unique>(metric, dim, param); + } #endif if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } @@ -207,6 +235,13 @@ std::unique_ptr::AnnSearchParam> create_search parse_search_param(conf, *param); return param; } +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA + if (algo == "raft_cagra") { + auto param = std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } #endif // else throw std::runtime_error("invalid algo: '" + algo + "'"); @@ -216,4 +251,4 @@ std::unique_ptr::AnnSearchParam> create_search #include "../common/benchmark.hpp" -int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } \ No newline at end of file +int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } diff --git a/cpp/bench/ann/src/raft/raft_cagra.cu b/cpp/bench/ann/src/raft/raft_cagra.cu new file mode 100644 index 0000000000..be18af7f2c --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_cagra.cu @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "raft_cagra_wrapper.h" + +namespace raft::bench::ann { +template class RaftCagra; +template class RaftCagra; +template class RaftCagra; +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h new file mode 100644 index 0000000000..d47de1eeac --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#include "raft_ann_bench_utils.h" +#include + +namespace raft::bench::ann { + +template +class RaftCagra : public ANN { + public: + using typename ANN::AnnSearchParam; + + struct SearchParam : public AnnSearchParam { + raft::neighbors::experimental::cagra::search_params p; + }; + + using BuildParam = raft::neighbors::cagra::index_params; + + RaftCagra(Metric metric, int dim, const BuildParam& param); + + void build(const T* dataset, size_t nrow, cudaStream_t stream) final; + + void set_search_param(const AnnSearchParam& param) override; + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const override; + + // to enable dataset access from GPU memory + AlgoProperty get_property() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::HostMmap; + property.query_memory_type = MemoryType::Device; + property.need_dataset_when_search = true; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + + ~RaftCagra() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + + private: + raft::device_resources handle_; + BuildParam index_params_; + raft::neighbors::cagra::search_params search_params_; + std::optional> index_; + int device_; + int dimension_; + rmm::mr::pool_memory_resource mr_; +}; + +template +RaftCagra::RaftCagra(Metric metric, int dim, const BuildParam& param) + : ANN(metric, dim), + index_params_(param), + dimension_(dim), + mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) +{ + rmm::mr::set_current_device_resource(&mr_); + index_params_.metric = parse_metric_type(metric); + RAFT_CUDA_TRY(cudaGetDevice(&device_)); +} + +template +void RaftCagra::build(const T* dataset, size_t nrow, cudaStream_t) +{ + if (get_property().dataset_memory_type != MemoryType::Device) { + auto dataset_view = + raft::make_host_matrix_view(dataset, IdxT(nrow), dimension_); + index_.emplace(raft::neighbors::cagra::build(handle_, index_params_, dataset_view)); + } else { + auto dataset_view = + raft::make_device_matrix_view(dataset, IdxT(nrow), dimension_); + index_.emplace(raft::neighbors::cagra::build(handle_, index_params_, dataset_view)); + } + return; +} + +template +void RaftCagra::set_search_param(const AnnSearchParam& param) +{ + auto search_param = dynamic_cast(param); + search_params_ = search_param.p; + return; +} + +template +void RaftCagra::save(const std::string& file) const +{ + raft::neighbors::cagra::serialize(handle_, file, *index_); + return; +} + +template +void RaftCagra::load(const std::string& file) +{ + index_ = raft::neighbors::cagra::deserialize(handle_, file); + return; +} + +template +void RaftCagra::search( + const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const +{ + IdxT* neighbors_IdxT; + rmm::device_uvector neighbors_storage(0, resource::get_cuda_stream(handle_)); + if constexpr (std::is_same::value) { + neighbors_IdxT = neighbors; + } else { + neighbors_storage.resize(batch_size * k, resource::get_cuda_stream(handle_)); + neighbors_IdxT = neighbors_storage.data(); + } + + auto queries_view = + raft::make_device_matrix_view(queries, batch_size, dimension_); + auto neighbors_view = raft::make_device_matrix_view(neighbors_IdxT, batch_size, k); + auto distances_view = raft::make_device_matrix_view(distances, batch_size, k); + + raft::neighbors::cagra::search( + handle_, search_params_, *index_, queries_view, neighbors_view, distances_view); + + if (!std::is_same::value) { + raft::linalg::unaryOp(neighbors, + neighbors_IdxT, + batch_size * k, + raft::cast_op(), + resource::get_cuda_stream(handle_)); + } + + handle_.sync_stream(); + return; +} +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h index 36b4931460..42fb9bd4a1 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h @@ -79,6 +79,8 @@ class RaftIvfFlatGpu : public ANN { void save(const std::string& file) const override; void load(const std::string&) override; + ~RaftIvfFlatGpu() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + private: raft::device_resources handle_; BuildParam index_params_; @@ -96,7 +98,9 @@ RaftIvfFlatGpu::RaftIvfFlatGpu(Metric metric, int dim, const BuildParam dimension_(dim), mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) { - index_params_.metric = parse_metric_type(metric); + index_params_.metric = parse_metric_type(metric); + index_params_.conservative_memory_allocation = true; + rmm::mr::set_current_device_resource(&mr_); RAFT_CUDA_TRY(cudaGetDevice(&device_)); } diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h index c390d0bd7e..30bd5ab4d6 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h @@ -73,12 +73,14 @@ class RaftIvfPQ : public ANN { AlgoProperty property; property.dataset_memory_type = MemoryType::Host; property.query_memory_type = MemoryType::Device; - property.need_dataset_when_search = true; // actually it is only used during refinement + property.need_dataset_when_search = refine_ratio_ > 1.0; return property; } void save(const std::string& file) const override; void load(const std::string&) override; + ~RaftIvfPQ() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + private: raft::device_resources handle_; BuildParam index_params_; @@ -98,6 +100,7 @@ RaftIvfPQ::RaftIvfPQ(Metric metric, int dim, const BuildParam& param, f refine_ratio_(refine_ratio), mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) { + rmm::mr::set_current_device_resource(&mr_); index_params_.metric = parse_metric_type(metric); RAFT_CUDA_TRY(cudaGetDevice(&device_)); } diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index c90886841b..e8d4739384 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -141,6 +141,7 @@ if(BUILD_PRIMS_BENCH) PATH bench/prims/neighbors/knn/brute_force_float_int64_t.cu bench/prims/neighbors/knn/brute_force_float_uint32_t.cu + bench/prims/neighbors/knn/cagra_float_uint32_t.cu bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu diff --git a/cpp/bench/prims/neighbors/cagra_bench.cuh b/cpp/bench/prims/neighbors/cagra_bench.cuh new file mode 100644 index 0000000000..bb405088bb --- /dev/null +++ b/cpp/bench/prims/neighbors/cagra_bench.cuh @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +namespace raft::bench::neighbors { + +struct params { + /** Size of the dataset. */ + size_t n_samples; + /** Number of dimensions in the dataset. */ + int n_dims; + /** The batch size -- number of KNN searches. */ + int n_queries; + /** Number of nearest neighbours to find for every probe. */ + int k; + /** kNN graph degree*/ + int degree; + int itopk_size; + int block_size; + int search_width; + int max_iterations; +}; + +template +struct CagraBench : public fixture { + explicit CagraBench(const params& ps) + : fixture(true), + params_(ps), + queries_(make_device_matrix(handle, ps.n_queries, ps.n_dims)), + dataset_(make_device_matrix(handle, ps.n_samples, ps.n_dims)), + knn_graph_(make_device_matrix(handle, ps.n_samples, ps.degree)) + { + // Generate random dataset and queriees + raft::random::RngState state{42}; + constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); + constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); + if constexpr (std::is_integral_v) { + raft::random::uniformInt( + state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax, stream); + raft::random::uniformInt( + state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax, stream); + } else { + raft::random::uniform( + state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax, stream); + raft::random::uniform( + state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax, stream); + } + + // Generate random knn graph + + raft::random::uniformInt( + state, knn_graph_.data_handle(), knn_graph_.size(), 0, ps.n_samples - 1, stream); + + auto metric = raft::distance::DistanceType::L2Expanded; + + index_.emplace(raft::neighbors::cagra::index( + handle, metric, make_const_mdspan(dataset_.view()), make_const_mdspan(knn_graph_.view()))); + } + + void run_benchmark(::benchmark::State& state) override + { + raft::neighbors::cagra::search_params search_params; + search_params.max_queries = 1024; + search_params.itopk_size = params_.itopk_size; + search_params.team_size = 0; + search_params.thread_block_size = params_.block_size; + search_params.search_width = params_.search_width; + + auto indices = make_device_matrix(handle, params_.n_queries, params_.k); + auto distances = make_device_matrix(handle, params_.n_queries, params_.k); + auto ind_v = make_device_matrix_view( + indices.data_handle(), params_.n_queries, params_.k); + auto dist_v = make_device_matrix_view( + distances.data_handle(), params_.n_queries, params_.k); + + auto queries_v = make_const_mdspan(queries_.view()); + loop_on_state(state, [&]() { + raft::neighbors::cagra::search( + this->handle, search_params, *this->index_, queries_v, ind_v, dist_v); + }); + + double data_size = params_.n_samples * params_.n_dims * sizeof(T); + double graph_size = params_.n_samples * params_.degree * sizeof(IdxT); + + int iterations = params_.max_iterations; + if (iterations == 0) { + // see search_plan_impl::adjust_search_params() + double r = params_.itopk_size / static_cast(params_.search_width); + iterations = 1 + std::min(r * 1.1, r + 10); + } + state.counters["dataset (GiB)"] = data_size / (1 << 30); + state.counters["graph (GiB)"] = graph_size / (1 << 30); + state.counters["n_rows"] = params_.n_samples; + state.counters["n_cols"] = params_.n_dims; + state.counters["degree"] = params_.degree; + state.counters["n_queries"] = params_.n_queries; + state.counters["k"] = params_.k; + state.counters["itopk_size"] = params_.itopk_size; + state.counters["block_size"] = params_.block_size; + state.counters["search_width"] = params_.search_width; + state.counters["iterations"] = iterations; + } + + private: + const params params_; + std::optional> index_; + raft::device_matrix queries_; + raft::device_matrix dataset_; + raft::device_matrix knn_graph_; +}; + +inline const std::vector generate_inputs() +{ + std::vector inputs = + raft::util::itertools::product({2000000ull}, // n_samples + {128, 256, 512, 1024}, // dataset dim + {1000}, // n_queries + {32}, // k + {64}, // knn graph degree + {64}, // itopk_size + {0}, // block_size + {1}, // search_width + {0} // max_iterations + ); + auto inputs2 = raft::util::itertools::product({2000000ull, 10000000ull}, // n_samples + {128}, // dataset dim + {1000}, // n_queries + {32}, // k + {64}, // knn graph degree + {64}, // itopk_size + {64, 128, 256, 512, 1024}, // block_size + {1}, // search_width + {0} // max_iterations + ); + inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); + return inputs; +} + +const std::vector kCagraInputs = generate_inputs(); + +#define CAGRA_REGISTER(ValT, IdxT, inputs) \ + namespace BENCHMARK_PRIVATE_NAME(knn) { \ + using AnnCagra = CagraBench; \ + RAFT_BENCH_REGISTER(AnnCagra, #ValT "/" #IdxT, inputs); \ + } + +} // namespace raft::bench::neighbors diff --git a/cpp/bench/prims/neighbors/knn/cagra_float_uint32_t.cu b/cpp/bench/prims/neighbors/knn/cagra_float_uint32_t.cu new file mode 100644 index 0000000000..5d762f6e85 --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/cagra_float_uint32_t.cu @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../cagra_bench.cuh" + +namespace raft::bench::neighbors { + +CAGRA_REGISTER(float, uint32_t, kCagraInputs); + +} // namespace raft::bench::neighbors diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 1948169c91..09353125b9 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "RAFT C++ API" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "23.06" +PROJECT_NUMBER = "23.08" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -893,11 +893,8 @@ EXCLUDE = ../include/raft/sparse/linalg/symmetrize.hpp \ ../include/raft/util/device_utils.cuh \ ../include/raft/core/error.hpp \ ../include/raft/core/handle.hpp \ - ../include/raft/util/integer_utils.hpp \ - ../include/raft/core/interruptible.hpp \ - ../include/raft/core/mdarray.hpp \ + ../include/raft/util/integer_utils.hpp ../include/raft/util/pow2_utils.cuh \ - ../include/raft/core/span.hpp \ ../include/raft/util/vectorized.cuh \ ../include/raft/raft.hpp \ ../include/raft/core/cudart_utils.hpp \ diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh index c4dd74f255..a962d4b7c6 100644 --- a/cpp/include/raft/cluster/detail/mst.cuh +++ b/cpp/include/raft/cluster/detail/mst.cuh @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include @@ -81,8 +81,20 @@ void connect_knn_graph( raft::sparse::COO connected_edges(stream); - raft::sparse::neighbors::connect_components( - handle, connected_edges, X, color, m, n, reduction_op); + // default row and column batch sizes are chosen for computing cross component nearest neighbors. + // Reference: PR #1445 + static constexpr size_t default_row_batch_size = 4096; + static constexpr size_t default_col_batch_size = 16; + + raft::sparse::neighbors::cross_component_nn(handle, + connected_edges, + X, + color, + m, + n, + reduction_op, + min(m, default_row_batch_size), + min(n, default_col_batch_size)); rmm::device_uvector indptr2(m + 1, stream); raft::sparse::convert::sorted_coo_to_csr( @@ -192,4 +204,4 @@ void build_sorted_mst( raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream); } -}; // namespace raft::cluster::detail +}; // namespace raft::cluster::detail \ No newline at end of file diff --git a/cpp/include/raft/cluster/detail/single_linkage.cuh b/cpp/include/raft/cluster/detail/single_linkage.cuh index ddd422a89b..848ca0357e 100644 --- a/cpp/include/raft/cluster/detail/single_linkage.cuh +++ b/cpp/include/raft/cluster/detail/single_linkage.cuh @@ -81,7 +81,7 @@ void single_linkage(raft::resources const& handle, * 2. Construct MST, sorted by weights */ rmm::device_uvector color(m, stream); - raft::sparse::neighbors::FixConnectivitiesRedOp op(color.data(), m); + raft::sparse::neighbors::FixConnectivitiesRedOp op(m); detail::build_sorted_mst(handle, X, indptr.data(), diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp index 8b92ed48f7..de2a7d3415 100644 --- a/cpp/include/raft/comms/detail/std_comms.hpp +++ b/cpp/include/raft/comms/detail/std_comms.hpp @@ -28,6 +28,8 @@ #include +#include + #include #include @@ -138,50 +140,39 @@ class std_comms : public comms_iface { update_host(h_colors.data(), d_colors.data(), get_size(), stream_); update_host(h_keys.data(), d_keys.data(), get_size(), stream_); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_)); - - std::vector subcomm_ranks{}; - std::vector new_ucx_ptrs{}; + this->sync_stream(stream_); - for (int i = 0; i < get_size(); ++i) { - if (h_colors[i] == color) { - subcomm_ranks.push_back(i); - if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); } - } - } + ncclComm_t nccl_comm; + // Create a structure to allgather... ncclUniqueId id{}; - if (get_rank() == subcomm_ranks[0]) { // root of the new subcommunicator - RAFT_NCCL_TRY(ncclGetUniqueId(&id)); - std::vector requests(subcomm_ranks.size() - 1); - for (size_t i = 1; i < subcomm_ranks.size(); ++i) { - isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1)); - } - waitall(requests.size(), requests.data()); - } else { - request_t request{}; - irecv(&id, sizeof(ncclUniqueId), subcomm_ranks[0], color, &request); - waitall(1, &request); - } - // FIXME: this seems unnecessary, do more testing and remove this - barrier(); + rmm::device_uvector d_nccl_ids(get_size(), stream_); - ncclComm_t nccl_comm; - RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key)); - - if (ucp_worker_ != nullptr && subcomms_ucp_) { - auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr(new std_comms(nccl_comm, - (ucp_worker_h)ucp_worker_, - eps_sp, - subcomm_ranks.size(), - key, - stream_, - subcomms_ucp_)); - } else { - return std::unique_ptr( - new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_)); - } + if (key == 0) { RAFT_NCCL_TRY(ncclGetUniqueId(&id)); } + + update_device(d_nccl_ids.data() + get_rank(), &id, 1, stream_); + + allgather(d_nccl_ids.data() + get_rank(), + d_nccl_ids.data(), + sizeof(ncclUniqueId), + datatype_t::UINT8, + stream_); + + auto offset = + std::distance(thrust::make_zip_iterator(h_colors.begin(), h_keys.begin()), + std::find_if(thrust::make_zip_iterator(h_colors.begin(), h_keys.begin()), + thrust::make_zip_iterator(h_colors.end(), h_keys.end()), + [color](auto tuple) { return thrust::get<0>(tuple) == color; })); + + auto subcomm_size = std::count(h_colors.begin(), h_colors.end(), color); + + update_host(&id, d_nccl_ids.data() + offset, 1, stream_); + + this->sync_stream(stream_); + + RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_size, id, key)); + + return std::unique_ptr(new std_comms(nccl_comm, subcomm_size, key, stream_)); } void barrier() const diff --git a/cpp/include/raft/core/coo_matrix.hpp b/cpp/include/raft/core/coo_matrix.hpp index a5f7c05493..52ac69f163 100644 --- a/cpp/include/raft/core/coo_matrix.hpp +++ b/cpp/include/raft/core/coo_matrix.hpp @@ -23,6 +23,11 @@ namespace raft { +/** + * \defgroup coo_matrix COO Matrix + * @{ + */ + template class coordinate_structure_t : public sparse_structure { public: @@ -289,4 +294,7 @@ class coo_matrix } } }; + +/** @} */ + } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/csr_matrix.hpp b/cpp/include/raft/core/csr_matrix.hpp index 95d09d3eea..1113cc2023 100644 --- a/cpp/include/raft/core/csr_matrix.hpp +++ b/cpp/include/raft/core/csr_matrix.hpp @@ -22,6 +22,11 @@ namespace raft { +/** + * \defgroup csr_matrix CSR Matrix + * @{ + */ + template class compressed_structure_t : public sparse_structure { public: @@ -301,4 +306,7 @@ class csr_matrix } } }; + +/** @} */ + } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp index 390acea697..bb4207938b 100644 --- a/cpp/include/raft/core/detail/macros.hpp +++ b/cpp/include/raft/core/detail/macros.hpp @@ -22,6 +22,14 @@ #endif #endif +#if defined(_RAFT_HAS_CUDA) +#define CUDA_CONDITION_ELSE_TRUE(condition) condition +#define CUDA_CONDITION_ELSE_FALSE(condition) condition +#else +#define CUDA_CONDITION_ELSE_TRUE(condition) true +#define CUDA_CONDITION_ELSE_FALSE(condition) false +#endif + #ifndef _RAFT_HOST_DEVICE #if defined(_RAFT_HAS_CUDA) #define _RAFT_DEVICE __device__ @@ -40,6 +48,10 @@ #define RAFT_INLINE_FUNCTION _RAFT_HOST_DEVICE _RAFT_FORCEINLINE #endif +#ifndef RAFT_DEVICE_INLINE_FUNCTION +#define RAFT_DEVICE_INLINE_FUNCTION _RAFT_DEVICE _RAFT_FORCEINLINE +#endif + // The RAFT_INLINE_CONDITIONAL is a conditional inline specifier that removes // the inline specification when RAFT_COMPILED is defined. // diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp index eef981e56f..011de307db 100644 --- a/cpp/include/raft/core/device_container_policy.hpp +++ b/cpp/include/raft/core/device_container_policy.hpp @@ -164,10 +164,19 @@ class device_uvector_policy { public: auto create(raft::resources const& res, size_t n) -> container_type { - return container_type(n, resource::get_cuda_stream(res), resource::get_workspace_resource(res)); + if (mr_ == nullptr) { + // NB: not using the workspace resource by default! + // The workspace resource is for short-lived temporary allocations. + return container_type(n, resource::get_cuda_stream(res)); + } else { + return container_type(n, resource::get_cuda_stream(res), mr_); + } } - device_uvector_policy() = default; + constexpr device_uvector_policy() = default; + constexpr explicit device_uvector_policy(rmm::mr::device_memory_resource* mr) noexcept : mr_(mr) + { + } [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference { @@ -181,6 +190,9 @@ class device_uvector_policy { [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } + + private: + rmm::mr::device_memory_resource* mr_{nullptr}; }; } // namespace raft diff --git a/cpp/include/raft/core/device_coo_matrix.hpp b/cpp/include/raft/core/device_coo_matrix.hpp index 67aa4e12f1..41da605ff0 100644 --- a/cpp/include/raft/core/device_coo_matrix.hpp +++ b/cpp/include/raft/core/device_coo_matrix.hpp @@ -23,14 +23,26 @@ namespace raft { -template +using device_coordinate_structure_view = coordinate_structure_view; + +/** + * Specialization for a sparsity-owning coordinate structure which uses device memory + */ +template typename ContainerPolicy = device_uvector_policy, - SparsityType sparsity_type = SparsityType::OWNING> -using device_coo_matrix = - coo_matrix; + template typename ContainerPolicy = device_uvector_policy> +using device_coordinate_structure = + coordinate_structure; /** * Specialization for a coo matrix view which uses device memory @@ -38,6 +50,15 @@ using device_coo_matrix = template using device_coo_matrix_view = coo_matrix_view; +template typename ContainerPolicy = device_uvector_policy, + SparsityType sparsity_type = SparsityType::OWNING> +using device_coo_matrix = + coo_matrix; + /** * Specialization for a sparsity-owning coo matrix which uses device memory */ @@ -62,21 +83,15 @@ using device_sparsity_preserving_coo_matrix = coo_matrix; -/** - * Specialization for a sparsity-owning coordinate structure which uses device memory - */ -template typename ContainerPolicy = device_uvector_policy> -using device_coordinate_structure = - coordinate_structure; +template +struct is_device_coo_matrix_view : std::false_type {}; -/** - * Specialization for a sparsity-preserving coordinate structure view which uses device memory - */ -template -using device_coordinate_structure_view = coordinate_structure_view; +template +struct is_device_coo_matrix_view> + : std::true_type {}; + +template +constexpr bool is_device_coo_matrix_view_v = is_device_coo_matrix_view::value; template struct is_device_coo_matrix : std::false_type {}; @@ -378,4 +393,6 @@ auto make_device_coordinate_structure_view(raft::device_span rows, return device_coordinate_structure_view(rows, cols, n_rows, n_cols); } +/** @} */ + }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/device_csr_matrix.hpp b/cpp/include/raft/core/device_csr_matrix.hpp index 1495609d75..da4ac117b1 100644 --- a/cpp/include/raft/core/device_csr_matrix.hpp +++ b/cpp/include/raft/core/device_csr_matrix.hpp @@ -25,6 +25,34 @@ namespace raft { +/** + * \defgroup device_csr_matrix Device CSR Matrix Types + * @{ + */ + +/** + * Specialization for a sparsity-preserving compressed structure view which uses device memory + */ +template +using device_compressed_structure_view = + compressed_structure_view; + +/** + * Specialization for a sparsity-owning compressed structure which uses device memory + */ +template typename ContainerPolicy = device_uvector_policy> +using device_compressed_structure = + compressed_structure; + +/** + * Specialization for a csr matrix view which uses device memory + */ +template +using device_csr_matrix_view = csr_matrix_view; + template ; +/** + * Specialization for a sparsity-preserving csr matrix which uses device memory + */ +template typename ContainerPolicy = device_uvector_policy> +using device_sparsity_preserving_csr_matrix = csr_matrix; + +template +struct is_device_csr_matrix_view : std::false_type {}; + +template +struct is_device_csr_matrix_view< + device_csr_matrix_view> : std::true_type {}; + +template +constexpr bool is_device_csr_matrix_view_v = is_device_csr_matrix_view::value; + template struct is_device_csr_matrix : std::false_type {}; @@ -70,51 +124,6 @@ template constexpr bool is_device_csr_sparsity_preserving_v = is_device_csr_matrix::value and T::get_sparsity_type() == PRESERVING; -/** - * Specialization for a csr matrix view which uses device memory - */ -template -using device_csr_matrix_view = csr_matrix_view; - -/** - * Specialization for a sparsity-preserving csr matrix which uses device memory - */ -template typename ContainerPolicy = device_uvector_policy> -using device_sparsity_preserving_csr_matrix = csr_matrix; - -/** - * Specialization for a csr matrix view which uses device memory - */ -template -using device_csr_matrix_view = csr_matrix_view; - -/** - * Specialization for a sparsity-owning compressed structure which uses device memory - */ -template typename ContainerPolicy = device_uvector_policy> -using device_compressed_structure = - compressed_structure; - -/** - * Specialization for a sparsity-preserving compressed structure view which uses device memory - */ -template -using device_compressed_structure_view = - compressed_structure_view; - /** * Create a sparsity-owning sparse matrix in the compressed-sparse row format. sparsity-owning * means that all of the underlying vectors (data, indptr, indices) are owned by the csr_matrix @@ -410,4 +419,6 @@ auto make_device_compressed_structure_view(raft::device_span indptr, return device_compressed_structure_view(indptr, indices, n_cols); } +/** @} */ + }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp index 68273db15c..fe543c97dd 100644 --- a/cpp/include/raft/core/device_mdarray.hpp +++ b/cpp/include/raft/core/device_mdarray.hpp @@ -112,7 +112,7 @@ auto make_device_mdarray(raft::resources const& handle, using mdarray_t = device_mdarray; typename mdarray_t::mapping_type layout{exts}; - typename mdarray_t::container_policy_type policy{}; + typename mdarray_t::container_policy_type policy{mr}; return mdarray_t{handle, layout, policy}; } diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp index c620a688b9..cf06920a8c 100644 --- a/cpp/include/raft/core/device_resources.hpp +++ b/cpp/include/raft/core/device_resources.hpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -60,12 +61,12 @@ namespace raft { class device_resources : public resources { public: device_resources(const device_resources& handle, - rmm::mr::device_memory_resource* workspace_resource) + std::shared_ptr workspace_resource, + std::optional allocation_limit = std::nullopt) : resources{handle} { // replace the resource factory for the workspace_resources - resources::add_resource_factory( - std::make_shared(workspace_resource)); + resource::set_workspace_resource(*this, workspace_resource, allocation_limit); } device_resources(const device_resources& handle) : resources{handle} {} @@ -80,10 +81,13 @@ class device_resources : public resources { * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified) * @param[in] workspace_resource an optional resource used by some functions for allocating * temporary workspaces. + * @param[in] allocation_limit the total amount of memory in bytes available to the temporary + * workspace resources. */ device_resources(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, std::shared_ptr stream_pool = {nullptr}, - rmm::mr::device_memory_resource* workspace_resource = nullptr) + std::shared_ptr workspace_resource = {nullptr}, + std::optional allocation_limit = std::nullopt) : resources{} { resources::add_resource_factory(std::make_shared()); @@ -91,8 +95,9 @@ class device_resources : public resources { std::make_shared(stream_view)); resources::add_resource_factory( std::make_shared(stream_pool)); - resources::add_resource_factory( - std::make_shared(workspace_resource)); + if (workspace_resource) { + resource::set_workspace_resource(*this, workspace_resource, allocation_limit); + } } /** Destroys all held-up resources */ @@ -255,4 +260,4 @@ class stream_syncer { } // namespace raft -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/core/handle.hpp b/cpp/include/raft/core/handle.hpp index 2a6b5657e2..124ab8c315 100644 --- a/cpp/include/raft/core/handle.hpp +++ b/cpp/include/raft/core/handle.hpp @@ -32,7 +32,8 @@ namespace raft { */ class handle_t : public raft::device_resources { public: - handle_t(const handle_t& handle, rmm::mr::device_memory_resource* workspace_resource) + handle_t(const handle_t& handle, + std::shared_ptr workspace_resource) : device_resources(handle, workspace_resource) { } @@ -51,9 +52,9 @@ class handle_t : public raft::device_resources { * @param[in] workspace_resource an optional resource used by some functions for allocating * temporary workspaces. */ - handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, - std::shared_ptr stream_pool = {nullptr}, - rmm::mr::device_memory_resource* workspace_resource = nullptr) + handle_t(rmm::cuda_stream_view stream_view = rmm::cuda_stream_per_thread, + std::shared_ptr stream_pool = {nullptr}, + std::shared_ptr workspace_resource = {nullptr}) : device_resources{stream_view, stream_pool, workspace_resource} { } diff --git a/cpp/include/raft/core/host_coo_matrix.hpp b/cpp/include/raft/core/host_coo_matrix.hpp index 32e7a9e3c4..7a216dc8a2 100644 --- a/cpp/include/raft/core/host_coo_matrix.hpp +++ b/cpp/include/raft/core/host_coo_matrix.hpp @@ -22,14 +22,26 @@ namespace raft { -template +using host_coordinate_structure_view = coordinate_structure_view; + +/** + * Specialization for a sparsity-owning coordinate structure which uses host memory + */ +template typename ContainerPolicy = host_vector_policy, - SparsityType sparsity_type = SparsityType::OWNING> -using host_coo_matrix = - coo_matrix; + template typename ContainerPolicy = host_vector_policy> +using host_coordinate_structure = + coordinate_structure; /** * Specialization for a coo matrix view which uses host memory @@ -37,6 +49,15 @@ using host_coo_matrix = template using host_coo_matrix_view = coo_matrix_view; +template typename ContainerPolicy = host_vector_policy, + SparsityType sparsity_type = SparsityType::OWNING> +using host_coo_matrix = + coo_matrix; + /** * Specialization for a sparsity-owning coo matrix which uses host memory */ @@ -61,21 +82,15 @@ using host_sparsity_preserving_coo_matrix = coo_matrix; -/** - * Specialization for a sparsity-owning coordinate structure which uses host memory - */ -template typename ContainerPolicy = host_vector_policy> -using host_coordinate_structure = - coordinate_structure; +template +struct is_host_coo_matrix_view : std::false_type {}; -/** - * Specialization for a sparsity-preserving coordinate structure view which uses host memory - */ -template -using host_coordinate_structure_view = coordinate_structure_view; +template +struct is_host_coo_matrix_view> + : std::true_type {}; + +template +constexpr bool is_host_coo_matrix_view_v = is_host_coo_matrix_view::value; template struct is_host_coo_matrix : std::false_type {}; @@ -376,4 +391,6 @@ auto make_host_coordinate_structure_view(raft::host_span rows, return host_coordinate_structure_view(rows, cols, n_rows, n_cols); } +/** @} */ + }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/host_csr_matrix.hpp b/cpp/include/raft/core/host_csr_matrix.hpp index 86199335f2..f32ff1dc00 100644 --- a/cpp/include/raft/core/host_csr_matrix.hpp +++ b/cpp/include/raft/core/host_csr_matrix.hpp @@ -24,6 +24,34 @@ namespace raft { +/** + * \defgroup host_csr_matrix Host CSR Matrix + * @{ + */ + +/** + * Specialization for a sparsity-preserving compressed structure view which uses host memory + */ +template +using host_compressed_structure_view = + compressed_structure_view; + +/** + * Specialization for a sparsity-owning compressed structure which uses host memory + */ +template typename ContainerPolicy = host_vector_policy> +using host_compressed_structure = + compressed_structure; + +/** + * Specialization for a csr matrix view which uses host memory + */ +template +using host_csr_matrix_view = csr_matrix_view; + template ; +/** + * Specialization for a sparsity-preserving csr matrix which uses host memory + */ +template typename ContainerPolicy = host_vector_policy> +using host_sparsity_preserving_csr_matrix = csr_matrix; + +template +struct is_host_csr_matrix_view : std::false_type {}; + +template +struct is_host_csr_matrix_view> + : std::true_type {}; + +template +constexpr bool is_host_csr_matrix_view_v = is_host_csr_matrix_view::value; + template struct is_host_csr_matrix : std::false_type {}; @@ -66,53 +120,9 @@ constexpr bool is_host_csr_sparsity_owning_v = is_host_csr_matrix::value and T::get_sparsity_type() == OWNING; template -constexpr bool is_host_csr_sparsity_preserving_v = - is_host_csr_matrix::value and T::get_sparsity_type() == PRESERVING; - -/** - * Specialization for a csr matrix view which uses host memory - */ -template -using host_csr_matrix_view = csr_matrix_view; - -/** - * Specialization for a sparsity-preserving csr matrix which uses host memory - */ -template typename ContainerPolicy = host_vector_policy> -using host_sparsity_preserving_csr_matrix = csr_matrix; - -/** - * Specialization for a csr matrix view which uses host memory - */ -template -using host_csr_matrix_view = csr_matrix_view; - -/** - * Specialization for a sparsity-owning compressed structure which uses host memory - */ -template typename ContainerPolicy = host_vector_policy> -using host_compressed_structure = - compressed_structure; - -/** - * Specialization for a sparsity-preserving compressed structure view which uses host memory - */ -template -using host_compressed_structure_view = - compressed_structure_view; +constexpr bool is_host_csr_sparsity_preserving_v = std::disjunction_v< + is_host_csr_matrix_view, + std::bool_constant::value and T::get_sparsity_type() == PRESERVING>>; /** * Create a sparsity-owning sparse matrix in the compressed-sparse row format. sparsity-owning @@ -410,4 +420,6 @@ auto make_host_compressed_structure_view(raft::host_span indptr, return host_compressed_structure_view(indptr, indices, n_cols); } +/** @} */ + }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/host_span.hpp b/cpp/include/raft/core/host_span.hpp index 8b37414e76..36978dfca4 100644 --- a/cpp/include/raft/core/host_span.hpp +++ b/cpp/include/raft/core/host_span.hpp @@ -21,7 +21,7 @@ namespace raft { /** - * @defgroup device_span one-dimensional device span type + * @defgroup host_span one-dimensional device span type * @{ */ diff --git a/cpp/include/raft/core/interruptible.hpp b/cpp/include/raft/core/interruptible.hpp index f7351c3411..10ab22f820 100644 --- a/cpp/include/raft/core/interruptible.hpp +++ b/cpp/include/raft/core/interruptible.hpp @@ -303,7 +303,7 @@ class interruptible { }; /** - * @} + * @} // end doxygen group interruptible */ } // namespace raft diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp index c5f08b84b7..56a8d78926 100644 --- a/cpp/include/raft/core/math.hpp +++ b/cpp/include/raft/core/math.hpp @@ -22,10 +22,15 @@ #include +#if defined(_RAFT_HAS_CUDA) +#include +#include +#endif + namespace raft { /** - * @defgroup Absolute Absolute value + * @defgroup math_functions Mathematical Functions * @{ */ template @@ -50,12 +55,7 @@ constexpr RAFT_INLINE_FUNCTION auto abs(T x) { return x < T{0} ? -x : x; } -/** @} */ -/** - * @defgroup Trigonometry Trigonometry functions - * @{ - */ /** Inverse cosine */ template RAFT_INLINE_FUNCTION auto acos(T x) @@ -90,7 +90,10 @@ RAFT_INLINE_FUNCTION auto atanh(T x) } /** Cosine */ -template +template && + (!std::is_same_v)))), + int> = 0> RAFT_INLINE_FUNCTION auto cos(T x) { #ifdef __CUDA_ARCH__ @@ -100,8 +103,38 @@ RAFT_INLINE_FUNCTION auto cos(T x) #endif } -/** Sine */ +#if defined(_RAFT_HAS_CUDA) template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> cos(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hcos(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +cos(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hcos(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + +/** Sine */ +template && + (!std::is_same_v)))), + int> = 0> RAFT_INLINE_FUNCTION auto sin(T x) { #ifdef __CUDA_ARCH__ @@ -111,6 +144,33 @@ RAFT_INLINE_FUNCTION auto sin(T x) #endif } +#if defined(_RAFT_HAS_CUDA) +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> sin(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hsin(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +sin(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hsin(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + /** Sine and cosine */ template RAFT_INLINE_FUNCTION std::enable_if_t || std::is_same_v> sincos( @@ -134,14 +194,12 @@ RAFT_INLINE_FUNCTION auto tanh(T x) return std::tanh(x); #endif } -/** @} */ -/** - * @defgroup Exponential Exponential and logarithm - * @{ - */ /** Exponential function */ -template +template && + (!std::is_same_v)))), + int> = 0> RAFT_INLINE_FUNCTION auto exp(T x) { #ifdef __CUDA_ARCH__ @@ -151,8 +209,38 @@ RAFT_INLINE_FUNCTION auto exp(T x) #endif } -/** Natural logarithm */ +#if defined(_RAFT_HAS_CUDA) +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> exp(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hexp(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +exp(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hexp(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + +/** Natural logarithm */ +template && + (!std::is_same_v)))), + int> = 0> RAFT_INLINE_FUNCTION auto log(T x) { #ifdef __CUDA_ARCH__ @@ -161,12 +249,36 @@ RAFT_INLINE_FUNCTION auto log(T x) return std::log(x); #endif } -/** @} */ + +#if defined(_RAFT_HAS_CUDA) +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> log(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hlog(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +log(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hlog(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif /** - * @defgroup Maximum Maximum of two or more values. - * - * The CUDA Math API has overloads for all combinations of float/double. We provide similar + * @brief The CUDA Math API has overloads for all combinations of float/double. We provide similar * functionality while wrapping around std::max, which only supports arguments of the same type. * However, though the CUDA Math API supports combinations of unsigned and signed integers, this is * very error-prone so we do not support that and require the user to cast instead. (e.g the max of @@ -176,7 +288,13 @@ RAFT_INLINE_FUNCTION auto log(T x) * same (and that the less-than operator be defined). * @{ */ -template +template < + typename T1, + typename T2, + std::enable_if_t && !std::is_same_v) || + (!std::is_same_v && !std::is_same_v)))), + int> = 0> RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y) { #ifdef __CUDA_ARCH__ @@ -208,6 +326,34 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y) #endif } +#if defined(_RAFT_HAS_CUDA) +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> max(T x, + T y) +{ +#if (__CUDA_ARCH__ >= 530) + return ::__hmax(x, y); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +max(T x, T y) +{ +#if (__CUDA_ARCH__ >= 800) + return ::__hmax(x, y); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + /** Many-argument overload to avoid verbose nested calls or use with variadic arguments */ template RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y, Args&&... args) @@ -221,10 +367,36 @@ constexpr RAFT_INLINE_FUNCTION auto max(const T& x) { return x; } -/** @} */ + +#if defined(_RAFT_HAS_CUDA) +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> max(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return x; +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +max(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return x; +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif /** - * @defgroup Minimum Minimum of two or more values. + * @brief Minimum Minimum of two or more values. * * The CUDA Math API has overloads for all combinations of float/double. We provide similar * functionality while wrapping around std::min, which only supports arguments of the same type. @@ -236,7 +408,13 @@ constexpr RAFT_INLINE_FUNCTION auto max(const T& x) * same (and that the less-than operator be defined). * @{ */ -template +template < + typename T1, + typename T2, + std::enable_if_t && !std::is_same_v) || + (!std::is_same_v && !std::is_same_v)))), + int> = 0> RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y) { #ifdef __CUDA_ARCH__ @@ -268,6 +446,34 @@ RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y) #endif } +#if defined(_RAFT_HAS_CUDA) +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> min(T x, + T y) +{ +#if (__CUDA_ARCH__ >= 530) + return ::__hmin(x, y); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +min(T x, T y) +{ +#if (__CUDA_ARCH__ >= 800) + return ::__hmin(x, y); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + /** Many-argument overload to avoid verbose nested calls or use with variadic arguments */ template RAFT_INLINE_FUNCTION auto min(const T1& x, const T2& y, Args&&... args) @@ -281,12 +487,35 @@ constexpr RAFT_INLINE_FUNCTION auto min(const T& x) { return x; } -/** @} */ -/** - * @defgroup Power Power and root functions - * @{ - */ +#if defined(_RAFT_HAS_CUDA) +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> min( + T x) +{ +#if (__CUDA_ARCH__ >= 530) + return x; +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +min(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return x; +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif + /** Power */ template RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y) @@ -299,7 +528,10 @@ RAFT_INLINE_FUNCTION auto pow(T1 x, T2 y) } /** Square root */ -template +template && + (!std::is_same_v)))), + int> = 0> RAFT_INLINE_FUNCTION auto sqrt(T x) { #ifdef __CUDA_ARCH__ @@ -308,7 +540,33 @@ RAFT_INLINE_FUNCTION auto sqrt(T x) return std::sqrt(x); #endif } -/** @} */ + +#if defined(_RAFT_HAS_CUDA) +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, __half> sqrt(T x) +{ +#if (__CUDA_ARCH__ >= 530) + return ::hsqrt(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "__half is only supported on __CUDA_ARCH__ >= 530"); + return T{}; +#endif +} + +template +RAFT_DEVICE_INLINE_FUNCTION typename std::enable_if_t, nv_bfloat16> +sqrt(T x) +{ +#if (__CUDA_ARCH__ >= 800) + return ::hsqrt(x); +#else + // Fail during template instantiation if the compute capability doesn't support this operation + static_assert(sizeof(T) != sizeof(T), "nv_bfloat16 is only supported on __CUDA_ARCH__ >= 800"); + return T{}; +#endif +} +#endif /** Sign */ template @@ -317,4 +575,6 @@ RAFT_INLINE_FUNCTION auto sgn(T val) -> int return (T(0) < val) - (val < T(0)); } +/** @} */ + } // namespace raft diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp index 7bd5a28a0c..2cdeb36fc8 100644 --- a/cpp/include/raft/core/mdarray.hpp +++ b/cpp/include/raft/core/mdarray.hpp @@ -34,7 +34,7 @@ namespace raft { /** - * @defgroup mdarray multi-dimensional memory-owning type + * @defgroup mdarray_apis multi-dimensional memory-owning type * @{ */ @@ -343,9 +343,7 @@ class mdarray container_type c_; }; -/** - * @} - */ +/** @} */ /** * @defgroup mdarray_reshape Row- or Col-norm computation @@ -387,8 +385,6 @@ auto reshape(const array_interface_type& mda, extents new return reshape(mda.view(), new_shape); } -/** - * }@ - */ +/** @} */ } // namespace raft diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp index e87c76d82d..f1a1adb916 100644 --- a/cpp/include/raft/core/mdspan.hpp +++ b/cpp/include/raft/core/mdspan.hpp @@ -270,6 +270,13 @@ auto reshape(mdspan_type mds, extents new_shape) new_shape); } +/* @} */ + +/** + * @defgroup mdspan_unravel Unravel mdspan + * @{ + */ + /** * \brief Turns linear index into coordinate. Similar to numpy unravel_index. * @@ -303,9 +310,7 @@ RAFT_INLINE_FUNCTION auto unravel_index(Idx idx, } } -/** - * @} - */ +/** @} */ /** * @brief Const accessor specialization for default_accessor @@ -337,6 +342,11 @@ accessor_of_const(host_device_accessor mds) mds.data_handle(), mds.mapping(), acc_c}; } +/** @} */ + } // namespace raft diff --git a/cpp/include/raft/core/mdspan_types.hpp b/cpp/include/raft/core/mdspan_types.hpp index 07c69f472c..62f95b6afc 100644 --- a/cpp/include/raft/core/mdspan_types.hpp +++ b/cpp/include/raft/core/mdspan_types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,18 +24,13 @@ using std::experimental::dynamic_extent; using std::experimental::extents; /** - * @defgroup C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory. + * @defgroup mdspan_layout C- and F-contiguous mdspan layouts * @{ */ using std::experimental::layout_right; using layout_c_contiguous = layout_right; using row_major = layout_right; -/** @} */ -/** - * @defgroup F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory. - * @{ - */ using std::experimental::layout_left; using layout_f_contiguous = layout_left; using col_major = layout_left; diff --git a/cpp/include/raft/core/resource/detail/device_memory_resource.hpp b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp new file mode 100644 index 0000000000..9d3f13689d --- /dev/null +++ b/cpp/include/raft/core/resource/detail/device_memory_resource.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include +#include +#include + +namespace raft::resource::detail { + +/** + * Warn a user of the calling algorithm if they use the default non-pooled memory allocator, + * as it may hurt the performance. + * + * This helper function is designed to produce the warning once for a given `user_name`. + * + * @param[in] res + * @param[in] user_name the name of the algorithm or any other identification. + * + */ +inline void warn_non_pool_workspace(resources const& res, std::string user_name) +{ + // Detect if the plain cuda memory resource is used for the workspace + if (rmm::mr::cuda_memory_resource{}.is_equal(*get_workspace_resource(res)->get_upstream())) { + static std::set notified_names{}; + static std::mutex mutex{}; + std::lock_guard guard(mutex); + auto [it, inserted] = notified_names.insert(std::move(user_name)); + if (inserted) { + RAFT_LOG_WARN( + "[%s] the default cuda resource is used for the raft workspace allocations. This may lead " + "to a significant slowdown for this algorithm. Consider using the default pool resource " + "(`raft::resource::set_workspace_to_pool_resource`) or set your own resource explicitly " + "(`raft::resource::set_workspace_resource`).", + it->c_str()); + } + } +} + +} // namespace raft::resource::detail diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp index ebc41e0f8e..9aa9e4fb85 100644 --- a/cpp/include/raft/core/resource/device_memory_resource.hpp +++ b/cpp/include/raft/core/resource/device_memory_resource.hpp @@ -15,24 +15,55 @@ */ #pragma once +#include #include #include +#include + #include +#include #include +#include + +#include +#include namespace raft::resource { -class device_memory_resource : public resource { + +/** + * \defgroup device_memory_resource Device memory resources + * @{ + */ + +class limiting_memory_resource : public resource { public: - device_memory_resource(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) + limiting_memory_resource(std::shared_ptr mr, + std::size_t allocation_limit, + std::optional alignment) + : upstream_(mr), mr_(make_adaptor(mr, allocation_limit, alignment)) { - if (mr_ == nullptr) { mr = rmm::mr::get_current_device_resource(); } } - void* get_resource() override { return mr; } - ~device_memory_resource() override {} + auto get_resource() -> void* override { return &mr_; } + + ~limiting_memory_resource() override = default; private: - rmm::mr::device_memory_resource* mr; + std::shared_ptr upstream_; + rmm::mr::limiting_resource_adaptor mr_; + + static inline auto make_adaptor(std::shared_ptr upstream, + std::size_t limit, + std::optional alignment) + -> rmm::mr::limiting_resource_adaptor + { + auto p = upstream.get(); + if (alignment.has_value()) { + return rmm::mr::limiting_resource_adaptor(p, limit, alignment.value()); + } else { + return rmm::mr::limiting_resource_adaptor(p, limit); + } + } }; /** @@ -41,36 +72,175 @@ class device_memory_resource : public resource { */ class workspace_resource_factory : public resource_factory { public: - workspace_resource_factory(rmm::mr::device_memory_resource* mr_ = nullptr) : mr(mr_) {} - resource_type get_resource_type() override { return resource_type::WORKSPACE_RESOURCE; } - resource* make_resource() override { return new device_memory_resource(mr); } + explicit workspace_resource_factory( + std::shared_ptr mr = {nullptr}, + std::optional allocation_limit = std::nullopt, + std::optional alignment = std::nullopt) + : allocation_limit_(allocation_limit.value_or(default_allocation_limit())), + alignment_(alignment), + mr_(mr ? mr : default_plain_resource()) + { + } + + auto get_resource_type() -> resource_type override { return resource_type::WORKSPACE_RESOURCE; } + auto make_resource() -> resource* override + { + return new limiting_memory_resource(mr_, allocation_limit_, alignment_); + } + + /** Construct a sensible default pool memory resource. */ + static inline auto default_pool_resource(std::size_t limit) + -> std::shared_ptr + { + // Set the default granularity to 1 GiB + constexpr std::size_t kOneGb = 1024lu * 1024lu * 1024lu; + // The initial size of the pool. The choice of this value only affects the performance a little + // bit. Heuristics: + // 1) the pool shouldn't be too big from the beginning independently of the limit; + // 2) otherwise, set it to half the max size to avoid too many resize calls. + auto min_size = std::min(kOneGb, limit / 2lu); + // The pool is going to be place behind the limiting resource adaptor. This means the user won't + // be able to allocate more than 'limit' bytes of memory anyway. At the same time, the pool + // itself may consume a little bit more memory than the 'limit' due to memory fragmentation. + // Therefore, we look for a compromise, such that: + // 1) 'limit' is accurate - the user should be more likely to run into the limiting + // resource adaptor bad_alloc error than into the pool bad_alloc error. + // 2) The pool doesn't grab too much memory on top of the 'limit'. + auto max_size = std::min(limit + kOneGb / 2lu, limit * 3lu / 2lu); + auto upstream = rmm::mr::get_current_device_resource(); + RAFT_LOG_DEBUG( + "Setting the workspace pool resource; memory limit = %zu, initial pool size = %zu, max pool " + "size = %zu.", + limit, + min_size, + max_size); + return std::make_shared>( + upstream, min_size, max_size); + } + + /** + * Get the global memory resource wrapped into an unmanaged shared_ptr (with no deleter). + * + * Note: the lifetime of the underlying `rmm::mr::get_current_device_resource()` is managed + * somewhere else, since it's passed by a raw pointer. Hence, this shared_ptr wrapper is not + * allowed to delete the pointer on destruction. + */ + static inline auto default_plain_resource() -> std::shared_ptr + { + return std::shared_ptr{rmm::mr::get_current_device_resource(), + void_op{}}; + } private: - rmm::mr::device_memory_resource* mr; + std::size_t allocation_limit_; + std::optional alignment_; + std::shared_ptr mr_; + + static inline auto default_allocation_limit() -> std::size_t + { + std::size_t free_size{}; + std::size_t total_size{}; + RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); + // Note, the workspace does not claim all this memory from the start, so it's still usable by + // the main resource as well. + // This limit is merely an order for algorithm internals to plan the batching accordingly. + return total_size / 2; + } }; /** * Load a temp workspace resource from a resources instance (and populate it on the res * if needed). + * * @param res raft resources object for managing resources * @return device memory resource object */ -inline rmm::mr::device_memory_resource* get_workspace_resource(resources const& res) +inline auto get_workspace_resource(resources const& res) + -> rmm::mr::limiting_resource_adaptor* { if (!res.has_resource_factory(resource_type::WORKSPACE_RESOURCE)) { res.add_resource_factory(std::make_shared()); } - return res.get_resource(resource_type::WORKSPACE_RESOURCE); + return res.get_resource>( + resource_type::WORKSPACE_RESOURCE); +}; + +/** Get the total size of the workspace resource. */ +inline auto get_workspace_total_bytes(resources const& res) -> size_t +{ + return get_workspace_resource(res)->get_allocation_limit(); +}; + +/** Get the already allocated size of the workspace resource. */ +inline auto get_workspace_used_bytes(resources const& res) -> size_t +{ + return get_workspace_resource(res)->get_allocated_bytes(); +}; + +/** Get the available size of the workspace resource. */ +inline auto get_workspace_free_bytes(resources const& res) -> size_t +{ + const auto* p = get_workspace_resource(res); + return p->get_allocation_limit() - p->get_allocated_bytes(); +}; + +/** + * Set a temporary workspace resource on a resources instance. + * + * @param res raft resources object for managing resources + * @param mr an optional RMM device_memory_resource + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources. + * @param alignment optional alignment requirements passed to RMM allocations + * + */ +inline void set_workspace_resource(resources const& res, + std::shared_ptr mr = {nullptr}, + std::optional allocation_limit = std::nullopt, + std::optional alignment = std::nullopt) +{ + res.add_resource_factory( + std::make_shared(mr, allocation_limit, alignment)); }; /** - * Set a temp workspace resource on a resources instance. + * Set the temporary workspace resource to a pool on top of the global memory resource + * (`rmm::mr::get_current_device_resource()`. * * @param res raft resources object for managing resources - * @param mr a valid rmm device_memory_resource + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources; + * if not provided, a last used or default limit is used. + * */ -inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_resource* mr) +inline void set_workspace_to_pool_resource( + resources const& res, std::optional allocation_limit = std::nullopt) { - res.add_resource_factory(std::make_shared(mr)); + if (!allocation_limit.has_value()) { allocation_limit = get_workspace_total_bytes(res); } + res.add_resource_factory(std::make_shared( + workspace_resource_factory::default_pool_resource(*allocation_limit), + allocation_limit, + std::nullopt)); }; + +/** + * Set the temporary workspace resource the same as the global memory resource + * (`rmm::mr::get_current_device_resource()`. + * + * Note, the workspace resource is always limited; the limit here defines how much of the global + * memory resource can be consumed by the workspace allocations. + * + * @param res raft resources object for managing resources + * @param allocation_limit + * the total amount of memory in bytes available to the temporary workspace resources. + */ +inline void set_workspace_to_global_resource( + resources const& res, std::optional allocation_limit = std::nullopt) +{ + res.add_resource_factory(std::make_shared( + workspace_resource_factory::default_plain_resource(), allocation_limit, std::nullopt)); +}; + +/** @} */ + } // namespace raft::resource diff --git a/cpp/include/raft/core/resource/sub_comms.hpp b/cpp/include/raft/core/resource/sub_comms.hpp index 7070b61c54..11d2aed1e0 100644 --- a/cpp/include/raft/core/resource/sub_comms.hpp +++ b/cpp/include/raft/core/resource/sub_comms.hpp @@ -43,7 +43,7 @@ class sub_comms_resource_factory : public resource_factory { }; /** - * @defgroup resource_subcomms Subcommunicator resource functions + * @defgroup resource_sub_comms Subcommunicator resource functions * @{ */ diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp index e0f51b61b4..d5bd176d50 100644 --- a/cpp/include/raft/core/resources.hpp +++ b/cpp/include/raft/core/resources.hpp @@ -95,6 +95,11 @@ class resources { RAFT_EXPECTS(rtype != resource::resource_type::LAST_KEY, "LAST_KEY is a placeholder and not a valid resource factory type."); factories_.at(rtype) = std::make_pair(rtype, factory); + // Clear the corresponding resource, so that on next `get_resource` the new factory is used + if (resources_.at(rtype).first != resource::resource_type::LAST_KEY) { + resources_.at(rtype) = std::make_pair(resource::resource_type::LAST_KEY, + std::make_shared()); + } } /** diff --git a/cpp/include/raft/core/span.hpp b/cpp/include/raft/core/span.hpp index 22906580de..d77e0fcb40 100644 --- a/cpp/include/raft/core/span.hpp +++ b/cpp/include/raft/core/span.hpp @@ -280,7 +280,6 @@ auto as_writable_bytes(span s) noexcept return {reinterpret_cast(s.data()), s.size_bytes()}; } -/** - * @} - */ +/* @} */ + } // namespace raft diff --git a/cpp/include/raft/core/sparse_types.hpp b/cpp/include/raft/core/sparse_types.hpp index a1432c9eb6..55da3037a9 100644 --- a/cpp/include/raft/core/sparse_types.hpp +++ b/cpp/include/raft/core/sparse_types.hpp @@ -22,6 +22,11 @@ namespace raft { +/** + * \defgroup sparse_types Sparse API vocabulary + * @{ + */ + enum SparsityType { OWNING, PRESERVING }; /** @@ -214,4 +219,7 @@ class sparse_matrix { container_policy_type cp_; container_type c_elements_; }; + +/* @} */ + } // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/core/temporary_device_buffer.hpp b/cpp/include/raft/core/temporary_device_buffer.hpp index fcb63f169c..358eeab861 100644 --- a/cpp/include/raft/core/temporary_device_buffer.hpp +++ b/cpp/include/raft/core/temporary_device_buffer.hpp @@ -27,7 +27,7 @@ namespace raft { /** - * \defgroup TemporaryDeviceBuffer `raft::temporary_device_buffer` and associated factories + * \defgroup temporary_device_buffer `raft::temporary_device_buffer` * @{ */ @@ -137,6 +137,13 @@ class temporary_device_buffer { int device_id_; }; +/**@}*/ + +/** + * \defgroup temporary_device_buffer_factories Temporary device buffer factories + * @{ + */ + /** * @brief Factory to create a `raft::temporary_device_buffer` * diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh index 9b079a8539..e121c1be9c 100644 --- a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh +++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh @@ -471,41 +471,18 @@ class GramMatrixBase { ASSERT(is_row_major_nopad || is_col_major_nopad, "Sparse linear Kernel distance does not support ld_out parameter"); - auto x1_structure = x1.structure_view(); - auto x2_structure = x2.structure_view(); - raft::sparse::distance::distances_config_t dist_config(handle); - - // switch a,b based on data layout + // switch a,b based on is_row_major if (is_col_major_nopad) { - dist_config.a_nrows = x2_structure.get_n_rows(); - dist_config.a_ncols = x2_structure.get_n_cols(); - dist_config.a_nnz = x2_structure.get_nnz(); - dist_config.a_indptr = const_cast(x2_structure.get_indptr().data()); - dist_config.a_indices = const_cast(x2_structure.get_indices().data()); - dist_config.a_data = const_cast(x2.get_elements().data()); - dist_config.b_nrows = x1_structure.get_n_rows(); - dist_config.b_ncols = x1_structure.get_n_cols(); - dist_config.b_nnz = x1_structure.get_nnz(); - dist_config.b_indptr = const_cast(x1_structure.get_indptr().data()); - dist_config.b_indices = const_cast(x1_structure.get_indices().data()); - dist_config.b_data = const_cast(x1.get_elements().data()); + auto out_row_major = raft::make_device_matrix_view( + out.data_handle(), out.extent(1), out.extent(0)); + raft::sparse::distance::pairwise_distance( + handle, x2, x1, out_row_major, raft::distance::DistanceType::InnerProduct, 0.0); } else { - dist_config.a_nrows = x1_structure.get_n_rows(); - dist_config.a_ncols = x1_structure.get_n_cols(); - dist_config.a_nnz = x1_structure.get_nnz(); - dist_config.a_indptr = const_cast(x1_structure.get_indptr().data()); - dist_config.a_indices = const_cast(x1_structure.get_indices().data()); - dist_config.a_data = const_cast(x1.get_elements().data()); - dist_config.b_nrows = x2_structure.get_n_rows(); - dist_config.b_ncols = x2_structure.get_n_cols(); - dist_config.b_nnz = x2_structure.get_nnz(); - dist_config.b_indptr = const_cast(x2_structure.get_indptr().data()); - dist_config.b_indices = const_cast(x2_structure.get_indices().data()); - dist_config.b_data = const_cast(x2.get_elements().data()); + auto out_row_major = raft::make_device_matrix_view( + out.data_handle(), out.extent(0), out.extent(1)); + raft::sparse::distance::pairwise_distance( + handle, x1, x2, out_row_major, raft::distance::DistanceType::InnerProduct, 0.0); } - - raft::sparse::distance::pairwiseDistance( - out.data_handle(), dist_config, raft::distance::DistanceType::InnerProduct, 0.0); } }; diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh index 234265dbc1..f02e29c797 100644 --- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh +++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh @@ -135,6 +135,17 @@ __global__ void rbf_kernel_expanded( } } +namespace { +std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) +{ + dim3 block_shape = dim3(32, 4); + const int num_blocks_x = raft::ceildiv(n1, 32); + const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); + dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); + return std::make_tuple(grid_shape, block_shape); +} +} // namespace + /** * Create a kernel matrix using polynomial kernel function. */ @@ -152,12 +163,11 @@ class PolynomialKernel : public GramMatrixBase { polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( inout, rows * cols, exponent, gain, offset); } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - polynomial_kernel<<>>(inout, ld, n1, n2, exponent, gain, offset); + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + polynomial_kernel<<>>( + inout, ld, n1, n2, exponent, gain, offset); } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -327,12 +337,10 @@ class TanhKernel : public GramMatrixBase { tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( inout, rows * cols, gain, offset); } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -498,14 +506,13 @@ class RBFKernel : public GramMatrixBase { bool is_row_major, cudaStream_t stream) { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; - math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; - rbf_kernel_expanded<<>>(inout, ld, n1, n2, norm_n1, norm_n2, gain); + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; + math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + rbf_kernel_expanded<<>>( + inout, ld, n1, n2, norm_n1, norm_n2, gain); } public: @@ -576,7 +583,6 @@ class RBFKernel : public GramMatrixBase { math_t* norm_x2) { cudaStream_t stream = resource::get_cuda_stream(handle); - // lazy compute norms if not given rmm::device_uvector tmp_norm_x1(0, stream); rmm::device_uvector tmp_norm_x2(0, stream); diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh index ccb3bd46bf..aeb862b06a 100644 --- a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh +++ b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh @@ -162,7 +162,7 @@ std::enable_if_t::value> cutlassDistanceKernel(const Da RAFT_CUTLASS_TRY(cutlassDist_op.initialize(arguments, workspace.data(), stream)); // Launch initialized CUTLASS kernel - RAFT_CUTLASS_TRY(cutlassDist_op()); + RAFT_CUTLASS_TRY(cutlassDist_op(stream)); } }; // namespace detail diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh index c426250e18..9dad96356b 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.cuh @@ -121,7 +121,7 @@ void norm(raft::resources const& handle, { RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); - auto constexpr row_major = std::is_same_v; + auto constexpr row_major = std::is_same_v; auto along_rows = apply == Apply::ALONG_ROWS; if (along_rows) { diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh index 7bd30e5bc6..59fcf606c8 100644 --- a/cpp/include/raft/matrix/detail/gather.cuh +++ b/cpp/include/raft/matrix/detail/gather.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -135,16 +136,6 @@ void gatherImpl(const InputIteratorT in, // stencil value type typedef typename std::iterator_traits::value_type StencilValueT; - // return type of MapTransformOp, must be convertible to IndexT - typedef typename std::result_of::type MapTransformOpReturnT; - static_assert((std::is_convertible::value), - "MapTransformOp's result type must be convertible to signed integer"); - - // return type of UnaryPredicateOp, must be convertible to bool - typedef typename std::result_of::type PredicateOpReturnT; - static_assert((std::is_convertible::value), - "UnaryPredicateOp's result type must be convertible to bool type"); - IndexT len = map_length * D; constexpr int TPB = 128; const int n_sm = raft::getMultiProcessorCount(); @@ -343,6 +334,7 @@ void gather_if(const InputIteratorT in, typedef typename std::iterator_traits::value_type MapValueT; gatherImpl(in, D, N, map, stencil, map_length, out, pred_op, transform_op, stream); } + } // namespace detail } // namespace matrix } // namespace raft diff --git a/cpp/include/raft/matrix/detail/gather_inplace.cuh b/cpp/include/raft/matrix/detail/gather_inplace.cuh new file mode 100644 index 0000000000..cc510e068b --- /dev/null +++ b/cpp/include/raft/matrix/detail/gather_inplace.cuh @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace raft { +namespace matrix { +namespace detail { + +template +void gatherInplaceImpl(raft::resources const& handle, + raft::device_matrix_view inout, + raft::device_vector_view map, + MapTransformOp transform_op, + IndexT batch_size) +{ + IndexT m = inout.extent(0); + IndexT n = inout.extent(1); + IndexT map_length = map.extent(0); + + // skip in case of 0 length input + if (map_length <= 0 || m <= 0 || n <= 0 || batch_size < 0) return; + + RAFT_EXPECTS(map_length <= m, "Length of map should be <= number of rows for inplace gather"); + + RAFT_EXPECTS(batch_size >= 0, "batch size should be >= 0"); + + // re-assign batch_size for default case + if (batch_size == 0 || batch_size > n) batch_size = n; + + auto exec_policy = resource::get_thrust_policy(handle); + + IndexT n_batches = raft::ceildiv(n, batch_size); + + auto scratch_space = raft::make_device_vector(handle, map_length * batch_size); + + for (IndexT bid = 0; bid < n_batches; bid++) { + IndexT batch_offset = bid * batch_size; + IndexT cols_per_batch = min(batch_size, n - batch_offset); + + auto gather_op = [inout = inout.data_handle(), + map = map.data_handle(), + transform_op, + batch_offset, + map_length, + cols_per_batch = raft::util::FastIntDiv(cols_per_batch), + n] __device__(auto idx) { + IndexT row = idx / cols_per_batch; + IndexT col = idx % cols_per_batch; + MapT map_val = map[row]; + + IndexT i_src = transform_op(map_val); + return inout[i_src * n + batch_offset + col]; + }; + raft::linalg::map_offset( + handle, + raft::make_device_vector_view(scratch_space.data_handle(), map_length * cols_per_batch), + gather_op); + + auto copy_op = [inout = inout.data_handle(), + map = map.data_handle(), + scratch_space = scratch_space.data_handle(), + batch_offset, + map_length, + cols_per_batch = raft::util::FastIntDiv(cols_per_batch), + n] __device__(auto idx) { + IndexT row = idx / cols_per_batch; + IndexT col = idx % cols_per_batch; + inout[row * n + batch_offset + col] = scratch_space[idx]; + return; + }; + auto counting = thrust::make_counting_iterator(0); + thrust::for_each(exec_policy, counting, counting + map_length * cols_per_batch, copy_op); + } +} + +template +void gather(raft::resources const& handle, + raft::device_matrix_view inout, + raft::device_vector_view map, + MapTransformOp transform_op, + IndexT batch_size) +{ + gatherInplaceImpl(handle, inout, map, transform_op, batch_size); +} + +template +void gather(raft::resources const& handle, + raft::device_matrix_view inout, + raft::device_vector_view map, + IndexT batch_size) +{ + gatherInplaceImpl(handle, inout, map, raft::identity_op(), batch_size); +} + +} // namespace detail +} // namespace matrix +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index 6b6c00c391..48821df5b2 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -170,14 +170,14 @@ void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) */ template __global__ void slice( - const m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) + const m_t* src_d, idx_t lda, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t dm = x2 - x1, dn = y2 - y1; if (idx < dm * dn) { idx_t i = idx % dm, j = idx / dm; idx_t is = i + x1, js = j + y1; - dst_d[idx] = src_d[is + js * m]; + dst_d[idx] = src_d[is + js * lda]; } } @@ -190,12 +190,16 @@ void sliceMatrix(const m_t* in, idx_t y1, idx_t x2, idx_t y2, + bool row_major, cudaStream_t stream) { - // Slicing + auto lda = row_major ? n_cols : n_rows; dim3 block(64); dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); - slice<<>>(in, n_rows, n_cols, out, x1, y1, x2, y2); + if (row_major) + slice<<>>(in, lda, out, y1, x1, y2, x2); + else + slice<<>>(in, lda, out, x1, y1, x2, y2); } /** @@ -230,52 +234,53 @@ void copyUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, c /** * @brief Copy a vector to the diagonal of a matrix * @param vec: vector of length k = min(n_rows, n_cols) - * @param matrix: matrix of size n_rows x n_cols - * @param m: number of rows of the matrix - * @param n: number of columns of the matrix + * @param matrix: matrix of size n_rows x n_cols (leading dimension = lda) + * @param lda: leading dimension of the matrix * @param k: dimensionality */ template -__global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k) +__global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t lda, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < k) { matrix[idx + idx * m] = vec[idx]; } + if (idx < k) { matrix[idx + idx * lda] = vec[idx]; } } /** * @brief Copy matrix diagonal to vector * @param vec: vector of length k = min(n_rows, n_cols) - * @param matrix: matrix of size n_rows x n_cols - * @param m: number of rows of the matrix - * @param n: number of columns of the matrix + * @param matrix: matrix of size n_rows x n_cols (leading dimension = lda) + * @param lda: leading dimension of the matrix * @param k: dimensionality */ template -__global__ void copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t m, idx_t n, idx_t k) +__global__ void copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t lda, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < k) { vec[idx] = matrix[idx + idx * m]; } + if (idx < k) { vec[idx] = matrix[idx + idx * lda]; } } template void initializeDiagonalMatrix( - const m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) + const m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, bool row_major, cudaStream_t stream) { - idx_t k = std::min(n_rows, n_cols); + idx_t k = std::min(n_rows, n_cols); + idx_t lda = row_major ? n_cols : n_rows; dim3 block(64); dim3 grid((k + block.x - 1) / block.x); - copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, n_cols, k); + copyVectorToMatrixDiagonal<<>>(vec, matrix, lda, k); } template -void getDiagonalMatrix(m_t* vec, const m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +void getDiagonalMatrix( + m_t* vec, const m_t* matrix, idx_t n_rows, idx_t n_cols, bool row_major, cudaStream_t stream) { - idx_t k = std::min(n_rows, n_cols); + idx_t k = std::min(n_rows, n_cols); + idx_t lda = row_major ? n_cols : n_rows; dim3 block(64); dim3 grid((k + block.x - 1) / block.x); - copyVectorFromMatrixDiagonal<<>>(vec, matrix, n_rows, n_cols, k); + copyVectorFromMatrixDiagonal<<>>(vec, matrix, lda, k); } /** diff --git a/cpp/include/raft/matrix/detail/scatter_inplace.cuh b/cpp/include/raft/matrix/detail/scatter_inplace.cuh new file mode 100644 index 0000000000..3a57c5478b --- /dev/null +++ b/cpp/include/raft/matrix/detail/scatter_inplace.cuh @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace matrix { +namespace detail { + +/** + * @brief In-place scatter elements in a row-major matrix according to a + * map. The length of the map is equal to the number of rows. The + * map specifies the destination index for each row, i.e. in the + * resulting matrix, row map[i] is assigned to row i. For example, + * the matrix [[1, 2, 3], [4, 5, 6], [7, 8, 9]] with the map [2, 0, 1] will + * be transformed to [[4, 5, 6], [7, 8, 9], [1, 2, 3]]. Batching is done on + * columns and an additional scratch space of shape n_rows * cols_batch_size + * is created. For each batch, chunks of columns from each row are copied + * into the appropriate location in the scratch space and copied back to + * the corresponding locations in the input matrix. + * + * @tparam InputIteratorT + * @tparam MapIteratorT + * @tparam IndexT + * + * @param[inout] handle raft handle + * @param[inout] inout input matrix (n_rows * n_cols) + * @param[inout] map map containing the destination index for each row (n_rows) + * @param[inout] batch_size column batch size + */ + +template +void scatterInplaceImpl( + raft::resources const& handle, + raft::device_matrix_view inout, + raft::device_vector_view map, + IndexT batch_size) +{ + IndexT m = inout.extent(0); + IndexT n = inout.extent(1); + IndexT map_length = map.extent(0); + + // skip in case of 0 length input + if (map_length <= 0 || m <= 0 || n <= 0 || batch_size < 0) return; + + RAFT_EXPECTS(map_length == m, + "Length of map should be equal to number of rows for inplace scatter"); + + RAFT_EXPECTS(batch_size >= 0, "batch size should be >= 0"); + + // re-assign batch_size for default case + if (batch_size == 0 || batch_size > n) batch_size = n; + + auto exec_policy = resource::get_thrust_policy(handle); + + IndexT n_batches = raft::ceildiv(n, batch_size); + + auto scratch_space = raft::make_device_vector(handle, m * batch_size); + + for (IndexT bid = 0; bid < n_batches; bid++) { + IndexT batch_offset = bid * batch_size; + IndexT cols_per_batch = min(batch_size, n - batch_offset); + + auto copy_op = [inout = inout.data_handle(), + map = map.data_handle(), + batch_offset, + cols_per_batch = raft::util::FastIntDiv(cols_per_batch), + n] __device__(auto idx) { + IndexT row = idx / cols_per_batch; + IndexT col = idx % cols_per_batch; + return inout[row * n + batch_offset + col]; + }; + raft::linalg::map_offset( + handle, + raft::make_device_vector_view(scratch_space.data_handle(), m * cols_per_batch), + copy_op); + + auto scatter_op = [inout = inout.data_handle(), + map = map.data_handle(), + scratch_space = scratch_space.data_handle(), + batch_offset, + cols_per_batch = raft::util::FastIntDiv(cols_per_batch), + n] __device__(auto idx) { + IndexT row = idx / cols_per_batch; + IndexT col = idx % cols_per_batch; + IndexT map_val = map[row]; + + inout[map_val * n + batch_offset + col] = scratch_space[idx]; + return; + }; + auto counting = thrust::make_counting_iterator(0); + thrust::for_each(exec_policy, counting, counting + m * cols_per_batch, scatter_op); + } +} + +template +void scatter(raft::resources const& handle, + raft::device_matrix_view inout, + raft::device_vector_view map, + IndexT batch_size) +{ + scatterInplaceImpl(handle, inout, map, batch_size); +} + +} // end namespace detail +} // end namespace matrix +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh index e05c8882fe..f934d7e3b4 100644 --- a/cpp/include/raft/matrix/detail/select_k-ext.cuh +++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh @@ -18,6 +18,7 @@ #include // uint32_t #include // __half +#include #include // RAFT_EXPLICIT #include // rmm:cuda_stream_view #include // rmm::mr::device_memory_resource @@ -27,7 +28,8 @@ namespace raft::matrix::detail { template -void select_k(const T* in_val, +void select_k(raft::resources const& handle, + const T* in_val, const IdxT* in_idx, size_t batch_size, size_t len, @@ -35,24 +37,24 @@ void select_k(const T* in_val, T* out_val, IdxT* out_idx, bool select_min, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT; + rmm::mr::device_memory_resource* mr = nullptr, + bool sorted = false) RAFT_EXPLICIT; } // namespace raft::matrix::detail #endif // RAFT_EXPLICIT_INSTANTIATE_ONLY -#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ - extern template void raft::matrix::detail::select_k(const T* in_val, \ - const IdxT* in_idx, \ - size_t batch_size, \ - size_t len, \ - int k, \ - T* out_val, \ - IdxT* out_idx, \ - bool select_min, \ - rmm::cuda_stream_view stream, \ - rmm::mr::device_memory_resource* mr) - +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + extern template void raft::matrix::detail::select_k(raft::resources const& handle, \ + const T* in_val, \ + const IdxT* in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T* out_val, \ + IdxT* out_idx, \ + bool select_min, \ + rmm::mr::device_memory_resource* mr, \ + bool sorted) instantiate_raft_matrix_detail_select_k(__half, uint32_t); instantiate_raft_matrix_detail_select_k(__half, int64_t); instantiate_raft_matrix_detail_select_k(float, int64_t); diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh index dba2d1d841..af5a5770fb 100644 --- a/cpp/include/raft/matrix/detail/select_k-inl.cuh +++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh @@ -19,11 +19,16 @@ #include "select_radix.cuh" #include "select_warpsort.cuh" +#include +#include #include +#include +#include #include #include #include +#include namespace raft::matrix::detail { @@ -116,6 +121,121 @@ inline Algo choose_select_k_algorithm(size_t rows, size_t cols, int k) } } +/** + * Performs a segmented sorting of a keys array with respect to + * the segments of a values array. + * @tparam KeyT + * @tparam ValT + * @param handle + * @param values + * @param keys + * @param n_segments + * @param k + * @param select_min + */ +template +void segmented_sort_by_key(raft::resources const& handle, + KeyT* keys, + ValT* values, + size_t n_segments, + size_t n_elements, + const ValT* offsets, + bool asc) +{ + auto stream = raft::resource::get_cuda_stream(handle); + auto out_inds = raft::make_device_vector(handle, n_elements); + auto out_dists = raft::make_device_vector(handle, n_elements); + + // Determine temporary device storage requirements + auto d_temp_storage = raft::make_device_vector(handle, 0); + size_t temp_storage_bytes = 0; + if (asc) { + cub::DeviceSegmentedRadixSort::SortPairs((void*)d_temp_storage.data_handle(), + temp_storage_bytes, + keys, + out_dists.data_handle(), + values, + out_inds.data_handle(), + n_elements, + n_segments, + offsets, + offsets + 1, + 0, + sizeof(ValT) * 8, + stream); + } else { + cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)d_temp_storage.data_handle(), + temp_storage_bytes, + keys, + out_dists.data_handle(), + values, + out_inds.data_handle(), + n_elements, + n_segments, + offsets, + offsets + 1, + 0, + sizeof(ValT) * 8, + stream); + } + + d_temp_storage = raft::make_device_vector(handle, temp_storage_bytes); + + if (asc) { + // Run sorting operation + cub::DeviceSegmentedRadixSort::SortPairs((void*)d_temp_storage.data_handle(), + temp_storage_bytes, + keys, + out_dists.data_handle(), + values, + out_inds.data_handle(), + n_elements, + n_segments, + offsets, + offsets + 1, + 0, + sizeof(ValT) * 8, + stream); + + } else { + // Run sorting operation + cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)d_temp_storage.data_handle(), + temp_storage_bytes, + keys, + out_dists.data_handle(), + values, + out_inds.data_handle(), + n_elements, + n_segments, + offsets, + offsets + 1, + 0, + sizeof(ValT) * 8, + stream); + } + + raft::copy(values, out_inds.data_handle(), out_inds.size(), stream); + raft::copy(keys, out_dists.data_handle(), out_dists.size(), stream); +} + +template +void segmented_sort_by_key(raft::resources const& handle, + raft::device_vector_view offsets, + raft::device_vector_view keys, + raft::device_vector_view values, + bool asc) +{ + RAFT_EXPECTS(keys.size() == values.size(), + "Keys and values must contain the same number of elements."); + segmented_sort_by_key(handle, + keys.data_handle(), + values.data_handle(), + offsets.size() - 1, + keys.size(), + offsets.data_handle(), + asc); +} + /** * Select k smallest or largest key/values from each row in the input data. * @@ -154,7 +274,8 @@ inline Algo choose_select_k_algorithm(size_t rows, size_t cols, int k) * memory pool here to avoid memory allocations within the call). */ template -void select_k(const T* in_val, +void select_k(raft::resources const& handle, + const T* in_val, const IdxT* in_idx, size_t batch_size, size_t len, @@ -162,32 +283,55 @@ void select_k(const T* in_val, T* out_val, IdxT* out_idx, bool select_min, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = nullptr) + rmm::mr::device_memory_resource* mr = nullptr, + bool sorted = false) { common::nvtx::range fun_scope( "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k); - auto algo = choose_select_k_algorithm(batch_size, len, k); + auto stream = raft::resource::get_cuda_stream(handle); + auto algo = choose_select_k_algorithm(batch_size, len, k); + switch (algo) { case Algo::kRadix11bits: - return detail::select::radix::select_k(in_val, - in_idx, - batch_size, - len, - k, - out_val, - out_idx, - select_min, - true, // fused_last_filter - stream); + detail::select::radix::select_k(in_val, + in_idx, + batch_size, + len, + k, + out_val, + out_idx, + select_min, + true, // fused_last_filter + stream, + mr); + + if (sorted) { + auto offsets = raft::make_device_vector(handle, (IdxT)(batch_size + 1)); + + raft::matrix::fill(handle, offsets.view(), (IdxT)k); + + thrust::exclusive_scan(raft::resource::get_thrust_policy(handle), + offsets.data_handle(), + offsets.data_handle() + offsets.size(), + offsets.data_handle(), + 0); + + auto keys = raft::make_device_vector_view(out_val, (IdxT)(batch_size * k)); + auto vals = raft::make_device_vector_view(out_idx, (IdxT)(batch_size * k)); + + segmented_sort_by_key( + handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min); + } + return; case Algo::kWarpDistributedShm: return detail::select::warpsort:: select_k_impl( - in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream); + in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); case Algo::kFaissBlockSelect: return neighbors::detail::select_k( in_val, in_idx, batch_size, len, out_val, out_idx, select_min, k, stream); + default: RAFT_FAIL("K-selection Algorithm not supported."); } } } // namespace raft::matrix::detail diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh index c7a3681983..5cd2cd5c26 100644 --- a/cpp/include/raft/matrix/diagonal.cuh +++ b/cpp/include/raft/matrix/diagonal.cuh @@ -19,6 +19,8 @@ #include #include #include +#include +#include namespace raft::matrix { @@ -40,11 +42,13 @@ void set_diagonal(raft::resources const& handle, { RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)), "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)"); + constexpr auto is_row_major = std::is_same_v; detail::initializeDiagonalMatrix(vec.data_handle(), matrix.data_handle(), matrix.extent(0), matrix.extent(1), + is_row_major, resource::get_cuda_stream(handle)); } @@ -61,10 +65,12 @@ void get_diagonal(raft::resources const& handle, { RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)), "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)"); + constexpr auto is_row_major = std::is_same_v; detail::getDiagonalMatrix(vec.data_handle(), matrix.data_handle(), matrix.extent(0), matrix.extent(1), + is_row_major, resource::get_cuda_stream(handle)); } @@ -83,6 +89,26 @@ void invert_diagonal(raft::resources const& handle, inout.data_handle(), inout.extent(0), resource::get_cuda_stream(handle)); } +/** + * @brief create an identity matrix + * @tparam math_t data-type upon which the math operation will be performed + * @tparam idx_t indexing type used for the output + * @tparam layout_t layout of the matrix data (must be row or col major) + * @param[in] handle: raft handle + * @param[out] out: output matrix + */ +template +void eye(const raft::resources& handle, raft::device_matrix_view out) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + + auto diag = raft::make_device_vector(handle, min(out.extent(0), out.extent(1))); + RAFT_CUDA_TRY(cudaMemsetAsync( + out.data_handle(), 0, out.size() * sizeof(math_t), resource::get_cuda_stream(handle))); + raft::matrix::fill(handle, diag.view(), math_t(1)); + set_diagonal(handle, raft::make_const_mdspan(diag.view()), out); +} + /** @} */ // end of group matrix_diagonal } // namespace raft::matrix diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh index 89950c2e14..2fbbcfa2bb 100644 --- a/cpp/include/raft/matrix/gather.cuh +++ b/cpp/include/raft/matrix/gather.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include namespace raft::matrix { @@ -289,6 +290,46 @@ void gather_if(const raft::resources& handle, resource::get_cuda_stream(handle)); } +/** + * @brief In-place gather elements in a row-major matrix according to a + * map. The map specifies the new order in which rows of the input matrix are + * rearranged, i.e. for each output row, read the index in the input matrix + * from the map, apply a transformation to this input index if specified, and copy the row. + * map[i]. For example, the matrix [[1, 2, 3], [4, 5, 6], [7, 8, 9]] with the + * map [2, 0, 1] will be transformed to [[7, 8, 9], [1, 2, 3], [4, 5, 6]]. + * Batching is done on columns and an additional scratch space of + * shape n_rows * cols_batch_size is created. For each batch, chunks + * of columns from each row are copied into the appropriate location + * in the scratch space and copied back to the corresponding locations + * in the input matrix. + * + * @tparam matrix_t Matrix element type + * @tparam map_t Integer type of map elements + * @tparam map_xform_t Unary lambda expression or operator type. MapTransformOp's result type must + * be convertible to idx_t. + * @tparam idx_t Integer type used for indexing + * + * @param[in] handle raft handle + * @param[inout] inout input matrix (n_rows * n_cols) + * @param[in] map Pointer to the input sequence of gather locations + * @param[in] col_batch_size (optional) column batch size. Determines the shape of the scratch space + * (map_length, col_batch_size). When set to zero (default), no batching is done and an additional + * scratch space of shape (map_lengthm, n_cols) is created. + * @param[in] transform_op (optional) Transformation to apply to map values + */ +template +void gather(raft::resources const& handle, + raft::device_matrix_view inout, + raft::device_vector_view map, + idx_t col_batch_size = 0, + map_xform_t transform_op = raft::identity_op()) +{ + detail::gather(handle, inout, map, transform_op, col_batch_size); +} + /** @} */ // end of group matrix_gather } // namespace raft::matrix diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index bc553011c0..63c33ff034 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -203,7 +203,7 @@ void sliceMatrix(m_t* in, idx_t y2, cudaStream_t stream) { - detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream); + detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, false, stream); } /** @@ -221,9 +221,9 @@ void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStr } /** - * @brief Initialize a diagonal matrix with a vector + * @brief Initialize a diagonal col-major matrix with a vector * @param vec: vector of length k = min(n_rows, n_cols) - * @param matrix: matrix of size n_rows x n_cols + * @param matrix: matrix of size n_rows x n_cols (col-major) * @param n_rows: number of rows of the matrix * @param n_cols: number of columns of the matrix * @param stream: cuda stream @@ -232,7 +232,7 @@ template void initializeDiagonalMatrix( m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream); + detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, false, stream); } /** diff --git a/cpp/include/raft/matrix/scatter.cuh b/cpp/include/raft/matrix/scatter.cuh new file mode 100644 index 0000000000..cd2d76a863 --- /dev/null +++ b/cpp/include/raft/matrix/scatter.cuh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft::matrix { +/** + * @brief In-place scatter elements in a row-major matrix according to a + * map. The map specifies the new order in which rows of the input matrix are + * rearranged, i.e. read the destination index from the map, and copy the row. For example, + * the matrix [[1, 2, 3], [4, 5, 6], [7, 8, 9]] with the map [2, 0, 1] will + * be transformed to [[4, 5, 6], [7, 8, 9], [1, 2, 3]]. Batching is done on + * columns and an additional scratch space of shape n_rows * cols_batch_size + * is created. For each batch, chunks of columns from each row are copied + * into the appropriate location in the scratch space and copied back to + * the corresponding locations in the input matrix. + * Note: in-place scatter is not thread safe if the values in the map are not unique. + * Users must ensure that the map indices are unique and in the range [0, n_rows). + * + * @tparam matrix_t Matrix element type + * @tparam idx_t Integer type used for indexing + * + * @param[in] handle raft handle + * @param[inout] inout input matrix (n_rows * n_cols) + * @param[in] map Pointer to the input sequence of scatter locations. The length of the map should + * be equal to the number of rows in the input matrix. Map indices should be unique and in the range + * [0, n_rows). The map represents a complete permutation of indices. + * @param[in] col_batch_size (optional) column batch size. Determines the shape of the scratch space + * (n_rows, col_batch_size). When set to zero (default), no batching is done and an additional + * scratch space of shape (n_rows, n_cols) is created. + */ +template +void scatter(raft::resources const& handle, + raft::device_matrix_view inout, + raft::device_vector_view map, + idx_t col_batch_size = 0) +{ + detail::scatter(handle, inout, map, col_batch_size); +} + +} // namespace raft::matrix \ No newline at end of file diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh index 8e6dbaafa8..37a36cbf6b 100644 --- a/cpp/include/raft/matrix/select_k.cuh +++ b/cpp/include/raft/matrix/select_k.cuh @@ -58,7 +58,7 @@ namespace raft::matrix { * @tparam IdxT * the index type (what is being selected together with the keys). * - * @param[in] handle + * @param[in] handle container of reusable resources * @param[in] in_val * inputs values [batch_size, len]; * these are compared and selected. @@ -74,14 +74,17 @@ namespace raft::matrix { * the payload selected together with `out_val`. * @param[in] select_min * whether to select k smallest (true) or largest (false) keys. + * @param[in] sorted + * whether to make sure selected pairs are sorted by value */ template -void select_k(const resources& handle, +void select_k(raft::resources const& handle, raft::device_matrix_view in_val, std::optional> in_idx, raft::device_matrix_view out_val, raft::device_matrix_view out_idx, - bool select_min) + bool select_min, + bool sorted = false) { RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits::max()), "output k must fit the int type."); @@ -95,7 +98,9 @@ void select_k(const resources& handle, RAFT_EXPECTS(len == in_idx->extent(1), "value and index input lengths must be equal"); } RAFT_EXPECTS(int64_t(k) == out_idx.extent(1), "value and index output lengths must be equal"); - return detail::select_k(in_val.data_handle(), + + return detail::select_k(handle, + in_val.data_handle(), in_idx.has_value() ? in_idx->data_handle() : nullptr, batch_size, len, @@ -103,7 +108,8 @@ void select_k(const resources& handle, out_val.data_handle(), out_idx.data_handle(), select_min, - resource::get_cuda_stream(handle)); + nullptr, + sorted); } /** @} */ // end of group select_k diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh index b739f1c732..e81c186960 100644 --- a/cpp/include/raft/matrix/slice.cuh +++ b/cpp/include/raft/matrix/slice.cuh @@ -19,6 +19,7 @@ #include #include #include +#include namespace raft::matrix { @@ -45,17 +46,18 @@ struct slice_coordinates { * @tparam m_t type of matrix elements * @tparam idx_t integer type used for indexing * @param[in] handle: raft handle - * @param[in] in: input matrix (column-major) - * @param[out] out: output matrix (column-major) + * @param[in] in: input matrix + * @param[out] out: output matrix * @param[in] coords: coordinates of the wanted slice * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice(handle, in, out, {0, 1, 4, 3}); */ -template +template void slice(raft::resources const& handle, - raft::device_matrix_view in, - raft::device_matrix_view out, + raft::device_matrix_view in, + raft::device_matrix_view out, slice_coordinates coords) { + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Matrix layout must be row- or column-major"); RAFT_EXPECTS(coords.row2 > coords.row1, "row2 must be > row1"); RAFT_EXPECTS(coords.col2 > coords.col1, "col2 must be > col1"); RAFT_EXPECTS(coords.row1 >= 0, "row1 must be >= 0"); @@ -72,6 +74,7 @@ void slice(raft::resources const& handle, coords.col1, coords.row2, coords.col2, + raft::is_row_major(in), resource::get_cuda_stream(handle)); } diff --git a/cpp/include/raft/neighbors/brute_force-inl.cuh b/cpp/include/raft/neighbors/brute_force-inl.cuh index b4de76037a..bc9e09e5b0 100644 --- a/cpp/include/raft/neighbors/brute_force-inl.cuh +++ b/cpp/include/raft/neighbors/brute_force-inl.cuh @@ -90,10 +90,14 @@ inline void knn_merge_parts( RAFT_EXPECTS(in_keys.extent(1) == in_values.extent(1) && in_keys.extent(0) == in_values.extent(0), "in_keys and in_values must have the same shape."); RAFT_EXPECTS( - out_keys.extent(0) == out_values.extent(0) == n_samples, + out_keys.extent(0) == out_values.extent(0) && out_keys.extent(0) == n_samples, "Number of rows in output keys and val matrices must equal number of rows in search matrix."); - RAFT_EXPECTS(out_keys.extent(1) == out_values.extent(1) == in_keys.extent(1), - "Number of columns in output indices and distances matrices must be equal to k"); + RAFT_EXPECTS( + out_keys.extent(1) == out_values.extent(1) && out_keys.extent(1) == in_keys.extent(1), + "Number of columns in output indices and distances matrices must be equal to k"); + + idx_t* translations_ptr = nullptr; + if (translations.has_value()) { translations_ptr = translations.value().data_handle(); } auto n_parts = in_keys.extent(0) / n_samples; detail::knn_merge_parts(in_keys.data_handle(), @@ -104,7 +108,7 @@ inline void knn_merge_parts( n_parts, in_keys.extent(1), resource::get_cuda_stream(handle), - translations.value_or(nullptr)); + translations_ptr); } /** diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh index 9905f2abae..6bb7beca55 100644 --- a/cpp/include/raft/neighbors/cagra.cuh +++ b/cpp/include/raft/neighbors/cagra.cuh @@ -27,7 +27,7 @@ #include #include -namespace raft::neighbors::experimental::cagra { +namespace raft::neighbors::cagra { /** * @defgroup cagra CUDA ANN Graph-based nearest neighbor search @@ -57,14 +57,15 @@ namespace raft::neighbors::experimental::cagra { * auto knn_graph = raft::make_host_matrix(dataset.extent(0), 128); * // create knn graph * cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params); - * auto pruned_gaph = raft::make_host_matrix(dataset.extent(0), 64); - * cagra::prune(res, dataset, knn_graph.view(), pruned_graph.view()); - * // Construct an index from dataset and pruned knn_graph - * auto index = cagra::index(res, build_params.metric(), dataset, pruned_graph.view()); + * auto optimized_gaph = raft::make_host_matrix(dataset.extent(0), 64); + * cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view()); + * // Construct an index from dataset and optimized knn_graph + * auto index = cagra::index(res, build_params.metric(), dataset, + * optimized_graph.view()); * @endcode * - * @tparam T data element type - * @tparam IdxT type of the indices in the source dataset + * @tparam DataT data element type + * @tparam IdxT type of the dataset vector indices * * @param[in] res raft resources * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim] @@ -75,31 +76,31 @@ namespace raft::neighbors::experimental::cagra { */ template void build_knn_graph(raft::resources const& res, - mdspan, row_major, accessor> dataset, - raft::host_matrix_view knn_graph, + mdspan, row_major, accessor> dataset, + raft::host_matrix_view knn_graph, std::optional refine_rate = std::nullopt, std::optional build_params = std::nullopt, std::optional search_params = std::nullopt) { using internal_IdxT = typename std::make_unsigned::type; - auto knn_graph_internal = make_host_matrix_view( + auto knn_graph_internal = make_host_matrix_view( reinterpret_cast(knn_graph.data_handle()), knn_graph.extent(0), knn_graph.extent(1)); - auto dataset_internal = mdspan, row_major, accessor>( + auto dataset_internal = mdspan, row_major, accessor>( dataset.data_handle(), dataset.extent(0), dataset.extent(1)); - detail::build_knn_graph( + cagra::detail::build_knn_graph( res, dataset_internal, knn_graph_internal, refine_rate, build_params, search_params); } /** * @brief Sort a KNN graph index. - * Preprocessing step for `cagra::prune`: If a KNN graph is not built using + * Preprocessing step for `cagra::optimize`: If a KNN graph is not built using * `cagra::build_knn_graph`, then it is necessary to call this function before calling - * `cagra::prune`. If the graph is built by `cagra::build_knn_graph`, it is already sorted and you - * do not need to call this function. + * `cagra::optimize`. If the graph is built by `cagra::build_knn_graph`, it is already sorted and + * you do not need to call this function. * * Usage example: * @code{.cpp} @@ -110,14 +111,15 @@ void build_knn_graph(raft::resources const& res, * // build(knn_graph, dataset, ...); * // sort graph index * sort_knn_graph(res, dataset.view(), knn_graph.view()); - * // prune graph - * cagra::prune(res, dataset, knn_graph.view(), pruned_graph.view()); - * // Construct an index from dataset and pruned knn_graph - * auto index = cagra::index(res, build_params.metric(), dataset, pruned_graph.view()); + * // optimize graph + * cagra::optimize(res, dataset, knn_graph.view(), optimized_graph.view()); + * // Construct an index from dataset and optimized knn_graph + * auto index = cagra::index(res, build_params.metric(), dataset, + * optimized_graph.view()); * @endcode * * @tparam DataT type of the data in the source dataset - * @tparam IdxT type of the indices in the source dataset + * @tparam IdxT type of the dataset vector indices * * @param[in] res raft resources * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim] @@ -131,23 +133,23 @@ template , memory_type::host>> void sort_knn_graph(raft::resources const& res, - mdspan, row_major, d_accessor> dataset, - mdspan, row_major, g_accessor> knn_graph) + mdspan, row_major, d_accessor> dataset, + mdspan, row_major, g_accessor> knn_graph) { using internal_IdxT = typename std::make_unsigned::type; using g_accessor_internal = host_device_accessor, g_accessor::mem_type>; auto knn_graph_internal = - mdspan, row_major, g_accessor_internal>( + mdspan, row_major, g_accessor_internal>( reinterpret_cast(knn_graph.data_handle()), knn_graph.extent(0), knn_graph.extent(1)); - auto dataset_internal = mdspan, row_major, d_accessor>( + auto dataset_internal = mdspan, row_major, d_accessor>( dataset.data_handle(), dataset.extent(0), dataset.extent(1)); - detail::graph::sort_knn_graph(res, dataset_internal, knn_graph_internal); + cagra::detail::graph::sort_knn_graph(res, dataset_internal, knn_graph_internal); } /** @@ -162,18 +164,18 @@ void sort_knn_graph(raft::resources const& res, * @param[in] res raft resources * @param[in] knn_graph a matrix view (host or device) of the input knn graph [n_rows, * knn_graph_degree] - * @param[out] new_graph a host matrix view of the pruned knn graph [n_rows, graph_degree] + * @param[out] new_graph a host matrix view of the optimized knn graph [n_rows, graph_degree] */ template , memory_type::host>> -void prune(raft::resources const& res, - mdspan, row_major, g_accessor> knn_graph, - raft::host_matrix_view new_graph) +void optimize(raft::resources const& res, + mdspan, row_major, g_accessor> knn_graph, + raft::host_matrix_view new_graph) { using internal_IdxT = typename std::make_unsigned::type; - auto new_graph_internal = raft::make_host_matrix_view( + auto new_graph_internal = raft::make_host_matrix_view( reinterpret_cast(new_graph.data_handle()), new_graph.extent(0), new_graph.extent(1)); @@ -181,26 +183,26 @@ void prune(raft::resources const& res, using g_accessor_internal = host_device_accessor, memory_type::host>; auto knn_graph_internal = - mdspan, row_major, g_accessor_internal>( + mdspan, row_major, g_accessor_internal>( reinterpret_cast(knn_graph.data_handle()), knn_graph.extent(0), knn_graph.extent(1)); - detail::graph::prune(res, knn_graph_internal, new_graph_internal); + cagra::detail::graph::optimize(res, knn_graph_internal, new_graph_internal); } /** * @brief Build the index from the dataset for efficient search. * - * The build consist of two steps: build an intermediate knn-graph, and prune it to + * The build consist of two steps: build an intermediate knn-graph, and optimize it to * create the final graph. The index_params struct controls the node degree of these * graphs. * - * It is required that dataset and the pruned graph fit the GPU memory. + * It is required that dataset and the optimized graph fit the GPU memory. * * To customize the parameters for knn-graph building and pruning, and to reuse the * intermediate results, you could build the index in two steps using - * [cagra::build_knn_graph](#cagra::build_knn_graph) and [cagra::prune](#cagra::prune). + * [cagra::build_knn_graph](#cagra::build_knn_graph) and [cagra::optimize](#cagra::optimize). * * The following distance metrics are supported: * - L2 @@ -235,28 +237,35 @@ template , memory_type::host>> index build(raft::resources const& res, const index_params& params, - mdspan, row_major, Accessor> dataset) + mdspan, row_major, Accessor> dataset) { - size_t degree = params.intermediate_graph_degree; - if (degree >= static_cast(dataset.extent(0))) { + size_t intermediate_degree = params.intermediate_graph_degree; + size_t graph_degree = params.graph_degree; + if (intermediate_degree >= static_cast(dataset.extent(0))) { RAFT_LOG_WARN( "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu", dataset.extent(0)); - degree = dataset.extent(0) - 1; + intermediate_degree = dataset.extent(0) - 1; + } + if (intermediate_degree < graph_degree) { + RAFT_LOG_WARN( + "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing " + "graph_degree.", + graph_degree, + intermediate_degree); + graph_degree = intermediate_degree; } - RAFT_EXPECTS(degree >= params.graph_degree, - "Intermediate graph degree cannot be smaller than final graph degree"); - auto knn_graph = raft::make_host_matrix(dataset.extent(0), degree); + auto knn_graph = raft::make_host_matrix(dataset.extent(0), intermediate_degree); build_knn_graph(res, dataset, knn_graph.view()); - auto cagra_graph = raft::make_host_matrix(dataset.extent(0), params.graph_degree); + auto cagra_graph = raft::make_host_matrix(dataset.extent(0), graph_degree); - prune(res, knn_graph.view(), cagra_graph.view()); + optimize(res, knn_graph.view(), cagra_graph.view()); - // Construct an index from dataset and pruned knn graph. - return index(res, params.metric, dataset, cagra_graph.view()); + // Construct an index from dataset and optimized knn graph. + return index(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view())); } /** @@ -280,9 +289,9 @@ template void search(raft::resources const& res, const search_params& params, const index& idx, - raft::device_matrix_view queries, - raft::device_matrix_view neighbors, - raft::device_matrix_view distances) + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) { RAFT_EXPECTS( queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0), @@ -290,23 +299,31 @@ void search(raft::resources const& res, RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1), "Number of columns in output neighbors and distances matrices must equal k"); - RAFT_EXPECTS(queries.extent(1) == idx.dim(), "Number of query dimensions should equal number of dimensions in the index."); using internal_IdxT = typename std::make_unsigned::type; - auto queries_internal = raft::make_device_matrix_view( + auto queries_internal = raft::make_device_matrix_view( queries.data_handle(), queries.extent(0), queries.extent(1)); - auto neighbors_internal = raft::make_device_matrix_view( + auto neighbors_internal = raft::make_device_matrix_view( reinterpret_cast(neighbors.data_handle()), neighbors.extent(0), neighbors.extent(1)); - auto distances_internal = raft::make_device_matrix_view( + auto distances_internal = raft::make_device_matrix_view( distances.data_handle(), distances.extent(0), distances.extent(1)); - detail::search_main( + cagra::detail::search_main( res, params, idx, queries_internal, neighbors_internal, distances_internal); } /** @} */ // end group cagra +} // namespace raft::neighbors::cagra + +// TODO: Remove deprecated experimental namespace in 23.12 release +namespace raft::neighbors::experimental::cagra { +using raft::neighbors::cagra::build; +using raft::neighbors::cagra::build_knn_graph; +using raft::neighbors::cagra::optimize; +using raft::neighbors::cagra::search; +using raft::neighbors::cagra::sort_knn_graph; } // namespace raft::neighbors::experimental::cagra diff --git a/cpp/include/raft/neighbors/cagra_serialize.cuh b/cpp/include/raft/neighbors/cagra_serialize.cuh index 8d1771a301..2242629409 100644 --- a/cpp/include/raft/neighbors/cagra_serialize.cuh +++ b/cpp/include/raft/neighbors/cagra_serialize.cuh @@ -18,7 +18,7 @@ #include "detail/cagra/cagra_serialize.cuh" -namespace raft::neighbors::experimental::cagra { +namespace raft::neighbors::cagra { /** * \defgroup cagra_serialize CAGRA Serialize @@ -110,7 +110,7 @@ void serialize(raft::resources const& handle, * @param[in] handle the raft handle * @param[in] is input stream * - * @return raft::neighbors::cagra::index + * @return raft::neighbors::experimental::cagra::index */ template index deserialize(raft::resources const& handle, std::istream& is) @@ -141,7 +141,7 @@ index deserialize(raft::resources const& handle, std::istream& is) * @param[in] handle the raft handle * @param[in] filename the name of the file that stores the index * - * @return raft::neighbors::cagra::index + * @return raft::neighbors::experimental::cagra::index */ template index deserialize(raft::resources const& handle, const std::string& filename) @@ -151,4 +151,11 @@ index deserialize(raft::resources const& handle, const std::string& fil /**@}*/ -} // namespace raft::neighbors::experimental::cagra +} // namespace raft::neighbors::cagra + +// TODO: Remove deprecated experimental namespace in 23.12 release +namespace raft::neighbors::experimental::cagra { +using raft::neighbors::cagra::deserialize; +using raft::neighbors::cagra::serialize; + +} // namespace raft::neighbors::experimental::cagra \ No newline at end of file diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp index 87405ae9fb..01d6a92235 100644 --- a/cpp/include/raft/neighbors/cagra_types.hpp +++ b/cpp/include/raft/neighbors/cagra_types.hpp @@ -33,7 +33,8 @@ #include #include -namespace raft::neighbors::experimental::cagra { +#include +namespace raft::neighbors::cagra { /** * @ingroup cagra * @{ @@ -54,8 +55,8 @@ enum class search_algo { enum class hash_mode { HASH, SMALL, AUTO }; struct search_params : ann::search_params { - /** Maximum number of queries to search at the same time (batch size). */ - size_t max_queries = 1; + /** Maximum number of queries to search at the same time (batch size). Auto select when 0.*/ + size_t max_queries = 0; /** Number of intermediate search results retained during the search. * @@ -78,12 +79,10 @@ struct search_params : ann::search_params { /*/ Number of graph nodes to select as the starting point for the search in each iteration. aka * search width?*/ - size_t num_parents = 1; + size_t search_width = 1; /** Lower limit of search iterations. */ size_t min_iterations = 0; - /** Bit length for reading the dataset vectors. 0, 64 or 128. Auto selection when 0. */ - size_t load_bit_length = 0; /** Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. */ size_t thread_block_size = 0; /** Hashmap type. Auto selection when AUTO. */ @@ -108,7 +107,7 @@ static_assert(std::is_aggregate_v); * The index stores the dataset and a kNN graph in device memory. * * @tparam T data element type - * @tparam IdxT type of the indices in the source dataset + * @tparam IdxT type of the vector indices (represent dataset.extent(0)) * */ template @@ -123,36 +122,35 @@ struct index : ann::index { return metric_; } - // /** Total length of the index. */ - [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return dataset_.extent(0); } + // /** Total length of the index (number of vectors). */ + [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT + { + return dataset_view_.extent(0); + } /** Dimensionality of the data. */ [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t { - return dataset_.extent(1); + return dataset_view_.extent(1); } /** Graph degree */ [[nodiscard]] constexpr inline auto graph_degree() const noexcept -> uint32_t { - return graph_.extent(1); + return graph_view_.extent(1); } /** Dataset [size, dim] */ - [[nodiscard]] inline auto dataset() const noexcept -> device_matrix_view + [[nodiscard]] inline auto dataset() const noexcept + -> device_matrix_view { - return dataset_.view(); + return dataset_view_; } /** neighborhood graph [size, graph-degree] */ - inline auto graph() noexcept -> device_matrix_view - { - return graph_.view(); - } - [[nodiscard]] inline auto graph() const noexcept - -> device_matrix_view + -> device_matrix_view { - return graph_.view(); + return graph_view_; } // Don't allow copying the index for performance reasons (try avoiding copying data) @@ -166,41 +164,192 @@ struct index : ann::index { index(raft::resources const& res) : ann::index(), metric_(raft::distance::DistanceType::L2Expanded), - dataset_(make_device_matrix(res, 0, 0)), - graph_(make_device_matrix(res, 0, 0)) + dataset_(make_device_matrix(res, 0, 0)), + graph_(make_device_matrix(res, 0, 0)) { } - /** Construct an index from dataset and knn_graph arrays */ + /** Construct an index from dataset and knn_graph arrays + * + * If the dataset and graph is already in GPU memory, then the index is just a thin wrapper around + * these that stores a non-owning a reference to the arrays. + * + * The constructor also accepts host arrays. In that case they are copied to the device, and the + * device arrays will be owned by the index. + * + * In case the dasates rows are not 16 bytes aligned, then we create a padded copy in device + * memory to ensure alignment for vectorized load. + * + * Usage examples: + * + * - Cagra index is normally created by the cagra::build + * @code{.cpp} + * using namespace raft::neighbors::experimental; + * auto dataset = raft::make_host_matrix(n_rows, n_cols); + * load_dataset(dataset.view()); + * // use default index parameters + * cagra::index_params index_params; + * // create and fill the index from a [N, D] dataset + * auto index = cagra::build(res, index_params, dataset); + * // use default search parameters + * cagra::search_params search_params; + * // search K nearest neighbours + * auto neighbors = raft::make_device_matrix(res, n_queries, k); + * auto distances = raft::make_device_matrix(res, n_queries, k); + * cagra::search(res, search_params, index, queries, neighbors, distances); + * @endcode + * In the above example, we have passed a host dataset to build. The returned index will own a + * device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a + * device_mdspan to build, then it will only store a reference to it. + * + * - Constructing index using existing knn-graph + * @code{.cpp} + * using namespace raft::neighbors::experimental; + * + * auto dataset = raft::make_device_matrix(res, n_rows, n_cols); + * auto knn_graph = raft::make_device_matrix(res, n_rows, graph_degree); + * + * // custom loading and graph creation + * // load_dataset(dataset.view()); + * // create_knn_graph(knn_graph.view()); + * + * // Wrap the existing device arrays into an index structure + * cagra::index index(res, metric, raft::make_const_mdspan(dataset.view()), + * raft::make_const_mdspan(knn_graph.view())); + * + * // Both knn_graph and dataset objects have to be in scope while the index is used because + * // the index only stores a reference to these. + * cagra::search(res, search_params, index, queries, neighbors, distances); + * @endcode + * + */ template index(raft::resources const& res, raft::distance::DistanceType metric, - mdspan, row_major, data_accessor> dataset, - mdspan, row_major, graph_accessor> knn_graph) + mdspan, row_major, data_accessor> dataset, + mdspan, row_major, graph_accessor> knn_graph) : ann::index(), metric_(metric), - dataset_(make_device_matrix(res, dataset.extent(0), dataset.extent(1))), - graph_(make_device_matrix(res, knn_graph.extent(0), knn_graph.extent(1))) + dataset_(make_device_matrix(res, 0, 0)), + graph_(make_device_matrix(res, 0, 0)) { RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0), "Dataset and knn_graph must have equal number of rows"); - raft::copy(dataset_.data_handle(), - dataset.data_handle(), - dataset.size(), - resource::get_cuda_stream(res)); + update_dataset(res, dataset); + update_graph(res, knn_graph); + resource::sync_stream(res); + } + + /** + * Replace the dataset with a new dataset. + * + * If the new dataset rows are aligned on 16 bytes, then only a reference is stored to the + * dataset. It is the caller's responsibility to ensure that dataset stays alive as long as the + * index. + */ + void update_dataset(raft::resources const& res, + raft::device_matrix_view dataset) + { + if (dataset.extent(1) * sizeof(T) % 16 != 0) { + RAFT_LOG_DEBUG("Creating a padded copy of CAGRA dataset in device memory"); + copy_padded(res, dataset); + } else { + dataset_view_ = make_device_strided_matrix_view( + dataset.data_handle(), dataset.extent(0), dataset.extent(1), dataset.extent(1)); + } + } + + /** + * Replace the dataset with a new dataset. + * + * We create a copy of the dataset on the device. The index manages the lifetime of this copy. + */ + void update_dataset(raft::resources const& res, + raft::host_matrix_view dataset) + { + RAFT_LOG_DEBUG("Copying CAGRA dataset from host to device"); + copy_padded(res, dataset); + } + + /** + * Replace the graph with a new graph. + * + * Since the new graph is a device array, we store a reference to that, and it is + * the caller's responsibility to ensure that knn_graph stays alive as long as the index. + */ + void update_graph(raft::resources const& res, + raft::device_matrix_view knn_graph) + { + graph_view_ = knn_graph; + } + + /** + * Replace the graph with a new graph. + * + * We create a copy of the graph on the device. The index manages the lifetime of this copy. + */ + void update_graph(raft::resources const& res, + raft::host_matrix_view knn_graph) + { + RAFT_LOG_DEBUG("Copying CAGRA knn graph from host to device"); + graph_ = make_device_matrix(res, knn_graph.extent(0), knn_graph.extent(1)); raft::copy(graph_.data_handle(), knn_graph.data_handle(), knn_graph.size(), resource::get_cuda_stream(res)); - resource::sync_stream(res); + graph_view_ = graph_.view(); } private: + /** Create a device copy of the dataset, and pad it if necessary. */ + template + void copy_padded(raft::resources const& res, + mdspan, row_major, data_accessor> dataset) + { + size_t padded_dim = round_up_safe(dataset.extent(1) * sizeof(T), 16) / sizeof(T); + dataset_ = make_device_matrix(res, dataset.extent(0), padded_dim); + if (dataset_.extent(1) == dataset.extent(1)) { + raft::copy(dataset_.data_handle(), + dataset.data_handle(), + dataset.size(), + resource::get_cuda_stream(res)); + } else { + // copy with padding + RAFT_CUDA_TRY(cudaMemsetAsync( + dataset_.data_handle(), 0, dataset_.size() * sizeof(T), resource::get_cuda_stream(res))); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(dataset_.data_handle(), + sizeof(T) * dataset_.extent(1), + dataset.data_handle(), + sizeof(T) * dataset.extent(1), + sizeof(T) * dataset.extent(1), + dataset.extent(0), + cudaMemcpyDefault, + resource::get_cuda_stream(res))); + } + dataset_view_ = make_device_strided_matrix_view( + dataset_.data_handle(), dataset_.extent(0), dataset.extent(1), dataset_.extent(1)); + RAFT_LOG_DEBUG("CAGRA dataset strided matrix view %zux%zu, stride %zu", + static_cast(dataset_view_.extent(0)), + static_cast(dataset_view_.extent(1)), + static_cast(dataset_view_.stride(0))); + } + raft::distance::DistanceType metric_; - raft::device_matrix dataset_; - raft::device_matrix graph_; + raft::device_matrix dataset_; + raft::device_matrix graph_; + raft::device_matrix_view dataset_view_; + raft::device_matrix_view graph_view_; }; /** @} */ +} // namespace raft::neighbors::cagra + +// TODO: Remove deprecated experimental namespace in 23.12 release +namespace raft::neighbors::experimental::cagra { +using raft::neighbors::cagra::hash_mode; +using raft::neighbors::cagra::index; +using raft::neighbors::cagra::index_params; +using raft::neighbors::cagra::search_algo; +using raft::neighbors::cagra::search_params; } // namespace raft::neighbors::experimental::cagra diff --git a/cpp/include/raft/neighbors/detail/cagra/bitonic.hpp b/cpp/include/raft/neighbors/detail/cagra/bitonic.hpp index 45aff99421..9fca7f8ebd 100644 --- a/cpp/include/raft/neighbors/detail/cagra/bitonic.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/bitonic.hpp @@ -18,7 +18,7 @@ #include #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace bitonic { namespace detail { @@ -223,4 +223,4 @@ __device__ void warp_sort(K k[N], V v[N], const bool asc = true) } } // namespace bitonic -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh index 693ab9029d..d19d7e7904 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh @@ -36,20 +36,16 @@ #include #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { template void build_knn_graph(raft::resources const& res, - mdspan, row_major, accessor> dataset, - raft::host_matrix_view knn_graph, + mdspan, row_major, accessor> dataset, + raft::host_matrix_view knn_graph, std::optional refine_rate = std::nullopt, std::optional build_params = std::nullopt, std::optional search_params = std::nullopt) { - RAFT_EXPECTS( - dataset.extent(1) * sizeof(DataT) % 8 == 0, - "Dataset rows are expected to have at least 8 bytes alignment. Try padding feature dims."); - RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded, "Currently only L2Expanded metric is supported"); @@ -112,7 +108,6 @@ void build_knn_graph(raft::resources const& res, max_batch_size, search_params->n_probes); - // TODO(tfeher): shall we use uint32_t? auto distances = raft::make_device_matrix(res, max_batch_size, gpu_top_k); auto neighbors = raft::make_device_matrix(res, max_batch_size, gpu_top_k); auto refined_distances = raft::make_device_matrix(res, max_batch_size, top_k); @@ -139,7 +134,12 @@ void build_knn_graph(raft::resources const& res, resource::get_cuda_stream(res), device_memory); + size_t next_report_offset = 0; + size_t d_report_offset = dataset.extent(0) / 100; // Report progress in 1% steps. + for (const auto& batch : vec_batches) { + // Map int64_t to uint32_t because ivf_pq requires the latter. + // TODO(tfeher): remove this mapping once ivf_pq accepts mdspan with int64_t index type auto queries_view = raft::make_device_matrix_view( batch.data(), batch.size(), batch.row_width()); auto neighbors_view = make_device_matrix_view( @@ -148,7 +148,6 @@ void build_knn_graph(raft::resources const& res, distances.data_handle(), batch.size(), distances.extent(1)); ivf_pq::search(res, *search_params, index, queries_view, neighbors_view, distances_view); - if constexpr (is_host_mdspan_v) { raft::copy(neighbors_host.data_handle(), neighbors.data_handle(), @@ -168,7 +167,7 @@ void build_knn_graph(raft::resources const& res, refined_distances_host.data_handle(), batch.size(), top_k); resource::sync_stream(res); - raft::neighbors::detail::refine_host( // res, + raft::neighbors::detail::refine_host( dataset, queries_host_view, neighbors_host_view, @@ -216,21 +215,27 @@ void build_knn_graph(raft::resources const& res, size_t num_queries_done = batch.offset() + batch.size(); const auto end_clock = std::chrono::system_clock::now(); - const auto time = - std::chrono::duration_cast(end_clock - start_clock).count() * 1e-6; - const auto throughput = num_queries_done / time; - RAFT_LOG_DEBUG( - "# Search %12lu / %12lu (%3.2f %%), %e queries/sec, %.2f minutes ETA, self included = " - "%3.2f %% \r", - num_queries_done, - dataset.extent(0), - num_queries_done / static_cast(dataset.extent(0)) * 100, - throughput, - (num_queries - num_queries_done) / throughput / 60, - static_cast(num_self_included) / num_queries_done * 100.); + if (batch.offset() > next_report_offset) { + next_report_offset += d_report_offset; + const auto time = + std::chrono::duration_cast(end_clock - start_clock).count() * + 1e-6; + const auto throughput = num_queries_done / time; + + RAFT_LOG_DEBUG( + "# Search %12lu / %12lu (%3.2f %%), %e queries/sec, %.2f minutes ETA, self included = " + "%3.2f %% \r", + num_queries_done, + dataset.extent(0), + num_queries_done / static_cast(dataset.extent(0)) * 100, + throughput, + (num_queries - num_queries_done) / throughput / 60, + static_cast(num_self_included) / num_queries_done * 100.); + } first = false; } + if (!first) RAFT_LOG_DEBUG("# Finished building kNN graph"); } -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh index d3b24dc861..8190817b5b 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh @@ -27,12 +27,10 @@ #include #include "factory.cuh" -#include "search_multi_cta.cuh" -#include "search_multi_kernel.cuh" #include "search_plan.cuh" #include "search_single_cta.cuh" -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { /** * @brief Search ANN using the constructed index. @@ -40,7 +38,9 @@ namespace raft::neighbors::experimental::cagra::detail { * See the [build](#build) documentation for a usage example. * * @tparam T data element type - * @tparam IdxT type of the indices + * @tparam IdxT type of database vector indices + * @tparam internal_IdxT during search we map IdxT to internal_IdxT, this way we do not need + * separate kernels for int/uint. * * @param[in] handle * @param[in] params configure the search @@ -56,9 +56,9 @@ template & index, - raft::device_matrix_view queries, - raft::device_matrix_view neighbors, - raft::device_matrix_view distances) + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) { RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n", static_cast(index.dataset().extent(0)), @@ -67,7 +67,9 @@ void search_main(raft::resources const& res, static_cast(queries.extent(0)), static_cast(queries.extent(1))); RAFT_EXPECTS(queries.extent(1) == index.dim(), "Querise and index dim must match"); - uint32_t topk = neighbors.extent(1); + const uint32_t topk = neighbors.extent(1); + + if (params.max_queries == 0) { params.max_queries = queries.extent(0); } std::unique_ptr> plan = factory::create( @@ -76,8 +78,8 @@ void search_main(raft::resources const& res, plan->check(neighbors.extent(1)); RAFT_LOG_DEBUG("Cagra search"); - uint32_t max_queries = plan->max_queries; - uint32_t query_dim = queries.extent(1); + const uint32_t max_queries = plan->max_queries; + const uint32_t query_dim = queries.extent(1); for (unsigned qid = 0; qid < queries.extent(0); qid += max_queries) { const uint32_t n_queries = std::min(max_queries, queries.extent(0) - qid); @@ -92,13 +94,15 @@ void search_main(raft::resources const& res, : nullptr; uint32_t* _num_executed_iterations = nullptr; - auto dataset_internal = raft::make_device_matrix_view( - index.dataset().data_handle(), index.dataset().extent(0), index.dataset().extent(1)); - auto graph_internal = - raft::make_device_matrix_view( - reinterpret_cast(index.graph().data_handle()), - index.graph().extent(0), - index.graph().extent(1)); + auto dataset_internal = + make_device_strided_matrix_view(index.dataset().data_handle(), + index.dataset().extent(0), + index.dataset().extent(1), + index.dataset().stride(0)); + auto graph_internal = raft::make_device_matrix_view( + reinterpret_cast(index.graph().data_handle()), + index.graph().extent(0), + index.graph().extent(1)); (*plan)(res, dataset_internal, @@ -130,4 +134,4 @@ void search_main(raft::resources const& res, } /** @} */ // end group cagra -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh index 04d0bb350f..8d040c352b 100644 --- a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh @@ -22,10 +22,10 @@ #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { // Serialization version 1. -constexpr int serialization_version = 1; +constexpr int serialization_version = 2; // NB: we wrap this check in a struct, so that the updated RealSize is easy to see in the error // message. @@ -36,7 +36,8 @@ struct check_index_layout { "paste in the new size and consider updating the serialization logic"); }; -template struct check_index_layout), 136>; +constexpr size_t expected_size = 200; +template struct check_index_layout), expected_size>; /** * Save the index to file. @@ -59,7 +60,19 @@ void serialize(raft::resources const& res, std::ostream& os, const index(dataset.extent(0), dataset.extent(1)); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), + sizeof(T) * host_dataset.extent(1), + dataset.data_handle(), + sizeof(T) * dataset.stride(0), + sizeof(T) * host_dataset.extent(1), + dataset.extent(0), + cudaMemcpyDefault, + resource::get_cuda_stream(res))); + resource::sync_stream(res); + serialize_mdspan(res, os, host_dataset.view()); serialize_mdspan(res, os, index_.graph()); } @@ -98,13 +111,13 @@ auto deserialize(raft::resources const& res, std::istream& is) -> index auto graph_degree = deserialize_scalar(res, is); auto metric = deserialize_scalar(res, is); - auto dataset = raft::make_host_matrix(n_rows, dim); - auto graph = raft::make_host_matrix(n_rows, graph_degree); - + auto dataset = raft::make_host_matrix(n_rows, dim); + auto graph = raft::make_host_matrix(n_rows, graph_degree); deserialize_mdspan(res, is, dataset.view()); deserialize_mdspan(res, is, graph.view()); - return index(res, metric, raft::make_const_mdspan(dataset.view()), graph.view()); + return index( + res, metric, raft::make_const_mdspan(dataset.view()), raft::make_const_mdspan(graph.view())); } template @@ -120,4 +133,4 @@ auto deserialize(raft::resources const& res, const std::string& filename) -> ind return index; } -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp index fd66735cf6..2758148942 100644 --- a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp @@ -22,7 +22,7 @@ #include "utils.hpp" #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace device { // using LOAD_256BIT_T = ulonglong4; @@ -56,6 +56,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] const std::size_t dataset_dim, const std::size_t dataset_size, + const std::size_t dataset_ld, const std::size_t num_pickup, const unsigned num_distilation, const uint64_t rand_xor_mask, @@ -93,7 +94,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes( for (uint32_t e = 0; e < nelem; e++) { const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen; if (k >= dataset_dim) break; - dl_buff[e].load = ((LOAD_T*)(dataset_ptr + k + (dataset_dim * seed_index)))[0]; + dl_buff[e].load = ((LOAD_T*)(dataset_ptr + k + (dataset_ld * seed_index)))[0]; } #pragma unroll for (uint32_t e = 0; e < nelem; e++) { @@ -146,6 +147,7 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in // [dataset_dim, dataset_size] const DATA_T* const dataset_ptr, const std::size_t dataset_dim, + const std::size_t dataset_ld, // [knn_k, dataset_size] const INDEX_T* const knn_graph, const std::uint32_t knn_k, @@ -153,13 +155,13 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in INDEX_T* const visited_hashmap_ptr, const std::uint32_t hash_bitlen, const INDEX_T* const parent_indices, - const std::uint32_t num_parents) + const std::uint32_t search_width) { const INDEX_T invalid_index = utils::get_max_value(); // Read child indices of parents from knn graph and check if the distance // computaiton is necessary. - for (uint32_t i = threadIdx.x; i < knn_k * num_parents; i += BLOCK_SIZE) { + for (uint32_t i = threadIdx.x; i < knn_k * search_width; i += BLOCK_SIZE) { const INDEX_T parent_id = parent_indices[i / knn_k]; INDEX_T child_id = invalid_index; if (parent_id != invalid_index) { @@ -201,10 +203,10 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in __syncthreads(); // Compute the distance to child nodes - std::uint32_t max_i = knn_k * num_parents; + std::uint32_t max_i = knn_k * search_width; if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); } for (std::uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += BLOCK_SIZE / TEAM_SIZE) { - const bool valid_i = (i < (knn_k * num_parents)); + const bool valid_i = (i < (knn_k * search_width)); INDEX_T child_id = invalid_index; if (valid_i) { child_id = result_child_indices_ptr[i]; } @@ -215,7 +217,7 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in for (unsigned e = 0; e < nelem; e++) { const unsigned k = (lane_id + (TEAM_SIZE * e)) * vlen; if (k >= dataset_dim) break; - dl_buff[e].load = ((LOAD_T*)(dataset_ptr + k + (dataset_dim * child_id)))[0]; + dl_buff[e].load = ((LOAD_T*)(dataset_ptr + k + (dataset_ld * child_id)))[0]; } #pragma unroll for (unsigned e = 0; e < nelem; e++) { @@ -252,4 +254,4 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_in } } // namespace device -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/device_common.hpp b/cpp/include/raft/neighbors/detail/cagra/device_common.hpp index f9c81f3d25..b1a2207a4e 100644 --- a/cpp/include/raft/neighbors/detail/cagra/device_common.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/device_common.hpp @@ -21,7 +21,7 @@ #include #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace device { // warpSize for compile time calculation @@ -49,4 +49,4 @@ _RAFT_DEVICE inline T swizzling(T x) } } // namespace device -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/factory.cuh b/cpp/include/raft/neighbors/detail/cagra/factory.cuh index 7d4cfee0b9..625040194b 100644 --- a/cpp/include/raft/neighbors/detail/cagra/factory.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/factory.cuh @@ -21,7 +21,7 @@ #include "search_plan.cuh" #include "search_single_cta.cuh" -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { template class factory { @@ -86,4 +86,4 @@ class factory { } } }; -}; // namespace raft::neighbors::experimental::cagra::detail +}; // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/fragment.hpp b/cpp/include/raft/neighbors/detail/cagra/fragment.hpp index c423ac12c2..e124b3fc8c 100644 --- a/cpp/include/raft/neighbors/detail/cagra/fragment.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/fragment.hpp @@ -20,7 +20,7 @@ #include #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace device { namespace detail { @@ -208,4 +208,4 @@ _RAFT_DEVICE void print_fragment(const device::fragment& a) } } // namespace device -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index feb9b76b2d..0558d7ea39 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -31,11 +31,12 @@ #include #include +#include #include #include "utils.hpp" -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace graph { // unnamed namespace to avoid multiple definition error @@ -67,7 +68,7 @@ __device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool a return false; } -template +template __global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, dataset_dim] const IdxT dataset_size, const uint32_t dataset_dim, @@ -75,25 +76,23 @@ __global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, const uint32_t graph_size, const uint32_t graph_degree) { - __shared__ float smem_keys[blockDim_x * numElementsPerThread]; - __shared__ IdxT smem_vals[blockDim_x * numElementsPerThread]; - - const IdxT srcNode = blockIdx.x; + const IdxT srcNode = (blockDim.x * blockIdx.x + threadIdx.x) / raft::WarpSize; if (srcNode >= graph_size) { return; } - const uint32_t num_warps = blockDim_x / 32; - const uint32_t warp_id = threadIdx.x / 32; - const uint32_t lane_id = threadIdx.x % 32; + const uint32_t lane_id = threadIdx.x % raft::WarpSize; + + float my_keys[numElementsPerThread]; + IdxT my_vals[numElementsPerThread]; // Compute distance from a src node to its neighbors - for (int k = warp_id; k < graph_degree; k += num_warps) { - const IdxT dstNode = knn_graph[k + ((uint64_t)graph_degree * srcNode)]; + for (int k = 0; k < graph_degree; k++) { + const IdxT dstNode = knn_graph[k + static_cast(graph_degree) * srcNode]; float dist = 0.0; - for (int d = lane_id; d < dataset_dim; d += 32) { + for (int d = lane_id; d < dataset_dim; d += raft::WarpSize) { float diff = spatial::knn::detail::utils::mapping{}( - dataset[d + ((uint64_t)dataset_dim * srcNode)]) - + dataset[d + static_cast(dataset_dim) * srcNode]) - spatial::knn::detail::utils::mapping{}( - dataset[d + ((uint64_t)dataset_dim * dstNode)]); + dataset[d + static_cast(dataset_dim) * dstNode]); dist += diff * diff; } dist += __shfl_xor_sync(0xffffffff, dist, 1); @@ -101,91 +100,24 @@ __global__ void kern_sort(const DATA_T* const dataset, // [dataset_chunk_size, dist += __shfl_xor_sync(0xffffffff, dist, 4); dist += __shfl_xor_sync(0xffffffff, dist, 8); dist += __shfl_xor_sync(0xffffffff, dist, 16); - if (lane_id == 0) { - smem_keys[k] = dist; - smem_vals[k] = dstNode; - } - } - __syncthreads(); - - float my_keys[numElementsPerThread]; - IdxT my_vals[numElementsPerThread]; - for (int i = 0; i < numElementsPerThread; i++) { - const int k = i + (numElementsPerThread * threadIdx.x); - if (k < graph_degree) { - my_keys[i] = smem_keys[k]; - my_vals[i] = smem_vals[k]; - } else { - my_keys[i] = FLT_MAX; - my_vals[i] = utils::get_max_value(); + if (lane_id == (k % raft::WarpSize)) { + my_keys[k / raft::WarpSize] = dist; + my_vals[k / raft::WarpSize] = dstNode; } } - __syncthreads(); - - // Sorting by thread - uint32_t mask = 1; - const bool ascending = ((threadIdx.x & mask) == 0); - for (int j = 0; j < numElementsPerThread; j += 2) { -#pragma unroll - for (int i = 0; i < numElementsPerThread; i += 2) { - swap_if_needed( - my_keys[i], my_keys[i + 1], my_vals[i], my_vals[i + 1], ascending); - } -#pragma unroll - for (int i = 1; i < numElementsPerThread - 1; i += 2) { - swap_if_needed( - my_keys[i], my_keys[i + 1], my_vals[i], my_vals[i + 1], ascending); + for (int k = graph_degree; k < raft::WarpSize * numElementsPerThread; k++) { + if (lane_id == k % raft::WarpSize) { + my_keys[k / raft::WarpSize] = utils::get_max_value(); + my_vals[k / raft::WarpSize] = utils::get_max_value(); } } - // Bitonic Sorting - while (mask < blockDim_x) { - const uint32_t next_mask = mask << 1; - - for (uint32_t curr_mask = mask; curr_mask > 0; curr_mask >>= 1) { - const bool ascending = ((threadIdx.x & curr_mask) == 0) == ((threadIdx.x & next_mask) == 0); - if (mask >= 32) { - // inter warp - __syncthreads(); -#pragma unroll - for (int i = 0; i < numElementsPerThread; i++) { - smem_keys[threadIdx.x + (blockDim_x * i)] = my_keys[i]; - smem_vals[threadIdx.x + (blockDim_x * i)] = my_vals[i]; - } - __syncthreads(); -#pragma unroll - for (int i = 0; i < numElementsPerThread; i++) { - float opp_key = smem_keys[(threadIdx.x ^ curr_mask) + (blockDim_x * i)]; - IdxT opp_val = smem_vals[(threadIdx.x ^ curr_mask) + (blockDim_x * i)]; - swap_if_needed(my_keys[i], opp_key, my_vals[i], opp_val, ascending); - } - } else { -// intra warp -#pragma unroll - for (int i = 0; i < numElementsPerThread; i++) { - float opp_key = __shfl_xor_sync(0xffffffff, my_keys[i], curr_mask); - IdxT opp_val = __shfl_xor_sync(0xffffffff, my_vals[i], curr_mask); - swap_if_needed(my_keys[i], opp_key, my_vals[i], opp_val, ascending); - } - } - } - - const bool ascending = ((threadIdx.x & next_mask) == 0); -#pragma unroll - for (uint32_t curr_mask = numElementsPerThread / 2; curr_mask > 0; curr_mask >>= 1) { -#pragma unroll - for (int i = 0; i < numElementsPerThread; i++) { - int j = i ^ curr_mask; - if (i > j) continue; - swap_if_needed(my_keys[i], my_keys[j], my_vals[i], my_vals[j], ascending); - } - } - mask = next_mask; - } + // Sort by RAFT bitonic sort + raft::util::bitonic(true).sort(my_keys, my_vals); // Update knn_graph for (int i = 0; i < numElementsPerThread; i++) { - const int k = i + (numElementsPerThread * threadIdx.x); + const int k = i * raft::WarpSize + lane_id; if (k < graph_degree) { knn_graph[k + (static_cast(graph_degree) * srcNode)] = my_vals[i]; } @@ -299,8 +231,8 @@ template , memory_type::host>> void sort_knn_graph(raft::resources const& res, - mdspan, row_major, d_accessor> dataset, - mdspan, row_major, g_accessor> knn_graph) + mdspan, row_major, d_accessor> dataset, + mdspan, row_major, g_accessor> knn_graph) { RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0), "dataset size is expected to have the same number of graph index size"); @@ -320,7 +252,7 @@ void sort_knn_graph(raft::resources const& res, const double time_sort_start = cur_time(); RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs "); - auto d_dataset = raft::make_device_matrix(res, dataset_size, dataset_dim); + auto d_dataset = raft::make_device_matrix(res, dataset_size, dataset_dim); raft::copy(d_dataset.data_handle(), dataset_ptr, dataset_size * dataset_dim, @@ -333,35 +265,37 @@ void sort_knn_graph(raft::resources const& res, void (*kernel_sort)( const DataT* const, const IdxT, const uint32_t, IdxT* const, const uint32_t, const uint32_t); - constexpr int numElementsPerThread = 4; - dim3 threads_sort(1, 1, 1); - if (input_graph_degree <= numElementsPerThread * 32) { - constexpr int blockDim_x = 32; - kernel_sort = kern_sort; - threads_sort.x = blockDim_x; - } else if (input_graph_degree <= numElementsPerThread * 64) { - constexpr int blockDim_x = 64; - kernel_sort = kern_sort; - threads_sort.x = blockDim_x; - } else if (input_graph_degree <= numElementsPerThread * 128) { - constexpr int blockDim_x = 128; - kernel_sort = kern_sort; - threads_sort.x = blockDim_x; - } else if (input_graph_degree <= numElementsPerThread * 256) { - constexpr int blockDim_x = 256; - kernel_sort = kern_sort; - threads_sort.x = blockDim_x; + if (input_graph_degree <= 32) { + constexpr int numElementsPerThread = 1; + kernel_sort = kern_sort; + } else if (input_graph_degree <= 64) { + constexpr int numElementsPerThread = 2; + kernel_sort = kern_sort; + } else if (input_graph_degree <= 128) { + constexpr int numElementsPerThread = 4; + kernel_sort = kern_sort; + } else if (input_graph_degree <= 256) { + constexpr int numElementsPerThread = 8; + kernel_sort = kern_sort; + } else if (input_graph_degree <= 512) { + constexpr int numElementsPerThread = 16; + kernel_sort = kern_sort; + } else if (input_graph_degree <= 1024) { + constexpr int numElementsPerThread = 32; + kernel_sort = kern_sort; } else { - RAFT_LOG_ERROR( - "[ERROR] The degree of input knn graph is too large (%u). " - "It must be equal to or small than %d.\n", + RAFT_FAIL( + "The degree of input knn graph is too large (%u). " + "It must be equal to or smaller than %d.", input_graph_degree, - numElementsPerThread * 256); - exit(-1); + 1024); } - dim3 blocks_sort(graph_size, 1, 1); + const auto block_size = 256; + const auto num_warps_per_block = block_size / raft::WarpSize; + const auto grid_size = (graph_size + num_warps_per_block - 1) / num_warps_per_block; + RAFT_LOG_DEBUG("."); - kernel_sort<<>>( + kernel_sort<<>>( d_dataset.data_handle(), dataset_size, dataset_dim, @@ -383,9 +317,9 @@ void sort_knn_graph(raft::resources const& res, template , memory_type::host>> -void prune(raft::resources const& res, - mdspan, row_major, g_accessor> knn_graph, - raft::host_matrix_view new_graph) +void optimize(raft::resources const& res, + mdspan, row_major, g_accessor> knn_graph, + raft::host_matrix_view new_graph) { RAFT_LOG_DEBUG( "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); @@ -400,23 +334,24 @@ void prune(raft::resources const& res, auto output_graph_ptr = new_graph.data_handle(); const IdxT graph_size = new_graph.extent(0); - auto pruned_graph = raft::make_host_matrix(graph_size, output_graph_degree); + auto pruned_graph = raft::make_host_matrix(graph_size, output_graph_degree); { // // Prune kNN graph // - auto d_input_graph = raft::make_device_matrix(res, graph_size, input_graph_degree); + auto d_input_graph = + raft::make_device_matrix(res, graph_size, input_graph_degree); - auto detour_count = raft::make_host_matrix(graph_size, input_graph_degree); + auto detour_count = raft::make_host_matrix(graph_size, input_graph_degree); auto d_detour_count = - raft::make_device_matrix(res, graph_size, input_graph_degree); + raft::make_device_matrix(res, graph_size, input_graph_degree); RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), 0xff, graph_size * input_graph_degree * sizeof(uint8_t), resource::get_cuda_stream(res))); - auto d_num_no_detour_edges = raft::make_device_vector(res, graph_size); + auto d_num_no_detour_edges = raft::make_device_vector(res, graph_size); RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(), 0x00, graph_size * sizeof(uint32_t), @@ -459,12 +394,11 @@ void prune(raft::resources const& res, if (input_graph_degree <= MAX_DEGREE) { kernel_prune = kern_prune; } else { - RAFT_LOG_ERROR( - "[ERROR] The degree of input knn graph is too large (%u). " - "It must be equal to or small than %d.\n", + RAFT_FAIL( + "The degree of input knn graph is too large (%u). " + "It must be equal to or smaller than %d.", input_graph_degree, 1024); - exit(-1); } const uint32_t batch_size = std::min(static_cast(graph_size), static_cast(256 * 1024)); @@ -535,8 +469,8 @@ void prune(raft::resources const& res, (double)num_full / graph_size * 100); } - auto rev_graph = raft::make_host_matrix(graph_size, output_graph_degree); - auto rev_graph_count = raft::make_host_vector(graph_size); + auto rev_graph = raft::make_host_matrix(graph_size, output_graph_degree); + auto rev_graph_count = raft::make_host_vector(graph_size); { // @@ -544,20 +478,21 @@ void prune(raft::resources const& res, // const double time_make_start = cur_time(); - auto d_rev_graph = raft::make_device_matrix(res, graph_size, output_graph_degree); + auto d_rev_graph = + raft::make_device_matrix(res, graph_size, output_graph_degree); RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(), 0xff, graph_size * output_graph_degree * sizeof(IdxT), resource::get_cuda_stream(res))); - auto d_rev_graph_count = raft::make_device_vector(res, graph_size); + auto d_rev_graph_count = raft::make_device_vector(res, graph_size); RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(), 0x00, graph_size * sizeof(uint32_t), resource::get_cuda_stream(res))); - auto dest_nodes = raft::make_host_vector(graph_size); - auto d_dest_nodes = raft::make_device_vector(res, graph_size); + auto dest_nodes = raft::make_host_vector(graph_size); + auto d_dest_nodes = raft::make_device_vector(res, graph_size); for (uint64_t k = 0; k < output_graph_degree; k++) { #pragma omp parallel for @@ -655,4 +590,4 @@ void prune(raft::resources const& res, } } // namespace graph -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp b/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp index cd2c8ec491..346bbeaa9e 100644 --- a/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp @@ -18,11 +18,12 @@ #include "utils.hpp" #include #include +#include // #pragma GCC diagnostic push // #pragma GCC diagnostic ignored // #pragma GCC diagnostic pop -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace hashmap { _RAFT_HOST_DEVICE inline uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; } @@ -84,4 +85,4 @@ _RAFT_DEVICE inline uint32_t insert(IdxT* const table, const uint32_t bitlen, co } } // namespace hashmap -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh index f9a0fef2fe..3fd4fca0f3 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh @@ -33,6 +33,7 @@ #include "compute_distance.hpp" #include "device_common.hpp" #include "hashmap.hpp" +#include "search_multi_cta_kernel.cuh" #include "search_plan.cuh" #include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk if possible #include "utils.hpp" @@ -40,383 +41,9 @@ #include #include // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace multi_cta_search { -// #define _CLK_BREAKDOWN - -template -__device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [num_parents] - const uint32_t num_parents, - INDEX_T* const itopk_indices, // [num_itopk] - const size_t num_itopk, - uint32_t* const terminate_flag) -{ - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - const unsigned warp_id = threadIdx.x / 32; - if (warp_id > 0) { return; } - const unsigned lane_id = threadIdx.x % 32; - for (uint32_t i = lane_id; i < num_parents; i += 32) { - next_parent_indices[i] = utils::get_max_value(); - } - uint32_t max_itopk = num_itopk; - if (max_itopk % 32) { max_itopk += 32 - (max_itopk % 32); } - uint32_t num_new_parents = 0; - for (uint32_t j = lane_id; j < max_itopk; j += 32) { - INDEX_T index; - int new_parent = 0; - if (j < num_itopk) { - index = itopk_indices[j]; - if ((index & index_msb_1_mask) == 0) { // check if most significant bit is set - new_parent = 1; - } - } - const uint32_t ballot_mask = __ballot_sync(0xffffffff, new_parent); - if (new_parent) { - const auto i = __popc(ballot_mask & ((1 << lane_id) - 1)) + num_new_parents; - if (i < num_parents) { - next_parent_indices[i] = index; - itopk_indices[j] |= index_msb_1_mask; // set most significant bit as used node - } - } - num_new_parents += __popc(ballot_mask); - if (num_new_parents >= num_parents) { break; } - } - if (threadIdx.x == 0 && (num_new_parents == 0)) { *terminate_flag = 1; } -} - -template -__device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] - INDEX_T* indices, // [num_elements] - const uint32_t num_elements, - const uint32_t num_itopk // num_itopk <= num_elements -) -{ - const unsigned warp_id = threadIdx.x / 32; - if (warp_id > 0) { return; } - const unsigned lane_id = threadIdx.x % 32; - constexpr unsigned N = (MAX_ELEMENTS + 31) / 32; - float key[N]; - INDEX_T val[N]; - for (unsigned i = 0; i < N; i++) { - unsigned j = lane_id + (32 * i); - if (j < num_elements) { - key[i] = distances[j]; - val[i] = indices[j]; - } else { - key[i] = utils::get_max_value(); - val[i] = utils::get_max_value(); - } - } - /* Warp Sort */ - bitonic::warp_sort(key, val); - /* Store itopk sorted results */ - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * lane_id) + i; - if (j < num_itopk) { - distances[j] = key[i]; - indices[j] = val[i]; - } - } -} - -// -// multiple CTAs per single query -// -template -__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel( - INDEX_T* const result_indices_ptr, // [num_queries, num_cta_per_query, itopk_size] - DISTANCE_T* const result_distances_ptr, // [num_queries, num_cta_per_query, itopk_size] - const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] - const size_t dataset_dim, - const size_t dataset_size, - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const INDEX_T* const knn_graph, // [dataset_size, graph_degree] - const uint32_t graph_degree, - const unsigned num_distilation, - const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] - const uint32_t num_seeds, - INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] - const uint32_t hash_bitlen, - const uint32_t itopk_size, - const uint32_t num_parents, - const uint32_t min_iteration, - const uint32_t max_iteration, - uint32_t* const num_executed_iterations /* stats */ -) -{ - assert(blockDim.x == BLOCK_SIZE); - assert(dataset_dim <= MAX_DATASET_DIM); - - // const auto num_queries = gridDim.y; - const auto query_id = blockIdx.y; - const auto num_cta_per_query = gridDim.x; - const auto cta_id = blockIdx.x; // local CTA ID - -#ifdef _CLK_BREAKDOWN - uint64_t clk_init = 0; - uint64_t clk_compute_1st_distance = 0; - uint64_t clk_topk = 0; - uint64_t clk_pickup_parents = 0; - uint64_t clk_compute_distance = 0; - uint64_t clk_start; -#define _CLK_START() clk_start = clock64() -#define _CLK_REC(V) V += clock64() - clk_start; -#else -#define _CLK_START() -#define _CLK_REC(V) -#endif - _CLK_START(); - - extern __shared__ uint32_t smem[]; - - // Layout of result_buffer - // +----------------+------------------------------+---------+ - // | internal_top_k | neighbors of parent nodes | padding | - // | | | upto 32 | - // +----------------+------------------------------+---------+ - // |<--- result_buffer_size --->| - uint32_t result_buffer_size = itopk_size + (num_parents * graph_degree); - uint32_t result_buffer_size_32 = result_buffer_size; - if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } - assert(result_buffer_size_32 <= MAX_ELEMENTS); - - auto query_buffer = reinterpret_cast(smem); - auto result_indices_buffer = reinterpret_cast(query_buffer + MAX_DATASET_DIM); - auto result_distances_buffer = - reinterpret_cast(result_indices_buffer + result_buffer_size_32); - auto parent_indices_buffer = - reinterpret_cast(result_distances_buffer + result_buffer_size_32); - auto terminate_flag = reinterpret_cast(parent_indices_buffer + num_parents); - -#if 0 - /* debug */ - for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += BLOCK_SIZE) { - result_indices_buffer[i] = utils::get_max_value(); - result_distances_buffer[i] = utils::get_max_value(); - } -#endif - - const DATA_T* const query_ptr = queries_ptr + (dataset_dim * query_id); - for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) { - unsigned j = device::swizzling(i); - if (i < dataset_dim) { - query_buffer[j] = spatial::knn::detail::utils::mapping{}(query_ptr[i]); - } else { - query_buffer[j] = 0.0; - } - } - if (threadIdx.x == 0) { terminate_flag[0] = 0; } - INDEX_T* const local_visited_hashmap_ptr = - visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id); - __syncthreads(); - _CLK_REC(clk_init); - - // compute distance to randomly selecting nodes - _CLK_START(); - const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; - device::compute_distance_to_random_nodes( - result_indices_buffer, - result_distances_buffer, - query_buffer, - dataset_ptr, - dataset_dim, - dataset_size, - result_buffer_size, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - hash_bitlen, - cta_id, - num_cta_per_query); - __syncthreads(); - _CLK_REC(clk_compute_1st_distance); - - uint32_t iter = 0; - while (1) { - // topk with bitonic sort - _CLK_START(); - topk_by_bitonic_sort(result_distances_buffer, - result_indices_buffer, - itopk_size + (num_parents * graph_degree), - itopk_size); - _CLK_REC(clk_topk); - - if (iter + 1 == max_iteration) { - __syncthreads(); - break; - } - - // pick up next parents - _CLK_START(); - pickup_next_parents( - parent_indices_buffer, num_parents, result_indices_buffer, itopk_size, terminate_flag); - _CLK_REC(clk_pickup_parents); - - __syncthreads(); - if (*terminate_flag && iter >= min_iteration) { break; } - - // compute the norms between child nodes and query node - _CLK_START(); - // constexpr unsigned max_n_frags = 16; - constexpr unsigned max_n_frags = 0; - device:: - compute_distance_to_child_nodes( - result_indices_buffer + itopk_size, - result_distances_buffer + itopk_size, - query_buffer, - dataset_ptr, - dataset_dim, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - hash_bitlen, - parent_indices_buffer, - num_parents); - _CLK_REC(clk_compute_distance); - __syncthreads(); - - iter++; - } - - for (uint32_t i = threadIdx.x; i < itopk_size; i += BLOCK_SIZE) { - uint32_t j = i + (itopk_size * (cta_id + (num_cta_per_query * query_id))); - if (result_distances_ptr != nullptr) { result_distances_ptr[j] = result_distances_buffer[i]; } - - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - - result_indices_ptr[j] = - result_indices_buffer[i] & ~index_msb_1_mask; // clear most significant bit - } - - if (threadIdx.x == 0 && cta_id == 0 && num_executed_iterations != nullptr) { - num_executed_iterations[query_id] = iter + 1; - } - -#ifdef _CLK_BREAKDOWN - if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && (blockIdx.x == 0) && - ((query_id * 3) % gridDim.y < 3)) { - RAFT_LOG_DEBUG( - "query, %d, thread, %d" - ", init, %d" - ", 1st_distance, %lu" - ", topk, %lu" - ", pickup_parents, %lu" - ", distance, %lu" - "\n", - query_id, - threadIdx.x, - clk_init, - clk_compute_1st_distance, - clk_topk, - clk_pickup_parents, - clk_compute_distance); - } -#endif -} - -#define SET_MC_KERNEL_3(BLOCK_SIZE, BLOCK_COUNT, MAX_ELEMENTS, LOAD_T) \ - kernel = search_kernel; - -#define SET_MC_KERNEL_2(BLOCK_SIZE, BLOCK_COUNT, MAX_ELEMENTS) \ - if (load_bit_length == 128) { \ - SET_MC_KERNEL_3(BLOCK_SIZE, BLOCK_COUNT, MAX_ELEMENTS, device::LOAD_128BIT_T) \ - } else if (load_bit_length == 64) { \ - SET_MC_KERNEL_3(BLOCK_SIZE, BLOCK_COUNT, MAX_ELEMENTS, device::LOAD_64BIT_T) \ - } - -#define SET_MC_KERNEL_1(MAX_ELEMENTS) \ - /* if ( block_size == 32 ) { \ - SET_MC_KERNEL_2( 32, 32, MAX_ELEMENTS ) \ - } else */ \ - if (block_size == 64) { \ - SET_MC_KERNEL_2(64, 16, MAX_ELEMENTS) \ - } else if (block_size == 128) { \ - SET_MC_KERNEL_2(128, 8, MAX_ELEMENTS) \ - } else if (block_size == 256) { \ - SET_MC_KERNEL_2(256, 4, MAX_ELEMENTS) \ - } else if (block_size == 512) { \ - SET_MC_KERNEL_2(512, 2, MAX_ELEMENTS) \ - } else { \ - SET_MC_KERNEL_2(1024, 1, MAX_ELEMENTS) \ - } - -#define SET_MC_KERNEL \ - typedef void (*search_kernel_t)(INDEX_T* const result_indices_ptr, \ - DISTANCE_T* const result_distances_ptr, \ - const DATA_T* const dataset_ptr, \ - const size_t dataset_dim, \ - const size_t dataset_size, \ - const DATA_T* const queries_ptr, \ - const INDEX_T* const knn_graph, \ - const uint32_t graph_degree, \ - const unsigned num_distilation, \ - const uint64_t rand_xor_mask, \ - const INDEX_T* seed_ptr, \ - const uint32_t num_seeds, \ - INDEX_T* const visited_hashmap_ptr, \ - const uint32_t hash_bitlen, \ - const uint32_t itopk_size, \ - const uint32_t num_parents, \ - const uint32_t min_iteration, \ - const uint32_t max_iteration, \ - uint32_t* const num_executed_iterations); \ - search_kernel_t kernel; \ - if (result_buffer_size <= 64) { \ - SET_MC_KERNEL_1(64) \ - } else if (result_buffer_size <= 128) { \ - SET_MC_KERNEL_1(128) \ - } else if (result_buffer_size <= 256) { \ - SET_MC_KERNEL_1(256) \ - } - -template -__global__ void set_value_batch_kernel(T* const dev_ptr, - const std::size_t ld, - const T val, - const std::size_t count, - const std::size_t batch_size) -{ - const auto tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= count * batch_size) { return; } - const auto batch_id = tid / count; - const auto elem_id = tid % count; - dev_ptr[elem_id + ld * batch_id] = val; -} - -template -void set_value_batch(T* const dev_ptr, - const std::size_t ld, - const T val, - const std::size_t count, - const std::size_t batch_size, - cudaStream_t cuda_stream) -{ - constexpr std::uint32_t block_size = 256; - const auto grid_size = (count * batch_size + block_size - 1) / block_size; - set_value_batch_kernel - <<>>(dev_ptr, ld, val, count, batch_size); -} - template { using search_plan_impl::itopk_size; using search_plan_impl::algo; using search_plan_impl::team_size; - using search_plan_impl::num_parents; + using search_plan_impl::search_width; using search_plan_impl::min_iterations; using search_plan_impl::max_iterations; - using search_plan_impl::load_bit_length; using search_plan_impl::thread_block_size; using search_plan_impl::hashmap_mode; using search_plan_impl::hashmap_min_bitlen; @@ -453,7 +79,6 @@ struct search : public search_plan_impl { using search_plan_impl::result_buffer_size; using search_plan_impl::smem_size; - using search_plan_impl::load_bit_lenght; using search_plan_impl::hashmap; using search_plan_impl::num_executed_iterations; @@ -477,15 +102,15 @@ struct search : public search_plan_impl { topk_workspace(0, resource::get_cuda_stream(res)) { - set_params(res); + set_params(res, params); } - void set_params(raft::resources const& res) + void set_params(raft::resources const& res, const search_params& params) { this->itopk_size = 32; - num_parents = 1; - num_cta_per_query = max(num_parents, itopk_size / 32); - result_buffer_size = itopk_size + num_parents * graph_degree; + search_width = 1; + num_cta_per_query = max(params.search_width, params.itopk_size / 32); + result_buffer_size = itopk_size + search_width * graph_degree; typedef raft::Pow2<32> AlignBytes; unsigned result_buffer_size_32 = AlignBytes::roundUp(result_buffer_size); // constexpr unsigned max_result_buffer_size = 256; @@ -493,7 +118,7 @@ struct search : public search_plan_impl { smem_size = sizeof(float) * max_dim + (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 + - sizeof(uint32_t) * num_parents + sizeof(uint32_t); + sizeof(uint32_t) * search_width + sizeof(uint32_t); RAFT_LOG_DEBUG("# smem_size: %u", smem_size); // @@ -518,7 +143,7 @@ struct search : public search_plan_impl { cudaDeviceProp deviceProp = resource::get_device_properties(res); RAFT_LOG_DEBUG("# multiProcessorCount: %d", deviceProp.multiProcessorCount); while ((block_size < max_block_size) && - (graph_degree * num_parents * team_size >= block_size * 2) && + (graph_degree * search_width * team_size >= block_size * 2) && (num_cta_per_query * max_queries <= (1024 / (block_size * 2)) * deviceProp.multiProcessorCount)) { block_size *= 2; @@ -533,30 +158,14 @@ struct search : public search_plan_impl { max_block_size); thread_block_size = block_size; - // - // Determine load bit length - // - const uint32_t total_bit_length = dim * sizeof(DATA_T) * 8; - if (load_bit_length == 0) { - load_bit_length = 128; - while (total_bit_length % load_bit_length) { - load_bit_length /= 2; - } - } - RAFT_LOG_DEBUG("# load_bit_length: %u (%u loads per vector)", - load_bit_length, - total_bit_length / load_bit_length); - RAFT_EXPECTS(total_bit_length % load_bit_length == 0, - "load_bit_length must be a divisor of dim*sizeof(data_t)*8=%u", - total_bit_length); - RAFT_EXPECTS(load_bit_length >= 64, "load_bit_lenght cannot be less than 64"); - // // Allocate memory for intermediate buffer and workspace. // uint32_t num_intermediate_results = num_cta_per_query * itopk_size; - intermediate_indices.resize(num_intermediate_results, resource::get_cuda_stream(res)); - intermediate_distances.resize(num_intermediate_results, resource::get_cuda_stream(res)); + intermediate_indices.resize(num_intermediate_results * max_queries, + resource::get_cuda_stream(res)); + intermediate_distances.resize(num_intermediate_results * max_queries, + resource::get_cuda_stream(res)); hashmap.resize(hashmap_size, resource::get_cuda_stream(res)); @@ -569,8 +178,8 @@ struct search : public search_plan_impl { ~search() {} void operator()(raft::resources const& res, - raft::device_matrix_view dataset, - raft::device_matrix_view graph, + raft::device_matrix_view dataset, + raft::device_matrix_view graph, INDEX_T* const topk_indices_ptr, // [num_queries, topk] DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] const DATA_T* const queries_ptr, // [num_queries, dataset_dim] @@ -580,42 +189,31 @@ struct search : public search_plan_impl { uint32_t topk) { cudaStream_t stream = resource::get_cuda_stream(res); - uint32_t block_size = thread_block_size; - - SET_MC_KERNEL; - RAFT_CUDA_TRY( - cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); - // Initialize hash table - const uint32_t hash_size = hashmap::get_size(hash_bitlen); - set_value_batch( - hashmap.data(), hash_size, utils::get_max_value(), hash_size, num_queries, stream); - dim3 block_dims(block_size, 1, 1); - dim3 grid_dims(num_cta_per_query, num_queries, 1); - RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %lu smem", - block_size, - num_cta_per_query, - num_queries, - smem_size); - kernel<<>>(intermediate_indices.data(), - intermediate_distances.data(), - dataset.data_handle(), - dataset.extent(1), - dataset.extent(0), - queries_ptr, - graph.data_handle(), - graph.extent(1), - num_random_samplings, - rand_xor_mask, - dev_seed_ptr, - num_seeds, - hashmap.data(), - hash_bitlen, - itopk_size, - num_parents, - min_iterations, - max_iterations, - num_executed_iterations); + select_and_run( + dataset, + graph, + intermediate_indices.data(), + intermediate_distances.data(), + queries_ptr, + num_queries, + dev_seed_ptr, + num_executed_iterations, + topk, + thread_block_size, + result_buffer_size, + smem_size, + hash_bitlen, + hashmap.data(), + num_cta_per_query, + num_random_samplings, + rand_xor_mask, + num_seeds, + itopk_size, + search_width, + min_iterations, + max_iterations, + stream); RAFT_CUDA_TRY(cudaPeekAtLastError()); // Select the top-k results from the intermediate results @@ -639,4 +237,4 @@ struct search : public search_plan_impl { }; } // namespace multi_cta_search -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh new file mode 100644 index 0000000000..de83acbb64 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include // RAFT_EXPLICIT + +namespace raft::neighbors::cagra::detail { +namespace multi_cta_search { + +#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY + +template +void select_and_run(raft::device_matrix_view dataset, + raft::device_matrix_view graph, + INDEX_T* const topk_indices_ptr, + DISTANCE_T* const topk_distances_ptr, + const DATA_T* const queries_ptr, + const uint32_t num_queries, + const INDEX_T* dev_seed_ptr, + uint32_t* const num_executed_iterations, + uint32_t topk, + uint32_t block_size, + uint32_t result_buffer_size, + uint32_t smem_size, + int64_t hash_bitlen, + INDEX_T* hashmap_ptr, + uint32_t num_cta_per_query, + uint32_t num_random_samplings, + uint64_t rand_xor_mask, + uint32_t num_seeds, + size_t itopk_size, + size_t search_width, + size_t min_iterations, + size_t max_iterations, + cudaStream_t stream) RAFT_EXPLICIT; +#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + extern template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 1024, float, uint32_t, float); +instantiate_kernel_selection(8, 128, float, uint32_t, float); +instantiate_kernel_selection(16, 256, float, uint32_t, float); +instantiate_kernel_selection(32, 512, float, uint32_t, float); +instantiate_kernel_selection(32, 1024, int8_t, uint32_t, float); +instantiate_kernel_selection(8, 128, int8_t, uint32_t, float); +instantiate_kernel_selection(16, 256, int8_t, uint32_t, float); +instantiate_kernel_selection(32, 512, int8_t, uint32_t, float); +instantiate_kernel_selection(32, 1024, uint8_t, uint32_t, float); +instantiate_kernel_selection(8, 128, uint8_t, uint32_t, float); +instantiate_kernel_selection(16, 256, uint8_t, uint32_t, float); +instantiate_kernel_selection(32, 512, uint8_t, uint32_t, float); + +#undef instantiate_kernel_selection +} // namespace multi_cta_search +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh new file mode 100644 index 0000000000..0015b4a791 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "bitonic.hpp" +#include "compute_distance.hpp" +#include "device_common.hpp" +#include "hashmap.hpp" +#include "search_plan.cuh" +#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk if possible +#include "utils.hpp" +#include +#include +#include // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp + +namespace raft::neighbors::cagra::detail { +namespace multi_cta_search { + +// #define _CLK_BREAKDOWN + +template +__device__ void pickup_next_parents(INDEX_T* const next_parent_indices, // [search_width] + const uint32_t search_width, + INDEX_T* const itopk_indices, // [num_itopk] + const size_t num_itopk, + uint32_t* const terminate_flag) +{ + constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; + const unsigned warp_id = threadIdx.x / 32; + if (warp_id > 0) { return; } + const unsigned lane_id = threadIdx.x % 32; + for (uint32_t i = lane_id; i < search_width; i += 32) { + next_parent_indices[i] = utils::get_max_value(); + } + uint32_t max_itopk = num_itopk; + if (max_itopk % 32) { max_itopk += 32 - (max_itopk % 32); } + uint32_t num_new_parents = 0; + for (uint32_t j = lane_id; j < max_itopk; j += 32) { + INDEX_T index; + int new_parent = 0; + if (j < num_itopk) { + index = itopk_indices[j]; + if ((index & index_msb_1_mask) == 0) { // check if most significant bit is set + new_parent = 1; + } + } + const uint32_t ballot_mask = __ballot_sync(0xffffffff, new_parent); + if (new_parent) { + const auto i = __popc(ballot_mask & ((1 << lane_id) - 1)) + num_new_parents; + if (i < search_width) { + next_parent_indices[i] = index; + itopk_indices[j] |= index_msb_1_mask; // set most significant bit as used node + } + } + num_new_parents += __popc(ballot_mask); + if (num_new_parents >= search_width) { break; } + } + if (threadIdx.x == 0 && (num_new_parents == 0)) { *terminate_flag = 1; } +} + +template +__device__ inline void topk_by_bitonic_sort(float* distances, // [num_elements] + INDEX_T* indices, // [num_elements] + const uint32_t num_elements, + const uint32_t num_itopk // num_itopk <= num_elements +) +{ + const unsigned warp_id = threadIdx.x / 32; + if (warp_id > 0) { return; } + const unsigned lane_id = threadIdx.x % 32; + constexpr unsigned N = (MAX_ELEMENTS + 31) / 32; + float key[N]; + INDEX_T val[N]; + for (unsigned i = 0; i < N; i++) { + unsigned j = lane_id + (32 * i); + if (j < num_elements) { + key[i] = distances[j]; + val[i] = indices[j]; + } else { + key[i] = utils::get_max_value(); + val[i] = utils::get_max_value(); + } + } + /* Warp Sort */ + bitonic::warp_sort(key, val); + /* Store itopk sorted results */ + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * lane_id) + i; + if (j < num_itopk) { + distances[j] = key[i]; + indices[j] = val[i]; + } + } +} + +// +// multiple CTAs per single query +// +template +__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel( + INDEX_T* const result_indices_ptr, // [num_queries, num_cta_per_query, itopk_size] + DISTANCE_T* const result_distances_ptr, // [num_queries, num_cta_per_query, itopk_size] + const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] + const size_t dataset_dim, + const size_t dataset_size, + const size_t dataset_ld, + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + const INDEX_T* const knn_graph, // [dataset_size, graph_degree] + const uint32_t graph_degree, + const unsigned num_distilation, + const uint64_t rand_xor_mask, + const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const uint32_t num_seeds, + INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] + const uint32_t hash_bitlen, + const uint32_t itopk_size, + const uint32_t search_width, + const uint32_t min_iteration, + const uint32_t max_iteration, + uint32_t* const num_executed_iterations /* stats */ +) +{ + assert(blockDim.x == BLOCK_SIZE); + assert(dataset_dim <= MAX_DATASET_DIM); + + const auto num_queries = gridDim.y; + const auto query_id = blockIdx.y; + const auto num_cta_per_query = gridDim.x; + const auto cta_id = blockIdx.x; // local CTA ID + +#ifdef _CLK_BREAKDOWN + uint64_t clk_init = 0; + uint64_t clk_compute_1st_distance = 0; + uint64_t clk_topk = 0; + uint64_t clk_pickup_parents = 0; + uint64_t clk_compute_distance = 0; + uint64_t clk_start; +#define _CLK_START() clk_start = clock64() +#define _CLK_REC(V) V += clock64() - clk_start; +#else +#define _CLK_START() +#define _CLK_REC(V) +#endif + _CLK_START(); + + extern __shared__ uint32_t smem[]; + + // Layout of result_buffer + // +----------------+------------------------------+---------+ + // | internal_top_k | neighbors of parent nodes | padding | + // | | | upto 32 | + // +----------------+------------------------------+---------+ + // |<--- result_buffer_size --->| + uint32_t result_buffer_size = itopk_size + (search_width * graph_degree); + uint32_t result_buffer_size_32 = result_buffer_size; + if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } + assert(result_buffer_size_32 <= MAX_ELEMENTS); + + auto query_buffer = reinterpret_cast(smem); + auto result_indices_buffer = reinterpret_cast(query_buffer + MAX_DATASET_DIM); + auto result_distances_buffer = + reinterpret_cast(result_indices_buffer + result_buffer_size_32); + auto parent_indices_buffer = + reinterpret_cast(result_distances_buffer + result_buffer_size_32); + auto terminate_flag = reinterpret_cast(parent_indices_buffer + search_width); + +#if 0 + /* debug */ + for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += BLOCK_SIZE) { + result_indices_buffer[i] = utils::get_max_value(); + result_distances_buffer[i] = utils::get_max_value(); + } +#endif + const DATA_T* const query_ptr = queries_ptr + (dataset_dim * query_id); + for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) { + unsigned j = device::swizzling(i); + if (i < dataset_dim) { + query_buffer[j] = spatial::knn::detail::utils::mapping{}(query_ptr[i]); + } else { + query_buffer[j] = 0.0; + } + } + if (threadIdx.x == 0) { terminate_flag[0] = 0; } + INDEX_T* const local_visited_hashmap_ptr = + visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id); + __syncthreads(); + _CLK_REC(clk_init); + + // compute distance to randomly selecting nodes + _CLK_START(); + const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; + uint32_t block_id = cta_id + (num_cta_per_query * query_id); + uint32_t num_blocks = num_cta_per_query * num_queries; + device::compute_distance_to_random_nodes( + result_indices_buffer, + result_distances_buffer, + query_buffer, + dataset_ptr, + dataset_dim, + dataset_size, + dataset_ld, + result_buffer_size, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + hash_bitlen, + block_id, + num_blocks); + __syncthreads(); + _CLK_REC(clk_compute_1st_distance); + + uint32_t iter = 0; + while (1) { + // topk with bitonic sort + _CLK_START(); + topk_by_bitonic_sort(result_distances_buffer, + result_indices_buffer, + itopk_size + (search_width * graph_degree), + itopk_size); + _CLK_REC(clk_topk); + + if (iter + 1 == max_iteration) { + __syncthreads(); + break; + } + + // pick up next parents + _CLK_START(); + pickup_next_parents( + parent_indices_buffer, search_width, result_indices_buffer, itopk_size, terminate_flag); + _CLK_REC(clk_pickup_parents); + + __syncthreads(); + if (*terminate_flag && iter >= min_iteration) { break; } + + // compute the norms between child nodes and query node + _CLK_START(); + // constexpr unsigned max_n_frags = 16; + constexpr unsigned max_n_frags = 0; + device:: + compute_distance_to_child_nodes( + result_indices_buffer + itopk_size, + result_distances_buffer + itopk_size, + query_buffer, + dataset_ptr, + dataset_dim, + dataset_ld, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + hash_bitlen, + parent_indices_buffer, + search_width); + _CLK_REC(clk_compute_distance); + __syncthreads(); + + iter++; + } + + for (uint32_t i = threadIdx.x; i < itopk_size; i += BLOCK_SIZE) { + uint32_t j = i + (itopk_size * (cta_id + (num_cta_per_query * query_id))); + if (result_distances_ptr != nullptr) { result_distances_ptr[j] = result_distances_buffer[i]; } + constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; + + result_indices_ptr[j] = + result_indices_buffer[i] & ~index_msb_1_mask; // clear most significant bit + } + + if (threadIdx.x == 0 && cta_id == 0 && num_executed_iterations != nullptr) { + num_executed_iterations[query_id] = iter + 1; + } + +#ifdef _CLK_BREAKDOWN + if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && (blockIdx.x == 0) && + ((query_id * 3) % gridDim.y < 3)) { + RAFT_LOG_DEBUG( + "query, %d, thread, %d" + ", init, %d" + ", 1st_distance, %lu" + ", topk, %lu" + ", pickup_parents, %lu" + ", distance, %lu" + "\n", + query_id, + threadIdx.x, + clk_init, + clk_compute_1st_distance, + clk_topk, + clk_pickup_parents, + clk_compute_distance); + } +#endif +} + +template +__global__ void set_value_batch_kernel(T* const dev_ptr, + const std::size_t ld, + const T val, + const std::size_t count, + const std::size_t batch_size) +{ + const auto tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= count * batch_size) { return; } + const auto batch_id = tid / count; + const auto elem_id = tid % count; + dev_ptr[elem_id + ld * batch_id] = val; +} + +template +void set_value_batch(T* const dev_ptr, + const std::size_t ld, + const T val, + const std::size_t count, + const std::size_t batch_size, + cudaStream_t cuda_stream) +{ + constexpr std::uint32_t block_size = 256; + const auto grid_size = (count * batch_size + block_size - 1) / block_size; + set_value_batch_kernel + <<>>(dev_ptr, ld, val, count, batch_size); +} + +template +struct search_kernel_config { + // Search kernel function type. Note that the actual values for the template value + // parameters do not matter, because they are not part of the function signature. The + // second to fourth value parameters will be selected by the choose_* functions below. + using kernel_t = decltype(&search_kernel); + + static auto choose_buffer_size(unsigned result_buffer_size, unsigned block_size) -> kernel_t + { + if (result_buffer_size <= 64) { + return choose_max_elements<64>(block_size); + } else if (result_buffer_size <= 128) { + return choose_max_elements<128>(block_size); + } else if (result_buffer_size <= 256) { + return choose_max_elements<256>(block_size); + } + THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256); + } + + template + // Todo: rename this to choose block_size + static auto choose_max_elements(unsigned block_size) -> kernel_t + { + if (block_size == 64) { + return search_kernel; + } else if (block_size == 128) { + return search_kernel; + } else if (block_size == 256) { + return search_kernel; + } else if (block_size == 512) { + return search_kernel; + } else { + return search_kernel; + } + } +}; + +template +void select_and_run( // raft::resources const& res, + raft::device_matrix_view dataset, + raft::device_matrix_view graph, + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + const uint32_t num_queries, + const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* const num_executed_iterations, // [num_queries,] + uint32_t topk, + // multi_cta_search (params struct) + uint32_t block_size, // + uint32_t result_buffer_size, + uint32_t smem_size, + int64_t hash_bitlen, + INDEX_T* hashmap_ptr, + uint32_t num_cta_per_query, + uint32_t num_random_samplings, + uint64_t rand_xor_mask, + uint32_t num_seeds, + size_t itopk_size, + size_t search_width, + size_t min_iterations, + size_t max_iterations, + cudaStream_t stream) +{ + auto kernel = search_kernel_config:: + choose_buffer_size(result_buffer_size, block_size); + + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); + // Initialize hash table + const uint32_t hash_size = hashmap::get_size(hash_bitlen); + set_value_batch( + hashmap_ptr, hash_size, utils::get_max_value(), hash_size, num_queries, stream); + + dim3 block_dims(block_size, 1, 1); + dim3 grid_dims(num_cta_per_query, num_queries, 1); + RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %lu smem", + block_size, + num_cta_per_query, + num_queries, + smem_size); + kernel<<>>(topk_indices_ptr, + topk_distances_ptr, + dataset.data_handle(), + dataset.extent(1), + dataset.extent(0), + dataset.stride(0), + queries_ptr, + graph.data_handle(), + graph.extent(1), + num_random_samplings, + rand_xor_mask, + dev_seed_ptr, + num_seeds, + hashmap_ptr, + hash_bitlen, + itopk_size, + search_width, + min_iterations, + max_iterations, + num_executed_iterations); +} + +} // namespace multi_cta_search +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel.cuh new file mode 100644 index 0000000000..e003907292 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel.cuh @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY +#include "search_multi_cta_kernel-inl.cuh" +#endif + +#ifdef RAFT_COMPILED +#include "search_multi_cta_kernel-ext.cuh" +#endif diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh index 8fbd5d8f03..e664764721 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh @@ -40,7 +40,7 @@ #include #include // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace multi_kernel_search { template @@ -93,6 +93,7 @@ __global__ void random_pickup_kernel( const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] const std::size_t dataset_dim, const std::size_t dataset_size, + const std::size_t dataset_ld, const DATA_T* const queries_ptr, // [num_queries, dataset_dim] const std::size_t num_pickup, const unsigned num_distilation, @@ -125,7 +126,7 @@ __global__ void random_pickup_kernel( } device::fragment random_data_frag; device::load_vector_sync( - random_data_frag, dataset_ptr + (dataset_dim * seed_index), dataset_dim); + random_data_frag, dataset_ptr + (dataset_ld * seed_index), dataset_dim); // Compute the norm of two data const auto norm2 = device::norm2( @@ -163,6 +164,7 @@ template >>(dataset_ptr, dataset_dim, dataset_size, + dataset_ld, queries_ptr, num_pickup, num_distilation, @@ -305,11 +308,12 @@ template __global__ void compute_distance_to_child_nodes_kernel( - const INDEX_T* const parent_node_list, // [num_queries, num_parents] - const std::uint32_t num_parents, + const INDEX_T* const parent_node_list, // [num_queries, search_width] + const std::uint32_t search_width, const DATA_T* const dataset_ptr, // [dataset_size, data_dim] const std::uint32_t data_dim, const std::uint32_t dataset_size, + const std::uint32_t dataset_ld, const INDEX_T* const neighbor_graph_ptr, // [dataset_size, graph_degree] const std::uint32_t graph_degree, const DATA_T* query_ptr, // [num_queries, data_dim] @@ -317,16 +321,16 @@ __global__ void compute_distance_to_child_nodes_kernel( const std::uint32_t hash_bitlen, INDEX_T* const result_indices_ptr, // [num_queries, ldd] DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] - const std::uint32_t ldd // (*) ldd >= num_parents * graph_degree + const std::uint32_t ldd // (*) ldd >= search_width * graph_degree ) { const uint32_t ldb = hashmap::get_size(hash_bitlen); const auto tid = threadIdx.x + blockDim.x * blockIdx.x; const auto global_team_id = tid / TEAM_SIZE; - if (global_team_id >= num_parents * graph_degree) { return; } + if (global_team_id >= search_width * graph_degree) { return; } const std::size_t parent_index = - parent_node_list[global_team_id / graph_degree + (num_parents * blockIdx.y)]; + parent_node_list[global_team_id / graph_degree + (search_width * blockIdx.y)]; if (parent_index == utils::get_max_value()) { result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value(); return; @@ -338,7 +342,7 @@ __global__ void compute_distance_to_child_nodes_kernel( if (hashmap::insert( visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id)) { device::fragment frag_target; - device::load_vector_sync(frag_target, dataset_ptr + (data_dim * child_id), data_dim); + device::load_vector_sync(frag_target, dataset_ptr + (dataset_ld * child_id), data_dim); device::fragment frag_query; device::load_vector_sync(frag_query, query_ptr + blockIdx.y * data_dim, data_dim); @@ -365,11 +369,12 @@ template void compute_distance_to_child_nodes( - const INDEX_T* const parent_node_list, // [num_queries, num_parents] - const uint32_t num_parents, + const INDEX_T* const parent_node_list, // [num_queries, search_width] + const uint32_t search_width, const DATA_T* const dataset_ptr, // [dataset_size, data_dim] const std::uint32_t data_dim, const std::uint32_t dataset_size, + const std::uint32_t dataset_ld, const INDEX_T* const neighbor_graph_ptr, // [dataset_size, graph_degree] const std::uint32_t graph_degree, const DATA_T* query_ptr, // [num_queries, data_dim] @@ -378,19 +383,20 @@ void compute_distance_to_child_nodes( const std::uint32_t hash_bitlen, INDEX_T* const result_indices_ptr, // [num_queries, ldd] DISTANCE_T* const result_distances_ptr, // [num_queries, ldd] - const std::uint32_t ldd, // (*) ldd >= num_parents * graph_degree + const std::uint32_t ldd, // (*) ldd >= search_width * graph_degree cudaStream_t cuda_stream = 0) { const auto block_size = 128; const dim3 grid_size( - (num_parents * graph_degree + (block_size / TEAM_SIZE) - 1) / (block_size / TEAM_SIZE), + (search_width * graph_degree + (block_size / TEAM_SIZE) - 1) / (block_size / TEAM_SIZE), num_queries); compute_distance_to_child_nodes_kernel <<>>(parent_node_list, - num_parents, + search_width, dataset_ptr, data_dim, dataset_size, + dataset_ld, neighbor_graph_ptr, graph_degree, query_ptr, @@ -493,7 +499,7 @@ void set_value_batch(T* const dev_ptr, // result_buffer (work buffer) for "multi-kernel" // +--------------------+------------------------------+-------------------+ // | internal_top_k (A) | neighbors of internal_top_k | internal_topk (B) | -// | | | | +// | | | | // +--------------------+------------------------------+-------------------+ // |<--- result_buffer_allocation_size --->| // |<--- result_buffer_size --->| // Double buffer (A) @@ -508,10 +514,9 @@ struct search : search_plan_impl { using search_plan_impl::itopk_size; using search_plan_impl::algo; using search_plan_impl::team_size; - using search_plan_impl::num_parents; + using search_plan_impl::search_width; using search_plan_impl::min_iterations; using search_plan_impl::max_iterations; - using search_plan_impl::load_bit_length; using search_plan_impl::thread_block_size; using search_plan_impl::hashmap_mode; using search_plan_impl::hashmap_min_bitlen; @@ -533,7 +538,6 @@ struct search : search_plan_impl { using search_plan_impl::result_buffer_size; using search_plan_impl::smem_size; - using search_plan_impl::load_bit_lenght; using search_plan_impl::hashmap; using search_plan_impl::num_executed_iterations; @@ -569,14 +573,14 @@ struct search : search_plan_impl { // // Allocate memory for intermediate buffer and workspace. // - result_buffer_size = itopk_size + (num_parents * graph_degree); + result_buffer_size = itopk_size + (search_width * graph_degree); result_buffer_allocation_size = result_buffer_size + itopk_size; result_indices.resize(result_buffer_allocation_size * max_queries, resource::get_cuda_stream(res)); result_distances.resize(result_buffer_allocation_size * max_queries, resource::get_cuda_stream(res)); - parent_node_list.resize(max_queries * num_parents, resource::get_cuda_stream(res)); + parent_node_list.resize(max_queries * search_width, resource::get_cuda_stream(res)); topk_hint.resize(max_queries, resource::get_cuda_stream(res)); size_t topk_workspace_size = _cuann_find_topk_bufferSize( @@ -590,8 +594,8 @@ struct search : search_plan_impl { ~search() {} void operator()(raft::resources const& res, - raft::device_matrix_view dataset, - raft::device_matrix_view graph, + raft::device_matrix_view dataset, + raft::device_matrix_view graph, INDEX_T* const topk_indices_ptr, // [num_queries, topk] DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] const DATA_T* const queries_ptr, // [num_queries, dataset_dim] @@ -613,6 +617,7 @@ struct search : search_plan_impl { dataset.data_handle(), dataset.extent(1), dataset.extent(0), + dataset.stride(0), queries_ptr, num_queries, result_buffer_size, @@ -665,8 +670,8 @@ struct search : search_plan_impl { hash_bitlen, _small_hash_bitlen, parent_node_list.data(), - num_parents, - num_parents, + search_width, + search_width, terminate_flag.data(), stream); @@ -679,10 +684,11 @@ struct search : search_plan_impl { // Compute distance to child nodes that are adjacent to the parent node compute_distance_to_child_nodes( parent_node_list.data(), - num_parents, + search_width, dataset.data_handle(), dataset.extent(1), dataset.extent(0), + dataset.stride(0), graph.data_handle(), graph.extent(1), queries_ptr, @@ -732,4 +738,4 @@ struct search : search_plan_impl { }; } // namespace multi_kernel_search -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh index 3bed100a70..bc2102b9b0 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh @@ -26,7 +26,7 @@ #include #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { struct search_plan_impl_base : public search_params { int64_t max_dim; @@ -53,7 +53,6 @@ struct search_plan_impl_base : public search_params { max_dim = 128; while (max_dim < dim && max_dim <= 1024) max_dim *= 2; - if (team_size != 0) { RAFT_LOG_WARN("Overriding team size parameter."); } // To keep binary size in check we limit only one team size specialization for each max_dim. // TODO(tfeher): revise this decision. switch (max_dim) { @@ -77,7 +76,6 @@ struct search_plan_impl : public search_plan_impl_base { uint32_t result_buffer_size; uint32_t smem_size; - uint32_t load_bit_lenght; uint32_t topk; uint32_t num_seeds; @@ -107,8 +105,8 @@ struct search_plan_impl : public search_plan_impl_base { virtual ~search_plan_impl() {} virtual void operator()(raft::resources const& res, - raft::device_matrix_view dataset, - raft::device_matrix_view graph, + raft::device_matrix_view dataset, + raft::device_matrix_view graph, INDEX_T* const result_indices_ptr, // [num_queries, topk] DISTANCE_T* const result_distances_ptr, // [num_queries, topk] const DATA_T* const queries_ptr, // [num_queries, dataset_dim] @@ -125,7 +123,7 @@ struct search_plan_impl : public search_plan_impl_base { _max_iterations = 1 + std::min(32 * 1.1, 32 + 10.0); // TODO(anaruse) } else { _max_iterations = - 1 + std::min((itopk_size / num_parents) * 1.1, (itopk_size / num_parents) + 10.0); + 1 + std::min((itopk_size / search_width) * 1.1, (itopk_size / search_width) + 10.0); } } if (max_iterations < min_iterations) { _max_iterations = min_iterations; } @@ -149,14 +147,14 @@ struct search_plan_impl : public search_plan_impl_base { { // for multipel CTA search uint32_t mc_num_cta_per_query = 0; - uint32_t mc_num_parents = 0; + uint32_t mc_search_width = 0; uint32_t mc_itopk_size = 0; if (algo == search_algo::MULTI_CTA) { mc_itopk_size = 32; - mc_num_parents = 1; - mc_num_cta_per_query = max(num_parents, itopk_size / 32); + mc_search_width = 1; + mc_num_cta_per_query = max(search_width, itopk_size / 32); RAFT_LOG_DEBUG("# mc_itopk_size: %u", mc_itopk_size); - RAFT_LOG_DEBUG("# mc_num_parents: %u", mc_num_parents); + RAFT_LOG_DEBUG("# mc_search_width: %u", mc_search_width); RAFT_LOG_DEBUG("# mc_num_cta_per_query: %u", mc_num_cta_per_query); } @@ -174,7 +172,7 @@ struct search_plan_impl : public search_plan_impl_base { // be determined based on the internal topk size and the number of nodes // visited per iteration. // - const auto max_visited_nodes = itopk_size + (num_parents * graph_degree * 1); + const auto max_visited_nodes = itopk_size + (search_width * graph_degree * 1); unsigned min_bitlen = 8; // 256 unsigned max_bitlen = 13; // 8K if (min_bitlen < hashmap_min_bitlen) { min_bitlen = hashmap_min_bitlen; } @@ -188,11 +186,9 @@ struct search_plan_impl : public search_plan_impl_base { hash_bitlen = 0; break; } else { - RAFT_LOG_DEBUG( - "[CAGRA Error]" + RAFT_FAIL( "small-hash cannot be used because the required hash size exceeds the limit (%u)", hashmap::get_size(max_bitlen)); - exit(-1); } } small_hash_bitlen = hash_bitlen; @@ -205,7 +201,7 @@ struct search_plan_impl : public search_plan_impl_base { small_hash_reset_interval = 1; while (1) { const auto max_visited_nodes = - itopk_size + (num_parents * graph_degree * (small_hash_reset_interval + 1)); + itopk_size + (search_width * graph_degree * (small_hash_reset_interval + 1)); if (max_visited_nodes > hashmap::get_size(hash_bitlen) * max_fill_rate) { break; } small_hash_reset_interval += 1; } @@ -217,9 +213,9 @@ struct search_plan_impl : public search_plan_impl_base { // nodes that may be visited before the search is completed and the // maximum fill rate of the hash table. // - uint32_t max_visited_nodes = itopk_size + (num_parents * graph_degree * max_iterations); + uint32_t max_visited_nodes = itopk_size + (search_width * graph_degree * max_iterations); if (algo == search_algo::MULTI_CTA) { - max_visited_nodes = mc_itopk_size + (mc_num_parents * graph_degree * max_iterations); + max_visited_nodes = mc_itopk_size + (mc_search_width * graph_degree * max_iterations); max_visited_nodes *= mc_num_cta_per_query; } unsigned min_bitlen = 11; // 2K @@ -232,7 +228,7 @@ struct search_plan_impl : public search_plan_impl_base { } RAFT_LOG_DEBUG("# internal topK = %lu", itopk_size); - RAFT_LOG_DEBUG("# parent size = %lu", num_parents); + RAFT_LOG_DEBUG("# parent size = %lu", search_width); RAFT_LOG_DEBUG("# min_iterations = %lu", min_iterations); RAFT_LOG_DEBUG("# max_iterations = %lu", max_iterations); RAFT_LOG_DEBUG("# max_queries = %lu", max_queries); @@ -258,7 +254,7 @@ struct search_plan_impl : public search_plan_impl_base { { RAFT_EXPECTS(topk <= itopk_size, "topk must be smaller than itopk_size = %lu", itopk_size); if (algo == search_algo::MULTI_CTA) { - uint32_t mc_num_cta_per_query = max(num_parents, itopk_size / 32); + uint32_t mc_num_cta_per_query = max(search_width, itopk_size / 32); RAFT_EXPECTS(mc_num_cta_per_query * 32 >= topk, "`mc_num_cta_per_query` (%u) * 32 must be equal to or greater than " "`topk` /%u) when 'search_mode' is \"multi-cta\"", @@ -286,14 +282,10 @@ struct search_plan_impl : public search_plan_impl_base { error_message += "`team_size` must be 0, 4, 8, 16 or 32. " + std::to_string(team_size) + " has been given."; } - if (load_bit_length != 0 && load_bit_length != 64 && load_bit_length != 128) { - error_message += "`load_bit_length` must be 0, 64 or 128. " + - std::to_string(load_bit_length) + " has been given."; - } if (thread_block_size != 0 && thread_block_size != 64 && thread_block_size != 128 && thread_block_size != 256 && thread_block_size != 512 && thread_block_size != 1024) { error_message += "`thread_block_size` must be 0, 64, 128, 256 or 512. " + - std::to_string(load_bit_length) + " has been given."; + std::to_string(thread_block_size) + " has been given."; } if (hashmap_min_bitlen > 20) { error_message += "`hashmap_min_bitlen` must be equal to or smaller than 20. " + @@ -332,4 +324,4 @@ struct search_plan_impl : public search_plan_impl_base { // }; /** @} */ // end group cagra -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh index 9400a16c36..96de83369d 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh @@ -34,902 +34,17 @@ #include "device_common.hpp" #include "hashmap.hpp" #include "search_plan.cuh" +#include "search_single_cta_kernel.cuh" +#include "topk_by_radix.cuh" #include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk #include "utils.hpp" #include #include #include // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace single_cta_search { -// #define _CLK_BREAKDOWN - -template -__device__ void pickup_next_parents(std::uint32_t* const terminate_flag, - INDEX_T* const next_parent_indices, - INDEX_T* const internal_topk_indices, - const std::size_t internal_topk_size, - const std::size_t dataset_size, - const std::uint32_t num_parents) -{ - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - // if (threadIdx.x >= 32) return; - - for (std::uint32_t i = threadIdx.x; i < num_parents; i += 32) { - next_parent_indices[i] = utils::get_max_value(); - } - std::uint32_t itopk_max = internal_topk_size; - if (itopk_max % 32) { itopk_max += 32 - (itopk_max % 32); } - std::uint32_t num_new_parents = 0; - for (std::uint32_t j = threadIdx.x; j < itopk_max; j += 32) { - std::uint32_t jj = j; - if (TOPK_BY_BITONIC_SORT) { jj = device::swizzling(j); } - INDEX_T index; - int new_parent = 0; - if (j < internal_topk_size) { - index = internal_topk_indices[jj]; - if ((index & index_msb_1_mask) == 0) { // check if most significant bit is set - new_parent = 1; - } - } - const std::uint32_t ballot_mask = __ballot_sync(0xffffffff, new_parent); - if (new_parent) { - const auto i = __popc(ballot_mask & ((1 << threadIdx.x) - 1)) + num_new_parents; - if (i < num_parents) { - next_parent_indices[i] = index; - // set most significant bit as used node - internal_topk_indices[jj] |= index_msb_1_mask; - } - } - num_new_parents += __popc(ballot_mask); - if (num_new_parents >= num_parents) { break; } - } - if (threadIdx.x == 0 && (num_new_parents == 0)) { *terminate_flag = 1; } -} - -template -struct topk_by_radix_sort_base { - static constexpr std::uint32_t smem_size = MAX_INTERNAL_TOPK * 2 + 2048 + 8; - static constexpr std::uint32_t state_bit_lenght = 0; - static constexpr std::uint32_t vecLen = 2; // TODO -}; -template -struct topk_by_radix_sort : topk_by_radix_sort_base {}; - -template -struct topk_by_radix_sort> - : topk_by_radix_sort_base { - __device__ void operator()(uint32_t topk, - uint32_t batch_size, - uint32_t len_x, - const uint32_t* _x, - const IdxT* _in_vals, - uint32_t* _y, - IdxT* _out_vals, - uint32_t* work, - uint32_t* _hints, - bool sort, - uint32_t* _smem) - { - std::uint8_t* const state = reinterpret_cast(work); - topk_cta_11_core::state_bit_lenght, - topk_by_radix_sort_base::vecLen, - 64, - 32, - IdxT>(topk, len_x, _x, _in_vals, _y, _out_vals, state, _hints, sort, _smem); - } -}; - -#define TOP_FUNC_PARTIAL_SPECIALIZATION(V) \ - template \ - struct topk_by_radix_sort< \ - MAX_INTERNAL_TOPK, \ - BLOCK_SIZE, \ - IdxT, \ - std::enable_if_t<((MAX_INTERNAL_TOPK <= V) && (2 * MAX_INTERNAL_TOPK > V))>> \ - : topk_by_radix_sort_base { \ - __device__ void operator()(uint32_t topk, \ - uint32_t batch_size, \ - uint32_t len_x, \ - const uint32_t* _x, \ - const IdxT* _in_vals, \ - uint32_t* _y, \ - IdxT* _out_vals, \ - uint32_t* work, \ - uint32_t* _hints, \ - bool sort, \ - uint32_t* _smem) \ - { \ - assert(BLOCK_SIZE >= V / 4); \ - std::uint8_t* state = (std::uint8_t*)work; \ - topk_cta_11_core::state_bit_lenght, \ - topk_by_radix_sort_base::vecLen, \ - V, \ - V / 4, \ - IdxT>( \ - topk, len_x, _x, _in_vals, _y, _out_vals, state, _hints, sort, _smem); \ - } \ - }; -TOP_FUNC_PARTIAL_SPECIALIZATION(128); -TOP_FUNC_PARTIAL_SPECIALIZATION(256); -TOP_FUNC_PARTIAL_SPECIALIZATION(512); -TOP_FUNC_PARTIAL_SPECIALIZATION(1024); - -template -__device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - const std::uint32_t num_itopk) -{ - const unsigned lane_id = threadIdx.x % 32; - const unsigned warp_id = threadIdx.x / 32; - if (MULTI_WARPS == 0) { - if (warp_id > 0) { return; } - constexpr unsigned N = (MAX_CANDIDATES + 31) / 32; - float key[N]; - IdxT val[N]; - /* Candidates -> Reg */ - for (unsigned i = 0; i < N; i++) { - unsigned j = lane_id + (32 * i); - if (j < num_candidates) { - key[i] = candidate_distances[j]; - val[i] = candidate_indices[j]; - } else { - key[i] = utils::get_max_value(); - val[i] = utils::get_max_value(); - } - } - /* Sort */ - bitonic::warp_sort(key, val); - /* Reg -> Temp_itopk */ - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * lane_id) + i; - if (j < num_candidates && j < num_itopk) { - candidate_distances[device::swizzling(j)] = key[i]; - candidate_indices[device::swizzling(j)] = val[i]; - } - } - } else { - // Use two warps (64 threads) - constexpr unsigned max_candidates_per_warp = (MAX_CANDIDATES + 1) / 2; - constexpr unsigned N = (max_candidates_per_warp + 31) / 32; - float key[N]; - IdxT val[N]; - if (warp_id < 2) { - /* Candidates -> Reg */ - for (unsigned i = 0; i < N; i++) { - unsigned jl = lane_id + (32 * i); - unsigned j = jl + (max_candidates_per_warp * warp_id); - if (j < num_candidates) { - key[i] = candidate_distances[j]; - val[i] = candidate_indices[j]; - } else { - key[i] = utils::get_max_value(); - val[i] = utils::get_max_value(); - } - } - /* Sort */ - bitonic::warp_sort(key, val); - /* Reg -> Temp_candidates */ - for (unsigned i = 0; i < N; i++) { - unsigned jl = (N * lane_id) + i; - unsigned j = jl + (max_candidates_per_warp * warp_id); - if (j < num_candidates && jl < num_itopk) { - candidate_distances[device::swizzling(j)] = key[i]; - candidate_indices[device::swizzling(j)] = val[i]; - } - } - } - __syncthreads(); - - unsigned num_warps_used = (num_itopk + max_candidates_per_warp - 1) / max_candidates_per_warp; - if (warp_id < num_warps_used) { - /* Temp_candidates -> Reg */ - for (unsigned i = 0; i < N; i++) { - unsigned jl = (N * lane_id) + i; - unsigned kl = max_candidates_per_warp - 1 - jl; - unsigned j = jl + (max_candidates_per_warp * warp_id); - unsigned k = MAX_CANDIDATES - 1 - j; - if (j >= num_candidates || k >= num_candidates || kl >= num_itopk) continue; - float temp_key = candidate_distances[device::swizzling(k)]; - if (key[i] == temp_key) continue; - if ((warp_id == 0) == (key[i] > temp_key)) { - key[i] = temp_key; - val[i] = candidate_indices[device::swizzling(k)]; - } - } - } - if (num_warps_used > 1) { __syncthreads(); } - if (warp_id < num_warps_used) { - /* Merge */ - bitonic::warp_merge(key, val, 32); - /* Reg -> Temp_itopk */ - for (unsigned i = 0; i < N; i++) { - unsigned jl = (N * lane_id) + i; - unsigned j = jl + (max_candidates_per_warp * warp_id); - if (j < num_candidates && j < num_itopk) { - candidate_distances[device::swizzling(j)] = key[i]; - candidate_indices[device::swizzling(j)] = val[i]; - } - } - } - if (num_warps_used > 1) { __syncthreads(); } - } -} - -template -__device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances, // [num_itopk] - IdxT* itopk_indices, // [num_itopk] - const std::uint32_t num_itopk, - float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - std::uint32_t* work_buf, - const bool first) -{ - const unsigned lane_id = threadIdx.x % 32; - const unsigned warp_id = threadIdx.x / 32; - if (MULTI_WARPS == 0) { - if (warp_id > 0) { return; } - constexpr unsigned N = (MAX_ITOPK + 31) / 32; - float key[N]; - IdxT val[N]; - if (first) { - /* Load itopk results */ - for (unsigned i = 0; i < N; i++) { - unsigned j = lane_id + (32 * i); - if (j < num_itopk) { - key[i] = itopk_distances[j]; - val[i] = itopk_indices[j]; - } else { - key[i] = utils::get_max_value(); - val[i] = utils::get_max_value(); - } - } - /* Warp Sort */ - bitonic::warp_sort(key, val); - } else { - /* Load itopk results */ - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * lane_id) + i; - if (j < num_itopk) { - key[i] = itopk_distances[device::swizzling(j)]; - val[i] = itopk_indices[device::swizzling(j)]; - } else { - key[i] = utils::get_max_value(); - val[i] = utils::get_max_value(); - } - } - } - /* Merge candidates */ - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * lane_id) + i; // [0:MAX_ITOPK-1] - unsigned k = MAX_ITOPK - 1 - j; - if (k >= num_itopk || k >= num_candidates) continue; - float candidate_key = candidate_distances[device::swizzling(k)]; - if (key[i] > candidate_key) { - key[i] = candidate_key; - val[i] = candidate_indices[device::swizzling(k)]; - } - } - /* Warp Merge */ - bitonic::warp_merge(key, val, 32); - /* Store new itopk results */ - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * lane_id) + i; - if (j < num_itopk) { - itopk_distances[device::swizzling(j)] = key[i]; - itopk_indices[device::swizzling(j)] = val[i]; - } - } - } else { - // Use two warps (64 threads) or more - constexpr unsigned max_itopk_per_warp = (MAX_ITOPK + 1) / 2; - constexpr unsigned N = (max_itopk_per_warp + 31) / 32; - float key[N]; - IdxT val[N]; - if (first) { - /* Load itop results (not sorted) */ - if (warp_id < 2) { - for (unsigned i = 0; i < N; i++) { - unsigned j = lane_id + (32 * i) + (max_itopk_per_warp * warp_id); - if (j < num_itopk) { - key[i] = itopk_distances[j]; - val[i] = itopk_indices[j]; - } else { - key[i] = utils::get_max_value(); - val[i] = utils::get_max_value(); - } - } - /* Warp Sort */ - bitonic::warp_sort(key, val); - /* Store intermedidate results */ - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * threadIdx.x) + i; - if (j >= num_itopk) continue; - itopk_distances[device::swizzling(j)] = key[i]; - itopk_indices[device::swizzling(j)] = val[i]; - } - } - __syncthreads(); - if (warp_id < 2) { - /* Load intermedidate results */ - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * threadIdx.x) + i; - unsigned k = MAX_ITOPK - 1 - j; - if (k >= num_itopk) continue; - float temp_key = itopk_distances[device::swizzling(k)]; - if (key[i] == temp_key) continue; - if ((warp_id == 0) == (key[i] > temp_key)) { - key[i] = temp_key; - val[i] = itopk_indices[device::swizzling(k)]; - } - } - /* Warp Merge */ - bitonic::warp_merge(key, val, 32); - } - __syncthreads(); - /* Store itopk results (sorted) */ - if (warp_id < 2) { - for (unsigned i = 0; i < N; i++) { - unsigned j = (N * threadIdx.x) + i; - if (j >= num_itopk) continue; - itopk_distances[device::swizzling(j)] = key[i]; - itopk_indices[device::swizzling(j)] = val[i]; - } - } - } - const uint32_t num_itopk_div2 = num_itopk / 2; - if (threadIdx.x < 3) { - // work_buf is used to obtain turning points in 1st and 2nd half of itopk afer merge. - work_buf[threadIdx.x] = num_itopk_div2; - } - __syncthreads(); - - // Merge candidates (using whole threads) - for (unsigned k = threadIdx.x; k < min(num_candidates, num_itopk); k += blockDim.x) { - const unsigned j = num_itopk - 1 - k; - const float itopk_key = itopk_distances[device::swizzling(j)]; - const float candidate_key = candidate_distances[device::swizzling(k)]; - if (itopk_key > candidate_key) { - itopk_distances[device::swizzling(j)] = candidate_key; - itopk_indices[device::swizzling(j)] = candidate_indices[device::swizzling(k)]; - if (j < num_itopk_div2) { - atomicMin(work_buf + 2, j); - } else { - atomicMin(work_buf + 1, j - num_itopk_div2); - } - } - } - __syncthreads(); - - // Merge 1st and 2nd half of itopk (using whole threads) - for (unsigned j = threadIdx.x; j < num_itopk_div2; j += blockDim.x) { - const unsigned k = j + num_itopk_div2; - float key_0 = itopk_distances[device::swizzling(j)]; - float key_1 = itopk_distances[device::swizzling(k)]; - if (key_0 > key_1) { - itopk_distances[device::swizzling(j)] = key_1; - itopk_distances[device::swizzling(k)] = key_0; - IdxT val_0 = itopk_indices[device::swizzling(j)]; - IdxT val_1 = itopk_indices[device::swizzling(k)]; - itopk_indices[device::swizzling(j)] = val_1; - itopk_indices[device::swizzling(k)] = val_0; - atomicMin(work_buf + 0, j); - } - } - if (threadIdx.x == blockDim.x - 1) { - if (work_buf[2] < num_itopk_div2) { work_buf[1] = work_buf[2]; } - } - __syncthreads(); - // if ((blockIdx.x == 0) && (threadIdx.x == 0)) { - // RAFT_LOG_DEBUG( "work_buf: %u, %u, %u\n", work_buf[0], work_buf[1], work_buf[2] ); - // } - - // Warp-0 merges 1st half of itopk, warp-1 does 2nd half. - if (warp_id < 2) { - // Load intermedidate itopk results - const uint32_t turning_point = work_buf[warp_id]; // turning_point <= num_itopk_div2 - for (unsigned i = 0; i < N; i++) { - unsigned k = num_itopk; - unsigned j = (N * lane_id) + i; - if (j < turning_point) { - k = j + (num_itopk_div2 * warp_id); - } else if (j >= (MAX_ITOPK / 2 - num_itopk_div2)) { - j -= (MAX_ITOPK / 2 - num_itopk_div2); - if ((turning_point <= j) && (j < num_itopk_div2)) { k = j + (num_itopk_div2 * warp_id); } - } - if (k < num_itopk) { - key[i] = itopk_distances[device::swizzling(k)]; - val[i] = itopk_indices[device::swizzling(k)]; - } else { - key[i] = utils::get_max_value(); - val[i] = utils::get_max_value(); - } - } - /* Warp Merge */ - bitonic::warp_merge(key, val, 32); - /* Store new itopk results */ - for (unsigned i = 0; i < N; i++) { - const unsigned j = (N * lane_id) + i; - if (j < num_itopk_div2) { - unsigned k = j + (num_itopk_div2 * warp_id); - itopk_distances[device::swizzling(k)] = key[i]; - itopk_indices[device::swizzling(k)] = val[i]; - } - } - } - } -} - -template -__device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] - IdxT* itopk_indices, // [num_itopk] - const std::uint32_t num_itopk, - float* candidate_distances, // [num_candidates] - IdxT* candidate_indices, // [num_candidates] - const std::uint32_t num_candidates, - std::uint32_t* work_buf, - const bool first) -{ - // The results in candidate_distances/indices are sorted by bitonic sort. - topk_by_bitonic_sort_1st( - candidate_distances, candidate_indices, num_candidates, num_itopk); - - // The results sorted above are merged with the internal intermediate top-k - // results so far using bitonic merge. - topk_by_bitonic_sort_2nd(itopk_distances, - itopk_indices, - num_itopk, - candidate_distances, - candidate_indices, - num_candidates, - work_buf, - first); -} - -template -__device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr, - const size_t hashmap_bitlen, - const INDEX_T* itopk_indices, - uint32_t itopk_size) -{ - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - - if (threadIdx.x < FIRST_TID || threadIdx.x >= LAST_TID) return; - for (unsigned i = threadIdx.x - FIRST_TID; i < itopk_size; i += LAST_TID - FIRST_TID) { - auto key = itopk_indices[i] & ~index_msb_1_mask; // clear most significant bit - hashmap::insert(hashmap_ptr, hashmap_bitlen, key); - } -} - -template -__device__ inline void set_value_device(T* const ptr, const T fill, const std::uint32_t count) -{ - for (std::uint32_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { - ptr[i] = fill; - } -} - -// One query one thread block -template -__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ - void search_kernel(INDEX_T* const result_indices_ptr, // [num_queries, top_k] - DISTANCE_T* const result_distances_ptr, // [num_queries, top_k] - const std::uint32_t top_k, - const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] - const std::size_t dataset_dim, - const std::size_t dataset_size, - const DATA_T* const queries_ptr, // [num_queries, dataset_dim] - const INDEX_T* const knn_graph, // [dataset_size, graph_degree] - const std::uint32_t graph_degree, - const unsigned num_distilation, - const uint64_t rand_xor_mask, - const INDEX_T* seed_ptr, // [num_queries, num_seeds] - const uint32_t num_seeds, - INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] - const std::uint32_t internal_topk, - const std::uint32_t num_parents, - const std::uint32_t min_iteration, - const std::uint32_t max_iteration, - std::uint32_t* const num_executed_iterations, // [num_queries] - const std::uint32_t hash_bitlen, - const std::uint32_t small_hash_bitlen, - const std::uint32_t small_hash_reset_interval) -{ - const auto query_id = blockIdx.y; - -#ifdef _CLK_BREAKDOWN - std::uint64_t clk_init = 0; - std::uint64_t clk_compute_1st_distance = 0; - std::uint64_t clk_topk = 0; - std::uint64_t clk_reset_hash = 0; - std::uint64_t clk_pickup_parents = 0; - std::uint64_t clk_restore_hash = 0; - std::uint64_t clk_compute_distance = 0; - std::uint64_t clk_start; -#define _CLK_START() clk_start = clock64() -#define _CLK_REC(V) V += clock64() - clk_start; -#else -#define _CLK_START() -#define _CLK_REC(V) -#endif - _CLK_START(); - - extern __shared__ std::uint32_t smem[]; - - // Layout of result_buffer - // +----------------------+------------------------------+---------+ - // | internal_top_k | neighbors of internal_top_k | padding | - // | | | upto 32 | - // +----------------------+------------------------------+---------+ - // |<--- result_buffer_size --->| - std::uint32_t result_buffer_size = internal_topk + (num_parents * graph_degree); - std::uint32_t result_buffer_size_32 = result_buffer_size; - if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } - const auto small_hash_size = hashmap::get_size(small_hash_bitlen); - auto query_buffer = reinterpret_cast(smem); - auto result_indices_buffer = reinterpret_cast(query_buffer + MAX_DATASET_DIM); - auto result_distances_buffer = - reinterpret_cast(result_indices_buffer + result_buffer_size_32); - auto visited_hash_buffer = - reinterpret_cast(result_distances_buffer + result_buffer_size_32); - auto parent_list_buffer = reinterpret_cast(visited_hash_buffer + small_hash_size); - auto topk_ws = reinterpret_cast(parent_list_buffer + num_parents); - auto terminate_flag = reinterpret_cast(topk_ws + 3); - auto smem_working_ptr = reinterpret_cast(terminate_flag + 1); - - const DATA_T* const query_ptr = queries_ptr + query_id * dataset_dim; - for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) { - unsigned j = device::swizzling(i); - if (i < dataset_dim) { - query_buffer[j] = spatial::knn::detail::utils::mapping{}(query_ptr[i]); - } else { - query_buffer[j] = 0.0; - } - } - if (threadIdx.x == 0) { - terminate_flag[0] = 0; - topk_ws[0] = ~0u; - } - - // Init hashmap - INDEX_T* local_visited_hashmap_ptr; - if (small_hash_bitlen) { - local_visited_hashmap_ptr = visited_hash_buffer; - } else { - local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id); - } - hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); - __syncthreads(); - _CLK_REC(clk_init); - - // compute distance to randomly selecting nodes - _CLK_START(); - const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; - device::compute_distance_to_random_nodes( - result_indices_buffer, - result_distances_buffer, - query_buffer, - dataset_ptr, - dataset_dim, - dataset_size, - result_buffer_size, - num_distilation, - rand_xor_mask, - local_seed_ptr, - num_seeds, - local_visited_hashmap_ptr, - hash_bitlen); - __syncthreads(); - _CLK_REC(clk_compute_1st_distance); - - std::uint32_t iter = 0; - while (1) { - // sort - if (TOPK_BY_BITONIC_SORT) { - // [Notice] - // It is good to use multiple warps in topk_by_bitonic_sort() when - // batch size is small (short-latency), but it might not be always good - // when batch size is large (high-throughput). - // topk_by_bitonic_sort() consists of two operations: - // if MAX_CANDIDATES is greater than 128, the first operation uses two warps; - // if MAX_ITOPK is greater than 256, the second operation used two warps. - constexpr unsigned multi_warps_1 = ((BLOCK_SIZE >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0; - constexpr unsigned multi_warps_2 = ((BLOCK_SIZE >= 64) && (MAX_ITOPK > 256)) ? 1 : 0; - - // reset small-hash table. - if ((iter + 1) % small_hash_reset_interval == 0) { - // Depending on the block size and the number of warps used in - // topk_by_bitonic_sort(), determine which warps are used to reset - // the small hash and whether they are performed in overlap with - // topk_by_bitonic_sort(). - _CLK_START(); - if (BLOCK_SIZE == 32) { - hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); - } else if (BLOCK_SIZE == 64) { - if (multi_warps_1 || multi_warps_2) { - hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); - } else { - hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); - } - } else { - if (multi_warps_1 || multi_warps_2) { - hashmap::init<64, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); - } else { - hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); - } - } - _CLK_REC(clk_reset_hash); - } - - // topk with bitonic sort - _CLK_START(); - topk_by_bitonic_sort( - result_distances_buffer, - result_indices_buffer, - internal_topk, - result_distances_buffer + internal_topk, - result_indices_buffer + internal_topk, - num_parents * graph_degree, - topk_ws, - (iter == 0)); - _CLK_REC(clk_topk); - - } else { - _CLK_START(); - // topk with radix block sort - topk_by_radix_sort{}( - internal_topk, - gridDim.x, - result_buffer_size, - reinterpret_cast(result_distances_buffer), - result_indices_buffer, - reinterpret_cast(result_distances_buffer), - result_indices_buffer, - nullptr, - topk_ws, - true, - reinterpret_cast(smem_working_ptr)); - _CLK_REC(clk_topk); - - // reset small-hash table - if ((iter + 1) % small_hash_reset_interval == 0) { - _CLK_START(); - hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); - _CLK_REC(clk_reset_hash); - } - } - __syncthreads(); - - if (iter + 1 == max_iteration) { break; } - - // pick up next parents - if (threadIdx.x < 32) { - _CLK_START(); - pickup_next_parents(terminate_flag, - parent_list_buffer, - result_indices_buffer, - internal_topk, - dataset_size, - num_parents); - _CLK_REC(clk_pickup_parents); - } - - // restore small-hash table by putting internal-topk indices in it - if ((iter + 1) % small_hash_reset_interval == 0) { - constexpr unsigned first_tid = ((BLOCK_SIZE <= 32) ? 0 : 32); - _CLK_START(); - hashmap_restore( - local_visited_hashmap_ptr, hash_bitlen, result_indices_buffer, internal_topk); - _CLK_REC(clk_restore_hash); - } - __syncthreads(); - - if (*terminate_flag && iter >= min_iteration) { break; } - - // compute the norms between child nodes and query node - _CLK_START(); - constexpr unsigned max_n_frags = 16; - device:: - compute_distance_to_child_nodes( - result_indices_buffer + internal_topk, - result_distances_buffer + internal_topk, - query_buffer, - dataset_ptr, - dataset_dim, - knn_graph, - graph_degree, - local_visited_hashmap_ptr, - hash_bitlen, - parent_list_buffer, - num_parents); - __syncthreads(); - _CLK_REC(clk_compute_distance); - - iter++; - } - for (std::uint32_t i = threadIdx.x; i < top_k; i += BLOCK_SIZE) { - unsigned j = i + (top_k * query_id); - unsigned ii = i; - if (TOPK_BY_BITONIC_SORT) { ii = device::swizzling(i); } - if (result_distances_ptr != nullptr) { result_distances_ptr[j] = result_distances_buffer[ii]; } - - constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; - - result_indices_ptr[j] = - result_indices_buffer[ii] & ~index_msb_1_mask; // clear most significant bit - } - if (threadIdx.x == 0 && num_executed_iterations != nullptr) { - num_executed_iterations[query_id] = iter + 1; - } -#ifdef _CLK_BREAKDOWN - if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && ((query_id * 3) % gridDim.y < 3)) { - RAFT_LOG_DEBUG( - "query, %d, thread, %d" - ", init, %d" - ", 1st_distance, %lu" - ", topk, %lu" - ", reset_hash, %lu" - ", pickup_parents, %lu" - ", restore_hash, %lu" - ", distance, %lu" - "\n", - query_id, - threadIdx.x, - clk_init, - clk_compute_1st_distance, - clk_topk, - clk_reset_hash, - clk_pickup_parents, - clk_restore_hash, - clk_compute_distance); - } -#endif -} - -#define SET_KERNEL_3( \ - BLOCK_SIZE, BLOCK_COUNT, MAX_ITOPK, MAX_CANDIDATES, TOPK_BY_BITONIC_SORT, LOAD_T) \ - kernel = search_kernel; - -#define SET_KERNEL_2(BLOCK_SIZE, BLOCK_COUNT, MAX_ITOPK, MAX_CANDIDATES, TOPK_BY_BITONIC_SORT) \ - if (load_bit_length == 128) { \ - SET_KERNEL_3(BLOCK_SIZE, \ - BLOCK_COUNT, \ - MAX_ITOPK, \ - MAX_CANDIDATES, \ - TOPK_BY_BITONIC_SORT, \ - device::LOAD_128BIT_T) \ - } else if (load_bit_length == 64) { \ - SET_KERNEL_3(BLOCK_SIZE, \ - BLOCK_COUNT, \ - MAX_ITOPK, \ - MAX_CANDIDATES, \ - TOPK_BY_BITONIC_SORT, \ - device::LOAD_64BIT_T) \ - } - -#define SET_KERNEL_1B(MAX_ITOPK, MAX_CANDIDATES) \ - /* if ( block_size == 32 ) { \ - SET_KERNEL_2( 32, 20, MAX_ITOPK, MAX_CANDIDATES, 1 ) \ - } else */ \ - if (block_size == 64) { \ - SET_KERNEL_2(64, 16 /*20*/, MAX_ITOPK, MAX_CANDIDATES, 1) \ - } else if (block_size == 128) { \ - SET_KERNEL_2(128, 8, MAX_ITOPK, MAX_CANDIDATES, 1) \ - } else if (block_size == 256) { \ - SET_KERNEL_2(256, 4, MAX_ITOPK, MAX_CANDIDATES, 1) \ - } else if (block_size == 512) { \ - SET_KERNEL_2(512, 2, MAX_ITOPK, MAX_CANDIDATES, 1) \ - } else { \ - SET_KERNEL_2(1024, 1, MAX_ITOPK, MAX_CANDIDATES, 1) \ - } - -#define SET_KERNEL_1R(MAX_ITOPK, MAX_CANDIDATES) \ - if (block_size == 256) { \ - SET_KERNEL_2(256, 4, MAX_ITOPK, MAX_CANDIDATES, 0) \ - } else if (block_size == 512) { \ - SET_KERNEL_2(512, 2, MAX_ITOPK, MAX_CANDIDATES, 0) \ - } else { \ - SET_KERNEL_2(1024, 1, MAX_ITOPK, MAX_CANDIDATES, 0) \ - } - -#define SET_KERNEL \ - typedef void (*search_kernel_t)(INDEX_T* const result_indices_ptr, \ - DISTANCE_T* const result_distances_ptr, \ - const std::uint32_t top_k, \ - const DATA_T* const dataset_ptr, \ - const std::size_t dataset_dim, \ - const std::size_t dataset_size, \ - const DATA_T* const queries_ptr, \ - const INDEX_T* const knn_graph, \ - const std::uint32_t graph_degree, \ - const unsigned num_distilation, \ - const uint64_t rand_xor_mask, \ - const INDEX_T* seed_ptr, \ - const uint32_t num_seeds, \ - INDEX_T* const visited_hashmap_ptr, \ - const std::uint32_t itopk_size, \ - const std::uint32_t num_parents, \ - const std::uint32_t min_iteration, \ - const std::uint32_t max_iteration, \ - std::uint32_t* const num_executed_iterations, \ - const std::uint32_t hash_bitlen, \ - const std::uint32_t small_hash_bitlen, \ - const std::uint32_t small_hash_reset_interval); \ - search_kernel_t kernel; \ - if (num_itopk_candidates <= 64) { \ - constexpr unsigned max_candidates = 64; \ - if (itopk_size <= 64) { \ - SET_KERNEL_1B(64, max_candidates) \ - } else if (itopk_size <= 128) { \ - SET_KERNEL_1B(128, max_candidates) \ - } else if (itopk_size <= 256) { \ - SET_KERNEL_1B(256, max_candidates) \ - } else if (itopk_size <= 512) { \ - SET_KERNEL_1B(512, max_candidates) \ - } \ - } else if (num_itopk_candidates <= 128) { \ - constexpr unsigned max_candidates = 128; \ - if (itopk_size <= 64) { \ - SET_KERNEL_1B(64, max_candidates) \ - } else if (itopk_size <= 128) { \ - SET_KERNEL_1B(128, max_candidates) \ - } else if (itopk_size <= 256) { \ - SET_KERNEL_1B(256, max_candidates) \ - } else if (itopk_size <= 512) { \ - SET_KERNEL_1B(512, max_candidates) \ - } \ - } else if (num_itopk_candidates <= 256) { \ - constexpr unsigned max_candidates = 256; \ - if (itopk_size <= 64) { \ - SET_KERNEL_1B(64, max_candidates) \ - } else if (itopk_size <= 128) { \ - SET_KERNEL_1B(128, max_candidates) \ - } else if (itopk_size <= 256) { \ - SET_KERNEL_1B(256, max_candidates) \ - } else if (itopk_size <= 512) { \ - SET_KERNEL_1B(512, max_candidates) \ - } \ - } else { \ - /* Radix-based topk is used */ \ - if (itopk_size <= 256) { \ - SET_KERNEL_1R(256, /*to avoid build failure*/ 32) \ - } else if (itopk_size <= 512) { \ - SET_KERNEL_1R(512, /*to avoid build failure*/ 32) \ - } \ - } - template { using search_plan_impl::itopk_size; using search_plan_impl::algo; using search_plan_impl::team_size; - using search_plan_impl::num_parents; + using search_plan_impl::search_width; using search_plan_impl::min_iterations; using search_plan_impl::max_iterations; - using search_plan_impl::load_bit_length; using search_plan_impl::thread_block_size; using search_plan_impl::hashmap_mode; using search_plan_impl::hashmap_min_bitlen; @@ -965,7 +79,6 @@ struct search : search_plan_impl { using search_plan_impl::result_buffer_size; using search_plan_impl::smem_size; - using search_plan_impl::load_bit_lenght; using search_plan_impl::hashmap; using search_plan_impl::num_executed_iterations; @@ -988,7 +101,7 @@ struct search : search_plan_impl { inline void set_params(raft::resources const& res) { - num_itopk_candidates = num_parents * graph_degree; + num_itopk_candidates = search_width * graph_degree; result_buffer_size = itopk_size + num_itopk_candidates; typedef raft::Pow2<32> AlignBytes; @@ -1009,7 +122,7 @@ struct search : search_plan_impl { const std::uint32_t topk_ws_size = 3; const std::uint32_t base_smem_size = sizeof(float) * max_dim + (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 + - sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * num_parents + + sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width + sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t); smem_size = base_smem_size; if (num_itopk_candidates > 256) { @@ -1052,7 +165,7 @@ struct search : search_plan_impl { cudaDeviceProp deviceProp = resource::get_device_properties(res); RAFT_LOG_DEBUG("# multiProcessorCount: %d", deviceProp.multiProcessorCount); while ((block_size < max_block_size) && - (graph_degree * num_parents * team_size >= block_size * 2) && + (graph_degree * search_width * team_size >= block_size * 2) && (max_queries <= (1024 / (block_size * 2)) * deviceProp.multiProcessorCount)) { block_size *= 2; } @@ -1066,22 +179,6 @@ struct search : search_plan_impl { max_block_size); thread_block_size = block_size; - // Determine load bit length - const uint32_t total_bit_length = dim * sizeof(DATA_T) * 8; - if (load_bit_length == 0) { - load_bit_length = 128; - while (total_bit_length % load_bit_length) { - load_bit_length /= 2; - } - } - RAFT_LOG_DEBUG("# load_bit_length: %u (%u loads per vector)", - load_bit_length, - total_bit_length / load_bit_length); - RAFT_EXPECTS(total_bit_length % load_bit_length == 0, - "load_bit_length must be a divisor of dim*sizeof(data_t)*8=%u", - total_bit_length); - RAFT_EXPECTS(load_bit_length >= 64, "load_bit_lenght cannot be less than 64"); - if (num_itopk_candidates <= 256) { RAFT_LOG_DEBUG("# bitonic-sort based topk routine is used"); } else { @@ -1129,8 +226,8 @@ struct search : search_plan_impl { } void operator()(raft::resources const& res, - raft::device_matrix_view dataset, - raft::device_matrix_view graph, + raft::device_matrix_view dataset, + raft::device_matrix_view graph, INDEX_T* const result_indices_ptr, // [num_queries, topk] DISTANCE_T* const result_distances_ptr, // [num_queries, topk] const DATA_T* const queries_ptr, // [num_queries, dataset_dim] @@ -1140,39 +237,33 @@ struct search : search_plan_impl { uint32_t topk) { cudaStream_t stream = resource::get_cuda_stream(res); - uint32_t block_size = thread_block_size; - SET_KERNEL; - RAFT_CUDA_TRY( - cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); - dim3 thread_dims(block_size, 1, 1); - dim3 block_dims(1, num_queries, 1); - RAFT_LOG_DEBUG( - "Launching kernel with %u threads, %u block %lu smem", block_size, num_queries, smem_size); - kernel<<>>(result_indices_ptr, - result_distances_ptr, - topk, - dataset.data_handle(), - dataset.extent(1), - dataset.extent(0), - queries_ptr, - graph.data_handle(), - graph.extent(1), - num_random_samplings, - rand_xor_mask, - dev_seed_ptr, - num_seeds, - hashmap.data(), - itopk_size, - num_parents, - min_iterations, - max_iterations, - num_executed_iterations, - hash_bitlen, - small_hash_bitlen, - small_hash_reset_interval); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + select_and_run( + dataset, + graph, + result_indices_ptr, + result_distances_ptr, + queries_ptr, + num_queries, + dev_seed_ptr, + num_executed_iterations, + topk, + num_itopk_candidates, + static_cast(thread_block_size), + smem_size, + hash_bitlen, + hashmap.data(), + small_hash_bitlen, + small_hash_reset_interval, + num_random_samplings, + rand_xor_mask, + num_seeds, + itopk_size, + search_width, + min_iterations, + max_iterations, + stream); } }; } // namespace single_cta_search -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh new file mode 100644 index 0000000000..f7c43fe11c --- /dev/null +++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include // RAFT_EXPLICIT +namespace raft::neighbors::cagra::detail { +namespace single_cta_search { + +#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY + +template +void select_and_run( // raft::resources const& res, + raft::device_matrix_view dataset, + raft::device_matrix_view graph, + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + const uint32_t num_queries, + const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* const num_executed_iterations, // [num_queries,] + uint32_t topk, + uint32_t num_itopk_candidates, + uint32_t block_size, + uint32_t smem_size, + int64_t hash_bitlen, + INDEX_T* hashmap_ptr, + size_t small_hash_bitlen, + size_t small_hash_reset_interval, + uint32_t num_random_samplings, + uint64_t rand_xor_mask, + uint32_t num_seeds, + size_t itopk_size, + size_t search_width, + size_t min_iterations, + size_t max_iterations, + cudaStream_t stream) RAFT_EXPLICIT; + +#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + extern template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 1024, float, uint32_t, float); +instantiate_single_cta_select_and_run(8, 128, float, uint32_t, float); +instantiate_single_cta_select_and_run(16, 256, float, uint32_t, float); +instantiate_single_cta_select_and_run(32, 512, float, uint32_t, float); +instantiate_single_cta_select_and_run(32, 1024, int8_t, uint32_t, float); +instantiate_single_cta_select_and_run(8, 128, int8_t, uint32_t, float); +instantiate_single_cta_select_and_run(16, 256, int8_t, uint32_t, float); +instantiate_single_cta_select_and_run(32, 512, int8_t, uint32_t, float); +instantiate_single_cta_select_and_run(32, 1024, uint8_t, uint32_t, float); +instantiate_single_cta_select_and_run(8, 128, uint8_t, uint32_t, float); +instantiate_single_cta_select_and_run(16, 256, uint8_t, uint32_t, float); +instantiate_single_cta_select_and_run(32, 512, uint8_t, uint32_t, float); + +#undef instantiate_single_cta_select_and_run + +} // namespace single_cta_search +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh new file mode 100644 index 0000000000..31d9c9fffa --- /dev/null +++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -0,0 +1,890 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bitonic.hpp" +#include "compute_distance.hpp" +#include "device_common.hpp" +#include "hashmap.hpp" +#include "search_plan.cuh" +#include "topk_by_radix.cuh" +#include "topk_for_cagra/topk_core.cuh" // TODO replace with raft topk +#include "utils.hpp" +#include +#include +#include // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp + +namespace raft::neighbors::cagra::detail { +namespace single_cta_search { + +// #define _CLK_BREAKDOWN + +template +__device__ void pickup_next_parents(std::uint32_t* const terminate_flag, + INDEX_T* const next_parent_indices, + INDEX_T* const internal_topk_indices, + const std::size_t internal_topk_size, + const std::size_t dataset_size, + const std::uint32_t search_width) +{ + constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; + // if (threadIdx.x >= 32) return; + + for (std::uint32_t i = threadIdx.x; i < search_width; i += 32) { + next_parent_indices[i] = utils::get_max_value(); + } + std::uint32_t itopk_max = internal_topk_size; + if (itopk_max % 32) { itopk_max += 32 - (itopk_max % 32); } + std::uint32_t num_new_parents = 0; + for (std::uint32_t j = threadIdx.x; j < itopk_max; j += 32) { + std::uint32_t jj = j; + if (TOPK_BY_BITONIC_SORT) { jj = device::swizzling(j); } + INDEX_T index; + int new_parent = 0; + if (j < internal_topk_size) { + index = internal_topk_indices[jj]; + if ((index & index_msb_1_mask) == 0) { // check if most significant bit is set + new_parent = 1; + } + } + const std::uint32_t ballot_mask = __ballot_sync(0xffffffff, new_parent); + if (new_parent) { + const auto i = __popc(ballot_mask & ((1 << threadIdx.x) - 1)) + num_new_parents; + if (i < search_width) { + next_parent_indices[i] = index; + // set most significant bit as used node + internal_topk_indices[jj] |= index_msb_1_mask; + } + } + num_new_parents += __popc(ballot_mask); + if (num_new_parents >= search_width) { break; } + } + if (threadIdx.x == 0 && (num_new_parents == 0)) { *terminate_flag = 1; } +} + +template +__device__ inline void topk_by_bitonic_sort_1st(float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + const std::uint32_t num_itopk) +{ + const unsigned lane_id = threadIdx.x % 32; + const unsigned warp_id = threadIdx.x / 32; + if (MULTI_WARPS == 0) { + if (warp_id > 0) { return; } + constexpr unsigned N = (MAX_CANDIDATES + 31) / 32; + float key[N]; + IdxT val[N]; + /* Candidates -> Reg */ + for (unsigned i = 0; i < N; i++) { + unsigned j = lane_id + (32 * i); + if (j < num_candidates) { + key[i] = candidate_distances[j]; + val[i] = candidate_indices[j]; + } else { + key[i] = utils::get_max_value(); + val[i] = utils::get_max_value(); + } + } + /* Sort */ + bitonic::warp_sort(key, val); + /* Reg -> Temp_itopk */ + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * lane_id) + i; + if (j < num_candidates && j < num_itopk) { + candidate_distances[device::swizzling(j)] = key[i]; + candidate_indices[device::swizzling(j)] = val[i]; + } + } + } else { + // Use two warps (64 threads) + constexpr unsigned max_candidates_per_warp = (MAX_CANDIDATES + 1) / 2; + constexpr unsigned N = (max_candidates_per_warp + 31) / 32; + float key[N]; + IdxT val[N]; + if (warp_id < 2) { + /* Candidates -> Reg */ + for (unsigned i = 0; i < N; i++) { + unsigned jl = lane_id + (32 * i); + unsigned j = jl + (max_candidates_per_warp * warp_id); + if (j < num_candidates) { + key[i] = candidate_distances[j]; + val[i] = candidate_indices[j]; + } else { + key[i] = utils::get_max_value(); + val[i] = utils::get_max_value(); + } + } + /* Sort */ + bitonic::warp_sort(key, val); + /* Reg -> Temp_candidates */ + for (unsigned i = 0; i < N; i++) { + unsigned jl = (N * lane_id) + i; + unsigned j = jl + (max_candidates_per_warp * warp_id); + if (j < num_candidates && jl < num_itopk) { + candidate_distances[device::swizzling(j)] = key[i]; + candidate_indices[device::swizzling(j)] = val[i]; + } + } + } + __syncthreads(); + + unsigned num_warps_used = (num_itopk + max_candidates_per_warp - 1) / max_candidates_per_warp; + if (warp_id < num_warps_used) { + /* Temp_candidates -> Reg */ + for (unsigned i = 0; i < N; i++) { + unsigned jl = (N * lane_id) + i; + unsigned kl = max_candidates_per_warp - 1 - jl; + unsigned j = jl + (max_candidates_per_warp * warp_id); + unsigned k = MAX_CANDIDATES - 1 - j; + if (j >= num_candidates || k >= num_candidates || kl >= num_itopk) continue; + float temp_key = candidate_distances[device::swizzling(k)]; + if (key[i] == temp_key) continue; + if ((warp_id == 0) == (key[i] > temp_key)) { + key[i] = temp_key; + val[i] = candidate_indices[device::swizzling(k)]; + } + } + } + if (num_warps_used > 1) { __syncthreads(); } + if (warp_id < num_warps_used) { + /* Merge */ + bitonic::warp_merge(key, val, 32); + /* Reg -> Temp_itopk */ + for (unsigned i = 0; i < N; i++) { + unsigned jl = (N * lane_id) + i; + unsigned j = jl + (max_candidates_per_warp * warp_id); + if (j < num_candidates && j < num_itopk) { + candidate_distances[device::swizzling(j)] = key[i]; + candidate_indices[device::swizzling(j)] = val[i]; + } + } + } + if (num_warps_used > 1) { __syncthreads(); } + } +} + +template +__device__ inline void topk_by_bitonic_sort_2nd(float* itopk_distances, // [num_itopk] + IdxT* itopk_indices, // [num_itopk] + const std::uint32_t num_itopk, + float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + std::uint32_t* work_buf, + const bool first) +{ + const unsigned lane_id = threadIdx.x % 32; + const unsigned warp_id = threadIdx.x / 32; + if (MULTI_WARPS == 0) { + if (warp_id > 0) { return; } + constexpr unsigned N = (MAX_ITOPK + 31) / 32; + float key[N]; + IdxT val[N]; + if (first) { + /* Load itopk results */ + for (unsigned i = 0; i < N; i++) { + unsigned j = lane_id + (32 * i); + if (j < num_itopk) { + key[i] = itopk_distances[j]; + val[i] = itopk_indices[j]; + } else { + key[i] = utils::get_max_value(); + val[i] = utils::get_max_value(); + } + } + /* Warp Sort */ + bitonic::warp_sort(key, val); + } else { + /* Load itopk results */ + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * lane_id) + i; + if (j < num_itopk) { + key[i] = itopk_distances[device::swizzling(j)]; + val[i] = itopk_indices[device::swizzling(j)]; + } else { + key[i] = utils::get_max_value(); + val[i] = utils::get_max_value(); + } + } + } + /* Merge candidates */ + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * lane_id) + i; // [0:MAX_ITOPK-1] + unsigned k = MAX_ITOPK - 1 - j; + if (k >= num_itopk || k >= num_candidates) continue; + float candidate_key = candidate_distances[device::swizzling(k)]; + if (key[i] > candidate_key) { + key[i] = candidate_key; + val[i] = candidate_indices[device::swizzling(k)]; + } + } + /* Warp Merge */ + bitonic::warp_merge(key, val, 32); + /* Store new itopk results */ + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * lane_id) + i; + if (j < num_itopk) { + itopk_distances[device::swizzling(j)] = key[i]; + itopk_indices[device::swizzling(j)] = val[i]; + } + } + } else { + // Use two warps (64 threads) or more + constexpr unsigned max_itopk_per_warp = (MAX_ITOPK + 1) / 2; + constexpr unsigned N = (max_itopk_per_warp + 31) / 32; + float key[N]; + IdxT val[N]; + if (first) { + /* Load itop results (not sorted) */ + if (warp_id < 2) { + for (unsigned i = 0; i < N; i++) { + unsigned j = lane_id + (32 * i) + (max_itopk_per_warp * warp_id); + if (j < num_itopk) { + key[i] = itopk_distances[j]; + val[i] = itopk_indices[j]; + } else { + key[i] = utils::get_max_value(); + val[i] = utils::get_max_value(); + } + } + /* Warp Sort */ + bitonic::warp_sort(key, val); + /* Store intermedidate results */ + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * threadIdx.x) + i; + if (j >= num_itopk) continue; + itopk_distances[device::swizzling(j)] = key[i]; + itopk_indices[device::swizzling(j)] = val[i]; + } + } + __syncthreads(); + if (warp_id < 2) { + /* Load intermedidate results */ + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * threadIdx.x) + i; + unsigned k = MAX_ITOPK - 1 - j; + if (k >= num_itopk) continue; + float temp_key = itopk_distances[device::swizzling(k)]; + if (key[i] == temp_key) continue; + if ((warp_id == 0) == (key[i] > temp_key)) { + key[i] = temp_key; + val[i] = itopk_indices[device::swizzling(k)]; + } + } + /* Warp Merge */ + bitonic::warp_merge(key, val, 32); + } + __syncthreads(); + /* Store itopk results (sorted) */ + if (warp_id < 2) { + for (unsigned i = 0; i < N; i++) { + unsigned j = (N * threadIdx.x) + i; + if (j >= num_itopk) continue; + itopk_distances[device::swizzling(j)] = key[i]; + itopk_indices[device::swizzling(j)] = val[i]; + } + } + } + const uint32_t num_itopk_div2 = num_itopk / 2; + if (threadIdx.x < 3) { + // work_buf is used to obtain turning points in 1st and 2nd half of itopk afer merge. + work_buf[threadIdx.x] = num_itopk_div2; + } + __syncthreads(); + + // Merge candidates (using whole threads) + for (unsigned k = threadIdx.x; k < min(num_candidates, num_itopk); k += blockDim.x) { + const unsigned j = num_itopk - 1 - k; + const float itopk_key = itopk_distances[device::swizzling(j)]; + const float candidate_key = candidate_distances[device::swizzling(k)]; + if (itopk_key > candidate_key) { + itopk_distances[device::swizzling(j)] = candidate_key; + itopk_indices[device::swizzling(j)] = candidate_indices[device::swizzling(k)]; + if (j < num_itopk_div2) { + atomicMin(work_buf + 2, j); + } else { + atomicMin(work_buf + 1, j - num_itopk_div2); + } + } + } + __syncthreads(); + + // Merge 1st and 2nd half of itopk (using whole threads) + for (unsigned j = threadIdx.x; j < num_itopk_div2; j += blockDim.x) { + const unsigned k = j + num_itopk_div2; + float key_0 = itopk_distances[device::swizzling(j)]; + float key_1 = itopk_distances[device::swizzling(k)]; + if (key_0 > key_1) { + itopk_distances[device::swizzling(j)] = key_1; + itopk_distances[device::swizzling(k)] = key_0; + IdxT val_0 = itopk_indices[device::swizzling(j)]; + IdxT val_1 = itopk_indices[device::swizzling(k)]; + itopk_indices[device::swizzling(j)] = val_1; + itopk_indices[device::swizzling(k)] = val_0; + atomicMin(work_buf + 0, j); + } + } + if (threadIdx.x == blockDim.x - 1) { + if (work_buf[2] < num_itopk_div2) { work_buf[1] = work_buf[2]; } + } + __syncthreads(); + // if ((blockIdx.x == 0) && (threadIdx.x == 0)) { + // RAFT_LOG_DEBUG( "work_buf: %u, %u, %u\n", work_buf[0], work_buf[1], work_buf[2] ); + // } + + // Warp-0 merges 1st half of itopk, warp-1 does 2nd half. + if (warp_id < 2) { + // Load intermedidate itopk results + const uint32_t turning_point = work_buf[warp_id]; // turning_point <= num_itopk_div2 + for (unsigned i = 0; i < N; i++) { + unsigned k = num_itopk; + unsigned j = (N * lane_id) + i; + if (j < turning_point) { + k = j + (num_itopk_div2 * warp_id); + } else if (j >= (MAX_ITOPK / 2 - num_itopk_div2)) { + j -= (MAX_ITOPK / 2 - num_itopk_div2); + if ((turning_point <= j) && (j < num_itopk_div2)) { k = j + (num_itopk_div2 * warp_id); } + } + if (k < num_itopk) { + key[i] = itopk_distances[device::swizzling(k)]; + val[i] = itopk_indices[device::swizzling(k)]; + } else { + key[i] = utils::get_max_value(); + val[i] = utils::get_max_value(); + } + } + /* Warp Merge */ + bitonic::warp_merge(key, val, 32); + /* Store new itopk results */ + for (unsigned i = 0; i < N; i++) { + const unsigned j = (N * lane_id) + i; + if (j < num_itopk_div2) { + unsigned k = j + (num_itopk_div2 * warp_id); + itopk_distances[device::swizzling(k)] = key[i]; + itopk_indices[device::swizzling(k)] = val[i]; + } + } + } + } +} + +template +__device__ void topk_by_bitonic_sort(float* itopk_distances, // [num_itopk] + IdxT* itopk_indices, // [num_itopk] + const std::uint32_t num_itopk, + float* candidate_distances, // [num_candidates] + IdxT* candidate_indices, // [num_candidates] + const std::uint32_t num_candidates, + std::uint32_t* work_buf, + const bool first) +{ + // The results in candidate_distances/indices are sorted by bitonic sort. + topk_by_bitonic_sort_1st( + candidate_distances, candidate_indices, num_candidates, num_itopk); + + // The results sorted above are merged with the internal intermediate top-k + // results so far using bitonic merge. + topk_by_bitonic_sort_2nd(itopk_distances, + itopk_indices, + num_itopk, + candidate_distances, + candidate_indices, + num_candidates, + work_buf, + first); +} + +template +__device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr, + const size_t hashmap_bitlen, + const INDEX_T* itopk_indices, + uint32_t itopk_size) +{ + constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; + if (threadIdx.x < FIRST_TID || threadIdx.x >= LAST_TID) return; + for (unsigned i = threadIdx.x - FIRST_TID; i < itopk_size; i += LAST_TID - FIRST_TID) { + auto key = itopk_indices[i] & ~index_msb_1_mask; // clear most significant bit + hashmap::insert(hashmap_ptr, hashmap_bitlen, key); + } +} + +template +__device__ inline void set_value_device(T* const ptr, const T fill, const std::uint32_t count) +{ + for (std::uint32_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { + ptr[i] = fill; + } +} + +// One query one thread block +template +__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ + void search_kernel(INDEX_T* const result_indices_ptr, // [num_queries, top_k] + DISTANCE_T* const result_distances_ptr, // [num_queries, top_k] + const std::uint32_t top_k, + const DATA_T* const dataset_ptr, // [dataset_size, dataset_dim] + const std::size_t dataset_dim, + const std::size_t dataset_size, + const std::size_t dataset_ld, // stride of dataset + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + const INDEX_T* const knn_graph, // [dataset_size, graph_degree] + const std::uint32_t graph_degree, + const unsigned num_distilation, + const uint64_t rand_xor_mask, + const INDEX_T* seed_ptr, // [num_queries, num_seeds] + const uint32_t num_seeds, + INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] + const std::uint32_t internal_topk, + const std::uint32_t search_width, + const std::uint32_t min_iteration, + const std::uint32_t max_iteration, + std::uint32_t* const num_executed_iterations, // [num_queries] + const std::uint32_t hash_bitlen, + const std::uint32_t small_hash_bitlen, + const std::uint32_t small_hash_reset_interval) +{ + using LOAD_T = device::LOAD_128BIT_T; + const auto query_id = blockIdx.y; + +#ifdef _CLK_BREAKDOWN + std::uint64_t clk_init = 0; + std::uint64_t clk_compute_1st_distance = 0; + std::uint64_t clk_topk = 0; + std::uint64_t clk_reset_hash = 0; + std::uint64_t clk_pickup_parents = 0; + std::uint64_t clk_restore_hash = 0; + std::uint64_t clk_compute_distance = 0; + std::uint64_t clk_start; +#define _CLK_START() clk_start = clock64() +#define _CLK_REC(V) V += clock64() - clk_start; +#else +#define _CLK_START() +#define _CLK_REC(V) +#endif + _CLK_START(); + + extern __shared__ std::uint32_t smem[]; + + // Layout of result_buffer + // +----------------------+------------------------------+---------+ + // | internal_top_k | neighbors of internal_top_k | padding | + // | | | upto 32 | + // +----------------------+------------------------------+---------+ + // |<--- result_buffer_size --->| + std::uint32_t result_buffer_size = internal_topk + (search_width * graph_degree); + std::uint32_t result_buffer_size_32 = result_buffer_size; + if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); } + const auto small_hash_size = hashmap::get_size(small_hash_bitlen); + auto query_buffer = reinterpret_cast(smem); + auto result_indices_buffer = reinterpret_cast(query_buffer + MAX_DATASET_DIM); + auto result_distances_buffer = + reinterpret_cast(result_indices_buffer + result_buffer_size_32); + auto visited_hash_buffer = + reinterpret_cast(result_distances_buffer + result_buffer_size_32); + auto parent_list_buffer = reinterpret_cast(visited_hash_buffer + small_hash_size); + auto topk_ws = reinterpret_cast(parent_list_buffer + search_width); + auto terminate_flag = reinterpret_cast(topk_ws + 3); + auto smem_working_ptr = reinterpret_cast(terminate_flag + 1); + + const DATA_T* const query_ptr = queries_ptr + query_id * dataset_dim; + for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) { + unsigned j = device::swizzling(i); + if (i < dataset_dim) { + query_buffer[j] = spatial::knn::detail::utils::mapping{}(query_ptr[i]); + } else { + query_buffer[j] = 0.0; + } + } + if (threadIdx.x == 0) { + terminate_flag[0] = 0; + topk_ws[0] = ~0u; + } + + // Init hashmap + INDEX_T* local_visited_hashmap_ptr; + if (small_hash_bitlen) { + local_visited_hashmap_ptr = visited_hash_buffer; + } else { + local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id); + } + hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); + __syncthreads(); + _CLK_REC(clk_init); + + // compute distance to randomly selecting nodes + _CLK_START(); + const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr; + device::compute_distance_to_random_nodes( + result_indices_buffer, + result_distances_buffer, + query_buffer, + dataset_ptr, + dataset_dim, + dataset_size, + dataset_ld, + result_buffer_size, + num_distilation, + rand_xor_mask, + local_seed_ptr, + num_seeds, + local_visited_hashmap_ptr, + hash_bitlen); + __syncthreads(); + _CLK_REC(clk_compute_1st_distance); + + std::uint32_t iter = 0; + while (1) { + // sort + if (TOPK_BY_BITONIC_SORT) { + // [Notice] + // It is good to use multiple warps in topk_by_bitonic_sort() when + // batch size is small (short-latency), but it might not be always good + // when batch size is large (high-throughput). + // topk_by_bitonic_sort() consists of two operations: + // if MAX_CANDIDATES is greater than 128, the first operation uses two warps; + // if MAX_ITOPK is greater than 256, the second operation used two warps. + constexpr unsigned multi_warps_1 = ((BLOCK_SIZE >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0; + constexpr unsigned multi_warps_2 = ((BLOCK_SIZE >= 64) && (MAX_ITOPK > 256)) ? 1 : 0; + + // reset small-hash table. + if ((iter + 1) % small_hash_reset_interval == 0) { + // Depending on the block size and the number of warps used in + // topk_by_bitonic_sort(), determine which warps are used to reset + // the small hash and whether they are performed in overlap with + // topk_by_bitonic_sort(). + _CLK_START(); + if (BLOCK_SIZE == 32) { + hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); + } else if (BLOCK_SIZE == 64) { + if (multi_warps_1 || multi_warps_2) { + hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); + } else { + hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); + } + } else { + if (multi_warps_1 || multi_warps_2) { + hashmap::init<64, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); + } else { + hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); + } + } + _CLK_REC(clk_reset_hash); + } + + // topk with bitonic sort + _CLK_START(); + topk_by_bitonic_sort( + result_distances_buffer, + result_indices_buffer, + internal_topk, + result_distances_buffer + internal_topk, + result_indices_buffer + internal_topk, + search_width * graph_degree, + topk_ws, + (iter == 0)); + _CLK_REC(clk_topk); + + } else { + _CLK_START(); + // topk with radix block sort + topk_by_radix_sort{}( + internal_topk, + gridDim.x, + result_buffer_size, + reinterpret_cast(result_distances_buffer), + result_indices_buffer, + reinterpret_cast(result_distances_buffer), + result_indices_buffer, + nullptr, + topk_ws, + true, + reinterpret_cast(smem_working_ptr)); + _CLK_REC(clk_topk); + + // reset small-hash table + if ((iter + 1) % small_hash_reset_interval == 0) { + _CLK_START(); + hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen); + _CLK_REC(clk_reset_hash); + } + } + __syncthreads(); + + if (iter + 1 == max_iteration) { break; } + + // pick up next parents + if (threadIdx.x < 32) { + _CLK_START(); + pickup_next_parents(terminate_flag, + parent_list_buffer, + result_indices_buffer, + internal_topk, + dataset_size, + search_width); + _CLK_REC(clk_pickup_parents); + } + + // restore small-hash table by putting internal-topk indices in it + if ((iter + 1) % small_hash_reset_interval == 0) { + constexpr unsigned first_tid = ((BLOCK_SIZE <= 32) ? 0 : 32); + _CLK_START(); + hashmap_restore( + local_visited_hashmap_ptr, hash_bitlen, result_indices_buffer, internal_topk); + _CLK_REC(clk_restore_hash); + } + __syncthreads(); + + if (*terminate_flag && iter >= min_iteration) { break; } + + // compute the norms between child nodes and query node + _CLK_START(); + constexpr unsigned max_n_frags = 16; + device:: + compute_distance_to_child_nodes( + result_indices_buffer + internal_topk, + result_distances_buffer + internal_topk, + query_buffer, + dataset_ptr, + dataset_dim, + dataset_ld, + knn_graph, + graph_degree, + local_visited_hashmap_ptr, + hash_bitlen, + parent_list_buffer, + search_width); + __syncthreads(); + _CLK_REC(clk_compute_distance); + + iter++; + } + for (std::uint32_t i = threadIdx.x; i < top_k; i += BLOCK_SIZE) { + unsigned j = i + (top_k * query_id); + unsigned ii = i; + if (TOPK_BY_BITONIC_SORT) { ii = device::swizzling(i); } + if (result_distances_ptr != nullptr) { result_distances_ptr[j] = result_distances_buffer[ii]; } + constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask::value; + + result_indices_ptr[j] = + result_indices_buffer[ii] & ~index_msb_1_mask; // clear most significant bit + } + if (threadIdx.x == 0 && num_executed_iterations != nullptr) { + num_executed_iterations[query_id] = iter + 1; + } +#ifdef _CLK_BREAKDOWN + if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && ((query_id * 3) % gridDim.y < 3)) { + RAFT_LOG_DEBUG( + "query, %d, thread, %d" + ", init, %d" + ", 1st_distance, %lu" + ", topk, %lu" + ", reset_hash, %lu" + ", pickup_parents, %lu" + ", restore_hash, %lu" + ", distance, %lu" + "\n", + query_id, + threadIdx.x, + clk_init, + clk_compute_1st_distance, + clk_topk, + clk_reset_hash, + clk_pickup_parents, + clk_restore_hash, + clk_compute_distance); + } +#endif +} + +template +struct search_kernel_config { + using kernel_t = decltype(&search_kernel); + + template + static auto choose_block_size(unsigned block_size) -> kernel_t + { + constexpr unsigned BS = USE_BITONIC_SORT; + if constexpr (BS) { + if (block_size == 64) { + return search_kernel; + } else if (block_size == 128) { + return search_kernel; + } else if (block_size == 256) { + return search_kernel; + } else if (block_size == 512) { + return search_kernel; + } else { + return search_kernel; + } + + } else { + if (block_size == 256) { + return search_kernel; + } else if (block_size == 512) { + return search_kernel; + } else { + return search_kernel; + } + } + } + + static auto choose_itopk_and_mx_candidates(unsigned itopk_size, + unsigned num_itopk_candidates, + unsigned block_size) -> kernel_t + { + if (num_itopk_candidates <= 64) { + // use bitonic sort based topk + constexpr unsigned max_candidates = 64; + if (itopk_size <= 64) { + return choose_block_size<64, max_candidates, 1>(block_size); + } else if (itopk_size <= 128) { + return choose_block_size<128, max_candidates, 1>(block_size); + } else if (itopk_size <= 256) { + return choose_block_size<256, max_candidates, 1>(block_size); + } else if (itopk_size <= 512) { + return choose_block_size<512, max_candidates, 1>(block_size); + } + } else if (num_itopk_candidates <= 128) { + constexpr unsigned max_candidates = 128; + if (itopk_size <= 64) { + return choose_block_size<64, max_candidates, 1>(block_size); + } else if (itopk_size <= 128) { + return choose_block_size<128, max_candidates, 1>(block_size); + } else if (itopk_size <= 256) { + return choose_block_size<256, max_candidates, 1>(block_size); + } else if (itopk_size <= 512) { + return choose_block_size<512, max_candidates, 1>(block_size); + } + } else if (num_itopk_candidates <= 256) { + constexpr unsigned max_candidates = 256; + if (itopk_size <= 64) { + return choose_block_size<64, max_candidates, 1>(block_size); + } else if (itopk_size <= 128) { + return choose_block_size<128, max_candidates, 1>(block_size); + } else if (itopk_size <= 256) { + return choose_block_size<256, max_candidates, 1>(block_size); + } else if (itopk_size <= 512) { + return choose_block_size<512, max_candidates, 1>(block_size); + } + } else { + // Radix-based topk is used + constexpr unsigned max_candidates = 32; // to avoid build failure + if (itopk_size <= 256) { + return choose_block_size<256, max_candidates, 0>(block_size); + } else if (itopk_size <= 512) { + return choose_block_size<512, max_candidates, 0>(block_size); + } + } + THROW("No kernel for parametels itopk_size %u, num_itopk_candidates %u", + itopk_size, + num_itopk_candidates); + } +}; + +template +void select_and_run( // raft::resources const& res, + raft::device_matrix_view dataset, + raft::device_matrix_view graph, + INDEX_T* const topk_indices_ptr, // [num_queries, topk] + DISTANCE_T* const topk_distances_ptr, // [num_queries, topk] + const DATA_T* const queries_ptr, // [num_queries, dataset_dim] + const uint32_t num_queries, + const INDEX_T* dev_seed_ptr, // [num_queries, num_seeds] + uint32_t* const num_executed_iterations, // [num_queries,] + uint32_t topk, + uint32_t num_itopk_candidates, + uint32_t block_size, // + uint32_t smem_size, + int64_t hash_bitlen, + INDEX_T* hashmap_ptr, + size_t small_hash_bitlen, + size_t small_hash_reset_interval, + uint32_t num_random_samplings, + uint64_t rand_xor_mask, + uint32_t num_seeds, + size_t itopk_size, + size_t search_width, + size_t min_iterations, + size_t max_iterations, + cudaStream_t stream) +{ + auto kernel = search_kernel_config:: + choose_itopk_and_mx_candidates(itopk_size, num_itopk_candidates, block_size); + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); + dim3 thread_dims(block_size, 1, 1); + dim3 block_dims(1, num_queries, 1); + RAFT_LOG_DEBUG( + "Launching kernel with %u threads, %u block %lu smem", block_size, num_queries, smem_size); + kernel<<>>(topk_indices_ptr, + topk_distances_ptr, + topk, + dataset.data_handle(), + dataset.extent(1), + dataset.extent(0), + dataset.stride(0), + queries_ptr, + graph.data_handle(), + graph.extent(1), + num_random_samplings, + rand_xor_mask, + dev_seed_ptr, + num_seeds, + hashmap_ptr, + itopk_size, + search_width, + min_iterations, + max_iterations, + num_executed_iterations, + hash_bitlen, + small_hash_bitlen, + small_hash_reset_interval); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} +} // namespace single_cta_search +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel.cuh new file mode 100644 index 0000000000..1d8fd8e30a --- /dev/null +++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel.cuh @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY +#include "search_single_cta_kernel-inl.cuh" +#endif + +#ifdef RAFT_COMPILED +#include "search_single_cta_kernel-ext.cuh" +#endif diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh new file mode 100644 index 0000000000..a1b7f930d3 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/cagra/topk_by_radix.cuh @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "topk_for_cagra/topk_core.cuh" + +namespace raft::neighbors::cagra::detail { +namespace single_cta_search { + +template +struct topk_by_radix_sort_base { + static constexpr std::uint32_t smem_size = MAX_INTERNAL_TOPK * 2 + 2048 + 8; + static constexpr std::uint32_t state_bit_lenght = 0; + static constexpr std::uint32_t vecLen = 2; // TODO +}; +template +struct topk_by_radix_sort : topk_by_radix_sort_base {}; + +template +struct topk_by_radix_sort> + : topk_by_radix_sort_base { + __device__ void operator()(uint32_t topk, + uint32_t batch_size, + uint32_t len_x, + const uint32_t* _x, + const IdxT* _in_vals, + uint32_t* _y, + IdxT* _out_vals, + uint32_t* work, + uint32_t* _hints, + bool sort, + uint32_t* _smem) + { + std::uint8_t* const state = reinterpret_cast(work); + topk_cta_11_core::state_bit_lenght, + topk_by_radix_sort_base::vecLen, + 64, + 32, + IdxT>(topk, len_x, _x, _in_vals, _y, _out_vals, state, _hints, sort, _smem); + } +}; + +#define TOP_FUNC_PARTIAL_SPECIALIZATION(V) \ + template \ + struct topk_by_radix_sort< \ + MAX_INTERNAL_TOPK, \ + BLOCK_SIZE, \ + IdxT, \ + std::enable_if_t<((MAX_INTERNAL_TOPK <= V) && (2 * MAX_INTERNAL_TOPK > V))>> \ + : topk_by_radix_sort_base { \ + __device__ void operator()(uint32_t topk, \ + uint32_t batch_size, \ + uint32_t len_x, \ + const uint32_t* _x, \ + const IdxT* _in_vals, \ + uint32_t* _y, \ + IdxT* _out_vals, \ + uint32_t* work, \ + uint32_t* _hints, \ + bool sort, \ + uint32_t* _smem) \ + { \ + assert(BLOCK_SIZE >= V / 4); \ + std::uint8_t* state = (std::uint8_t*)work; \ + topk_cta_11_core::state_bit_lenght, \ + topk_by_radix_sort_base::vecLen, \ + V, \ + V / 4, \ + IdxT>( \ + topk, len_x, _x, _in_vals, _y, _out_vals, state, _hints, sort, _smem); \ + } \ + }; +TOP_FUNC_PARTIAL_SPECIALIZATION(128); +TOP_FUNC_PARTIAL_SPECIALIZATION(256); +TOP_FUNC_PARTIAL_SPECIALIZATION(512); +TOP_FUNC_PARTIAL_SPECIALIZATION(1024); + +} // namespace single_cta_search +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk.h b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk.h index 2896dba1f3..92b9474047 100644 --- a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk.h +++ b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk.h @@ -18,7 +18,7 @@ #include #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { // size_t _cuann_find_topk_bufferSize(uint32_t topK, @@ -55,4 +55,4 @@ CUDA_DEVICE_HOST_FUNC inline size_t _cuann_aligned(size_t size, size_t unit = 12 if (size % unit) { size += unit - (size % unit); } return size; } -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh index 5bc4b70791..dd73558f86 100644 --- a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh @@ -21,7 +21,7 @@ #include #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { using namespace cub; // @@ -871,38 +871,36 @@ inline void _cuann_find_topk(uint32_t topK, } while (0) // V: vecLen -#define SET_KERNEL_V(V, ValT) \ - do { \ - if (topK <= 32) { \ - SET_KERNEL_VKT(V, 32, 32, ValT); \ - } else if (topK <= 64) { \ - SET_KERNEL_VKT(V, 64, 32, ValT); \ - } else if (topK <= 96) { \ - SET_KERNEL_VKT(V, 96, 32, ValT); \ - } else if (topK <= 128) { \ - SET_KERNEL_VKT(V, 128, 32, ValT); \ - } else if (topK <= 192) { \ - SET_KERNEL_VKT(V, 192, 64, ValT); \ - } else if (topK <= 256) { \ - SET_KERNEL_VKT(V, 256, 64, ValT); \ - } else if (topK <= 384) { \ - SET_KERNEL_VKT(V, 384, 128, ValT); \ - } else if (topK <= 512) { \ - SET_KERNEL_VKT(V, 512, 128, ValT); \ - } else if (topK <= 768) { \ - SET_KERNEL_VKT(V, 768, 256, ValT); \ - } else if (topK <= 1024) { \ - SET_KERNEL_VKT(V, 1024, 256, ValT); \ +#define SET_KERNEL_V(V, ValT) \ + do { \ + if (topK <= 32) { \ + SET_KERNEL_VKT(V, 32, 32, ValT); \ + } else if (topK <= 64) { \ + SET_KERNEL_VKT(V, 64, 32, ValT); \ + } else if (topK <= 96) { \ + SET_KERNEL_VKT(V, 96, 32, ValT); \ + } else if (topK <= 128) { \ + SET_KERNEL_VKT(V, 128, 32, ValT); \ + } else if (topK <= 192) { \ + SET_KERNEL_VKT(V, 192, 64, ValT); \ + } else if (topK <= 256) { \ + SET_KERNEL_VKT(V, 256, 64, ValT); \ + } else if (topK <= 384) { \ + SET_KERNEL_VKT(V, 384, 128, ValT); \ + } else if (topK <= 512) { \ + SET_KERNEL_VKT(V, 512, 128, ValT); \ + } else if (topK <= 768) { \ + SET_KERNEL_VKT(V, 768, 256, ValT); \ + } else if (topK <= 1024) { \ + SET_KERNEL_VKT(V, 1024, 256, ValT); \ } \ /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \ /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \ /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \ /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \ - else { \ - RAFT_LOG_DEBUG( \ - "[ERROR] (%s, %d) topk must be lower than or equla to 1024.\n", __func__, __LINE__); \ - exit(-1); \ - } \ + else { \ + RAFT_FAIL("topk must be lower than or equal to 1024"); \ + } \ } while (0) int _vecLen = _get_vecLen(ldIK, 2); @@ -929,4 +927,4 @@ inline void _cuann_find_topk(uint32_t topK, return; } -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp index 934e84d4d5..22c7a60647 100644 --- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp @@ -22,7 +22,7 @@ #include #include -namespace raft::neighbors::experimental::cagra::detail { +namespace raft::neighbors::cagra::detail { namespace utils { template inline cudaDataType_t get_cuda_data_type(); @@ -150,4 +150,4 @@ struct gen_index_msb_1_mask { }; } // namespace utils -} // namespace raft::neighbors::experimental::cagra::detail +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index 7c2fa05bfe..9cde1143e0 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -416,4 +417,77 @@ inline void fill_refinement_index(raft::resources const& handle, refinement_index->veclen()); RAFT_CUDA_TRY(cudaPeekAtLastError()); } + +template +__global__ void pack_interleaved_list_kernel( + const T* codes, + T* list_data, + uint32_t n_rows, + uint32_t dim, + uint32_t veclen, + std::variant offset_or_indices) +{ + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t dst_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + tid + : std::get(offset_or_indices)[tid]; + if (tid < n_rows) { codepacker::pack_1(codes + tid * dim, list_data, dim, veclen, dst_ix); } +} + +template +__global__ void unpack_interleaved_list_kernel( + const T* list_data, + T* codes, + uint32_t n_rows, + uint32_t dim, + uint32_t veclen, + std::variant offset_or_indices) +{ + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t src_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + tid + : std::get(offset_or_indices)[tid]; + if (tid < n_rows) { codepacker::unpack_1(list_data, codes + tid * dim, dim, veclen, src_ix); } +} + +template +void pack_list_data( + raft::resources const& res, + device_matrix_view codes, + uint32_t veclen, + std::variant offset_or_indices, + device_mdspan::list_extents, row_major> list_data) +{ + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); + if (n_rows == 0 || dim == 0) return; + static constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto stream = resource::get_cuda_stream(res); + pack_interleaved_list_kernel<<>>( + codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen, offset_or_indices); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void unpack_list_data( + raft::resources const& res, + device_mdspan::list_extents, row_major> list_data, + uint32_t veclen, + std::variant offset_or_indices, + device_matrix_view codes) +{ + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); + if (n_rows == 0 || dim == 0) return; + static constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto stream = resource::get_cuda_stream(res); + unpack_interleaved_list_kernel<<>>( + list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen, offset_or_indices); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + } // namespace raft::neighbors::ivf_flat::detail diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh index 46f72c4005..47f3e8888c 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh @@ -16,24 +16,27 @@ #pragma once -#include // uintX_t -#include // raft::neighbors::ivf_flat::index -#include // RAFT_EXPLICIT -#include // rmm:cuda_stream_view +#include // uintX_t +#include // raft::neighbors::ivf_flat::index +#include // none_ivf_sample_filter +#include // RAFT_EXPLICIT +#include // rmm:cuda_stream_view #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY namespace raft::neighbors::ivf_flat::detail { -template +template void ivfflat_interleaved_scan(const raft::neighbors::ivf_flat::index& index, const T* queries, const uint32_t* coarse_query_results, const uint32_t n_queries, + const uint32_t queries_offset, const raft::distance::DistanceType metric, const uint32_t n_probes, const uint32_t k, const bool select_min, + IvfSampleFilterT sample_filter, IdxT* neighbors, float* distances, uint32_t& grid_dim_x, @@ -43,23 +46,30 @@ void ivfflat_interleaved_scan(const raft::neighbors::ivf_flat::index& i #endif // RAFT_EXPLICIT_INSTANTIATE_ONLY -#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT) \ - extern template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ - const raft::neighbors::ivf_flat::index& index, \ - const T* queries, \ - const uint32_t* coarse_query_results, \ - const uint32_t n_queries, \ - const raft::distance::DistanceType metric, \ - const uint32_t n_probes, \ - const uint32_t k, \ - const bool select_min, \ - IdxT* neighbors, \ - float* distances, \ - uint32_t& grid_dim_x, \ +#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( \ + T, AccT, IdxT, IvfSampleFilterT) \ + extern template void \ + raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ + const raft::neighbors::ivf_flat::index& index, \ + const T* queries, \ + const uint32_t* coarse_query_results, \ + const uint32_t n_queries, \ + const uint32_t queries_offset, \ + const raft::distance::DistanceType metric, \ + const uint32_t n_probes, \ + const uint32_t k, \ + const bool select_min, \ + IvfSampleFilterT sample_filter, \ + IdxT* neighbors, \ + float* distances, \ + uint32_t& grid_dim_x, \ rmm::cuda_stream_view stream) -instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(float, float, int64_t); -instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(int8_t, int32_t, int64_t); -instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(uint8_t, uint32_t, int64_t); +instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( + float, float, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); +instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( + int8_t, int32_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); +instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( + uint8_t, uint32_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); #undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh index 4eed2aa453..18f1906dc5 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh @@ -646,6 +646,7 @@ struct loadAndComputeDist { * @param n_probes * @param k * @param dim + * @param sample_filter * @param[out] neighbors * @param[out] distances */ @@ -655,6 +656,7 @@ template __global__ void __launch_bounds__(kThreadsPerBlock) @@ -666,9 +668,11 @@ __global__ void __launch_bounds__(kThreadsPerBlock) const IdxT* const* list_indices_ptrs, const T* const* list_data_ptrs, const uint32_t* list_sizes, + const uint32_t queries_offset, const uint32_t n_probes, const uint32_t k, const uint32_t dim, + IvfSampleFilterT sample_filter, IdxT* neighbors, float* distances) { @@ -736,7 +740,7 @@ __global__ void __launch_bounds__(kThreadsPerBlock) const bool valid = vec_id < list_length; // Process first shm_assisted_dim dimensions (always using shared memory) - if (valid) { + if (valid && sample_filter(queries_offset + blockIdx.y, probe_id, vec_id)) { loadAndComputeDist lc(dist, compute_dist); for (int pos = 0; pos < shm_assisted_dim; @@ -803,6 +807,7 @@ template void launch_kernel(Lambda lambda, @@ -811,8 +816,10 @@ void launch_kernel(Lambda lambda, const T* queries, const uint32_t* coarse_index, const uint32_t num_queries, + const uint32_t queries_offset, const uint32_t n_probes, const uint32_t k, + IvfSampleFilterT sample_filter, IdxT* neighbors, float* distances, uint32_t& grid_dim_x, @@ -820,8 +827,15 @@ void launch_kernel(Lambda lambda, { RAFT_EXPECTS(Veclen == index.veclen(), "Configured Veclen does not match the index interleaving pattern."); - constexpr auto kKernel = - interleaved_scan_kernel; + constexpr auto kKernel = interleaved_scan_kernel; const int max_query_smem = 16384; int query_smem_elems = std::min(max_query_smem / sizeof(T), Pow2::roundUp(index.dim())); @@ -860,9 +874,11 @@ void launch_kernel(Lambda lambda, index.inds_ptrs().data_handle(), index.data_ptrs().data_handle(), index.list_sizes().data_handle(), + queries_offset + query_offset, n_probes, k, index.dim(), + sample_filter, neighbors, distances); queries += grid_dim_y * index.dim(); @@ -931,6 +947,7 @@ template void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args) { @@ -943,6 +960,7 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg T, AccT, IdxT, + IvfSampleFilterT, euclidean_dist, raft::identity_op>({}, {}, std::forward(args)...); case raft::distance::DistanceType::L2SqrtExpanded: @@ -953,6 +971,7 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg T, AccT, IdxT, + IvfSampleFilterT, euclidean_dist, raft::sqrt_op>({}, {}, std::forward(args)...); case raft::distance::DistanceType::InnerProduct: @@ -962,6 +981,7 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg T, AccT, IdxT, + IvfSampleFilterT, inner_prod_dist, raft::identity_op>({}, {}, std::forward(args)...); // NB: update the description of `knn::ivf_flat::build` when adding here a new metric. @@ -976,6 +996,7 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg template (1, 16 / sizeof(T))> struct select_interleaved_scan_kernel { @@ -990,13 +1011,20 @@ struct select_interleaved_scan_kernel { { if constexpr (Capacity > 1) { if (capacity * 2 <= Capacity) { - return select_interleaved_scan_kernel::run( - capacity, veclen, select_min, std::forward(args)...); + return select_interleaved_scan_kernel::run(capacity, + veclen, + select_min, + std::forward(args)...); } } if constexpr (Veclen > 1) { if (veclen % Veclen != 0) { - return select_interleaved_scan_kernel::run( + return select_interleaved_scan_kernel::run( capacity, 1, select_min, std::forward(args)...); } } @@ -1010,9 +1038,11 @@ struct select_interleaved_scan_kernel { veclen == Veclen, "Veclen must be power-of-two not bigger than the maximum allowed size for this data type."); if (select_min) { - launch_with_fixed_consts(std::forward(args)...); + launch_with_fixed_consts( + std::forward(args)...); } else { - launch_with_fixed_consts(std::forward(args)...); + launch_with_fixed_consts( + std::forward(args)...); } } }; @@ -1028,6 +1058,9 @@ struct select_interleaved_scan_kernel { * @param[in] queries device pointer to the query vectors [batch_size, dim] * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes] * @param n_queries batch size + * @param[in] queries_offset + * An offset of the current query batch. It is used for feeding sample_filter with the + * correct query index. * @param metric type of the measured distance * @param n_probes number of nearest clusters to query * @param k number of nearest neighbors. @@ -1041,36 +1074,43 @@ struct select_interleaved_scan_kernel { * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters; * (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes) * @param stream + * @param sample_filter + * A filter that selects samples for a given query. Use an instance of none_ivf_sample_filter to + * provide a green light for every sample. */ -template +template void ivfflat_interleaved_scan(const index& index, const T* queries, const uint32_t* coarse_query_results, const uint32_t n_queries, + const uint32_t queries_offset, const raft::distance::DistanceType metric, const uint32_t n_probes, const uint32_t k, const bool select_min, + IvfSampleFilterT sample_filter, IdxT* neighbors, float* distances, uint32_t& grid_dim_x, rmm::cuda_stream_view stream) { const int capacity = bound_by_power_of_two(k); - select_interleaved_scan_kernel::run(capacity, - index.veclen(), - select_min, - metric, - index, - queries, - coarse_query_results, - n_queries, - n_probes, - k, - neighbors, - distances, - grid_dim_x, - stream); + select_interleaved_scan_kernel::run(capacity, + index.veclen(), + select_min, + metric, + index, + queries, + coarse_query_results, + n_queries, + queries_offset, + n_probes, + k, + sample_filter, + neighbors, + distances, + grid_dim_x, + stream); } } // namespace raft::neighbors::ivf_flat::detail diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh index b97e64a259..976d15a61c 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh @@ -16,15 +16,16 @@ #pragma once -#include // uintX_t -#include // raft::neighbors::ivf_flat::index -#include // RAFT_EXPLICIT +#include // uintX_t +#include // raft::neighbors::ivf_flat::index +#include // none_ivf_sample_filter +#include // RAFT_EXPLICIT #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY namespace raft::neighbors::ivf_flat::detail { -template +template void search(raft::resources const& handle, const search_params& params, const raft::neighbors::ivf_flat::index& index, @@ -33,26 +34,31 @@ void search(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT; + rmm::mr::device_memory_resource* mr = nullptr, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; } // namespace raft::neighbors::ivf_flat::detail #endif // RAFT_EXPLICIT_INSTANTIATE_ONLY -#define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT) \ - extern template void raft::neighbors::ivf_flat::detail::search( \ - raft::resources const& handle, \ - const search_params& params, \ - const raft::neighbors::ivf_flat::index& index, \ - const T* queries, \ - uint32_t n_queries, \ - uint32_t k, \ - IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr) - -instantiate_raft_neighbors_ivf_flat_detail_search(float, int64_t); -instantiate_raft_neighbors_ivf_flat_detail_search(int8_t, int64_t); -instantiate_raft_neighbors_ivf_flat_detail_search(uint8_t, int64_t); +#define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT, IvfSampleFilterT) \ + extern template void raft::neighbors::ivf_flat::detail::search( \ + raft::resources const& handle, \ + const search_params& params, \ + const raft::neighbors::ivf_flat::index& index, \ + const T* queries, \ + uint32_t n_queries, \ + uint32_t k, \ + IdxT* neighbors, \ + float* distances, \ + rmm::mr::device_memory_resource* mr, \ + IvfSampleFilterT sample_filter) + +instantiate_raft_neighbors_ivf_flat_detail_search( + float, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); +instantiate_raft_neighbors_ivf_flat_detail_search( + int8_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); +instantiate_raft_neighbors_ivf_flat_detail_search( + uint8_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); #undef instantiate_raft_neighbors_ivf_flat_detail_search diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh index 66ad9682d7..93eeb0dead 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh @@ -26,6 +26,7 @@ #include // matrix::detail::select_k #include // interleaved_scan #include // raft::neighbors::ivf_flat::index +#include // none_ivf_sample_filter #include // utils::mapping #include // rmm::device_memory_resource @@ -33,17 +34,19 @@ namespace raft::neighbors::ivf_flat::detail { using namespace raft::spatial::knn::detail; // NOLINT -template +template void search_impl(raft::resources const& handle, const raft::neighbors::ivf_flat::index& index, const T* queries, uint32_t n_queries, + uint32_t queries_offset, uint32_t k, uint32_t n_probes, bool select_min, IdxT* neighbors, AccT* distances, - rmm::mr::device_memory_resource* search_mr) + rmm::mr::device_memory_resource* search_mr, + IvfSampleFilterT sample_filter) { auto stream = resource::get_cuda_stream(handle); // The norm of query @@ -124,7 +127,8 @@ void search_impl(raft::resources const& handle, stream); RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min(20, index.n_lists())); - matrix::detail::select_k(distance_buffer_dev.data(), + matrix::detail::select_k(handle, + distance_buffer_dev.data(), nullptr, n_queries, index.n_lists(), @@ -132,7 +136,6 @@ void search_impl(raft::resources const& handle, coarse_distances_dev.data(), coarse_indices_dev.data(), select_min, - stream, search_mr); RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes); RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes); @@ -143,18 +146,21 @@ void search_impl(raft::resources const& handle, uint32_t grid_dim_x = 0; if (n_probes > 1) { // query the gridDimX size to store probes topK output - ivfflat_interleaved_scan::value_t, IdxT>(index, - nullptr, - nullptr, - n_queries, - index.metric(), - n_probes, - k, - select_min, - nullptr, - nullptr, - grid_dim_x, - stream); + ivfflat_interleaved_scan::value_t, IdxT, IvfSampleFilterT>( + index, + nullptr, + nullptr, + n_queries, + queries_offset, + index.metric(), + n_probes, + k, + select_min, + sample_filter, + nullptr, + nullptr, + grid_dim_x, + stream); } else { grid_dim_x = 1; } @@ -164,25 +170,29 @@ void search_impl(raft::resources const& handle, indices_dev_ptr = neighbors; } - ivfflat_interleaved_scan::value_t, IdxT>(index, - queries, - coarse_indices_dev.data(), - n_queries, - index.metric(), - n_probes, - k, - select_min, - indices_dev_ptr, - distances_dev_ptr, - grid_dim_x, - stream); + ivfflat_interleaved_scan::value_t, IdxT, IvfSampleFilterT>( + index, + queries, + coarse_indices_dev.data(), + n_queries, + queries_offset, + index.metric(), + n_probes, + k, + select_min, + sample_filter, + indices_dev_ptr, + distances_dev_ptr, + grid_dim_x, + stream); RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k); RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k); // Merge topk values from different blocks if (grid_dim_x > 1) { - matrix::detail::select_k(refined_distances_dev.data(), + matrix::detail::select_k(handle, + refined_distances_dev.data(), refined_indices_dev.data(), n_queries, k * grid_dim_x, @@ -190,13 +200,14 @@ void search_impl(raft::resources const& handle, distances, neighbors, select_min, - stream, search_mr); } } /** See raft::neighbors::ivf_flat::search docs */ -template +template inline void search(raft::resources const& handle, const search_params& params, const index& index, @@ -205,7 +216,8 @@ inline void search(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr) + rmm::mr::device_memory_resource* mr = nullptr, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) { common::nvtx::range fun_scope( "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim()); @@ -230,16 +242,18 @@ inline void search(raft::resources const& handle, for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) { uint32_t queries_batch = min(max_queries, n_queries - offset_q); - search_impl(handle, - index, - queries + offset_q * index.dim(), - queries_batch, - k, - n_probes, - raft::distance::is_min_close(index.metric()), - neighbors + offset_q * k, - distances + offset_q * k, - mr); + search_impl(handle, + index, + queries + offset_q * index.dim(), + queries_batch, + offset_q, + k, + n_probes, + raft::distance::is_min_close(index.metric()), + neighbors + offset_q * k, + distances + offset_q * k, + mr, + sample_filter); } } diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh index b00d308827..61a6046273 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh @@ -45,7 +45,7 @@ struct check_index_layout { "paste in the new size and consider updating the serialization logic"); }; -template struct check_index_layout), 296>; +template struct check_index_layout), 328>; /** * Save the index to file. diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 4a54d33a02..199cb74fbe 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -346,10 +347,10 @@ void train_per_subset(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, - rmm::mr::device_memory_resource* managed_memory, - rmm::mr::device_memory_resource* device_memory) + rmm::mr::device_memory_resource* managed_memory) { - auto stream = resource::get_cuda_stream(handle); + auto stream = resource::get_cuda_stream(handle); + auto device_memory = resource::get_workspace_resource(handle); rmm::device_uvector pq_centers_tmp(index.pq_centers().size(), stream, device_memory); rmm::device_uvector sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory); @@ -392,10 +393,6 @@ void train_per_subset(raft::resources const& handle, index.pq_len(), stream); - // clone the handle and attached the device memory resource to it - const resources new_handle(handle); - resource::set_workspace_resource(new_handle, device_memory); - // train PQ codebook for this subspace auto sub_trainset_view = raft::make_device_matrix_view(sub_trainset.data(), n_rows, index.pq_len()); @@ -409,7 +406,7 @@ void train_per_subset(raft::resources const& handle, raft::cluster::kmeans_balanced_params kmeans_params; kmeans_params.n_iters = kmeans_n_iters; kmeans_params.metric = raft::distance::DistanceType::L2Expanded; - raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle, + raft::cluster::kmeans_balanced::helpers::build_clusters(handle, kmeans_params, sub_trainset_view, centers_tmp_view, @@ -427,10 +424,10 @@ void train_per_cluster(raft::resources const& handle, const float* trainset, // [n_rows, dim] const uint32_t* labels, // [n_rows] uint32_t kmeans_n_iters, - rmm::mr::device_memory_resource* managed_memory, - rmm::mr::device_memory_resource* device_memory) + rmm::mr::device_memory_resource* managed_memory) { - auto stream = resource::get_cuda_stream(handle); + auto stream = resource::get_cuda_stream(handle); + auto device_memory = resource::get_workspace_resource(handle); rmm::device_uvector pq_centers_tmp(index.pq_centers().size(), stream, device_memory); rmm::device_uvector cluster_sizes(index.n_lists(), stream, managed_memory); @@ -474,10 +471,6 @@ void train_per_cluster(raft::resources const& handle, indices + cluster_offsets[l], device_memory); - // clone the handle and attached the device memory resource to it - const resources new_handle(handle); - resource::set_workspace_resource(new_handle, device_memory); - // limit the cluster size to bound the training time. // [sic] we interpret the data as pq_len-dimensional size_t big_enough = 256ul * std::max(index.pq_book_size(), index.pq_dim()); @@ -498,7 +491,7 @@ void train_per_cluster(raft::resources const& handle, raft::cluster::kmeans_balanced_params kmeans_params; kmeans_params.n_iters = kmeans_n_iters; kmeans_params.metric = raft::distance::DistanceType::L2Expanded; - raft::cluster::kmeans_balanced::helpers::build_clusters(new_handle, + raft::cluster::kmeans_balanced::helpers::build_clusters(handle, kmeans_params, rot_vectors_view, centers_tmp_view, @@ -1325,6 +1318,8 @@ void extend(raft::resources const& handle, { common::nvtx::range fun_scope( "ivf_pq::extend(%zu, %u)", size_t(n_rows), index->dim()); + + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::extend"); auto stream = resource::get_cuda_stream(handle); const auto n_clusters = index->n_lists(); @@ -1523,6 +1518,7 @@ auto build(raft::resources const& handle, { common::nvtx::range fun_scope( "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::build"); static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported data type"); @@ -1543,24 +1539,18 @@ auto build(raft::resources const& handle, size_t(n_rows) / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); size_t n_rows_train = n_rows / trainset_ratio; - rmm::mr::device_memory_resource* device_memory = nullptr; - auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024); - if (pool_guard) { RAFT_LOG_DEBUG("ivf_pq::build: using pool memory resource"); } - + auto* device_memory = resource::get_workspace_resource(handle); rmm::mr::managed_memory_resource managed_memory_upstream; rmm::mr::pool_memory_resource managed_memory( &managed_memory_upstream, 1024 * 1024); // If the trainset is small enough to comfortably fit into device memory, put it there. // Otherwise, use the managed memory. + constexpr size_t kTolerableRatio = 4; rmm::mr::device_memory_resource* big_memory_resource = &managed_memory; - { - size_t free_mem, total_mem; - constexpr size_t kTolerableRatio = 4; - RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem)); - if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < free_mem) { - big_memory_resource = device_memory; - } + if (sizeof(float) * n_rows_train * index.dim() * kTolerableRatio < + resource::get_workspace_free_bytes(handle)) { + big_memory_resource = device_memory; } // Besides just sampling, we transform the input dataset into floats to make it easier @@ -1709,8 +1699,7 @@ auto build(raft::resources const& handle, trainset.data(), labels.data(), params.kmeans_n_iters, - &managed_memory, - device_memory); + &managed_memory); break; case codebook_gen::PER_CLUSTER: train_per_cluster(handle, @@ -1719,8 +1708,7 @@ auto build(raft::resources const& handle, trainset.data(), labels.data(), params.kmeans_n_iters, - &managed_memory, - device_memory); + &managed_memory); break; default: RAFT_FAIL("Unreachable code"); } diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh index 62e46e3ae1..1a9788ce4c 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh @@ -20,8 +20,8 @@ #include // RAFT_WEAK_FUNCTION #include // raft::distance::DistanceType #include // raft::neighbors::ivf_pq::detail::fp_8bit -#include // NoneSampleFilter #include // raft::neighbors::ivf_pq::codebook_gen +#include // none_ivf_sample_filter #include // RAFT_EXPLICIT #include // rmm::cuda_stream_view @@ -37,13 +37,12 @@ auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, ui template -__global__ void compute_similarity_kernel(uint32_t n_rows, - uint32_t dim, +__global__ void compute_similarity_kernel(uint32_t dim, uint32_t n_probes, uint32_t pq_dim, uint32_t n_queries, @@ -60,29 +59,28 @@ __global__ void compute_similarity_kernel(uint32_t n_rows, const float* queries, const uint32_t* index_list, float* query_kths, - SampleFilterT sample_filter, + IvfSampleFilterT sample_filter, LutT* lut_scores, OutT* _out_scores, uint32_t* _out_indices) RAFT_EXPLICIT; // The signature of the kernel defined by a minimal set of template parameters -template +template using compute_similarity_kernel_t = - decltype(&compute_similarity_kernel); + decltype(&compute_similarity_kernel); -template +template struct selected { - compute_similarity_kernel_t kernel; + compute_similarity_kernel_t kernel; dim3 grid_dim; dim3 block_dim; size_t smem_size; size_t device_lut_size; }; -template -void compute_similarity_run(selected s, +template +void compute_similarity_run(selected s, rmm::cuda_stream_view stream, - uint32_t n_rows, uint32_t dim, uint32_t n_probes, uint32_t pq_dim, @@ -100,7 +98,7 @@ void compute_similarity_run(selected s, const float* queries, const uint32_t* index_list, float* query_kths, - SampleFilterT sample_filter, + IvfSampleFilterT sample_filter, LutT* lut_scores, OutT* _out_scores, uint32_t* _out_indices) RAFT_EXPLICIT; @@ -119,7 +117,7 @@ void compute_similarity_run(selected s, * beyond this limit do not consider increasing the number of active blocks per SM * would improve locality anymore. */ -template +template auto compute_similarity_select(const cudaDeviceProp& dev_props, bool manage_local_topk, int locality_hint, @@ -129,78 +127,78 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, uint32_t precomp_data_count, uint32_t n_queries, uint32_t n_probes, - uint32_t topk) -> selected RAFT_EXPLICIT; + uint32_t topk) + -> selected RAFT_EXPLICIT; } // namespace raft::neighbors::ivf_pq::detail #endif // RAFT_EXPLICIT_INSTANTIATE_ONLY -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, SampleFilterT) \ - extern template auto \ - raft::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) \ - ->raft::neighbors::ivf_pq::detail::selected; \ - \ - extern template void \ - raft::neighbors::ivf_pq::detail::compute_similarity_run( \ - raft::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - raft::distance::DistanceType metric, \ - raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - SampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ + OutT, LutT, IvfSampleFilterT) \ + extern template auto \ + raft::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) \ + ->raft::neighbors::ivf_pq::detail::selected; \ + \ + extern template void \ + raft::neighbors::ivf_pq::detail::compute_similarity_run( \ + raft::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + raft::distance::DistanceType metric, \ + raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + IvfSampleFilterT sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - raft::neighbors::ivf_pq::detail::NoneSampleFilter); + raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - raft::neighbors::ivf_pq::detail::NoneSampleFilter); + raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( - half, half, raft::neighbors::ivf_pq::detail::NoneSampleFilter); + half, half, raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( - float, half, raft::neighbors::ivf_pq::detail::NoneSampleFilter); + float, half, raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( - float, float, raft::neighbors::ivf_pq::detail::NoneSampleFilter); + float, float, raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - raft::neighbors::ivf_pq::detail::NoneSampleFilter); + raft::neighbors::filtering::none_ivf_sample_filter); instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - raft::neighbors::ivf_pq::detail::NoneSampleFilter); + raft::neighbors::filtering::none_ivf_sample_filter); #undef COMMA diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh index 37174f54e1..90d993abd5 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh @@ -19,8 +19,8 @@ #include // raft::distance::DistanceType #include // matrix::detail::select::warpsort::warp_sort_distributed #include // dummy_block_sort_t -#include // NoneSampleFilter #include // codebook_gen +#include // none_ivf_sample_filter #include // RAFT_CUDA_TRY #include // raft::atomicMin #include // raft::Pow2 @@ -195,7 +195,6 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim, * Setting this to `false` allows to reduce the shared memory usage (and maximum data dim) * at the cost of reducing global memory reading throughput. * - * @param n_rows the number of records in the dataset * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`). * @param n_probes the number of clusters to search for each query * @param pq_dim @@ -229,7 +228,7 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim, * query_kths keep the current state of the filtering - atomically updated distances to the * k-th closest neighbors for each query [n_queries]. * @param sample_filter - * A filter that selects samples for a given query. Use an instance of NoneSampleFilter to + * A filter that selects samples for a given query. Use an instance of none_ivf_sample_filter to * provide a green light for every sample. * @param lut_scores * The device pointer for storing the lookup table globally [gridDim.x, pq_dim << PqBits]. @@ -246,13 +245,12 @@ __device__ auto ivfpq_compute_score(uint32_t pq_dim, */ template -__global__ void compute_similarity_kernel(uint32_t n_rows, - uint32_t dim, +__global__ void compute_similarity_kernel(uint32_t dim, uint32_t n_probes, uint32_t pq_dim, uint32_t n_queries, @@ -269,7 +267,7 @@ __global__ void compute_similarity_kernel(uint32_t n_rows, const float* queries, const uint32_t* index_list, float* query_kths, - SampleFilterT sample_filter, + IvfSampleFilterT sample_filter, LutT* lut_scores, OutT* _out_scores, uint32_t* _out_indices) @@ -327,14 +325,15 @@ __global__ void compute_similarity_kernel(uint32_t n_rows, uint32_t* out_indices = nullptr; if constexpr (kManageLocalTopK) { // Store topk calculated distances to out_scores (and its indices to out_indices) - out_scores = _out_scores + topk * (probe_ix + (n_probes * query_ix)); - out_indices = _out_indices + topk * (probe_ix + (n_probes * query_ix)); + const uint64_t out_offset = probe_ix + n_probes * query_ix; + out_scores = _out_scores + out_offset * topk; + out_indices = _out_indices + out_offset * topk; } else { // Store all calculated distances to out_scores - out_scores = _out_scores + max_samples * query_ix; + out_scores = _out_scores + uint64_t(max_samples) * query_ix; } uint32_t label = cluster_labels[n_probes * query_ix + probe_ix]; - const float* cluster_center = cluster_centers + (dim * label); + const float* cluster_center = cluster_centers + dim * label; const float* pq_center; if (codebook_kind == codebook_gen::PER_SUBSPACE) { pq_center = pq_centers; @@ -493,27 +492,29 @@ __global__ void compute_similarity_kernel(uint32_t n_rows, } // The signature of the kernel defined by a minimal set of template parameters -template +template using compute_similarity_kernel_t = - decltype(&compute_similarity_kernel); + decltype(&compute_similarity_kernel); // The config struct lifts the runtime parameters to the template parameters template + typename IvfSampleFilterT = raft::neighbors::filtering::none_ivf_sample_filter> struct compute_similarity_kernel_config { public: static auto get(uint32_t pq_bits, uint32_t k_max) - -> compute_similarity_kernel_t + -> compute_similarity_kernel_t { return kernel_choose_bits(pq_bits, k_max); } private: static auto kernel_choose_bits(uint32_t pq_bits, uint32_t k_max) - -> compute_similarity_kernel_t + -> compute_similarity_kernel_t { switch (pq_bits) { case 4: return kernel_try_capacity<4, kMaxCapacity>(k_max); @@ -527,7 +528,7 @@ struct compute_similarity_kernel_config { template static auto kernel_try_capacity(uint32_t k_max) - -> compute_similarity_kernel_t + -> compute_similarity_kernel_t { if constexpr (Capacity > 0) { if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity(k_max); } @@ -537,7 +538,7 @@ struct compute_similarity_kernel_config { } return compute_similarity_kernel + typename IvfSampleFilterT = raft::neighbors::filtering::none_ivf_sample_filter> auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max) - -> compute_similarity_kernel_t + -> compute_similarity_kernel_t { return compute_similarity_kernel_config::get(pq_bits, k_max); + IvfSampleFilterT>::get(pq_bits, k_max); } /** Estimate the occupancy for the given kernel on the given device. */ -template +template struct occupancy_t { using shmem_unit = Pow2<128>; @@ -575,7 +576,7 @@ struct occupancy_t { inline occupancy_t() = default; inline occupancy_t(size_t smem, uint32_t n_threads, - compute_similarity_kernel_t kernel, + compute_similarity_kernel_t kernel, const cudaDeviceProp& dev_props) { RAFT_CUDA_TRY( @@ -586,19 +587,20 @@ struct occupancy_t { } }; -template +template struct selected { - compute_similarity_kernel_t kernel; + compute_similarity_kernel_t kernel; dim3 grid_dim; dim3 block_dim; size_t smem_size; size_t device_lut_size; }; -template -void compute_similarity_run(selected s, +template +void compute_similarity_run(selected s, rmm::cuda_stream_view stream, - uint32_t n_rows, uint32_t dim, uint32_t n_probes, uint32_t pq_dim, @@ -616,13 +618,12 @@ void compute_similarity_run(selected s, const float* queries, const uint32_t* index_list, float* query_kths, - SampleFilterT sample_filter, + IvfSampleFilterT sample_filter, LutT* lut_scores, OutT* _out_scores, uint32_t* _out_indices) { - s.kernel<<>>(n_rows, - dim, + s.kernel<<>>(dim, n_probes, pq_dim, n_queries, @@ -660,7 +661,9 @@ void compute_similarity_run(selected s, * beyond this limit do not consider increasing the number of active blocks per SM * would improve locality anymore. */ -template +template auto compute_similarity_select(const cudaDeviceProp& dev_props, bool manage_local_topk, int locality_hint, @@ -670,7 +673,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, uint32_t precomp_data_count, uint32_t n_queries, uint32_t n_probes, - uint32_t topk) -> selected + uint32_t topk) -> selected { // Shared memory for storing the lookup table size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits); @@ -742,9 +745,9 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, the minimum number of blocks (just one, really). Then, we tweak the `n_threads` to further optimize occupancy and data locality for the L1 cache. */ - auto conf_fast = get_compute_similarity_kernel; - auto conf_no_basediff = get_compute_similarity_kernel; - auto conf_no_smem_lut = get_compute_similarity_kernel; + auto conf_fast = get_compute_similarity_kernel; + auto conf_no_basediff = get_compute_similarity_kernel; + auto conf_no_smem_lut = get_compute_similarity_kernel; auto topk_or_zero = manage_local_topk ? topk : 0u; std::array candidates{std::make_tuple(conf_fast(pq_bits, topk_or_zero), lut_mem + bdf_mem, true), std::make_tuple(conf_no_basediff(pq_bits, topk_or_zero), lut_mem, true), @@ -753,8 +756,8 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, // we may allow slightly lower than 100% occupancy; constexpr double kTargetOccupancy = 0.75; // This struct is used to select the better candidate - occupancy_t selected_perf{}; - selected selected_config; + occupancy_t selected_perf{}; + selected selected_config; for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) { if (smem_size_const > dev_props.sharedMemPerBlockOptin) { // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate. @@ -790,7 +793,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, continue; } - occupancy_t cur(smem_size, n_threads, kernel, dev_props); + occupancy_t cur(smem_size, n_threads, kernel, dev_props); if (cur.blocks_per_sm <= 0) { // For some reason, we still cannot make this kernel run. Skip the candidate. continue; @@ -805,7 +808,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, if (n_threads_tmp < n_threads) { while (n_threads_tmp >= n_threads_min) { auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp)); - occupancy_t tmp( + occupancy_t tmp( smem_size_tmp, n_threads_tmp, kernel, dev_props); bool select_it = false; if (lut_is_in_shmem && locality_hint >= tmp.blocks_per_sm) { diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh index 8a4d3277da..68c8a513f6 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh @@ -71,7 +71,7 @@ struct fp_8bit { return *this; } HDI explicit operator float() const { return fp_8bit2float(*this); } - HDI explicit operator half() const { return half(fp_8bit2float(*this)); } + HDI explicit operator half() const { return fp_8bit2half(*this); } private: static constexpr float kMin = 1.0f / float(1u << ExpMask); @@ -101,8 +101,23 @@ struct fp_8bit { u &= ~1; // zero the sign bit } float r; - *reinterpret_cast(&r) = - ((u << (15u + ExpBits)) + (0x3f800000u | (0x00400000u >> ValBits)) - (ExpMask << 23)); + constexpr uint32_t kBase32 = (0x3f800000u | (0x00400000u >> ValBits)) - (ExpMask << 23); + *reinterpret_cast(&r) = kBase32 + (u << (15u + ExpBits)); + if constexpr (Signed) { // recover the sign bit + if (v.bitstring & 1) { r = -r; } + } + return r; + } + + static HDI auto fp_8bit2half(const fp_8bit& v) -> half + { + uint16_t u = v.bitstring; + if constexpr (Signed) { + u &= ~1; // zero the sign bit + } + half r; + constexpr uint16_t kBase16 = (0x3c00u | (0x0200u >> ValBits)) - (ExpMask << 10); + *reinterpret_cast(&r) = kBase16 + (u << (2u + ExpBits)); if constexpr (Signed) { // recover the sign bit if (v.bitstring & 1) { r = -r; } } diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index d402a2436b..b9e911ffe2 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -23,14 +23,16 @@ #include #include #include -#include #include +#include #include #include #include #include #include +#include +#include #include #include #include @@ -151,7 +153,8 @@ void select_clusters(raft::resources const& handle, // Select neighbor clusters for each query. rmm::device_uvector cluster_dists(n_queries * n_probes, stream, mr); - matrix::detail::select_k(qc_distances.data(), + matrix::detail::select_k(handle, + qc_distances.data(), nullptr, n_queries, n_lists, @@ -159,7 +162,6 @@ void select_clusters(raft::resources const& handle, cluster_dists.data(), clusters_to_probe, true, - stream, mr); } @@ -415,7 +417,7 @@ constexpr inline auto expected_probe_coresidency(uint32_t n_clusters, * 3. split the query batch into smaller chunks, so that the device workspace * is guaranteed to fit into GPU memory. */ -template +template void ivfpq_search_worker(raft::resources const& handle, const index& index, uint32_t max_samples, @@ -429,13 +431,15 @@ void ivfpq_search_worker(raft::resources const& handle, float* distances, // [n_queries, topK] float scaling_factor, double preferred_shmem_carveout, - SampleFilterT sample_filter, - rmm::mr::device_memory_resource* mr) + IvfSampleFilterT sample_filter) { auto stream = resource::get_cuda_stream(handle); + auto mr = resource::get_workspace_resource(handle); - bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries); - auto topk_len = manage_local_topk ? n_probes * topK : max_samples; + bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries); + auto topk_len = manage_local_topk ? n_probes * topK : max_samples; + std::size_t n_queries_probes = std::size_t(n_queries) * std::size_t(n_probes); + std::size_t n_queries_topk_len = std::size_t(n_queries) * std::size_t(topk_len); if (manage_local_topk) { RAFT_LOG_DEBUG("Fused version of the search kernel is selected (manage_local_topk == true)"); } else { @@ -446,13 +450,13 @@ void ivfpq_search_worker(raft::resources const& handle, rmm::device_uvector index_list_sorted_buf(0, stream, mr); uint32_t* index_list_sorted = nullptr; rmm::device_uvector num_samples(n_queries, stream, mr); - rmm::device_uvector chunk_index(n_queries * n_probes, stream, mr); + rmm::device_uvector chunk_index(n_queries_probes, stream, mr); // [maxBatchSize, max_samples] or [maxBatchSize, n_probes, topk] - rmm::device_uvector distances_buf(n_queries * topk_len, stream, mr); + rmm::device_uvector distances_buf(n_queries_topk_len, stream, mr); rmm::device_uvector neighbors_buf(0, stream, mr); uint32_t* neighbors_ptr = nullptr; if (manage_local_topk) { - neighbors_buf.resize(n_queries * topk_len, stream); + neighbors_buf.resize(n_queries_topk_len, stream); neighbors_ptr = neighbors_buf.data(); } rmm::device_uvector neighbors_uint32_buf(0, stream, mr); @@ -477,10 +481,10 @@ void ivfpq_search_worker(raft::resources const& handle, // The goal is to incrase the L2 cache hit rate to read the vectors // of a cluster by processing the cluster at the same time as much as // possible. - index_list_sorted_buf.resize(n_queries * n_probes, stream); + index_list_sorted_buf.resize(n_queries_probes, stream); auto index_list_buf = - make_device_mdarray(handle, mr, make_extents(n_queries * n_probes)); - rmm::device_uvector cluster_labels_out(n_queries * n_probes, stream, mr); + make_device_mdarray(handle, mr, make_extents(n_queries_probes)); + rmm::device_uvector cluster_labels_out(n_queries_probes, stream, mr); auto index_list = index_list_buf.data_handle(); index_list_sorted = index_list_sorted_buf.data(); @@ -495,7 +499,7 @@ void ivfpq_search_worker(raft::resources const& handle, cluster_labels_out.data(), index_list, index_list_sorted, - n_queries * n_probes, + n_queries_probes, begin_bit, end_bit, stream); @@ -506,7 +510,7 @@ void ivfpq_search_worker(raft::resources const& handle, cluster_labels_out.data(), index_list, index_list_sorted, - n_queries * n_probes, + n_queries_probes, begin_bit, end_bit, stream); @@ -531,17 +535,17 @@ void ivfpq_search_worker(raft::resources const& handle, } break; } - auto search_instance = - compute_similarity_select(resource::get_device_properties(handle), - manage_local_topk, - coresidency, - preferred_shmem_carveout, - index.pq_bits(), - index.pq_dim(), - precomp_data_count, - n_queries, - n_probes, - topK); + auto search_instance = compute_similarity_select( + resource::get_device_properties(handle), + manage_local_topk, + coresidency, + preferred_shmem_carveout, + index.pq_bits(), + index.pq_dim(), + precomp_data_count, + n_queries, + n_probes, + topK); rmm::device_uvector device_lut(search_instance.device_lut_size, stream, mr); std::optional> query_kths_buf{std::nullopt}; @@ -556,7 +560,6 @@ void ivfpq_search_worker(raft::resources const& handle, } compute_similarity_run(search_instance, stream, - index.size(), index.rot_dim(), n_probes, index.pq_dim(), @@ -581,7 +584,8 @@ void ivfpq_search_worker(raft::resources const& handle, // Select topk vectors for each query rmm::device_uvector topk_dists(n_queries * topK, stream, mr); - matrix::detail::select_k(distances_buf.data(), + matrix::detail::select_k(handle, + distances_buf.data(), neighbors_ptr, n_queries, topk_len, @@ -589,7 +593,6 @@ void ivfpq_search_worker(raft::resources const& handle, topk_dists.data(), neighbors_uint32, true, - stream, mr); // Postprocessing @@ -610,10 +613,10 @@ void ivfpq_search_worker(raft::resources const& handle, * This structure helps selecting a proper instance of the worker search function, * which contains a few template parameters. */ -template +template struct ivfpq_search { public: - using fun_t = decltype(&ivfpq_search_worker); + using fun_t = decltype(&ivfpq_search_worker); /** * Select an instance of the ivf-pq search function based on search tuning parameters, @@ -629,7 +632,7 @@ struct ivfpq_search { static auto filter_reasonable_instances(const search_params& params) -> fun_t { if constexpr (sizeof(ScoreT) >= sizeof(LutT)) { - return ivfpq_search_worker; + return ivfpq_search_worker; } else { RAFT_FAIL( "Unexpected lut_dtype / internal_distance_dtype combination (%d, %d). " @@ -677,6 +680,7 @@ struct ivfpq_search { * A heuristic for bounding the number of queries per batch, to improve GPU utilization. * (based on the number of SMs and the work size). * + * @param res is used to query the workspace size * @param k top-k * @param n_probes number of selected clusters per query * @param n_queries number of queries hoped to be processed at once. @@ -685,7 +689,8 @@ struct ivfpq_search { * * @return maximum recommended batch size. */ -inline auto get_max_batch_size(uint32_t k, +inline auto get_max_batch_size(raft::resources const& res, + uint32_t k, uint32_t n_probes, uint32_t n_queries, uint32_t max_samples) -> uint32_t @@ -702,13 +707,17 @@ inline auto get_max_batch_size(uint32_t k, } // Check in the tmp distance buffer is not too big auto ws_size = [k, n_probes, max_samples](uint32_t bs) -> uint64_t { - return uint64_t(is_local_topk_feasible(k, n_probes, bs) ? k * n_probes : max_samples) * bs; + const uint64_t buffers_fused = 12ull * k * n_probes; + const uint64_t buffers_non_fused = 4ull * max_samples; + const uint64_t other = 32ull * n_probes; + return static_cast(bs) * + (other + (is_local_topk_feasible(k, n_probes, bs) ? buffers_fused : buffers_non_fused)); }; - constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024; - if (ws_size(max_batch_size) > kMaxWsSize) { + auto max_ws_size = resource::get_workspace_free_bytes(res); + if (ws_size(max_batch_size) > max_ws_size) { uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size); // gradually reduce the batch size until we fit into the max size limit. - while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) { + while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > max_ws_size) { smaller_batch_size >>= 1; } return smaller_batch_size; @@ -717,7 +726,9 @@ inline auto get_max_batch_size(uint32_t k, } /** See raft::spatial::knn::ivf_pq::search docs */ -template +template inline void search(raft::resources const& handle, const search_params& params, const index& index, @@ -726,8 +737,7 @@ inline void search(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - SampleFilterT sample_filter = SampleFilterT()) + IvfSampleFilterT sample_filter = IvfSampleFilterT()) { static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported element type."); @@ -737,6 +747,7 @@ inline void search(raft::resources const& handle, params.n_probes, k, index.dim()); + resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::search"); RAFT_EXPECTS( params.internal_distance_dtype == CUDA_R_16F || params.internal_distance_dtype == CUDA_R_32F, @@ -773,21 +784,17 @@ inline void search(raft::resources const& handle, max_samples = ms; } - auto pool_guard = raft::get_pool_memory_resource(mr, n_queries * n_probes * k * 16); - if (pool_guard) { - RAFT_LOG_DEBUG("ivf_pq::search: using pool memory resource with initial size %zu bytes", - n_queries * n_probes * k * 16ull); - } + auto mr = resource::get_workspace_resource(handle); // Maximum number of query vectors to search at the same time. const auto max_queries = std::min(std::max(n_queries, 1), 4096); - auto max_batch_size = get_max_batch_size(k, n_probes, max_queries, max_samples); + auto max_batch_size = get_max_batch_size(handle, k, n_probes, max_queries, max_samples); rmm::device_uvector float_queries(max_queries * dim_ext, stream, mr); rmm::device_uvector rot_queries(max_queries * index.rot_dim(), stream, mr); rmm::device_uvector clusters_to_probe(max_queries * n_probes, stream, mr); - auto search_instance = ivfpq_search::fun(params, index.metric()); + auto search_instance = ivfpq_search::fun(params, index.metric()); for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) { uint32_t queries_batch = min(max_queries, n_queries - offset_q); @@ -843,8 +850,7 @@ inline void search(raft::resources const& handle, distances + uint64_t(k) * (offset_q + offset_b), utils::config::kDivisor / utils::config::kDivisor, params.preferred_shmem_carveout, - sample_filter, - mr); + sample_filter); } } } diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh index ff5bd8ef89..f01035cad3 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_serialize.cuh @@ -48,7 +48,7 @@ struct check_index_layout { }; // TODO: Recompute this and come back to it. -template struct check_index_layout), 448>; +template struct check_index_layout), 480>; /** * Write the index to an output stream diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh index 5cb9f6d0ab..123a902ef9 100644 --- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh +++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh @@ -238,7 +238,8 @@ void tiled_brute_force_knn(const raft::resources& handle, distances + i * k, current_query_size, current_k), raft::make_device_matrix_view( indices + i * k, current_query_size, current_k), - select_min); + select_min, + true); // if we're tiling over columns, we need to do a couple things to fix up // the output of select_k @@ -280,7 +281,8 @@ void tiled_brute_force_knn(const raft::resources& handle, distances + i * k, current_query_size, k), raft::make_device_matrix_view( indices + i * k, current_query_size, k), - select_min); + select_min, + true); } } } diff --git a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh index e2b5c41fb0..0a33832b79 100644 --- a/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh +++ b/cpp/include/raft/neighbors/detail/knn_merge_parts.cuh @@ -30,8 +30,8 @@ template -__global__ void knn_merge_parts_kernel(value_t* inK, - value_idx* inV, +__global__ void knn_merge_parts_kernel(const value_t* inK, + const value_idx* inV, value_t* outK, value_idx* outV, size_t n_samples, @@ -65,8 +65,8 @@ __global__ void knn_merge_parts_kernel(value_t* inK, int col = i % k; - value_t* inKStart = inK + (row_idx + col); - value_idx* inVStart = inV + (row_idx + col); + const value_t* inKStart = inK + (row_idx + col); + const value_idx* inVStart = inV + (row_idx + col); int limit = Pow2::roundDown(total_k); value_idx translation = 0; @@ -99,8 +99,8 @@ __global__ void knn_merge_parts_kernel(value_t* inK, } template -inline void knn_merge_parts_impl(value_t* inK, - value_idx* inV, +inline void knn_merge_parts_impl(const value_t* inK, + const value_idx* inV, value_t* outK, value_idx* outV, size_t n_samples, @@ -137,8 +137,8 @@ inline void knn_merge_parts_impl(value_t* inK, * @param translations mapping of index offsets for each partition */ template -inline void knn_merge_parts(value_t* inK, - value_idx* inV, +inline void knn_merge_parts(const value_t* inK, + const value_idx* inV, value_t* outK, value_idx* outV, size_t n_samples, diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh index 64f9511ff9..170f973984 100644 --- a/cpp/include/raft/neighbors/detail/refine.cuh +++ b/cpp/include/raft/neighbors/detail/refine.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,228 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -namespace raft::neighbors::detail { - -/** Checks whether the input data extents are compatible. */ -template -void check_input(extents_t dataset, - extents_t queries, - extents_t candidates, - extents_t indices, - extents_t distances, - distance::DistanceType metric) -{ - auto n_queries = queries.extent(0); - auto k = distances.extent(1); - - RAFT_EXPECTS(k <= raft::matrix::detail::select::warpsort::kMaxCapacity, - "k must be lest than topk::kMaxCapacity (%d).", - raft::matrix::detail::select::warpsort::kMaxCapacity); - - RAFT_EXPECTS(indices.extent(0) == n_queries && distances.extent(0) == n_queries && - candidates.extent(0) == n_queries, - "Number of rows in output indices, distances and candidates matrices must be equal" - " with the number of rows in search matrix. Expected %d, got %d, %d, and %d", - static_cast(n_queries), - static_cast(indices.extent(0)), - static_cast(distances.extent(0)), - static_cast(candidates.extent(0))); - - RAFT_EXPECTS(indices.extent(1) == k, - "Number of columns in output indices and distances matrices must be equal to k"); - - RAFT_EXPECTS(queries.extent(1) == dataset.extent(1), - "Number of columns must be equal for dataset and queries"); - - RAFT_EXPECTS(candidates.extent(1) >= k, - "Number of neighbor candidates must not be smaller than k (%d vs %d)", - static_cast(candidates.extent(1)), - static_cast(k)); -} - -/** - * See raft::neighbors::refine for docs. - */ -template -void refine_device(raft::resources const& handle, - raft::device_matrix_view dataset, - raft::device_matrix_view queries, - raft::device_matrix_view neighbor_candidates, - raft::device_matrix_view indices, - raft::device_matrix_view distances, - distance::DistanceType metric = distance::DistanceType::L2Unexpanded) -{ - matrix_idx n_candidates = neighbor_candidates.extent(1); - matrix_idx n_queries = queries.extent(0); - matrix_idx dim = dataset.extent(1); - uint32_t k = static_cast(indices.extent(1)); - - common::nvtx::range fun_scope( - "neighbors::refine(%zu, %u)", size_t(n_queries), uint32_t(n_candidates)); - - check_input(dataset.extents(), - queries.extents(), - neighbor_candidates.extents(), - indices.extents(), - distances.extents(), - metric); - - // The refinement search can be mapped to an IVF flat search: - // - We consider that the candidate vectors form a cluster, separately for each query. - // - In other words, the n_queries * n_candidates vectors form n_queries clusters, each with - // n_candidates elements. - // - We consider that the coarse level search is already performed and assigned a single cluster - // to search for each query (the cluster formed from the corresponding candidates). - // - We run IVF flat search with n_probes=1 to select the best k elements of the candidates. - rmm::device_uvector fake_coarse_idx(n_queries, resource::get_cuda_stream(handle)); - - thrust::sequence(resource::get_thrust_policy(handle), - fake_coarse_idx.data(), - fake_coarse_idx.data() + n_queries); - - raft::neighbors::ivf_flat::index refinement_index( - handle, metric, n_queries, false, true, dim); - - raft::neighbors::ivf_flat::detail::fill_refinement_index(handle, - &refinement_index, - dataset.data_handle(), - neighbor_candidates.data_handle(), - n_queries, - n_candidates); - uint32_t grid_dim_x = 1; - raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< - data_t, - typename raft::spatial::knn::detail::utils::config::value_t, - idx_t>(refinement_index, - queries.data_handle(), - fake_coarse_idx.data(), - static_cast(n_queries), - refinement_index.metric(), - 1, - k, - raft::distance::is_min_close(metric), - indices.data_handle(), - distances.data_handle(), - grid_dim_x, - resource::get_cuda_stream(handle)); -} - -/** Helper structure for naive CPU implementation of refine. */ -typedef struct { - uint64_t id; - float distance; -} struct_for_refinement; - -inline int _postprocessing_qsort_compare(const void* v1, const void* v2) -{ - // sort in ascending order - if (((struct_for_refinement*)v1)->distance > ((struct_for_refinement*)v2)->distance) { - return 1; - } else if (((struct_for_refinement*)v1)->distance < ((struct_for_refinement*)v2)->distance) { - return -1; - } else { - return 0; - } -} - -/** - * Naive CPU implementation of refine operation - * - * All pointers are expected to be accessible on the host. - */ -template -void refine_host(raft::host_matrix_view dataset, - raft::host_matrix_view queries, - raft::host_matrix_view neighbor_candidates, - raft::host_matrix_view indices, - raft::host_matrix_view distances, - distance::DistanceType metric = distance::DistanceType::L2Unexpanded) -{ - check_input(dataset.extents(), - queries.extents(), - neighbor_candidates.extents(), - indices.extents(), - distances.extents(), - metric); - - switch (metric) { - case raft::distance::DistanceType::L2Expanded: break; - case raft::distance::DistanceType::InnerProduct: break; - default: throw raft::logic_error("Unsopported metric"); - } - - size_t numDataset = dataset.extent(0); - size_t numQueries = queries.extent(0); - size_t dimDataset = dataset.extent(1); - const data_t* dataset_ptr = dataset.data_handle(); - const data_t* queries_ptr = queries.data_handle(); - const idx_t* neighbors = neighbor_candidates.data_handle(); - idx_t topK = neighbor_candidates.extent(1); - idx_t refinedTopK = indices.extent(1); - idx_t* refinedNeighbors = indices.data_handle(); - distance_t* refinedDistances = distances.data_handle(); - - common::nvtx::range fun_scope( - "neighbors::refine_host(%zu, %u)", size_t(numQueries), uint32_t(topK)); - -#pragma omp parallel - { - struct_for_refinement* sfr = - (struct_for_refinement*)malloc(sizeof(struct_for_refinement) * topK); - for (size_t i = omp_get_thread_num(); i < numQueries; i += omp_get_num_threads()) { - // compute distance with original dataset vectors - const data_t* cur_query = queries_ptr + ((uint64_t)dimDataset * i); - for (size_t j = 0; j < (size_t)topK; j++) { - idx_t id = neighbors[j + (topK * i)]; - const data_t* cur_dataset = dataset_ptr + ((uint64_t)dimDataset * id); - float distance = 0.0; - for (size_t k = 0; k < (size_t)dimDataset; k++) { - float val_q = (float)(cur_query[k]); - float val_d = (float)(cur_dataset[k]); - if (metric == raft::distance::DistanceType::InnerProduct) { - distance += -val_q * val_d; // Negate because we sort in ascending order. - } else { - distance += (val_q - val_d) * (val_q - val_d); - } - } - sfr[j].id = id; - sfr[j].distance = distance; - } - - qsort(sfr, topK, sizeof(struct_for_refinement), _postprocessing_qsort_compare); - - for (size_t j = 0; j < (size_t)refinedTopK; j++) { - refinedNeighbors[j + (refinedTopK * i)] = sfr[j].id; - if (refinedDistances == NULL) continue; - if (metric == raft::distance::DistanceType::InnerProduct) { - refinedDistances[j + (refinedTopK * i)] = -sfr[j].distance; - } else { - refinedDistances[j + (refinedTopK * i)] = sfr[j].distance; - } - } - } - free(sfr); - } -} - -} // namespace raft::neighbors::detail +#include "refine_device.cuh" +#include "refine_host.hpp" diff --git a/cpp/include/raft/neighbors/detail/refine_common.hpp b/cpp/include/raft/neighbors/detail/refine_common.hpp new file mode 100644 index 0000000000..bfd3341ee9 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/refine_common.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft::neighbors::detail { + +/** Checks whether the input data extents are compatible. */ +template +void refine_check_input(ExtentsT dataset, + ExtentsT queries, + ExtentsT candidates, + ExtentsT indices, + ExtentsT distances, + distance::DistanceType metric) +{ + auto n_queries = queries.extent(0); + auto k = distances.extent(1); + + RAFT_EXPECTS(indices.extent(0) == n_queries && distances.extent(0) == n_queries && + candidates.extent(0) == n_queries, + "Number of rows in output indices, distances and candidates matrices must be equal" + " with the number of rows in search matrix. Expected %d, got %d, %d, and %d", + static_cast(n_queries), + static_cast(indices.extent(0)), + static_cast(distances.extent(0)), + static_cast(candidates.extent(0))); + + RAFT_EXPECTS(indices.extent(1) == k, + "Number of columns in output indices and distances matrices must be equal to k"); + + RAFT_EXPECTS(queries.extent(1) == dataset.extent(1), + "Number of columns must be equal for dataset and queries"); + + RAFT_EXPECTS(candidates.extent(1) >= k, + "Number of neighbor candidates must not be smaller than k (%d vs %d)", + static_cast(candidates.extent(1)), + static_cast(k)); +} + +} // namespace raft::neighbors::detail diff --git a/cpp/include/raft/neighbors/detail/refine_device.cuh b/cpp/include/raft/neighbors/detail/refine_device.cuh new file mode 100644 index 0000000000..6ee96957fa --- /dev/null +++ b/cpp/include/raft/neighbors/detail/refine_device.cuh @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace raft::neighbors::detail { + +/** + * See raft::neighbors::refine for docs. + */ +template +void refine_device(raft::resources const& handle, + raft::device_matrix_view dataset, + raft::device_matrix_view queries, + raft::device_matrix_view neighbor_candidates, + raft::device_matrix_view indices, + raft::device_matrix_view distances, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded) +{ + matrix_idx n_candidates = neighbor_candidates.extent(1); + matrix_idx n_queries = queries.extent(0); + matrix_idx dim = dataset.extent(1); + uint32_t k = static_cast(indices.extent(1)); + + RAFT_EXPECTS(k <= raft::matrix::detail::select::warpsort::kMaxCapacity, + "k must be lest than topk::kMaxCapacity (%d).", + raft::matrix::detail::select::warpsort::kMaxCapacity); + + common::nvtx::range fun_scope( + "neighbors::refine(%zu, %u)", size_t(n_queries), uint32_t(n_candidates)); + + refine_check_input(dataset.extents(), + queries.extents(), + neighbor_candidates.extents(), + indices.extents(), + distances.extents(), + metric); + + // The refinement search can be mapped to an IVF flat search: + // - We consider that the candidate vectors form a cluster, separately for each query. + // - In other words, the n_queries * n_candidates vectors form n_queries clusters, each with + // n_candidates elements. + // - We consider that the coarse level search is already performed and assigned a single cluster + // to search for each query (the cluster formed from the corresponding candidates). + // - We run IVF flat search with n_probes=1 to select the best k elements of the candidates. + rmm::device_uvector fake_coarse_idx(n_queries, resource::get_cuda_stream(handle)); + + thrust::sequence(resource::get_thrust_policy(handle), + fake_coarse_idx.data(), + fake_coarse_idx.data() + n_queries); + + raft::neighbors::ivf_flat::index refinement_index( + handle, metric, n_queries, false, true, dim); + + raft::neighbors::ivf_flat::detail::fill_refinement_index(handle, + &refinement_index, + dataset.data_handle(), + neighbor_candidates.data_handle(), + n_queries, + n_candidates); + uint32_t grid_dim_x = 1; + raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< + data_t, + typename raft::spatial::knn::detail::utils::config::value_t, + idx_t>(refinement_index, + queries.data_handle(), + fake_coarse_idx.data(), + static_cast(n_queries), + 0, + refinement_index.metric(), + 1, + k, + raft::distance::is_min_close(metric), + raft::neighbors::filtering::none_ivf_sample_filter(), + indices.data_handle(), + distances.data_handle(), + grid_dim_x, + resource::get_cuda_stream(handle)); +} + +} // namespace raft::neighbors::detail diff --git a/cpp/include/raft/neighbors/detail/refine_host-ext.hpp b/cpp/include/raft/neighbors/detail/refine_host-ext.hpp new file mode 100644 index 0000000000..3ce2dc3eb5 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/refine_host-ext.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include // int64_t + +#include // raft::host_matrix_view +#include // raft::distance::DistanceType +#include // RAFT_EXPLICIT + +#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY + +namespace raft::neighbors::detail { + +template +[[gnu::optimize(3), gnu::optimize("tree-vectorize")]] void refine_host( + raft::host_matrix_view dataset, + raft::host_matrix_view queries, + raft::host_matrix_view neighbor_candidates, + raft::host_matrix_view indices, + raft::host_matrix_view distances, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT; + +} + +#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY + +#define instantiate_raft_neighbors_refine(IdxT, DataT, DistanceT, ExtentsT) \ + extern template void raft::neighbors::detail::refine_host( \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view queries, \ + raft::host_matrix_view neighbor_candidates, \ + raft::host_matrix_view indices, \ + raft::host_matrix_view distances, \ + distance::DistanceType metric); + +instantiate_raft_neighbors_refine(int64_t, float, float, int64_t); +instantiate_raft_neighbors_refine(int64_t, int8_t, float, int64_t); +instantiate_raft_neighbors_refine(int64_t, uint8_t, float, int64_t); + +#undef instantiate_raft_neighbors_refine diff --git a/cpp/include/raft/neighbors/detail/refine_host-inl.hpp b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp new file mode 100644 index 0000000000..cfedaa38d3 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/refine_host-inl.hpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace raft::neighbors::detail { + +template +[[gnu::optimize(3), gnu::optimize("tree-vectorize")]] void refine_host_impl( + raft::host_matrix_view dataset, + raft::host_matrix_view queries, + raft::host_matrix_view neighbor_candidates, + raft::host_matrix_view indices, + raft::host_matrix_view distances) +{ + size_t n_queries = queries.extent(0); + size_t dim = dataset.extent(1); + size_t orig_k = neighbor_candidates.extent(1); + size_t refined_k = indices.extent(1); + + common::nvtx::range fun_scope( + "neighbors::refine_host(%zu, %zu -> %zu)", n_queries, orig_k, refined_k); + + auto suggested_n_threads = std::max(1, std::min(omp_get_num_procs(), omp_get_max_threads())); + if (size_t(suggested_n_threads) > n_queries) { suggested_n_threads = n_queries; } + +#pragma omp parallel num_threads(suggested_n_threads) + { + std::vector> refined_pairs(orig_k); + for (size_t i = omp_get_thread_num(); i < n_queries; i += omp_get_num_threads()) { + // Compute the refined distance using original dataset vectors + const DataT* query = queries.data_handle() + dim * i; + for (size_t j = 0; j < orig_k; j++) { + IdxT id = neighbor_candidates(i, j); + const DataT* row = dataset.data_handle() + dim * id; + DistanceT distance = 0.0; + for (size_t k = 0; k < dim; k++) { + distance += DC::template eval(query[k], row[k]); + } + refined_pairs[j] = std::make_tuple(distance, id); + } + // Sort the query neighbors by their refined distances + std::sort(refined_pairs.begin(), refined_pairs.end()); + // Store first refined_k neighbors + for (size_t j = 0; j < refined_k; j++) { + indices(i, j) = std::get<1>(refined_pairs[j]); + if (distances.data_handle() != nullptr) { + distances(i, j) = DC::template postprocess(std::get<0>(refined_pairs[j])); + } + } + } + } +} + +struct distance_comp_l2 { + template + static inline auto eval(const DistanceT& a, const DistanceT& b) -> DistanceT + { + auto d = a - b; + return d * d; + } + template + static inline auto postprocess(const DistanceT& a) -> DistanceT + { + return a; + } +}; + +struct distance_comp_inner { + template + static inline auto eval(const DistanceT& a, const DistanceT& b) -> DistanceT + { + return -a * b; + } + template + static inline auto postprocess(const DistanceT& a) -> DistanceT + { + return -a; + } +}; + +/** + * Naive CPU implementation of refine operation + * + * All pointers are expected to be accessible on the host. + */ +template +[[gnu::optimize(3), gnu::optimize("tree-vectorize")]] void refine_host( + raft::host_matrix_view dataset, + raft::host_matrix_view queries, + raft::host_matrix_view neighbor_candidates, + raft::host_matrix_view indices, + raft::host_matrix_view distances, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded) +{ + refine_check_input(dataset.extents(), + queries.extents(), + neighbor_candidates.extents(), + indices.extents(), + distances.extents(), + metric); + + switch (metric) { + case raft::distance::DistanceType::L2Expanded: + return refine_host_impl( + dataset, queries, neighbor_candidates, indices, distances); + case raft::distance::DistanceType::InnerProduct: + return refine_host_impl( + dataset, queries, neighbor_candidates, indices, distances); + default: throw raft::logic_error("Unsupported metric"); + } +} + +} // namespace raft::neighbors::detail diff --git a/cpp/include/raft/neighbors/detail/refine_host.hpp b/cpp/include/raft/neighbors/detail/refine_host.hpp new file mode 100644 index 0000000000..ff0de75660 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/refine_host.hpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY +#include "refine_host-inl.hpp" +#endif + +#ifdef RAFT_COMPILED +#include "refine_host-ext.hpp" +#endif diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh index dff7b6b2ab..848703c9b5 100644 --- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh +++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh @@ -74,6 +74,18 @@ void extend(raft::resources const& handle, std::optional> new_indices, index* index) RAFT_EXPLICIT; +template +void search_with_filtering(raft::resources const& handle, + const search_params& params, + const index& index, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr = nullptr, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; + template void search(raft::resources const& handle, const search_params& params, @@ -85,6 +97,15 @@ void search(raft::resources const& handle, float* distances, rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT; +template +void search_with_filtering(raft::resources const& handle, + const search_params& params, + const index& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT; + template void search(raft::resources const& handle, const search_params& params, diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh index 739e012e08..a18ee065bf 100644 --- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh @@ -357,6 +357,69 @@ void extend(raft::resources const& handle, * rmm::mr::get_current_device_resource(), 1024 * 1024); * // use default search parameters * ivf_flat::search_params search_params; + * filtering::none_ivf_sample_filter filter; + * // Use the same allocator across multiple searches to reduce the number of + * // cuda memory allocations + * ivf_flat::search_with_filtering( + * handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr, filter); + * ivf_flat::search_with_filtering( + * handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr, filter); + * ivf_flat::search_with_filtering( + * handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr, filter); + * ... + * @endcode + * The exact size of the temporary buffer depends on multiple factors and is an implementation + * detail. However, you can safely specify a small initial size for the memory pool, so that only a + * few allocations happen to grow it during the first invocations of the `search`. + * + * @tparam T data element type + * @tparam IdxT type of the indices + * + * @param[in] handle + * @param[in] params configure the search + * @param[in] index ivf-flat constructed index + * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()] + * @param[in] n_queries the batch size + * @param[in] k the number of neighbors to find for each query. + * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset + * [n_queries, k] + * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] + * @param[in] mr an optional memory resource to use across the searches (you can provide a large + * enough memory pool here to avoid memory allocations within search). + * @param[in] sample_filter a filter the greenlights samples for a given query + */ +template +void search_with_filtering(raft::resources const& handle, + const search_params& params, + const index& index, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr = nullptr, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) +{ + raft::neighbors::ivf_flat::detail::search( + handle, params, index, queries, n_queries, k, neighbors, distances, mr, sample_filter); +} + +/** + * @brief Search ANN using the constructed index using the given filter. + * + * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example. + * + * Note, this function requires a temporary buffer to store intermediate results between cuda kernel + * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can + * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or + * eliminate entirely allocations happening within `search`: + * @code{.cpp} + * ... + * // Create a pooling memory resource with a pre-defined initial size. + * rmm::mr::pool_memory_resource mr( + * rmm::mr::get_current_device_resource(), 1024 * 1024); + * // use default search parameters + * ivf_flat::search_params search_params; * // Use the same allocator across multiple searches to reduce the number of * // cuda memory allocations * ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr); @@ -394,8 +457,16 @@ void search(raft::resources const& handle, float* distances, rmm::mr::device_memory_resource* mr = nullptr) { - return raft::neighbors::ivf_flat::detail::search( - handle, params, index, queries, n_queries, k, neighbors, distances, mr); + raft::neighbors::ivf_flat::detail::search(handle, + params, + index, + queries, + n_queries, + k, + neighbors, + distances, + mr, + raft::neighbors::filtering::none_ivf_sample_filter()); } /** @@ -403,6 +474,74 @@ void search(raft::resources const& handle, * @{ */ +/** + * @brief Search ANN using the constructed index using the given filter. + * + * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example. + * + * Note, this function requires a temporary buffer to store intermediate results between cuda kernel + * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can + * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or + * eliminate entirely allocations happening within `search`: + * @code{.cpp} + * ... + * // use default search parameters + * ivf_flat::search_params search_params; + * filtering::none_ivf_sample_filter filter; + * // Use the same allocator across multiple searches to reduce the number of + * // cuda memory allocations + * ivf_flat::search_with_filtering( + * handle, search_params, index, queries1, out_inds1, out_dists1, filter); + * ivf_flat::search_with_filtering( + * handle, search_params, index, queries2, out_inds2, out_dists2, filter); + * ivf_flat::search_with_filtering( + * handle, search_params, index, queries3, out_inds3, out_dists3, filter); + * ... + * @endcode + * + * @tparam T data element type + * @tparam IdxT type of the indices + * + * @param[in] handle + * @param[in] params configure the search + * @param[in] index ivf-flat constructed index + * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()] + * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset + * [n_queries, k] + * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] + * @param[in] sample_filter a filter the greenlights samples for a given query + */ +template +void search_with_filtering(raft::resources const& handle, + const search_params& params, + const index& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances, + IvfSampleFilterT sample_filter = IvfSampleFilterT()) +{ + RAFT_EXPECTS( + queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0), + "Number of rows in output neighbors and distances matrices must equal the number of queries."); + + RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1), + "Number of columns in output neighbors and distances matrices must be equal"); + + RAFT_EXPECTS(queries.extent(1) == index.dim(), + "Number of query dimensions should equal number of dimensions in the index."); + + search_with_filtering(handle, + params, + index, + queries.data_handle(), + static_cast(queries.extent(0)), + static_cast(neighbors.extent(1)), + neighbors.data_handle(), + distances.data_handle(), + resource::get_workspace_resource(handle), + sample_filter); +} + /** * @brief Search ANN using the constructed index. * @@ -443,25 +582,13 @@ void search(raft::resources const& handle, raft::device_matrix_view neighbors, raft::device_matrix_view distances) { - RAFT_EXPECTS( - queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0), - "Number of rows in output neighbors and distances matrices must equal the number of queries."); - - RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1), - "Number of columns in output neighbors and distances matrices must be equal"); - - RAFT_EXPECTS(queries.extent(1) == index.dim(), - "Number of query dimensions should equal number of dimensions in the index."); - - return search(handle, - params, - index, - queries.data_handle(), - static_cast(queries.extent(0)), - static_cast(neighbors.extent(1)), - neighbors.data_handle(), - distances.data_handle(), - nullptr); + search_with_filtering(handle, + params, + index, + queries, + neighbors, + distances, + raft::neighbors::filtering::none_ivf_sample_filter()); } /** @} */ diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp new file mode 100644 index 0000000000..4594332fdf --- /dev/null +++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#ifdef _RAFT_HAS_CUDA +#include +#else +#include +#endif + +namespace raft::neighbors::ivf_flat::codepacker { + +template +_RAFT_HOST_DEVICE inline auto roundDown(T x) +{ +#if defined(_RAFT_HAS_CUDA) + return Pow2::roundDown(x); +#else + return raft::round_down_safe(x, kIndexGroupSize); +#endif +} + +template +_RAFT_HOST_DEVICE inline auto mod(T x) +{ +#if defined(_RAFT_HAS_CUDA) + return Pow2::mod(x); +#else + return x % kIndexGroupSize; +#endif +} + +/** + * Write one flat code into a block by the given offset. The offset indicates the id of the record + * in the list. This function interleaves the code and is intended to later copy the interleaved + * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit + * the record (offset + 1). + * + * @tparam T + * + * @param[in] flat_code input flat code + * @param[out] block block of memory to write interleaved codes to + * @param[in] dim dimension of the flat code + * @param[in] veclen size of interleaved data chunks + * @param[in] offset how many records to skip before writing the data into the list + */ +template +_RAFT_HOST_DEVICE void pack_1( + const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset) +{ + // The data is written in interleaved groups of `index::kGroupSize` vectors + // using interleaved_group = Pow2; + + // Interleave dimensions of the source vector while recording it. + // NB: such `veclen` is selected, that `dim % veclen == 0` + auto group_offset = roundDown(offset); + auto ingroup_id = mod(offset) * veclen; + + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j]; + } + } +} + +/** + * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset + * indicates the id of the record. This function fetches one flat code from an interleaved code. + * + * @tparam T + * + * @param[in] block interleaved block. The block can be thought of as the whole inverted list in + * interleaved format. + * @param[out] flat_code output flat code + * @param[in] dim dimension of the flat code + * @param[in] veclen size of interleaved data chunks + * @param[in] offset fetch the flat code by the given offset + */ +template +_RAFT_HOST_DEVICE void unpack_1( + const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset) +{ + // The data is written in interleaved groups of `index::kGroupSize` vectors + // using interleaved_group = Pow2; + + // NB: such `veclen` is selected, that `dim % veclen == 0` + auto group_offset = roundDown(offset); + auto ingroup_id = mod(offset) * veclen; + + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j]; + } + } +} +} // namespace raft::neighbors::ivf_flat::codepacker \ No newline at end of file diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh new file mode 100644 index 0000000000..096e8051c3 --- /dev/null +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace raft::neighbors::ivf_flat::helpers { +/** + * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index + * @{ + */ + +namespace codepacker { + +/** + * Write flat codes into an existing list by the given offset. + * + * NB: no memory allocation happens here; the list must fit the data (offset + n_vec). + * + * Usage example: + * @code{.cpp} + * auto list_data = index.lists()[label]->data.view(); + * // allocate the buffer for the input codes + * auto codes = raft::make_device_matrix(res, n_vec, index.dim()); + * ... prepare n_vecs to pack into the list in codes ... + * // write codes into the list starting from the 42nd position + * ivf_pq::helpers::codepacker::pack( + * res, make_const_mdspan(codes.view()), index.veclen(), 42, list_data); + * @endcode + * + * @tparam T + * @tparam IdxT + * + * @param[in] res + * @param[in] codes flat codes [n_vec, dim] + * @param[in] veclen size of interleaved data chunks + * @param[in] offset how many records to skip before writing the data into the list + * @param[inout] list_data block to write into + */ +template +void pack( + raft::resources const& res, + device_matrix_view codes, + uint32_t veclen, + uint32_t offset, + device_mdspan::list_extents, row_major> list_data) +{ + raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, offset, list_data); +} + +/** + * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index + * starting at given `offset`. + * + * Usage example: + * @code{.cpp} + * auto list_data = index.lists()[label]->data.view(); + * // allocate the buffer for the output + * uint32_t n_take = 4; + * auto codes = raft::make_device_matrix(res, n_take, index.dim()); + * uint32_t offset = 0; + * // unpack n_take elements from the list + * ivf_pq::helpers::codepacker::unpack(res, list_data, index.veclen(), offset, codes.view()); + * @endcode + * + * @tparam T + * @tparam IdxT + * + * @param[in] res raft resource + * @param[in] list_data block to read from + * @param[in] veclen size of interleaved data chunks + * @param[in] offset + * How many records in the list to skip. + * @param[inout] codes + * the destination buffer [n_take, index.dim()]. + * The length `n_take` defines how many records to unpack, + * it must be <= the list size. + */ +template +void unpack( + raft::resources const& res, + device_mdspan::list_extents, row_major> list_data, + uint32_t veclen, + uint32_t offset, + device_matrix_view codes) +{ + raft::neighbors::ivf_flat::detail::unpack_list_data( + res, list_data, veclen, offset, codes); +} +} // namespace codepacker +/** @} */ +} // namespace raft::neighbors::ivf_flat::helpers diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh index 5b7391569b..fcfe837e2d 100644 --- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh @@ -45,14 +45,14 @@ void extend(raft::resources const& handle, std::optional> new_indices, index* idx) RAFT_EXPLICIT; -template +template void search_with_filtering(raft::resources const& handle, const search_params& params, const index& idx, raft::device_matrix_view queries, raft::device_matrix_view neighbors, raft::device_matrix_view distances, - SampleFilterT sample_filter) RAFT_EXPLICIT; + IvfSampleFilterT sample_filter) RAFT_EXPLICIT; template void search(raft::resources const& handle, @@ -83,7 +83,7 @@ void extend(raft::resources const& handle, const IdxT* new_indices, IdxT n_rows) RAFT_EXPLICIT; -template +template void search_with_filtering(raft::resources const& handle, const raft::neighbors::ivf_pq::search_params& params, const index& idx, @@ -92,8 +92,7 @@ void search_with_filtering(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - SampleFilterT sample_filter = SampleFilterT()) RAFT_EXPLICIT; + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT; template void search(raft::resources const& handle, @@ -103,8 +102,34 @@ void search(raft::resources const& handle, uint32_t n_queries, uint32_t k, IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT; + float* distances) RAFT_EXPLICIT; + +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search_with_filtering(raft::resources const& handle, + const raft::neighbors::ivf_pq::search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT; + +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search(raft::resources const& handle, + const raft::neighbors::ivf_pq::search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr) RAFT_EXPLICIT; } // namespace raft::neighbors::ivf_pq @@ -182,7 +207,17 @@ instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t); uint32_t k, \ IdxT* neighbors, \ float* distances, \ - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* mr); \ + \ + extern template void raft::neighbors::ivf_pq::search( \ + raft::resources const& handle, \ + const raft::neighbors::ivf_pq::search_params& params, \ + const raft::neighbors::ivf_pq::index& idx, \ + const T* queries, \ + uint32_t n_queries, \ + uint32_t k, \ + IdxT* neighbors, \ + float* distances) instantiate_raft_neighbors_ivf_pq_search(float, int64_t); instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t); diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh index fbe2fcb30d..ccf8717486 100644 --- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh @@ -16,17 +16,18 @@ #pragma once -#include #include #include #include #include #include +#include #include -#include -#include +#include + +#include // shared_ptr namespace raft::neighbors::ivf_pq { @@ -158,14 +159,14 @@ void extend(raft::resources const& handle, * k] * @param[in] sample_filter a filter the greenlights samples for a given query. */ -template +template void search_with_filtering(raft::resources const& handle, const search_params& params, const index& idx, raft::device_matrix_view queries, raft::device_matrix_view neighbors, raft::device_matrix_view distances, - SampleFilterT sample_filter = SampleFilterT()) + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) { RAFT_EXPECTS( queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0), @@ -186,7 +187,6 @@ void search_with_filtering(raft::resources const& handle, k, neighbors.data_handle(), distances.data_handle(), - resource::get_workspace_resource(handle), sample_filter); } @@ -223,8 +223,13 @@ void search(raft::resources const& handle, raft::device_matrix_view neighbors, raft::device_matrix_view distances) { - search_with_filtering( - handle, params, idx, queries, neighbors, distances, detail::NoneSampleFilter()); + search_with_filtering(handle, + params, + idx, + queries, + neighbors, + distances, + raft::neighbors::filtering::none_ivf_sample_filter{}); } /** @} */ // end group ivf_pq @@ -337,7 +342,49 @@ void extend(raft::resources const& handle, detail::extend(handle, idx, new_vectors, new_indices, n_rows); } -template +/** + * @brief Search ANN using the constructed index using the given filter. + * + * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example. + * + * Note, this function requires a temporary buffer to store intermediate results between cuda kernel + * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can + * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or + * eliminate entirely allocations happening within `search`: + * @code{.cpp} + * ... + * // use default search parameters + * ivf_pq::search_params search_params; + * filtering::none_ivf_sample_filter filter; + * // Use the same allocator across multiple searches to reduce the number of + * // cuda memory allocations + * ivf_pq::search_with_filtering( + * handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, filter); + * ivf_pq::search_with_filtering( + * handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, filter); + * ivf_pq::search_with_filtering( + * handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, nfilter); + * ... + * @endcode + * The exact size of the temporary buffer depends on multiple factors and is an implementation + * detail. However, you can safely specify a small initial size for the memory pool, so that only a + * few allocations happen to grow it during the first invocations of the `search`. + * + * @tparam T data element type + * @tparam IdxT type of the indices + * + * @param[in] handle + * @param[in] params configure the search + * @param[in] idx ivf-pq constructed index + * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()] + * @param[in] n_queries the batch size + * @param[in] k the number of neighbors to find for each query. + * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset + * [n_queries, k] + * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] + * @param[in] sample_filter a filter the greenlights samples for a given query + */ +template void search_with_filtering(raft::resources const& handle, const search_params& params, const index& idx, @@ -346,11 +393,41 @@ void search_with_filtering(raft::resources const& handle, uint32_t k, IdxT* neighbors, float* distances, - rmm::mr::device_memory_resource* mr = nullptr, - SampleFilterT sample_filter = SampleFilterT()) + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) { - detail::search( - handle, params, idx, queries, n_queries, k, neighbors, distances, mr, sample_filter); + detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); +} + +/** + * This function is deprecated and will be removed in a future. + * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead. + */ +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search_with_filtering(raft::resources const& handle, + const search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr, + IvfSampleFilterT sample_filter = IvfSampleFilterT{}) +{ + if (mr != nullptr) { + // Shallow copy of the resource with the automatic lifespan: + // change the workspace resource temporarily + raft::resources res_local(handle); + resource::set_workspace_resource( + res_local, std::shared_ptr{mr, void_op{}}); + return search_with_filtering( + res_local, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); + } else { + return search_with_filtering( + handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter); + } } /** @@ -392,8 +469,6 @@ void search_with_filtering(raft::resources const& handle, * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset * [n_queries, k] * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] - * @param[in] mr an optional memory resource to use across the searches (you can provide a large - * enough memory pool here to avoid memory allocations within search). */ template void search(raft::resources const& handle, @@ -403,10 +478,46 @@ void search(raft::resources const& handle, uint32_t n_queries, uint32_t k, IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr = nullptr) + float* distances) +{ + return search_with_filtering(handle, + params, + idx, + queries, + n_queries, + k, + neighbors, + distances, + raft::neighbors::filtering::none_ivf_sample_filter{}); +} + +/** + * This function is deprecated and will be removed in a future. + * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead. + */ +template +[[deprecated( + "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void +search(raft::resources const& handle, + const search_params& params, + const index& idx, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr) { - detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, mr); + return search_with_filtering(handle, + params, + idx, + queries, + n_queries, + k, + neighbors, + distances, + mr, + raft::neighbors::filtering::none_ivf_sample_filter{}); } } // namespace raft::neighbors::ivf_pq diff --git a/cpp/include/raft/neighbors/detail/sample_filter.cuh b/cpp/include/raft/neighbors/sample_filter_types.hpp similarity index 70% rename from cpp/include/raft/neighbors/detail/sample_filter.cuh rename to cpp/include/raft/neighbors/sample_filter_types.hpp index f5c3d91afe..5a301e9d2f 100644 --- a/cpp/include/raft/neighbors/detail/sample_filter.cuh +++ b/cpp/include/raft/neighbors/sample_filter_types.hpp @@ -19,11 +19,13 @@ #include #include -namespace raft::neighbors::ivf_pq::detail { +#include + +namespace raft::neighbors::filtering { /* A filter that filters nothing. This is the default behavior. */ -struct NoneSampleFilter { - inline __device__ __host__ bool operator()( +struct none_ivf_sample_filter { + inline _RAFT_HOST_DEVICE bool operator()( // query index const uint32_t query_ix, // the current inverted list index @@ -40,20 +42,20 @@ struct NoneSampleFilter { * filter template can be used: * * template - * struct IndexSampleFilter { + * struct index_ivf_sample_filter { * using index_type = IdxT; * * const index_type* const* inds_ptr = nullptr; * - * IndexSampleFilter() {} - * IndexSampleFilter(const index_type* const* _inds_ptr) + * index_ivf_sample_filter() {} + * index_ivf_sample_filter(const index_type* const* _inds_ptr) * : inds_ptr{_inds_ptr} {} - * IndexSampleFilter(const IndexSampleFilter&) = default; - * IndexSampleFilter(IndexSampleFilter&&) = default; - * IndexSampleFilter& operator=(const IndexSampleFilter&) = default; - * IndexSampleFilter& operator=(IndexSampleFilter&&) = default; + * index_ivf_sample_filter(const index_ivf_sample_filter&) = default; + * index_ivf_sample_filter(index_ivf_sample_filter&&) = default; + * index_ivf_sample_filter& operator=(const index_ivf_sample_filter&) = default; + * index_ivf_sample_filter& operator=(index_ivf_sample_filter&&) = default; * - * inline __device__ __host__ bool operator()( + * inline _RAFT_HOST_DEVICE bool operator()( * const uint32_t query_ix, * const uint32_t cluster_ix, * const uint32_t sample_ix) const { @@ -65,7 +67,7 @@ struct NoneSampleFilter { * }; * * Initialize it as: - * using filter_type = IndexSampleFilter; + * using filter_type = index_ivf_sample_filter; * filter_type filter(raft_ivfpq_index.inds_ptrs().data_handle()); * * Use it as: @@ -78,27 +80,27 @@ struct NoneSampleFilter { * to a contiguous bit mask vector. * * template - * struct BitMaskSampleFilter { + * struct bitmask_ivf_sample_filter { * using index_type = IdxT; * * const index_type* const* inds_ptr = nullptr; * const uint64_t* const bit_mask_ptr = nullptr; * const int64_t bit_mask_stride_64 = 0; * - * BitMaskSampleFilter() {} - * BitMaskSampleFilter( + * bitmask_ivf_sample_filter() {} + * bitmask_ivf_sample_filter( * const index_type* const* _inds_ptr, * const uint64_t* const _bit_mask_ptr, * const int64_t _bit_mask_stride_64) * : inds_ptr{_inds_ptr}, * bit_mask_ptr{_bit_mask_ptr}, * bit_mask_stride_64{_bit_mask_stride_64} {} - * BitMaskSampleFilter(const BitMaskSampleFilter&) = default; - * BitMaskSampleFilter(BitMaskSampleFilter&&) = default; - * BitMaskSampleFilter& operator=(const BitMaskSampleFilter&) = default; - * BitMaskSampleFilter& operator=(BitMaskSampleFilter&&) = default; + * bitmask_ivf_sample_filter(const bitmask_ivf_sample_filter&) = default; + * bitmask_ivf_sample_filter(bitmask_ivf_sample_filter&&) = default; + * bitmask_ivf_sample_filter& operator=(const bitmask_ivf_sample_filter&) = default; + * bitmask_ivf_sample_filter& operator=(bitmask_ivf_sample_filter&&) = default; * - * inline __device__ __host__ bool operator()( + * inline _RAFT_HOST_DEVICE bool operator()( * const uint32_t query_ix, * const uint32_t cluster_ix, * const uint32_t sample_ix) const { @@ -113,4 +115,4 @@ struct NoneSampleFilter { * } * }; */ -} // namespace raft::neighbors::ivf_pq::detail +} // namespace raft::neighbors::filtering diff --git a/cpp/include/raft/sparse/detail/utils.h b/cpp/include/raft/sparse/detail/utils.h index 56e8832e0a..b5017451e6 100644 --- a/cpp/include/raft/sparse/detail/utils.h +++ b/cpp/include/raft/sparse/detail/utils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -90,7 +90,8 @@ __global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols) int tid = threadIdx.x; for (int i = tid; i < ncols; i += blockDim.x) { - indices[row * ncols + i] = i; + uint64_t idx = (uint64_t)row * (uint64_t)ncols; + indices[idx + i] = i; } } diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh index 630457158b..e87ef99469 100644 --- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh @@ -19,9 +19,9 @@ #include #include +#include "common.hpp" #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/detail/common.hpp similarity index 95% rename from cpp/include/raft/sparse/distance/common.h rename to cpp/include/raft/sparse/distance/detail/common.hpp index 0b866bdc55..0f463dac80 100644 --- a/cpp/include/raft/sparse/distance/common.h +++ b/cpp/include/raft/sparse/distance/detail/common.hpp @@ -21,6 +21,7 @@ namespace raft { namespace sparse { namespace distance { +namespace detail { template struct distances_config_t { @@ -52,6 +53,7 @@ class distances_t { virtual ~distances_t() = default; }; +}; // namespace detail }; // namespace distance -} // namespace sparse +}; // namespace sparse }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh index 3a8cf53b6e..c0d5fbc365 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh @@ -26,7 +26,7 @@ #include "../../csr.hpp" #include "../../detail/utils.h" -#include "../common.h" +#include "common.hpp" #include @@ -56,10 +56,8 @@ inline void balanced_coo_pairwise_generalized_spmv( strategy_t strategy, int chunk_size = 500000) { - RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, - 0, - sizeof(value_t) * config_.a_nrows * config_.b_nrows, - resource::get_cuda_stream(config_.handle))); + uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows; + RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, resource::get_cuda_stream(config_.handle))); strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); }; @@ -112,10 +110,8 @@ inline void balanced_coo_pairwise_generalized_spmv( write_f write_func, int chunk_size = 500000) { - RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, - 0, - sizeof(value_t) * config_.a_nrows * config_.b_nrows, - resource::get_cuda_stream(config_.handle))); + uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows; + RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, resource::get_cuda_stream(config_.handle))); int max_cols = max_cols_per_block(); diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh index 138471c6cf..1c2f83c69b 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh @@ -16,7 +16,7 @@ #pragma once -#include "../../common.h" +#include "../common.hpp" #include "../coo_spmv_kernel.cuh" #include "../utils.cuh" #include "coo_mask_row_iterators.cuh" diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh index 1fbce51caf..4c061336b3 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include "../../common.h" +#include "../common.hpp" #include "../utils.cuh" #include diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh index ef5bae8aa0..39e67acdea 100644 --- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh @@ -23,11 +23,11 @@ #include #include +#include "common.hpp" #include #include #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index 5293b36a26..acae3dc445 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -19,12 +19,12 @@ #include #include +#include "common.hpp" #include #include #include #include #include -#include #include #include #include diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh index ac78068247..ff9534a157 100644 --- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh @@ -29,8 +29,8 @@ #include #include +#include "common.hpp" #include -#include #include @@ -126,11 +126,13 @@ class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t::compute(out_dists); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; // Sqrt Post-processing raft::linalg::unaryOp( out_dists, out_dists, - this->config_->a_nrows * this->config_->b_nrows, + n, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return raft::sqrt(abs(input) * neg); @@ -203,10 +205,11 @@ class lp_unexpanded_distances_t : public distances_t { raft::add_op(), raft::atomic_add_op()); + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; value_t one_over_p = value_t{1} / p; raft::linalg::unaryOp(out_dists, out_dists, - config_->a_nrows * config_->b_nrows, + n, raft::pow_const_op(one_over_p), resource::get_cuda_stream(config_->handle)); } @@ -229,10 +232,11 @@ class hamming_unexpanded_distances_t : public distances_t { unexpanded_lp_distances( out_dists, config_, raft::notequal_op(), raft::add_op(), raft::atomic_add_op()); + uint64_t n = (uint64_t)config_->a_nrows * (uint64_t)config_->b_nrows; value_t n_cols = 1.0 / config_->a_ncols; raft::linalg::unaryOp(out_dists, out_dists, - config_->a_nrows * config_->b_nrows, + n, raft::mul_const_op(n_cols), resource::get_cuda_stream(config_->handle)); } @@ -271,10 +275,11 @@ class jensen_shannon_unexpanded_distances_t : public distances_t { raft::add_op(), raft::atomic_add_op()); + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; raft::linalg::unaryOp( out_dists, out_dists, - config_->a_nrows * config_->b_nrows, + n, [=] __device__(value_t input) { return raft::sqrt(0.5 * input); }, resource::get_cuda_stream(config_->handle)); } @@ -311,9 +316,10 @@ class kl_divergence_unexpanded_distances_t : public distances_t { raft::add_op(), raft::atomic_add_op()); + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; raft::linalg::unaryOp(out_dists, out_dists, - config_->a_nrows * config_->b_nrows, + n, raft::mul_const_op(0.5), resource::get_cuda_stream(config_->handle)); } diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh index 510e02822e..b60940341a 100644 --- a/cpp/include/raft/sparse/distance/distance.cuh +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,11 @@ #pragma once -#include +#include "detail/common.hpp" #include +#include + #include #include @@ -66,7 +68,7 @@ static const std::unordered_set supportedDistance{ */ template void pairwiseDistance(value_t* out, - distances_config_t input_config, + detail::distances_config_t input_config, raft::distance::DistanceType metric, float metric_arg) { @@ -130,8 +132,94 @@ void pairwiseDistance(value_t* out, } } -}; // namespace distance -}; // namespace sparse -}; // namespace raft +/** + * @defgroup sparse_distance Sparse Pairwise Distance + * @{ + */ + +/** + * @brief Compute pairwise distances between x and y, using the provided + * input configuration and distance function. + * + * @code{.cpp} + * #include + * #include + * #include + * + * int x_n_rows = 100000; + * int y_n_rows = 50000; + * int n_cols = 10000; + * + * raft::device_resources handle; + * auto x = raft::make_device_csr_matrix(handle, x_n_rows, n_cols); + * auto y = raft::make_device_csr_matrix(handle, y_n_rows, n_cols); + * + * ... + * // populate data + * ... + * + * auto out = raft::make_device_matrix(handle, x_nrows, y_nrows); + * auto metric = raft::distance::DistanceType::L2Expanded; + * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric); + * @endcode + * + * @tparam DeviceCSRMatrix raft::device_csr_matrix or raft::device_csr_matrix_view + * @tparam ElementType data-type of inputs and output + * @tparam IndexType data-type for indexing + * + * @param[in] handle raft::resources + * @param[in] x raft::device_csr_matrix_view + * @param[in] y raft::device_csr_matrix_view + * @param[out] dist raft::device_matrix_view dense matrix + * @param[in] metric distance metric to use + * @param[in] metric_arg metric argument (used for Minkowski distance) + */ +template >> +void pairwise_distance(raft::resources const& handle, + DeviceCSRMatrix x, + DeviceCSRMatrix y, + raft::device_matrix_view dist, + raft::distance::DistanceType metric, + float metric_arg = 2.0f) +{ + auto x_structure = x.structure_view(); + auto y_structure = y.structure_view(); + + RAFT_EXPECTS(x_structure.get_n_cols() == y_structure.get_n_cols(), + "Number of columns must be equal"); + + RAFT_EXPECTS(dist.extent(0) == x_structure.get_n_rows(), + "Number of rows in output must be equal to " + "number of rows in X"); + RAFT_EXPECTS(dist.extent(1) == y_structure.get_n_rows(), + "Number of columns in output must be equal to " + "number of rows in Y"); + + detail::distances_config_t input_config(handle); + input_config.a_nrows = x_structure.get_n_rows(); + input_config.a_ncols = x_structure.get_n_cols(); + input_config.a_nnz = x_structure.get_nnz(); + input_config.a_indptr = const_cast(x_structure.get_indptr().data()); + input_config.a_indices = const_cast(x_structure.get_indices().data()); + input_config.a_data = const_cast(x.get_elements().data()); + + input_config.b_nrows = y_structure.get_n_rows(); + input_config.b_ncols = y_structure.get_n_cols(); + input_config.b_nnz = y_structure.get_nnz(); + input_config.b_indptr = const_cast(y_structure.get_indptr().data()); + input_config.b_indices = const_cast(y_structure.get_indices().data()); + input_config.b_data = const_cast(y.get_elements().data()); + + pairwiseDistance(dist.data_handle(), input_config, metric, metric_arg); +} + +/** @} */ // end of sparse_distance + +}; // namespace distance +}; // namespace sparse +}; // namespace raft #endif \ No newline at end of file diff --git a/cpp/include/raft/sparse/neighbors/connect_components.cuh b/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh similarity index 65% rename from cpp/include/raft/sparse/neighbors/connect_components.cuh rename to cpp/include/raft/sparse/neighbors/cross_component_nn.cuh index fcc6ba349b..c94c6254c3 100644 --- a/cpp/include/raft/sparse/neighbors/connect_components.cuh +++ b/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh @@ -19,7 +19,7 @@ #include #include #include -#include +#include namespace raft::sparse::neighbors { @@ -59,11 +59,20 @@ value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream * @param[in] orig_colors array containing component number for each row of X * @param[in] n_rows number of rows in X * @param[in] n_cols number of cols in X - * @param[in] reduction_op - * @param[in] metric + * @param[in] reduction_op reduction operation for computing nearest neighbors. The reduction + * operation must have `gather` and `scatter` functions defined + * @param[in] row_batch_size the batch size for computing nearest neighbors. This parameter controls + * the number of samples for which the nearest neighbors are computed at once. Therefore, it affects + * the memory consumption mainly by reducing the size of the adjacency matrix for masked nearest + * neighbors computation + * @param[in] col_batch_size the input data is sorted and 'unsorted' based on color. An additional + * scratch space buffer of shape (n_rows, col_batch_size) is created for this. Usually, this + * parameter affects the memory consumption more drastically than the row_batch_size with a marginal + * increase in compute time as the col_batch_size is reduced + * @param[in] metric distance metric */ template -void connect_components( +void cross_component_nn( raft::resources const& handle, raft::sparse::COO& out, const value_t* X, @@ -71,9 +80,20 @@ void connect_components( size_t n_rows, size_t n_cols, red_op reduction_op, + size_t row_batch_size = 0, + size_t col_batch_size = 0, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) { - detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric); + detail::cross_component_nn(handle, + out, + X, + orig_colors, + n_rows, + n_cols, + reduction_op, + row_batch_size, + col_batch_size, + metric); } }; // end namespace raft::sparse::neighbors \ No newline at end of file diff --git a/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh similarity index 56% rename from cpp/include/raft/sparse/neighbors/detail/connect_components.cuh rename to cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh index f089cbea83..3570be2b5c 100644 --- a/cpp/include/raft/sparse/neighbors/detail/connect_components.cuh +++ b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh @@ -15,25 +15,29 @@ */ #pragma once +#include #include +#include +#include +#include #include #include - #include -#include +#include #include +#include #include +#include +#include #include #include #include #include #include - +#include #include -#include -#include #include #include #include @@ -43,6 +47,9 @@ #include #include +#include +#include + #include #include @@ -50,26 +57,24 @@ namespace raft::sparse::neighbors::detail { /** - * Functor with reduction ops for performing fused 1-nn - * computation and guaranteeing only cross-component - * neighbors are considered. + * Base functor with reduction ops for performing masked 1-nn + * computation. * @tparam value_idx * @tparam value_t */ template struct FixConnectivitiesRedOp { - value_idx* colors; value_idx m; // default constructor for cutlass - DI FixConnectivitiesRedOp() : colors(0), m(0) {} + DI FixConnectivitiesRedOp() : m(0) {} - FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){}; + FixConnectivitiesRedOp(value_idx m_) : m(m_){}; typedef typename raft::KeyValuePair KVP; DI void operator()(value_idx rit, KVP* out, const KVP& other) const { - if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) { + if (rit < m && other.value < out->value) { out->key = other.key; out->value = other.value; } @@ -77,7 +82,7 @@ struct FixConnectivitiesRedOp { DI KVP operator()(value_idx rit, const KVP& a, const KVP& b) const { - if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) { + if (rit < m && a.value < b.value) { return a; } else return b; @@ -96,6 +101,13 @@ struct FixConnectivitiesRedOp { DI value_t get_value(KVP& out) const { return out.value; } DI value_t get_value(value_t& out) const { return out; } + + /** The gather and scatter ensure that operator() is still consistent after rearranging the data. + * TODO (tarang-jain): refactor cross_component_nn API to separate out the gather and scatter + * functions from the reduction op. Reference: https://github.com/rapidsai/raft/issues/1614 */ + void gather(const raft::resources& handle, value_idx* map) {} + + void scatter(const raft::resources& handle, value_idx* map) {} }; /** @@ -182,6 +194,7 @@ struct LookupColorOp { * the given array of components * @tparam value_idx * @tparam value_t + * @param[in] handle raft handle * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given * array of components * @param[out] nn_colors components of nearest neighbors for each vertex @@ -189,41 +202,141 @@ struct LookupColorOp { * @param[in] X original dense data * @param[in] n_rows number of rows in original dense data * @param[in] n_cols number of columns in original dense data - * @param[in] stream cuda stream for which to order cuda operations + * @param[in] row_batch_size row batch size for computing nearest neighbors + * @param[in] col_batch_size column batch size for sorting and 'unsorting' + * @param[in] reduction_op reduction operation for computing nearest neighbors */ template -void perform_1nn(raft::KeyValuePair* kvp, +void perform_1nn(raft::resources const& handle, + raft::KeyValuePair* kvp, value_idx* nn_colors, value_idx* colors, const value_t* X, size_t n_rows, size_t n_cols, - cudaStream_t stream, + size_t row_batch_size, + size_t col_batch_size, red_op reduction_op) { - rmm::device_uvector workspace(n_rows, stream); - rmm::device_uvector x_norm(n_rows, stream); - - raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream); - - raft::distance::fusedL2NN, value_idx>( - kvp, - X, - X, - x_norm.data(), - x_norm.data(), - n_rows, - n_rows, - n_cols, - workspace.data(), - reduction_op, - reduction_op, - true, - true, - stream); + auto stream = resource::get_cuda_stream(handle); + auto exec_policy = resource::get_thrust_policy(handle); + + auto sort_plan = raft::make_device_vector(handle, (value_idx)n_rows); + raft::linalg::map_offset(handle, sort_plan.view(), [] __device__(value_idx idx) { return idx; }); + + thrust::sort_by_key( + resource::get_thrust_policy(handle), colors, colors + n_rows, sort_plan.data_handle()); + + // Modify the reduction operation based on the sort plan. + reduction_op.gather(handle, sort_plan.data_handle()); + + auto X_mutable_view = + raft::make_device_matrix_view(const_cast(X), n_rows, n_cols); + auto sort_plan_const_view = + raft::make_device_vector_view(sort_plan.data_handle(), n_rows); + raft::matrix::gather(handle, X_mutable_view, sort_plan_const_view, (value_idx)col_batch_size); + + // Get the number of unique components from the array of colors + value_idx n_components = get_n_components(colors, n_rows, stream); + + // colors_group_idxs is an array containing the *end* indices of each color + // component in colors. That is, the value of colors_group_idxs[j] indicates + // the start of color j + 1, i.e., it is the inclusive scan of the sizes of + // the color components. + auto colors_group_idxs = raft::make_device_vector(handle, n_components + 1); + raft::sparse::convert::sorted_coo_to_csr( + colors, n_rows, colors_group_idxs.data_handle(), n_components + 1, stream); + + auto group_idxs_view = raft::make_device_vector_view( + colors_group_idxs.data_handle() + 1, n_components); + + auto x_norm = raft::make_device_vector(handle, (value_idx)n_rows); + raft::linalg::rowNorm( + x_norm.data_handle(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream); + + auto adj = raft::make_device_matrix(handle, row_batch_size, n_components); + using OutT = raft::KeyValuePair; + using ParamT = raft::distance::masked_l2_nn_params; + + bool apply_sqrt = true; + bool init_out_buffer = true; + ParamT params{reduction_op, reduction_op, apply_sqrt, init_out_buffer}; + + auto X_full_view = raft::make_device_matrix_view(X, n_rows, n_cols); + + size_t n_batches = raft::ceildiv(n_rows, row_batch_size); + + for (size_t bid = 0; bid < n_batches; bid++) { + size_t batch_offset = bid * row_batch_size; + size_t rows_per_batch = min(row_batch_size, n_rows - batch_offset); + + auto X_batch_view = raft::make_device_matrix_view( + X + batch_offset * n_cols, rows_per_batch, n_cols); + + auto x_norm_batch_view = raft::make_device_vector_view( + x_norm.data_handle() + batch_offset, rows_per_batch); + + auto mask_op = [colors, + n_components = raft::util::FastIntDiv(n_components), + batch_offset] __device__(value_idx idx) { + value_idx row = idx / n_components; + value_idx col = idx % n_components; + return colors[batch_offset + row] != col; + }; + + auto adj_vector_view = raft::make_device_vector_view( + adj.data_handle(), rows_per_batch * n_components); + + raft::linalg::map_offset(handle, adj_vector_view, mask_op); + + auto adj_view = raft::make_device_matrix_view( + adj.data_handle(), rows_per_batch, n_components); + + auto kvp_view = + raft::make_device_vector_view, value_idx>( + kvp + batch_offset, rows_per_batch); + + raft::distance::masked_l2_nn(handle, + params, + X_batch_view, + X_full_view, + x_norm_batch_view, + x_norm.view(), + adj_view, + group_idxs_view, + kvp_view); + } + + // Transform the keys so that they correctly point to the unpermuted indices. + thrust::transform(exec_policy, + kvp, + kvp + n_rows, + kvp, + [sort_plan = sort_plan.data_handle()] __device__(OutT KVP) { + OutT res; + res.value = KVP.value; + res.key = sort_plan[KVP.key]; + return res; + }); + + // Undo permutation of the rows of X by scattering in place. + raft::matrix::scatter(handle, X_mutable_view, sort_plan_const_view, (value_idx)col_batch_size); + + // Undo permutation of the key-value pair and color vectors. This is not done + // inplace, so using two temporary vectors. + auto tmp_colors = raft::make_device_vector(handle, n_rows); + auto tmp_kvp = raft::make_device_vector(handle, n_rows); + + thrust::scatter(exec_policy, kvp, kvp + n_rows, sort_plan.data_handle(), tmp_kvp.data_handle()); + thrust::scatter( + exec_policy, colors, colors + n_rows, sort_plan.data_handle(), tmp_colors.data_handle()); + reduction_op.scatter(handle, sort_plan.data_handle()); + + raft::copy_async(colors, tmp_colors.data_handle(), n_rows, stream); + raft::copy_async(kvp, tmp_kvp.data_handle(), n_rows, stream); LookupColorOp extract_colors_op(colors); - thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); + thrust::transform(exec_policy, kvp, kvp + n_rows, nn_colors, extract_colors_op); } /** @@ -239,22 +352,22 @@ void perform_1nn(raft::KeyValuePair* kvp, * @param stream stream for which to order CUDA operations */ template -void sort_by_color(value_idx* colors, +void sort_by_color(raft::resources const& handle, + value_idx* colors, value_idx* nn_colors, raft::KeyValuePair* kvp, value_idx* src_indices, - size_t n_rows, - cudaStream_t stream) + size_t n_rows) { + auto exec_policy = resource::get_thrust_policy(handle); thrust::counting_iterator arg_sort_iter(0); - thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices); + thrust::copy(exec_policy, arg_sort_iter, arg_sort_iter + n_rows, src_indices); auto keys = thrust::make_zip_iterator( thrust::make_tuple(colors, nn_colors, (KeyValuePair*)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); - // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, TupleComp()); + thrust::sort_by_key(exec_policy, keys, keys + n_rows, vals, TupleComp()); } template @@ -285,9 +398,7 @@ __global__ void min_components_by_color_kernel(value_idx* out_rows, * @tparam value_idx * @tparam value_t * @param[out] coo output edge list - * @param[in] out_indptr output indptr for ordering edge list - * @param[in] colors_indptr indptr of source components - * @param[in] colors_nn components of nearest neighbors to each source component + * @param[in] out_index output indptr for ordering edge list * @param[in] indices indices of source vertices for each component * @param[in] kvp indices and distances of each destination vertex for each component * @param[in] n_colors number of components @@ -324,12 +435,24 @@ void min_components_by_color(raft::sparse::COO& coo, * @param[out] out output edge list containing nearest cross-component * edges. * @param[in] X original (row-major) dense matrix for which knn graph should be constructed. - * @param[in] colors array containing component number for each row of X + * @param[in] orig_colors array containing component number for each row of X * @param[in] n_rows number of rows in X * @param[in] n_cols number of cols in X + * @param[in] reduction_op reduction operation for computing nearest neighbors. The reduction + * operation must have `gather` and `scatter` functions defined + * @param[in] row_batch_size the batch size for computing nearest neighbors. This parameter controls + * the number of samples for which the nearest neighbors are computed at once. Therefore, it affects + * the memory consumption mainly by reducing the size of the adjacency matrix for masked nearest + * neighbors computation. default 0 indicates that no batching is done + * @param[in] col_batch_size the input data is sorted and 'unsorted' based on color. An additional + * scratch space buffer of shape (n_rows, col_batch_size) is created for this. Usually, this + * parameter affects the memory consumption more drastically than the col_batch_size with a marginal + * increase in compute time as the col_batch_size is reduced. default 0 indicates that no batching + * is done + * @param[in] metric distance metric */ template -void connect_components( +void cross_component_nn( raft::resources const& handle, raft::sparse::COO& out, const value_t* X, @@ -337,6 +460,8 @@ void connect_components( size_t n_rows, size_t n_cols, red_op reduction_op, + size_t row_batch_size, + size_t col_batch_size, raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) { auto stream = resource::get_cuda_stream(handle); @@ -345,13 +470,16 @@ void connect_components( "Fixing connectivities for an unconnected k-NN graph only " "supports L2SqrtExpanded currently."); + if (row_batch_size == 0 || row_batch_size > n_rows) { row_batch_size = n_rows; } + + if (col_batch_size == 0 || col_batch_size > n_cols) { col_batch_size = n_cols; } + rmm::device_uvector colors(n_rows, stream); - raft::copy_async(colors.data(), orig_colors, n_rows, stream); // Normalize colors so they are drawn from a monotonically increasing set - raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, true); - - value_idx n_components = get_n_components(colors.data(), n_rows, stream); + constexpr bool zero_based = true; + raft::label::make_monotonic( + colors.data(), const_cast(orig_colors), n_rows, stream, zero_based); /** * First compute 1-nn for all colors where the color of each data point @@ -361,13 +489,15 @@ void connect_components( rmm::device_uvector> temp_inds_dists(n_rows, stream); rmm::device_uvector src_indices(n_rows, stream); - perform_1nn(temp_inds_dists.data(), + perform_1nn(handle, + temp_inds_dists.data(), nn_colors.data(), colors.data(), X, n_rows, n_cols, - stream, + row_batch_size, + col_batch_size, reduction_op); /** @@ -376,7 +506,7 @@ void connect_components( // max_color + 1 = number of connected components // sort nn_colors by key w/ original colors sort_by_color( - colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream); + handle, colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows); /** * Take the min for any duplicate colors diff --git a/cpp/include/raft/sparse/neighbors/detail/knn.cuh b/cpp/include/raft/sparse/neighbors/detail/knn.cuh index 7d7bcba443..f2be427367 100644 --- a/cpp/include/raft/sparse/neighbors/detail/knn.cuh +++ b/cpp/include/raft/sparse/neighbors/detail/knn.cuh @@ -231,7 +231,8 @@ class sparse_knn_t { /** * Compute distances */ - size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows(); + uint64_t dense_size = + (uint64_t)idx_batcher.batch_rows() * (uint64_t)query_batcher.batch_rows(); rmm::device_uvector batch_dists(dense_size, resource::get_cuda_stream(handle)); RAFT_CUDA_TRY(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); @@ -390,7 +391,7 @@ class sparse_knn_t { /** * Compute distances */ - raft::sparse::distance::distances_config_t dist_config(handle); + raft::sparse::distance::detail::distances_config_t dist_config(handle); dist_config.b_nrows = idx_batcher.batch_rows(); dist_config.b_ncols = n_idx_cols; dist_config.b_nnz = idx_batch_nnz; diff --git a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh index 61378d71d8..00c5317b5c 100644 --- a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh +++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh @@ -126,7 +126,6 @@ void knn_graph(raft::resources const& handle, // pass value_idx through to knn. rmm::device_uvector int64_indices(nnz, stream); - uint32_t knn_start = curTimeMillis(); raft::spatial::knn::brute_force_knn(handle, inputs, sizes, diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/cross_component_nn.cuh similarity index 87% rename from cpp/include/raft/sparse/selection/connect_components.cuh rename to cpp/include/raft/sparse/selection/cross_component_nn.cuh index 9bc3f1553a..e115d6c061 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/cross_component_nn.cuh @@ -19,7 +19,7 @@ */ /** - * DISCLAIMER: this file is deprecated: use connect_components.cuh instead + * DISCLAIMER: this file is deprecated: use cross_component_nn.cuh instead */ #pragma once @@ -28,10 +28,10 @@ " is deprecated and will be removed in a future release." \ " Please use the sparse/spatial version instead.") -#include +#include namespace raft::linkage { -using raft::sparse::neighbors::connect_components; +using raft::sparse::neighbors::cross_component_nn; using raft::sparse::neighbors::FixConnectivitiesRedOp; using raft::sparse::neighbors::get_n_components; } // namespace raft::linkage \ No newline at end of file diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index 850b741dfd..1ce041d8da 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -466,7 +466,7 @@ struct batch_load_iterator { if (source_ == nullptr) { return; } if (needs_copy_) { if (size() > 0) { - RAFT_LOG_DEBUG("batch_load_iterator::copy(offset = %zu, size = %zu, row_width = %zu)", + RAFT_LOG_TRACE("batch_load_iterator::copy(offset = %zu, size = %zu, row_width = %zu)", size_t(offset()), size_t(size()), size_t(row_width())); diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh index 7b088316a3..3c089b1d22 100644 --- a/cpp/include/raft/spatial/knn/knn.cuh +++ b/cpp/include/raft/spatial/knn/knn.cuh @@ -50,8 +50,8 @@ namespace raft::spatial::knn { * @param translations */ template -inline void knn_merge_parts(value_t* in_keys, - idx_t* in_values, +inline void knn_merge_parts(const value_t* in_keys, + const idx_t* in_values, value_t* out_keys, idx_t* out_values, size_t n_samples, diff --git a/cpp/include/raft/util/bitonic_sort.cuh b/cpp/include/raft/util/bitonic_sort.cuh index 46670d39bd..eb4f546f7d 100644 --- a/cpp/include/raft/util/bitonic_sort.cuh +++ b/cpp/include/raft/util/bitonic_sort.cuh @@ -60,17 +60,17 @@ _RAFT_DEVICE _RAFT_FORCEINLINE void conditional_assign(bool cond, T& ptr, T x) * 3 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 48 49 50 ... * ` * - * Here is a small usage example of device code, which sorts the arrays of length 6 (= 3 * 2) + * Here is a small usage example of device code, which sorts the arrays of length 8 (= 4 * 2) * grouped in pairs of threads in ascending order: * @code{.cpp} - * // Fill an array of three ints in each thread of a warp. + * // Fill an array of four ints in each thread of a warp. * int i = laneId(); - * int arr[3] = {i+1, i+5, i}; + * int arr[4] = {i+1, i+5, i, i+7}; * // Sort the arrays in groups of two threads. - * bitonic<3>(ascending=true, warp_width=2).sort(arr); + * bitonic<4>(ascending=true, warp_width=2).sort(arr); * // As a result, - * // for every even thread (`i == 2j`): arr == {2j, 2j+1, 2j+5} - * // for every odd thread (`i == 2j+1`): arr == {2j+1, 2j+2, 2j+6} + * // for every even thread (`i == 2j`): arr == {2j, 2j+1, 2j+5, 2j+7} + * // for every odd thread (`i == 2j+1`): arr == {2j+1, 2j+2, 2j+6, 2j+8} * @endcode * * @tparam Size diff --git a/cpp/include/raft/util/cuda_rt_essentials.hpp b/cpp/include/raft/util/cuda_rt_essentials.hpp index e5f3af4e61..77612f97bc 100644 --- a/cpp/include/raft/util/cuda_rt_essentials.hpp +++ b/cpp/include/raft/util/cuda_rt_essentials.hpp @@ -23,6 +23,8 @@ #include #include +#include + namespace raft { /** @@ -58,3 +60,38 @@ struct cuda_error : public raft::exception { throw raft::cuda_error(msg); \ } \ } while (0) + +/** + * @brief Debug macro to check for CUDA errors + * + * In a non-release build, this macro will synchronize the specified stream + * before error checking. In both release and non-release builds, this macro + * checks for any pending CUDA errors from previous calls. If an error is + * reported, an exception is thrown detailing the CUDA error that occurred. + * + * The intent of this macro is to provide a mechanism for synchronous and + * deterministic execution for debugging asynchronous CUDA execution. It should + * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an + * asynchronous kernel launch. + */ +#ifndef NDEBUG +#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); +#else +#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError()); +#endif + +// /** +// * @brief check for cuda runtime API errors but log error instead of raising +// * exception. +// */ +#define RAFT_CUDA_TRY_NO_THROW(call) \ + do { \ + cudaError_t const status = call; \ + if (cudaSuccess != status) { \ + printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ + #call, \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(status)); \ + } \ + } while (0) diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh index 0523dcc81c..e718ca3545 100644 --- a/cpp/include/raft/util/cuda_utils.cuh +++ b/cpp/include/raft/util/cuda_utils.cuh @@ -20,6 +20,11 @@ #include #include +#if defined(_RAFT_HAS_CUDA) +#include +#include +#endif + #include #include #include @@ -79,6 +84,35 @@ DI void myAtomicReduce(float* address, float val, ReduceLambda op) } while (assumed != old); } +// Needed for atomicCas on ushort +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) +template +DI void myAtomicReduce(__half* address, __half val, ReduceLambda op) +{ + unsigned short int* address_as_uint = (unsigned short int*)address; + unsigned short int old = *address_as_uint, assumed; + do { + assumed = old; + old = atomicCAS(address_as_uint, assumed, __half_as_ushort(op(val, __ushort_as_half(assumed)))); + } while (assumed != old); +} +#endif + +// Needed for nv_bfloat16 support +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) +template +DI void myAtomicReduce(nv_bfloat16* address, nv_bfloat16 val, ReduceLambda op) +{ + unsigned short int* address_as_uint = (unsigned short int*)address; + unsigned short int old = *address_as_uint, assumed; + do { + assumed = old; + old = atomicCAS( + address_as_uint, assumed, __bfloat16_as_ushort(op(val, __ushort_as_bfloat16(assumed)))); + } while (assumed != old); +} +#endif + template DI void myAtomicReduce(int* address, int val, ReduceLambda op) { diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp index f3b083ac4a..743ffd743c 100644 --- a/cpp/include/raft/util/cudart_utils.hpp +++ b/cpp/include/raft/util/cudart_utils.hpp @@ -34,41 +34,6 @@ #include #include -/** - * @brief Debug macro to check for CUDA errors - * - * In a non-release build, this macro will synchronize the specified stream - * before error checking. In both release and non-release builds, this macro - * checks for any pending CUDA errors from previous calls. If an error is - * reported, an exception is thrown detailing the CUDA error that occurred. - * - * The intent of this macro is to provide a mechanism for synchronous and - * deterministic execution for debugging asynchronous CUDA execution. It should - * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an - * asynchronous kernel launch. - */ -#ifndef NDEBUG -#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); -#else -#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError()); -#endif - -// /** -// * @brief check for cuda runtime API errors but log error instead of raising -// * exception. -// */ -#define RAFT_CUDA_TRY_NO_THROW(call) \ - do { \ - cudaError_t const status = call; \ - if (cudaSuccess != status) { \ - printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ - #call, \ - __FILE__, \ - __LINE__, \ - cudaGetErrorString(status)); \ - } \ - } while (0) - namespace raft { /** Helper method to get to know warp size in device code */ diff --git a/cpp/include/raft_runtime/neighbors/cagra.hpp b/cpp/include/raft_runtime/neighbors/cagra.hpp new file mode 100644 index 0000000000..6f56302776 --- /dev/null +++ b/cpp/include/raft_runtime/neighbors/cagra.hpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace raft::runtime::neighbors::cagra { + +// Using device and host_matrix_view avoids needing to typedef mutltiple mdspans based on accessors +#define RAFT_INST_CAGRA_FUNCS(T, IdxT) \ + auto build(raft::resources const& handle, \ + const raft::neighbors::cagra::index_params& params, \ + raft::device_matrix_view dataset) \ + ->raft::neighbors::cagra::index; \ + \ + auto build(raft::resources const& handle, \ + const raft::neighbors::cagra::index_params& params, \ + raft::host_matrix_view dataset) \ + ->raft::neighbors::cagra::index; \ + \ + void build_device(raft::resources const& handle, \ + const raft::neighbors::cagra::index_params& params, \ + raft::device_matrix_view dataset, \ + raft::neighbors::cagra::index& idx); \ + \ + void build_host(raft::resources const& handle, \ + const raft::neighbors::cagra::index_params& params, \ + raft::host_matrix_view dataset, \ + raft::neighbors::cagra::index& idx); \ + \ + void search(raft::resources const& handle, \ + raft::neighbors::cagra::search_params const& params, \ + const raft::neighbors::cagra::index& index, \ + raft::device_matrix_view queries, \ + raft::device_matrix_view neighbors, \ + raft::device_matrix_view distances); \ + void serialize_file(raft::resources const& handle, \ + const std::string& filename, \ + const raft::neighbors::cagra::index& index); \ + \ + void deserialize_file(raft::resources const& handle, \ + const std::string& filename, \ + raft::neighbors::cagra::index* index); \ + void serialize(raft::resources const& handle, \ + std::string& str, \ + const raft::neighbors::cagra::index& index); \ + \ + void deserialize(raft::resources const& handle, \ + const std::string& str, \ + raft::neighbors::cagra::index* index); + +RAFT_INST_CAGRA_FUNCS(float, uint32_t); +RAFT_INST_CAGRA_FUNCS(int8_t, uint32_t); +RAFT_INST_CAGRA_FUNCS(uint8_t, uint32_t); + +#undef RAFT_INST_CAGRA_FUNCS + +#define RAFT_INST_CAGRA_OPTIMIZE(IdxT) \ + void optimize_device(raft::resources const& res, \ + raft::device_matrix_view knn_graph, \ + raft::host_matrix_view new_graph); \ + \ + void optimize_host(raft::resources const& res, \ + raft::host_matrix_view knn_graph, \ + raft::host_matrix_view new_graph); + +RAFT_INST_CAGRA_OPTIMIZE(uint32_t); + +#undef RAFT_INST_CAGRA_OPTIMIZE + +} // namespace raft::runtime::neighbors::cagra diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh index 013a61886f..b72e67580a 100644 --- a/cpp/internal/raft_internal/matrix/select_k.cuh +++ b/cpp/internal/raft_internal/matrix/select_k.cuh @@ -101,10 +101,15 @@ void select_k_impl(const resources& handle, if (in_idx == nullptr) { // NB: std::nullopt prevents automatic inference of the template parameters. return matrix::select_k( - handle, in_span, std::nullopt, out_span, out_idx_span, select_min); + handle, in_span, std::nullopt, out_span, out_idx_span, select_min, true); } else { - return matrix::select_k( - handle, in_span, std::make_optional(in_idx_span), out_span, out_idx_span, select_min); + return matrix::select_k(handle, + in_span, + std::make_optional(in_idx_span), + out_span, + out_idx_span, + select_min, + true); } } case Algo::kRadix8bits: diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh index 3ad055272b..8565735672 100644 --- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh +++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -78,7 +79,8 @@ __global__ void naive_distance_kernel(EvalT* dist, * when either distance or brute_force_knn support 8-bit int inputs. */ template -void naive_knn(EvalT* dist_topk, +void naive_knn(raft::resources const& handle, + EvalT* dist_topk, IdxT* indices_topk, const DataT* x, const DataT* y, @@ -86,12 +88,12 @@ void naive_knn(EvalT* dist_topk, size_t input_len, size_t dim, uint32_t k, - raft::distance::DistanceType type, - rmm::cuda_stream_view stream) + raft::distance::DistanceType type) { rmm::mr::device_memory_resource* mr = nullptr; auto pool_guard = raft::get_pool_memory_resource(mr, 1024 * 1024); + auto stream = raft::resource::get_cuda_stream(handle); dim3 block_dim(16, 32, 1); // maximum reasonable grid size in `y` direction auto grid_y = @@ -109,7 +111,8 @@ void naive_knn(EvalT* dist_topk, naive_distance_kernel<<>>( dist.data(), x + offset * dim, y, batch_size, input_len, dim, type); - matrix::detail::select_k(dist.data(), + matrix::detail::select_k(handle, + dist.data(), nullptr, batch_size, input_len, @@ -117,7 +120,6 @@ void naive_knn(EvalT* dist_topk, dist_topk + offset * k, indices_topk + offset * k, type != raft::distance::DistanceType::InnerProduct, - stream, mr); } RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); diff --git a/cpp/internal/raft_internal/neighbors/refine_helper.cuh b/cpp/internal/raft_internal/neighbors/refine_helper.cuh index 67217d1e0e..ee06d90851 100644 --- a/cpp/internal/raft_internal/neighbors/refine_helper.cuh +++ b/cpp/internal/raft_internal/neighbors/refine_helper.cuh @@ -80,7 +80,8 @@ class RefineHelper { { candidates = raft::make_device_matrix(handle_, p.n_queries, p.k0); rmm::device_uvector distances_tmp(p.n_queries * p.k0, stream_); - naive_knn(distances_tmp.data(), + naive_knn(handle_, + distances_tmp.data(), candidates.data_handle(), queries.data_handle(), dataset.data_handle(), @@ -88,8 +89,7 @@ class RefineHelper { p.n_rows, p.dim, p.k0, - p.metric, - stream_); + p.metric); resource::sync_stream(handle_, stream_); } @@ -112,7 +112,8 @@ class RefineHelper { { rmm::device_uvector distances_dev(p.n_queries * p.k, stream_); rmm::device_uvector indices_dev(p.n_queries * p.k, stream_); - naive_knn(distances_dev.data(), + naive_knn(handle_, + distances_dev.data(), indices_dev.data(), queries.data_handle(), dataset.data_handle(), @@ -120,8 +121,7 @@ class RefineHelper { p.n_rows, p.dim, p.k, - p.metric, - stream_); + p.metric); true_refined_distances_host.resize(p.n_queries * p.k); true_refined_indices_host.resize(p.n_queries * p.k); raft::copy(true_refined_indices_host.data(), indices_dev.data(), indices_dev.size(), stream_); diff --git a/cpp/src/matrix/detail/select_k_double_int64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu index 022627283a..c75a5b5261 100644 --- a/cpp/src/matrix/detail/select_k_double_int64_t.cu +++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu @@ -16,17 +16,18 @@ #include -#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ - template void raft::matrix::detail::select_k(const T* in_val, \ - const IdxT* in_idx, \ - size_t batch_size, \ - size_t len, \ - int k, \ - T* out_val, \ - IdxT* out_idx, \ - bool select_min, \ - rmm::cuda_stream_view stream, \ - rmm::mr::device_memory_resource* mr) +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + template void raft::matrix::detail::select_k(raft::resources const& handle, \ + const T* in_val, \ + const IdxT* in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T* out_val, \ + IdxT* out_idx, \ + bool select_min, \ + rmm::mr::device_memory_resource* mr, \ + bool sorted) instantiate_raft_matrix_detail_select_k(double, int64_t); diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu index 22c6989337..171c8a1ae7 100644 --- a/cpp/src/matrix/detail/select_k_double_uint32_t.cu +++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu @@ -17,17 +17,18 @@ #include // uint32_t #include -#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ - template void raft::matrix::detail::select_k(const T* in_val, \ - const IdxT* in_idx, \ - size_t batch_size, \ - size_t len, \ - int k, \ - T* out_val, \ - IdxT* out_idx, \ - bool select_min, \ - rmm::cuda_stream_view stream, \ - rmm::mr::device_memory_resource* mr) +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + template void raft::matrix::detail::select_k(raft::resources const& handle, \ + const T* in_val, \ + const IdxT* in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T* out_val, \ + IdxT* out_idx, \ + bool select_min, \ + rmm::mr::device_memory_resource* mr, \ + bool sorted) instantiate_raft_matrix_detail_select_k(double, uint32_t); diff --git a/cpp/src/matrix/detail/select_k_float_int32.cu b/cpp/src/matrix/detail/select_k_float_int32.cu index 42094bbb67..a21444dc0c 100644 --- a/cpp/src/matrix/detail/select_k_float_int32.cu +++ b/cpp/src/matrix/detail/select_k_float_int32.cu @@ -16,17 +16,18 @@ #include -#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ - template void raft::matrix::detail::select_k(const T* in_val, \ - const IdxT* in_idx, \ - size_t batch_size, \ - size_t len, \ - int k, \ - T* out_val, \ - IdxT* out_idx, \ - bool select_min, \ - rmm::cuda_stream_view stream, \ - rmm::mr::device_memory_resource* mr) +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + template void raft::matrix::detail::select_k(raft::resources const& handle, \ + const T* in_val, \ + const IdxT* in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T* out_val, \ + IdxT* out_idx, \ + bool select_min, \ + rmm::mr::device_memory_resource* mr, \ + bool sorted) instantiate_raft_matrix_detail_select_k(float, int); diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu index 1f1d686048..9542874ec0 100644 --- a/cpp/src/matrix/detail/select_k_float_int64_t.cu +++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu @@ -16,17 +16,18 @@ #include -#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ - template void raft::matrix::detail::select_k(const T* in_val, \ - const IdxT* in_idx, \ - size_t batch_size, \ - size_t len, \ - int k, \ - T* out_val, \ - IdxT* out_idx, \ - bool select_min, \ - rmm::cuda_stream_view stream, \ - rmm::mr::device_memory_resource* mr) +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + template void raft::matrix::detail::select_k(raft::resources const& handle, \ + const T* in_val, \ + const IdxT* in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T* out_val, \ + IdxT* out_idx, \ + bool select_min, \ + rmm::mr::device_memory_resource* mr, \ + bool sorted) instantiate_raft_matrix_detail_select_k(float, int64_t); diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu index 3bb47acbf2..fbf311d9bd 100644 --- a/cpp/src/matrix/detail/select_k_float_uint32_t.cu +++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu @@ -16,17 +16,18 @@ #include -#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ - template void raft::matrix::detail::select_k(const T* in_val, \ - const IdxT* in_idx, \ - size_t batch_size, \ - size_t len, \ - int k, \ - T* out_val, \ - IdxT* out_idx, \ - bool select_min, \ - rmm::cuda_stream_view stream, \ - rmm::mr::device_memory_resource* mr) +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + template void raft::matrix::detail::select_k(raft::resources const& handle, \ + const T* in_val, \ + const IdxT* in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T* out_val, \ + IdxT* out_idx, \ + bool select_min, \ + rmm::mr::device_memory_resource* mr, \ + bool sorted) instantiate_raft_matrix_detail_select_k(float, uint32_t); diff --git a/cpp/src/matrix/detail/select_k_half_int64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu index cf4e15959d..fdbfd66c46 100644 --- a/cpp/src/matrix/detail/select_k_half_int64_t.cu +++ b/cpp/src/matrix/detail/select_k_half_int64_t.cu @@ -16,17 +16,18 @@ #include -#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ - template void raft::matrix::detail::select_k(const T* in_val, \ - const IdxT* in_idx, \ - size_t batch_size, \ - size_t len, \ - int k, \ - T* out_val, \ - IdxT* out_idx, \ - bool select_min, \ - rmm::cuda_stream_view stream, \ - rmm::mr::device_memory_resource* mr) +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + template void raft::matrix::detail::select_k(raft::resources const& handle, \ + const T* in_val, \ + const IdxT* in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T* out_val, \ + IdxT* out_idx, \ + bool select_min, \ + rmm::mr::device_memory_resource* mr, \ + bool sorted) instantiate_raft_matrix_detail_select_k(__half, int64_t); diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu index b18887bfc0..48a3e91f9d 100644 --- a/cpp/src/matrix/detail/select_k_half_uint32_t.cu +++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu @@ -16,17 +16,18 @@ #include -#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ - template void raft::matrix::detail::select_k(const T* in_val, \ - const IdxT* in_idx, \ - size_t batch_size, \ - size_t len, \ - int k, \ - T* out_val, \ - IdxT* out_idx, \ - bool select_min, \ - rmm::cuda_stream_view stream, \ - rmm::mr::device_memory_resource* mr) +#define instantiate_raft_matrix_detail_select_k(T, IdxT) \ + template void raft::matrix::detail::select_k(raft::resources const& handle, \ + const T* in_val, \ + const IdxT* in_idx, \ + size_t batch_size, \ + size_t len, \ + int k, \ + T* out_val, \ + IdxT* out_idx, \ + bool select_min, \ + rmm::mr::device_memory_resource* mr, \ + bool sorted) instantiate_raft_matrix_detail_select_k(__half, uint32_t); diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py new file mode 100644 index 0000000000..784d116503 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_00_generate.py @@ -0,0 +1,104 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +header = """ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \\ + template void select_and_run( \\ + raft::device_matrix_view dataset, \\ + raft::device_matrix_view graph, \\ + INDEX_T* const topk_indices_ptr, \\ + DISTANCE_T* const topk_distances_ptr, \\ + const DATA_T* const queries_ptr, \\ + const uint32_t num_queries, \\ + const INDEX_T* dev_seed_ptr, \\ + uint32_t* const num_executed_iterations, \\ + uint32_t topk, \\ + uint32_t block_size, \\ + uint32_t result_buffer_size, \\ + uint32_t smem_size, \\ + int64_t hash_bitlen, \\ + INDEX_T* hashmap_ptr, \\ + uint32_t num_cta_per_query, \\ + uint32_t num_random_samplings, \\ + uint64_t rand_xor_mask, \\ + uint32_t num_seeds, \\ + size_t itopk_size, \\ + size_t search_width, \\ + size_t min_iterations, \\ + size_t max_iterations, \\ + cudaStream_t stream); + +""" + +trailer = """ +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::namespace multi_cta_search +""" + +mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] +# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] +# mxelem = [64, 128, 256] +load_types = ["uint4"] +search_types = dict( + float_uint32=( + "float", + "uint32_t", + "float", + ), # data_t, vec_idx_t, distance_t + int8_uint32=("int8_t", "uint32_t", "float"), + uint8_uint32=("uint8_t", "uint32_t", "float"), + float_uint64=("float", "uint64_t", "float"), +) +# knn +for type_path, (data_t, idx_t, distance_t) in search_types.items(): + for (mxdim, team) in mxdim_team: + path = f"search_multi_cta_{type_path}_dim{mxdim}_t{team}.cu" + with open(path, "w") as f: + f.write(header) + f.write( + f"instantiate_kernel_selection({team}, {mxdim}, {data_t}, {idx_t}, {distance_t});\n" + ) + f.write(trailer) + # For pasting into CMakeLists.txt + print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu new file mode 100644 index 0000000000..2a4e7ac607 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim1024_t32.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_widthhhhhhhhh, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 1024, float, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu new file mode 100644 index 0000000000..115ce3b48b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim128_t8.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(8, 128, float, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu new file mode 100644 index 0000000000..c5e704a85f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim256_t16.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(16, 256, float, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu new file mode 100644 index 0000000000..3469facf39 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint32_dim512_t32.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 512, float, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu new file mode 100644 index 0000000000..327bfc73b4 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 1024, float, uint64_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu new file mode 100644 index 0000000000..1abe0cd8af --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(8, 128, float, uint64_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu new file mode 100644 index 0000000000..dd61810d06 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(16, 256, float, uint64_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu new file mode 100644 index 0000000000..8e12bab514 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 512, float, uint64_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu new file mode 100644 index 0000000000..d946ac9c79 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim1024_t32.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 1024, int8_t, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu new file mode 100644 index 0000000000..e4d7b44d1e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim128_t8.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(8, 128, int8_t, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu new file mode 100644 index 0000000000..b8dc3b38a8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim256_t16.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(16, 256, int8_t, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu new file mode 100644 index 0000000000..749b35bad6 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_int8_uint32_dim512_t32.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 512, int8_t, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu new file mode 100644 index 0000000000..428d460ba8 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim1024_t32.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_widthh, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 1024, uint8_t, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu new file mode 100644 index 0000000000..28a20b865e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim128_t8.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(8, 128, uint8_t, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu new file mode 100644 index 0000000000..e85a84ae8e --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim256_t16.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(16, 256, uint8_t, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu new file mode 100644 index 0000000000..232b62ebcd --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_uint8_uint32_dim512_t32.cu @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_multi_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_multi_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::multi_cta_search { + +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 512, uint8_t, uint32_t, float); + +#undef instantiate_kernel_selection + +} // namespace raft::neighbors::cagra::detail::multi_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py new file mode 100644 index 0000000000..cf61a45b4a --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_00_generate.py @@ -0,0 +1,110 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +header = """ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \\ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \\ + template void select_and_run( \\ + raft::device_matrix_view dataset, \\ + raft::device_matrix_view graph, \\ + INDEX_T* const topk_indices_ptr, \\ + DISTANCE_T* const topk_distances_ptr, \\ + const DATA_T* const queries_ptr, \\ + const uint32_t num_queries, \\ + const INDEX_T* dev_seed_ptr, \\ + uint32_t* const num_executed_iterations, \\ + uint32_t topk, \\ + uint32_t num_itopk_candidates, \\ + uint32_t block_size, \\ + uint32_t smem_size, \\ + int64_t hash_bitlen, \\ + INDEX_T* hashmap_ptr, \\ + size_t small_hash_bitlen, \\ + size_t small_hash_reset_interval, \\ + uint32_t num_random_samplings, \\ + uint64_t rand_xor_mask, \\ + uint32_t num_seeds, \\ + size_t itopk_size, \\ + size_t search_width, \\ + size_t min_iterations, \\ + size_t max_iterations, \\ + cudaStream_t stream); + +""" + +trailer = """ +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search +""" + +mxdim_team = [(128, 8), (256, 16), (512, 32), (1024, 32)] +# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)] +# itopk_candidates = [64, 128, 256] +# itopk_size = [64, 128, 256, 512] +# mxelem = [64, 128, 256] + +# rblock = [(256, 4), (512, 2), (1024, 1)] +# rcandidates = [32] +# rsize = [256, 512] + +search_types = dict( + float_uint32=("float", "uint32_t", "float"), # data_t, idx_t, distance_t + int8_uint32=("int8_t", "uint32_t", "float"), + uint8_uint32=("uint8_t", "uint32_t", "float"), + float_uint64=("float", "uint64_t", "float"), +) + +# knn +for type_path, (data_t, idx_t, distance_t) in search_types.items(): + for (mxdim, team) in mxdim_team: + path = f"search_single_cta_{type_path}_dim{mxdim}_t{team}.cu" + with open(path, "w") as f: + f.write(header) + f.write( + f"instantiate_single_cta_select_and_run({team}, {mxdim},{data_t}, {idx_t}, {distance_t});\n" + ) + + f.write(trailer) + # For pasting into CMakeLists.txt + print(f"src/neighbors/detail/cagra/{path}") diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu new file mode 100644 index 0000000000..eb45d4ff08 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim1024_t32.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 1024, float, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu new file mode 100644 index 0000000000..049715aa20 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim128_t8.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(8, 128, float, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu new file mode 100644 index 0000000000..6028c283db --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim256_t16.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(16, 256, float, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu new file mode 100644 index 0000000000..2566e9cbd9 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint32_dim512_t32.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 512, float, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu new file mode 100644 index 0000000000..4cd96ad9c0 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 1024, float, uint64_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu new file mode 100644 index 0000000000..822a2efb2f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(8, 128, float, uint64_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu new file mode 100644 index 0000000000..80d1f76b9b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(16, 256, float, uint64_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu new file mode 100644 index 0000000000..06c3eaf10b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 512, float, uint64_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu new file mode 100644 index 0000000000..b4c30ac943 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim1024_t32.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 1024, int8_t, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu new file mode 100644 index 0000000000..c8d0df3ac4 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim128_t8.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(8, 128, int8_t, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu new file mode 100644 index 0000000000..19ecee91af --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim256_t16.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(16, 256, int8_t, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu new file mode 100644 index 0000000000..52c4eb7d6b --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_int8_uint32_dim512_t32.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 512, int8_t, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu new file mode 100644 index 0000000000..4675e17084 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim1024_t32.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 1024, uint8_t, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu new file mode 100644 index 0000000000..e73e1071ee --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim128_t8.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(8, 128, uint8_t, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu new file mode 100644 index 0000000000..01e26b5f29 --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim256_t16.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(16, 256, uint8_t, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu new file mode 100644 index 0000000000..b0534b555f --- /dev/null +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_uint8_uint32_dim512_t32.cu @@ -0,0 +1,63 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by search_single_cta_00_generate.py + * + * Make changes there and run in this directory: + * + * > python search_single_cta_00_generate.py + * + */ + +#include + +namespace raft::neighbors::cagra::detail::single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 512, uint8_t, uint32_t, float); + +#undef instantiate_single_cta_search_kernel + +} // namespace raft::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu index 4dfa2a707c..a1d6cca7d5 100644 --- a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu +++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu @@ -15,22 +15,28 @@ */ #include +#include -#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT) \ - template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ - const raft::neighbors::ivf_flat::index& index, \ - const T* queries, \ - const uint32_t* coarse_query_results, \ - const uint32_t n_queries, \ - const raft::distance::DistanceType metric, \ - const uint32_t n_probes, \ - const uint32_t k, \ - const bool select_min, \ - IdxT* neighbors, \ - float* distances, \ - uint32_t& grid_dim_x, \ +#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( \ + T, AccT, IdxT, IvfSampleFilterT) \ + template void \ + raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ + const raft::neighbors::ivf_flat::index& index, \ + const T* queries, \ + const uint32_t* coarse_query_results, \ + const uint32_t n_queries, \ + const uint32_t queries_offset, \ + const raft::distance::DistanceType metric, \ + const uint32_t n_probes, \ + const uint32_t k, \ + const bool select_min, \ + IvfSampleFilterT sample_filter, \ + IdxT* neighbors, \ + float* distances, \ + uint32_t& grid_dim_x, \ rmm::cuda_stream_view stream) -instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(float, float, int64_t); +instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( + float, float, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); #undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu index 2d54248e4d..514301562d 100644 --- a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu +++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu @@ -15,22 +15,28 @@ */ #include +#include -#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT) \ - template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ - const raft::neighbors::ivf_flat::index& index, \ - const T* queries, \ - const uint32_t* coarse_query_results, \ - const uint32_t n_queries, \ - const raft::distance::DistanceType metric, \ - const uint32_t n_probes, \ - const uint32_t k, \ - const bool select_min, \ - IdxT* neighbors, \ - float* distances, \ - uint32_t& grid_dim_x, \ +#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( \ + T, AccT, IdxT, IvfSampleFilterT) \ + template void \ + raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ + const raft::neighbors::ivf_flat::index& index, \ + const T* queries, \ + const uint32_t* coarse_query_results, \ + const uint32_t n_queries, \ + const uint32_t queries_offset, \ + const raft::distance::DistanceType metric, \ + const uint32_t n_probes, \ + const uint32_t k, \ + const bool select_min, \ + IvfSampleFilterT sample_filter, \ + IdxT* neighbors, \ + float* distances, \ + uint32_t& grid_dim_x, \ rmm::cuda_stream_view stream) -instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(int8_t, int32_t, int64_t); +instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( + int8_t, int32_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); #undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu index 75fe52f3c7..32698a8e80 100644 --- a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu +++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu @@ -15,22 +15,28 @@ */ #include +#include -#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT) \ - template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ - const raft::neighbors::ivf_flat::index& index, \ - const T* queries, \ - const uint32_t* coarse_query_results, \ - const uint32_t n_queries, \ - const raft::distance::DistanceType metric, \ - const uint32_t n_probes, \ - const uint32_t k, \ - const bool select_min, \ - IdxT* neighbors, \ - float* distances, \ - uint32_t& grid_dim_x, \ +#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( \ + T, AccT, IdxT, IvfSampleFilterT) \ + template void \ + raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan( \ + const raft::neighbors::ivf_flat::index& index, \ + const T* queries, \ + const uint32_t* coarse_query_results, \ + const uint32_t n_queries, \ + const uint32_t queries_offset, \ + const raft::distance::DistanceType metric, \ + const uint32_t n_probes, \ + const uint32_t k, \ + const bool select_min, \ + IvfSampleFilterT sample_filter, \ + IdxT* neighbors, \ + float* distances, \ + uint32_t& grid_dim_x, \ rmm::cuda_stream_view stream) -instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(uint8_t, uint32_t, int64_t); +instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan( + uint8_t, uint32_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); #undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan diff --git a/cpp/src/neighbors/detail/ivf_flat_search.cu b/cpp/src/neighbors/detail/ivf_flat_search.cu index 001281c8fc..9d39607750 100644 --- a/cpp/src/neighbors/detail/ivf_flat_search.cu +++ b/cpp/src/neighbors/detail/ivf_flat_search.cu @@ -15,21 +15,26 @@ */ #include +#include -#define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT) \ - template void raft::neighbors::ivf_flat::detail::search( \ - raft::resources const& handle, \ - const search_params& params, \ - const raft::neighbors::ivf_flat::index& index, \ - const T* queries, \ - uint32_t n_queries, \ - uint32_t k, \ - IdxT* neighbors, \ - float* distances, \ - rmm::mr::device_memory_resource* mr) +#define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT, IvfSampleFilterT) \ + template void raft::neighbors::ivf_flat::detail::search( \ + raft::resources const& handle, \ + const search_params& params, \ + const raft::neighbors::ivf_flat::index& index, \ + const T* queries, \ + uint32_t n_queries, \ + uint32_t k, \ + IdxT* neighbors, \ + float* distances, \ + rmm::mr::device_memory_resource* mr, \ + IvfSampleFilterT sample_filter) -instantiate_raft_neighbors_ivf_flat_detail_search(float, int64_t); -instantiate_raft_neighbors_ivf_flat_detail_search(int8_t, int64_t); -instantiate_raft_neighbors_ivf_flat_detail_search(uint8_t, int64_t); +instantiate_raft_neighbors_ivf_flat_detail_search( + float, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); +instantiate_raft_neighbors_ivf_flat_detail_search( + int8_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); +instantiate_raft_neighbors_ivf_flat_detail_search( + uint8_t, int64_t, raft::neighbors::filtering::none_ivf_sample_filter); #undef instantiate_raft_neighbors_ivf_flat_detail_search diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py index ac547626bb..5132048d40 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py @@ -41,8 +41,8 @@ #include #include -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT, SampleFilterT) \\ - template auto raft::neighbors::ivf_pq::detail::compute_similarity_select( \\ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT, IvfSampleFilterT) \\ + template auto raft::neighbors::ivf_pq::detail::compute_similarity_select( \\ const cudaDeviceProp& dev_props, \\ bool manage_local_topk, \\ int locality_hint, \\ @@ -52,12 +52,11 @@ uint32_t precomp_data_count, \\ uint32_t n_queries, \\ uint32_t n_probes, \\ - uint32_t topk) -> raft::neighbors::ivf_pq::detail::selected; \\ + uint32_t topk) -> raft::neighbors::ivf_pq::detail::selected; \\ \\ - template void raft::neighbors::ivf_pq::detail::compute_similarity_run( \\ - raft::neighbors::ivf_pq::detail::selected s, \\ + template void raft::neighbors::ivf_pq::detail::compute_similarity_run( \\ + raft::neighbors::ivf_pq::detail::selected s, \\ rmm::cuda_stream_view stream, \\ - uint32_t n_rows, \\ uint32_t dim, \\ uint32_t n_probes, \\ uint32_t pq_dim, \\ @@ -75,7 +74,7 @@ const float* queries, \\ const uint32_t* index_list, \\ float* query_kths, \\ - SampleFilterT sample_filter, \\ + IvfSampleFilterT sample_filter, \\ LutT* lut_scores, \\ OutT* _out_scores, \\ uint32_t* _out_indices); @@ -104,6 +103,6 @@ path = f"ivf_pq_compute_similarity_{path_key}.cu" with open(path, "w") as f: f.write(header) - f.write(f"instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select({OutT}, {LutT}, raft::neighbors::ivf_pq::detail::NoneSampleFilter);\n") + f.write(f"instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select({OutT}, {LutT}, raft::neighbors::filtering::none_ivf_sample_filter);\n") f.write(trailer) print(f"src/neighbors/detail/{path}") diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu index 67b67df19f..bfc07b0321 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu @@ -27,52 +27,51 @@ #include #include -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, SampleFilterT) \ - template auto \ - raft::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) \ - ->raft::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - raft::neighbors::ivf_pq::detail::compute_similarity_run( \ - raft::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - raft::distance::DistanceType metric, \ - raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - SampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ + OutT, LutT, IvfSampleFilterT) \ + template auto \ + raft::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) \ + ->raft::neighbors::ivf_pq::detail::selected; \ + \ + template void \ + raft::neighbors::ivf_pq::detail::compute_similarity_run( \ + raft::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + raft::distance::DistanceType metric, \ + raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + IvfSampleFilterT sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( - float, float, raft::neighbors::ivf_pq::detail::NoneSampleFilter); + float, float, raft::neighbors::filtering::none_ivf_sample_filter); #undef COMMA diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu index 1c97a1c9ba..537868b590 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu @@ -27,54 +27,53 @@ #include #include -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, SampleFilterT) \ - template auto \ - raft::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) \ - ->raft::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - raft::neighbors::ivf_pq::detail::compute_similarity_run( \ - raft::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - raft::distance::DistanceType metric, \ - raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - SampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ + OutT, LutT, IvfSampleFilterT) \ + template auto \ + raft::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) \ + ->raft::neighbors::ivf_pq::detail::selected; \ + \ + template void \ + raft::neighbors::ivf_pq::detail::compute_similarity_run( \ + raft::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + raft::distance::DistanceType metric, \ + raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + IvfSampleFilterT sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - raft::neighbors::ivf_pq::detail::NoneSampleFilter); + raft::neighbors::filtering::none_ivf_sample_filter); #undef COMMA diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu index 14e2d19fe7..59b64b892d 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu @@ -27,54 +27,53 @@ #include #include -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, SampleFilterT) \ - template auto \ - raft::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) \ - ->raft::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - raft::neighbors::ivf_pq::detail::compute_similarity_run( \ - raft::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - raft::distance::DistanceType metric, \ - raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - SampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ + OutT, LutT, IvfSampleFilterT) \ + template auto \ + raft::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) \ + ->raft::neighbors::ivf_pq::detail::selected; \ + \ + template void \ + raft::neighbors::ivf_pq::detail::compute_similarity_run( \ + raft::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + raft::distance::DistanceType metric, \ + raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + IvfSampleFilterT sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - raft::neighbors::ivf_pq::detail::NoneSampleFilter); + raft::neighbors::filtering::none_ivf_sample_filter); #undef COMMA diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu index 7fd3a8d0b2..f9e899f8e9 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu @@ -27,52 +27,51 @@ #include #include -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, SampleFilterT) \ - template auto \ - raft::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) \ - ->raft::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - raft::neighbors::ivf_pq::detail::compute_similarity_run( \ - raft::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - raft::distance::DistanceType metric, \ - raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - SampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ + OutT, LutT, IvfSampleFilterT) \ + template auto \ + raft::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) \ + ->raft::neighbors::ivf_pq::detail::selected; \ + \ + template void \ + raft::neighbors::ivf_pq::detail::compute_similarity_run( \ + raft::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + raft::distance::DistanceType metric, \ + raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + IvfSampleFilterT sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( - float, half, raft::neighbors::ivf_pq::detail::NoneSampleFilter); + float, half, raft::neighbors::filtering::none_ivf_sample_filter); #undef COMMA diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu index 01df4d87e3..bf699d7af6 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu @@ -27,54 +27,53 @@ #include #include -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, SampleFilterT) \ - template auto \ - raft::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) \ - ->raft::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - raft::neighbors::ivf_pq::detail::compute_similarity_run( \ - raft::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - raft::distance::DistanceType metric, \ - raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - SampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ + OutT, LutT, IvfSampleFilterT) \ + template auto \ + raft::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) \ + ->raft::neighbors::ivf_pq::detail::selected; \ + \ + template void \ + raft::neighbors::ivf_pq::detail::compute_similarity_run( \ + raft::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + raft::distance::DistanceType metric, \ + raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + IvfSampleFilterT sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>, - raft::neighbors::ivf_pq::detail::NoneSampleFilter); + raft::neighbors::filtering::none_ivf_sample_filter); #undef COMMA diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu index 251515a552..9689ec88e1 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu @@ -27,54 +27,53 @@ #include #include -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, SampleFilterT) \ - template auto \ - raft::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) \ - ->raft::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - raft::neighbors::ivf_pq::detail::compute_similarity_run( \ - raft::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - raft::distance::DistanceType metric, \ - raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - SampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ + OutT, LutT, IvfSampleFilterT) \ + template auto \ + raft::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) \ + ->raft::neighbors::ivf_pq::detail::selected; \ + \ + template void \ + raft::neighbors::ivf_pq::detail::compute_similarity_run( \ + raft::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + raft::distance::DistanceType metric, \ + raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + IvfSampleFilterT sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>, - raft::neighbors::ivf_pq::detail::NoneSampleFilter); + raft::neighbors::filtering::none_ivf_sample_filter); #undef COMMA diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu index b29f4bca96..deed61dd3d 100644 --- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu +++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu @@ -27,52 +27,51 @@ #include #include -#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ - OutT, LutT, SampleFilterT) \ - template auto \ - raft::neighbors::ivf_pq::detail::compute_similarity_select( \ - const cudaDeviceProp& dev_props, \ - bool manage_local_topk, \ - int locality_hint, \ - double preferred_shmem_carveout, \ - uint32_t pq_bits, \ - uint32_t pq_dim, \ - uint32_t precomp_data_count, \ - uint32_t n_queries, \ - uint32_t n_probes, \ - uint32_t topk) \ - ->raft::neighbors::ivf_pq::detail::selected; \ - \ - template void \ - raft::neighbors::ivf_pq::detail::compute_similarity_run( \ - raft::neighbors::ivf_pq::detail::selected s, \ - rmm::cuda_stream_view stream, \ - uint32_t n_rows, \ - uint32_t dim, \ - uint32_t n_probes, \ - uint32_t pq_dim, \ - uint32_t n_queries, \ - uint32_t queries_offset, \ - raft::distance::DistanceType metric, \ - raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ - uint32_t topk, \ - uint32_t max_samples, \ - const float* cluster_centers, \ - const float* pq_centers, \ - const uint8_t* const* pq_dataset, \ - const uint32_t* cluster_labels, \ - const uint32_t* _chunk_indices, \ - const float* queries, \ - const uint32_t* index_list, \ - float* query_kths, \ - SampleFilterT sample_filter, \ - LutT* lut_scores, \ - OutT* _out_scores, \ +#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( \ + OutT, LutT, IvfSampleFilterT) \ + template auto \ + raft::neighbors::ivf_pq::detail::compute_similarity_select( \ + const cudaDeviceProp& dev_props, \ + bool manage_local_topk, \ + int locality_hint, \ + double preferred_shmem_carveout, \ + uint32_t pq_bits, \ + uint32_t pq_dim, \ + uint32_t precomp_data_count, \ + uint32_t n_queries, \ + uint32_t n_probes, \ + uint32_t topk) \ + ->raft::neighbors::ivf_pq::detail::selected; \ + \ + template void \ + raft::neighbors::ivf_pq::detail::compute_similarity_run( \ + raft::neighbors::ivf_pq::detail::selected s, \ + rmm::cuda_stream_view stream, \ + uint32_t dim, \ + uint32_t n_probes, \ + uint32_t pq_dim, \ + uint32_t n_queries, \ + uint32_t queries_offset, \ + raft::distance::DistanceType metric, \ + raft::neighbors::ivf_pq::codebook_gen codebook_kind, \ + uint32_t topk, \ + uint32_t max_samples, \ + const float* cluster_centers, \ + const float* pq_centers, \ + const uint8_t* const* pq_dataset, \ + const uint32_t* cluster_labels, \ + const uint32_t* _chunk_indices, \ + const float* queries, \ + const uint32_t* index_list, \ + float* query_kths, \ + IvfSampleFilterT sample_filter, \ + LutT* lut_scores, \ + OutT* _out_scores, \ uint32_t* _out_indices); #define COMMA , instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select( - half, half, raft::neighbors::ivf_pq::detail::NoneSampleFilter); + half, half, raft::neighbors::filtering::none_ivf_sample_filter); #undef COMMA diff --git a/cpp/src/neighbors/detail/refine_host_float_float.cpp b/cpp/src/neighbors/detail/refine_host_float_float.cpp new file mode 100644 index 0000000000..c596200c0a --- /dev/null +++ b/cpp/src/neighbors/detail/refine_host_float_float.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#define instantiate_raft_neighbors_refine(IdxT, DataT, DistanceT, ExtentsT) \ + template void raft::neighbors::detail::refine_host( \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view queries, \ + raft::host_matrix_view neighbor_candidates, \ + raft::host_matrix_view indices, \ + raft::host_matrix_view distances, \ + distance::DistanceType metric); + +instantiate_raft_neighbors_refine(int64_t, float, float, int64_t); + +#undef instantiate_raft_neighbors_refine diff --git a/cpp/src/neighbors/detail/refine_host_int8_t_float.cpp b/cpp/src/neighbors/detail/refine_host_int8_t_float.cpp new file mode 100644 index 0000000000..334a3e8cb6 --- /dev/null +++ b/cpp/src/neighbors/detail/refine_host_int8_t_float.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define instantiate_raft_neighbors_refine(IdxT, DataT, DistanceT, ExtentsT) \ + template void raft::neighbors::detail::refine_host( \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view queries, \ + raft::host_matrix_view neighbor_candidates, \ + raft::host_matrix_view indices, \ + raft::host_matrix_view distances, \ + distance::DistanceType metric); +instantiate_raft_neighbors_refine(int64_t, int8_t, float, int64_t); + +#undef instantiate_raft_neighbors_refine diff --git a/cpp/src/neighbors/detail/refine_host_uint8_t_float.cpp b/cpp/src/neighbors/detail/refine_host_uint8_t_float.cpp new file mode 100644 index 0000000000..43d93e5f2e --- /dev/null +++ b/cpp/src/neighbors/detail/refine_host_uint8_t_float.cpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define instantiate_raft_neighbors_refine(IdxT, DataT, DistanceT, ExtentsT) \ + template void raft::neighbors::detail::refine_host( \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view queries, \ + raft::host_matrix_view neighbor_candidates, \ + raft::host_matrix_view indices, \ + raft::host_matrix_view distances, \ + distance::DistanceType metric); + +instantiate_raft_neighbors_refine(int64_t, uint8_t, float, int64_t); + +#undef instantiate_raft_neighbors_refine diff --git a/cpp/src/raft_runtime/neighbors/cagra_build.cu b/cpp/src/raft_runtime/neighbors/cagra_build.cu new file mode 100644 index 0000000000..225d645e4e --- /dev/null +++ b/cpp/src/raft_runtime/neighbors/cagra_build.cu @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +namespace raft::runtime::neighbors::cagra { + +#define RAFT_INST_CAGRA_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const raft::neighbors::cagra::index_params& params, \ + raft::device_matrix_view dataset) \ + ->raft::neighbors::cagra::index \ + { \ + return raft::neighbors::cagra::build(handle, params, dataset); \ + } \ + \ + auto build(raft::resources const& handle, \ + const raft::neighbors::cagra::index_params& params, \ + raft::host_matrix_view dataset) \ + ->raft::neighbors::cagra::index \ + { \ + return raft::neighbors::cagra::build(handle, params, dataset); \ + } \ + \ + void build_device(raft::resources const& handle, \ + const raft::neighbors::cagra::index_params& params, \ + raft::device_matrix_view dataset, \ + raft::neighbors::cagra::index& idx) \ + { \ + idx = build(handle, params, dataset); \ + } \ + \ + void build_host(raft::resources const& handle, \ + const raft::neighbors::cagra::index_params& params, \ + raft::host_matrix_view dataset, \ + raft::neighbors::cagra::index& idx) \ + { \ + idx = build(handle, params, dataset); \ + } + +RAFT_INST_CAGRA_BUILD(float, uint32_t); +RAFT_INST_CAGRA_BUILD(int8_t, uint32_t); +RAFT_INST_CAGRA_BUILD(uint8_t, uint32_t); + +#undef RAFT_INST_CAGRA_BUILD + +#define RAFT_INST_CAGRA_OPTIMIZE(IdxT) \ + void optimize_device(raft::resources const& handle, \ + raft::device_matrix_view knn_graph, \ + raft::host_matrix_view new_graph) \ + { \ + raft::neighbors::cagra::optimize(handle, knn_graph, new_graph); \ + } \ + void optimize_host(raft::resources const& handle, \ + raft::host_matrix_view knn_graph, \ + raft::host_matrix_view new_graph) \ + { \ + raft::neighbors::cagra::optimize(handle, knn_graph, new_graph); \ + } + +RAFT_INST_CAGRA_OPTIMIZE(uint32_t); + +#undef RAFT_INST_CAGRA_OPTIMIZE + +} // namespace raft::runtime::neighbors::cagra diff --git a/cpp/src/raft_runtime/neighbors/cagra_search.cu b/cpp/src/raft_runtime/neighbors/cagra_search.cu new file mode 100644 index 0000000000..149ae01392 --- /dev/null +++ b/cpp/src/raft_runtime/neighbors/cagra_search.cu @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft::runtime::neighbors::cagra { + +#define RAFT_INST_CAGRA_SEARCH(T, IdxT) \ + void search(raft::resources const& handle, \ + raft::neighbors::cagra::search_params const& params, \ + const raft::neighbors::cagra::index& index, \ + raft::device_matrix_view queries, \ + raft::device_matrix_view neighbors, \ + raft::device_matrix_view distances) \ + { \ + raft::neighbors::cagra::search(handle, params, index, queries, neighbors, distances); \ + } + +RAFT_INST_CAGRA_SEARCH(float, uint32_t); +RAFT_INST_CAGRA_SEARCH(int8_t, uint32_t); +RAFT_INST_CAGRA_SEARCH(uint8_t, uint32_t); + +#undef RAFT_INST_CAGRA_SEARCH + +} // namespace raft::runtime::neighbors::cagra diff --git a/cpp/src/raft_runtime/neighbors/cagra_serialize.cu b/cpp/src/raft_runtime/neighbors/cagra_serialize.cu new file mode 100644 index 0000000000..be9788562a --- /dev/null +++ b/cpp/src/raft_runtime/neighbors/cagra_serialize.cu @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +namespace raft::runtime::neighbors::cagra { + +#define RAFT_INST_CAGRA_SERIALIZE(DTYPE) \ + void serialize_file(raft::resources const& handle, \ + const std::string& filename, \ + const raft::neighbors::cagra::index& index) \ + { \ + raft::neighbors::cagra::serialize(handle, filename, index); \ + }; \ + \ + void deserialize_file(raft::resources const& handle, \ + const std::string& filename, \ + raft::neighbors::cagra::index* index) \ + { \ + if (!index) { RAFT_FAIL("Invalid index pointer"); } \ + *index = raft::neighbors::cagra::deserialize(handle, filename); \ + }; \ + void serialize(raft::resources const& handle, \ + std::string& str, \ + const raft::neighbors::cagra::index& index) \ + { \ + std::stringstream os; \ + raft::neighbors::cagra::serialize(handle, os, index); \ + str = os.str(); \ + } \ + \ + void deserialize(raft::resources const& handle, \ + const std::string& str, \ + raft::neighbors::cagra::index* index) \ + { \ + std::istringstream is(str); \ + if (!index) { RAFT_FAIL("Invalid index pointer"); } \ + *index = raft::neighbors::cagra::deserialize(handle, is); \ + } + +RAFT_INST_CAGRA_SERIALIZE(float); +RAFT_INST_CAGRA_SERIALIZE(int8_t); +RAFT_INST_CAGRA_SERIALIZE(uint8_t); + +#undef RAFT_INST_CAGRA_SERIALIZE +} // namespace raft::runtime::neighbors::cagra diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake index 248f4f1af4..075c51eddf 100644 --- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake +++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # Use this variable to update RAPIDS and RAFT versions -set(RAPIDS_VERSION "23.06") +set(RAPIDS_VERSION "23.08") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 871869102c..efcd48cd1d 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -13,27 +13,38 @@ # ============================================================================= # ################################################################################################## -# * compiler function ----------------------------------------------------------------------------- +# enable testing ################################################################################ +# ################################################################################################## +enable_testing() +include(rapids-test) +rapids_test_init() function(ConfigureTest) set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY) - set(oneValueArgs NAME) + set(oneValueArgs NAME GPUS PERCENT) set(multiValueArgs PATH TARGETS CONFIGURATIONS) - cmake_parse_arguments(ConfigureTest "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(TEST_NAME ${ConfigureTest_NAME}) - - add_executable(${TEST_NAME} ${ConfigureTest_PATH}) + cmake_parse_arguments(_RAFT_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(NOT DEFINED _RAFT_TEST_GPUS AND NOT DEFINED _RAFT_TEST_PERCENT) + set(_RAFT_TEST_GPUS 1) + set(_RAFT_TEST_PERCENT 30) + endif() + if(NOT DEFINED _RAFT_TEST_GPUS) + set(_RAFT_TEST_GPUS 1) + endif() + if(NOT DEFINED _RAFT_TEST_PERCENT) + set(_RAFT_TEST_PERCENT 100) + endif() - message("TEST PATH: ${ConfigureTest_PATH}") + set(TEST_NAME ${_RAFT_TEST_NAME}) + add_executable(${TEST_NAME} ${_RAFT_TEST_PATH}) target_link_libraries( ${TEST_NAME} PRIVATE raft raft_internal - $<$:raft::compiled> + $<$:raft::compiled> GTest::gtest GTest::gtest_main Threads::Threads @@ -41,35 +52,31 @@ function(ConfigureTest) $ $ ) - - add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) - set_target_properties( ${TEST_NAME} - PROPERTIES # set target compile options + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON ) - target_compile_options( ${TEST_NAME} PRIVATE "$<$:${RAFT_CXX_FLAGS}>" "$<$:${RAFT_CUDA_FLAGS}>" ) - - if(ConfigureTest_EXPLICIT_INSTANTIATE_ONLY) + if(_RAFT_TEST_EXPLICIT_INSTANTIATE_ONLY) target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") endif() target_include_directories(${TEST_NAME} PUBLIC "$") - install( - TARGETS ${TEST_NAME} - COMPONENT testing - DESTINATION bin/gtests/libraft - EXCLUDE_FROM_ALL + rapids_test_add( + NAME ${TEST_NAME} + COMMAND ${TEST_NAME} + GPUS ${_RAFT_TEST_GPUS} + PERCENT ${_RAFT_TEST_PERCENT} + INSTALL_COMPONENT_SET testing ) endfunction() @@ -90,7 +97,6 @@ if(BUILD_TESTS) test/cluster/cluster_solvers.cu test/cluster/linkage.cu test/cluster/kmeans_find_k.cu - OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) @@ -117,7 +123,6 @@ if(BUILD_TESTS) test/core/span.cu test/core/temporary_device_buffer.cu test/test.cpp - OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) @@ -147,7 +152,6 @@ if(BUILD_TESTS) test/distance/masked_nn_compress_to_bits.cu test/distance/fused_l2_nn.cu test/distance/gram.cu - OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) @@ -181,12 +185,10 @@ if(BUILD_TESTS) # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined # * EXT_HEADERS_TEST_IMPLICIT: no macros defined. ConfigureTest( - NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB + NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB EXPLICIT_INSTANTIATE_ONLY ) - ConfigureTest( - NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB - ) + ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB) ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES}) ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu) @@ -238,20 +240,26 @@ if(BUILD_TESTS) test/matrix/columnSort.cu test/matrix/diagonal.cu test/matrix/gather.cu + test/matrix/scatter.cu + test/matrix/eye.cu test/matrix/linewise_op.cu test/matrix/math.cu test/matrix/matrix.cu test/matrix/norm.cu test/matrix/reverse.cu - test/matrix/select_k.cu test/matrix/slice.cu test/matrix/triangular.cu test/sparse/spectral_matrix.cu - OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) + ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY) + + ConfigureTest( + NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY + ) + ConfigureTest( NAME RANDOM_TEST @@ -269,7 +277,7 @@ if(BUILD_TESTS) ConfigureTest( NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu - test/lap/lap.cu test/sparse/mst.cu OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY ) ConfigureTest( @@ -295,17 +303,16 @@ if(BUILD_TESTS) ConfigureTest( NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu - test/sparse/gram.cu OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY ) ConfigureTest( NAME SPARSE_NEIGHBORS_TEST PATH - test/sparse/neighbors/connect_components.cu + test/sparse/neighbors/cross_component_nn.cu test/sparse/neighbors/brute_force.cu test/sparse/neighbors/knn_graph.cu - OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) @@ -314,10 +321,45 @@ if(BUILD_TESTS) NAME NEIGHBORS_TEST PATH + test/neighbors/knn.cu + test/neighbors/fused_l2_knn.cu + test/neighbors/tiled_knn.cu + test/neighbors/haversine.cu + test/neighbors/ball_cover.cu + test/neighbors/epsilon_neighborhood.cu + test/neighbors/refine.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + ) + + ConfigureTest( + NAME + NEIGHBORS_ANN_CAGRA_TEST + PATH test/neighbors/ann_cagra/test_float_uint32_t.cu test/neighbors/ann_cagra/test_int8_t_uint32_t.cu test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu test/neighbors/ann_cagra/test_float_int64_t.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu + src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu + src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu + LIB + EXPLICIT_INSTANTIATE_ONLY + GPUS + 1 + PERCENT + 100 + ) + + ConfigureTest( + NAME + NEIGHBORS_ANN_IVF_TEST + PATH test/neighbors/ann_ivf_flat/test_float_int64_t.cu test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu @@ -326,17 +368,17 @@ if(BUILD_TESTS) test/neighbors/ann_ivf_pq/test_float_int64_t.cu test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu - test/neighbors/knn.cu - test/neighbors/fused_l2_knn.cu - test/neighbors/tiled_knn.cu - test/neighbors/haversine.cu - test/neighbors/ball_cover.cu - test/neighbors/epsilon_neighborhood.cu - test/neighbors/refine.cu - test/neighbors/selection.cu - OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + GPUS + 1 + PERCENT + 100 + ) + + ConfigureTest( + NAME NEIGHBORS_SELECTION_TEST PATH test/neighbors/selection.cu LIB EXPLICIT_INSTANTIATE_ONLY + GPUS 1 PERCENT 50 ) ConfigureTest( @@ -368,7 +410,6 @@ if(BUILD_TESTS) test/stats/trustworthiness.cu test/stats/weighted_mean.cu test/stats/v_measure.cu - OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) @@ -386,3 +427,8 @@ if(BUILD_TESTS) test/util/reduction.cu ) endif() + +# ################################################################################################## +# Install tests #################################################################################### +# ################################################################################################## +rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libraft) diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu index e660dbef13..52ec2efe8e 100644 --- a/cpp/test/cluster/linkage.cu +++ b/cpp/test/cluster/linkage.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -// XXX: We allow the instantiation of fused_l2_nn here: -// raft::linkage::FixConnectivitiesRedOp red_op(colors.data(), params.n_row); -// raft::linkage::connect_components( +// XXX: We allow the instantiation of masked_l2_nn here: +// raft::linkage::FixConnectivitiesRedOp red_op(params.n_row); +// raft::linkage::cross_component_nn( // handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); // // TODO: consider adding this to libraft.so or creating an instance in a diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp index 8c5e023df3..a1ad4385a7 100644 --- a/cpp/test/core/handle.cpp +++ b/cpp/test/core/handle.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -274,39 +275,61 @@ TEST(Raft, WorkspaceResource) { raft::handle_t handle; - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle)) == nullptr); - ASSERT_EQ(rmm::mr::get_current_device_resource(), resource::get_workspace_resource(handle)); + // The returned resource is always a limiting adaptor + auto* orig_mr = resource::get_workspace_resource(handle)->get_upstream(); - auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource()); - std::shared_ptr pool = {nullptr}; - raft::handle_t handle2(rmm::cuda_stream_per_thread, pool, pool_mr); + // Let's create a pooled resource + auto pool_mr = std::shared_ptr{ + new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())}; - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle2)) != nullptr); - ASSERT_EQ(pool_mr, resource::get_workspace_resource(handle2)); + // A tiny workspace of 1MB + size_t max_size = 1024 * 1024; - delete pool_mr; -} - -TEST(Raft, WorkspaceResourceCopy) -{ - auto stream_pool = std::make_shared(10); + // Replace the resource + resource::set_workspace_resource(handle, pool_mr, max_size); + auto new_mr = resource::get_workspace_resource(handle); - handle_t handle(rmm::cuda_stream_per_thread, stream_pool); + // By this point, the orig_mr likely points to a non-existent resource; don't dereference! + ASSERT_NE(orig_mr, new_mr); + ASSERT_EQ(pool_mr.get(), new_mr->get_upstream()); + // We can safely reset pool_mr, because the shared_ptr to the pool memory stays in the resource + pool_mr.reset(); - auto pool_mr = new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource()); + auto stream = resource::get_cuda_stream(handle); + rmm::device_buffer buf(max_size / 2, stream, new_mr); - handle_t copied_handle(handle, pool_mr); + // Note, the underlying pool allocator likely uses more space than reported here + ASSERT_EQ(max_size, resource::get_workspace_total_bytes(handle)); + ASSERT_EQ(buf.size(), resource::get_workspace_used_bytes(handle)); + ASSERT_EQ(max_size - buf.size(), resource::get_workspace_free_bytes(handle)); - assert_handles_equal(handle, copied_handle); + // this should throw, becaise we partially used the space. + ASSERT_THROW((rmm::device_buffer{max_size, stream, new_mr}), rmm::bad_alloc); +} - // Assert the workspace_resources are what we expect - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(handle)) == nullptr); +TEST(Raft, WorkspaceResourceCopy) +{ + raft::handle_t res; + auto orig_mr = resource::get_workspace_resource(res); + auto orig_size = resource::get_workspace_total_bytes(res); - ASSERT_TRUE(dynamic_cast*>( - resource::get_workspace_resource(copied_handle)) != nullptr); + { + // create a new handle in the inner scope and update the workspace resource for it. + raft::resources tmp_res(res); + resource::set_workspace_resource( + tmp_res, + std::shared_ptr{ + new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())}, + orig_size * 2); + + ASSERT_EQ(orig_mr, resource::get_workspace_resource(res)); + ASSERT_EQ(orig_size, resource::get_workspace_total_bytes(res)); + + ASSERT_NE(orig_mr, resource::get_workspace_resource(tmp_res)); + ASSERT_NE(orig_size, resource::get_workspace_total_bytes(tmp_res)); + } + ASSERT_EQ(orig_mr, resource::get_workspace_resource(res)); + ASSERT_EQ(orig_size, resource::get_workspace_total_bytes(res)); } TEST(Raft, HandleCopy) diff --git a/cpp/test/core/math_device.cu b/cpp/test/core/math_device.cu index ff4b343d9e..15c7b2b33a 100644 --- a/cpp/test/core/math_device.cu +++ b/cpp/test/core/math_device.cu @@ -21,6 +21,11 @@ #include #include +#if _RAFT_HAS_CUDA +#include +#include +#endif + template __global__ void math_eval_kernel(OutT* out, OpT op, Args... args) { @@ -118,8 +123,32 @@ struct cos_test_op { } }; +struct cos_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { + return __float2half(raft::cos(__half2float(in))); + } +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { + return __float2bfloat16(raft::cos(__bfloat162float(in))); + } else // else is there to make sure raft::cos(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::cos(in); + } +}; + TEST(MathDevice, Cos) { + ASSERT_TRUE(raft::match(std::cos(12.34f), + __half2float(math_eval(cos_test_op_device{}, __float2half(12.34f))), + raft::CompareApprox(0.001f))); + ASSERT_TRUE( + raft::match(std::cos(12.34f), + __bfloat162float(math_eval(cos_test_op_device{}, __float2bfloat16(12.34f))), + raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::cos(12.34f), math_eval(cos_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -134,14 +163,54 @@ struct exp_test_op { } }; +struct exp_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { + return __float2half(raft::exp(__half2float(in))); + } +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { + return __float2bfloat16(raft::exp(__bfloat162float(in))); + } else // else is there to make sure raft::exp(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::exp(in); + } +}; + TEST(MathDevice, Exp) { + ASSERT_TRUE(raft::match(std::exp(3.4f), + __half2float(math_eval(exp_test_op_device{}, __float2half(3.4f))), + raft::CompareApprox(0.001f))); + ASSERT_TRUE(raft::match(std::exp(3.4f), + __bfloat162float(math_eval(exp_test_op_device{}, __float2bfloat16(3.4f))), + raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( - std::exp(12.34f), math_eval(exp_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); + std::exp(3.4f), math_eval(exp_test_op{}, 3.4f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( - std::exp(12.34), math_eval(exp_test_op{}, 12.34), raft::CompareApprox(0.000001))); + std::exp(3.4), math_eval(exp_test_op{}, 3.4), raft::CompareApprox(0.000001))); } +struct log_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { + return __float2half(raft::log(__half2float(in))); + } +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { + return __float2bfloat16(raft::log(__bfloat162float(in))); + } else // else is there to make sure raft::log(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::log(in); + } +}; + struct log_test_op { template constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const @@ -152,6 +221,13 @@ struct log_test_op { TEST(MathDevice, Log) { + ASSERT_TRUE(raft::match(std::log(12.34f), + __half2float(math_eval(log_test_op_device{}, __float2half(12.34f))), + raft::CompareApprox(0.001f))); + ASSERT_TRUE( + raft::match(std::log(12.34f), + __bfloat162float(math_eval(log_test_op_device{}, __float2bfloat16(12.34f))), + raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::log(12.34f), math_eval(log_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -277,6 +353,23 @@ TEST(MathDevice, Sgn) ASSERT_TRUE(raft::match(1, math_eval(sgn_test_op{}, 12.34f), raft::Compare())); } +struct sin_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { + return __float2half(raft::sin(__half2float(in))); + } +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { + return __float2bfloat16(raft::sin(__bfloat162float(in))); + } else // else is there to make sure raft::sin(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::sin(in); + } +}; + struct sin_test_op { template constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const @@ -287,6 +380,13 @@ struct sin_test_op { TEST(MathDevice, Sin) { + ASSERT_TRUE(raft::match(std::sin(12.34f), + __half2float(math_eval(sin_test_op_device{}, __float2half(12.34f))), + raft::CompareApprox(0.01f))); + ASSERT_TRUE( + raft::match(std::sin(12.34f), + __bfloat162float(math_eval(sin_test_op_device{}, __float2bfloat16(12.34f))), + raft::CompareApprox(0.1f))); ASSERT_TRUE(raft::match( std::sin(12.34f), math_eval(sin_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( @@ -319,6 +419,23 @@ TEST(MathDevice, SinCos) ASSERT_TRUE(raft::match(std::cos(12.34), cd.value(stream), raft::CompareApprox(0.0001f))); } +struct sqrt_test_op_device { + template + constexpr RAFT_DEVICE_INLINE_FUNCTION auto operator()(const Type& in) const + { +#if (__CUDA_ARCH__ < 530) + if constexpr (std::is_same_v) { + return __float2half(raft::sqrt(__half2float(in))); + } +#elif (__CUDA_ARCH__ < 800) + if constexpr (std::is_same_v) { + return __float2bfloat16(raft::sqrt(__bfloat162float(in))); + } else // else is there to make sure raft::sqrt(in) is not compiled with __half / nv_bfloat16 +#endif + return raft::sqrt(in); + } +}; + struct sqrt_test_op { template constexpr RAFT_INLINE_FUNCTION auto operator()(const Type& in) const @@ -329,6 +446,13 @@ struct sqrt_test_op { TEST(MathDevice, Sqrt) { + ASSERT_TRUE(raft::match(std::sqrt(12.34f), + __half2float(math_eval(sqrt_test_op_device{}, __float2half(12.34f))), + raft::CompareApprox(0.001f))); + ASSERT_TRUE( + raft::match(std::sqrt(12.34f), + __bfloat162float(math_eval(sqrt_test_op_device{}, __float2bfloat16(12.34f))), + raft::CompareApprox(0.01f))); ASSERT_TRUE(raft::match( std::sqrt(12.34f), math_eval(sqrt_test_op{}, 12.34f), raft::CompareApprox(0.0001f))); ASSERT_TRUE(raft::match( diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu index b3640a888a..d5fecd93c6 100644 --- a/cpp/test/distance/gram.cu +++ b/cpp/test/distance/gram.cu @@ -75,9 +75,14 @@ template class GramMatrixTest : public ::testing::TestWithParam { protected: GramMatrixTest() - : params(GetParam()), stream(0), x1(0, stream), x2(0, stream), gram(0, stream), gram_host(0) + : params(GetParam()), + handle(), + x1(0, resource::get_cuda_stream(handle)), + x2(0, resource::get_cuda_stream(handle)), + gram(0, resource::get_cuda_stream(handle)), + gram_host(0) { - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); + auto stream = resource::get_cuda_stream(handle); if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; } if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; } @@ -99,7 +104,7 @@ class GramMatrixTest : public ::testing::TestWithParam { r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream); } - ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); } + ~GramMatrixTest() override {} void runTest() { @@ -127,6 +132,7 @@ class GramMatrixTest : public ::testing::TestWithParam { (*kernel)(handle, x1_span, x2_span, out_span); + auto stream = resource::get_cuda_stream(handle); naiveGramMatrixKernel(params.n1, params.n2, params.n_cols, @@ -142,16 +148,16 @@ class GramMatrixTest : public ::testing::TestWithParam { handle); ASSERT_TRUE(raft::devArrMatchHost( - gram_host.data(), gram.data(), gram.size(), raft::CompareApprox(1e-6f))); + gram_host.data(), gram.data(), gram.size(), raft::CompareApprox(1e-6f), stream)); } - raft::resources handle; - cudaStream_t stream = 0; GramMatrixInputs params; + raft::resources handle; rmm::device_uvector x1; rmm::device_uvector x2; rmm::device_uvector gram; + std::vector gram_host; }; diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index 022581c655..3e12f9171e 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -75,7 +75,9 @@ class MergeLabelsTest : public ::testing::TestWithParam params; rmm::device_uvector labels_a, labels_b, expected, R; - rmm::device_scalar mask, m; + rmm::device_uvector mask; + + rmm::device_scalar m; }; using MergeLabelsTestI = MergeLabelsTest; diff --git a/cpp/test/matrix/eye.cu b/cpp/test/matrix/eye.cu new file mode 100644 index 0000000000..33ed8a00ba --- /dev/null +++ b/cpp/test/matrix/eye.cu @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include +#include +#include + +#include +#include + +namespace raft::matrix { + +template +struct InitInputs { + int n_row; + int n_col; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const InitInputs& dims) +{ + return os; +} + +template +class InitTest : public ::testing::TestWithParam> { + public: + InitTest() + : params(::testing::TestWithParam>::GetParam()), + stream(resource::get_cuda_stream(handle)) + { + } + + protected: + void test_eye() + { + ASSERT_TRUE(params.n_row == 4 && params.n_col == 5); + auto eyemat_col = + raft::make_device_matrix(handle, params.n_row, params.n_col); + raft::matrix::eye(handle, eyemat_col.view()); + std::vector eye_exp{1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0}; + std::vector eye_act(params.n_col * params.n_row); + raft::copy(eye_act.data(), eyemat_col.data_handle(), eye_act.size(), stream); + resource::sync_stream(handle, stream); + ASSERT_TRUE(hostVecMatch(eye_exp, eye_act, raft::Compare())); + + auto eyemat_row = + raft::make_device_matrix(handle, params.n_row, params.n_col); + raft::matrix::eye(handle, eyemat_row.view()); + eye_exp = std::vector{1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0}; + raft::copy(eye_act.data(), eyemat_row.data_handle(), eye_act.size(), stream); + resource::sync_stream(handle, stream); + ASSERT_TRUE(hostVecMatch(eye_exp, eye_act, raft::Compare())); + } + + void SetUp() override { test_eye(); } + + protected: + raft::resources handle; + cudaStream_t stream; + + InitInputs params; +}; + +const std::vector> inputsf1 = {{4, 5}}; + +const std::vector> inputsd1 = {{4, 5}}; + +typedef InitTest InitTestF; +TEST_P(InitTestF, Result) {} + +typedef InitTest InitTestD; +TEST_P(InitTestD, Result) {} + +INSTANTIATE_TEST_SUITE_P(InitTests, InitTestF, ::testing::ValuesIn(inputsf1)); +INSTANTIATE_TEST_SUITE_P(InitTests, InitTestD, ::testing::ValuesIn(inputsd1)); + +} // namespace raft::matrix diff --git a/cpp/test/matrix/gather.cu b/cpp/test/matrix/gather.cu index cab96576d2..b1228f05ca 100644 --- a/cpp/test/matrix/gather.cu +++ b/cpp/test/matrix/gather.cu @@ -72,10 +72,16 @@ struct GatherInputs { IdxT nrows; IdxT ncols; IdxT map_length; + IdxT col_batch_size; unsigned long long int seed; }; -template +template class GatherTest : public ::testing::TestWithParam> { protected: GatherTest() @@ -97,6 +103,8 @@ class GatherTest : public ::testing::TestWithParam> { IdxT map_length = params.map_length; IdxT len = params.nrows * params.ncols; + if (map_length > params.nrows) map_length = params.nrows; + // input matrix setup d_in.resize(params.nrows * params.ncols, stream); h_in.resize(params.nrows * params.ncols); @@ -143,6 +151,8 @@ class GatherTest : public ::testing::TestWithParam> { auto in_view = raft::make_device_matrix_view( d_in.data(), params.nrows, params.ncols); + auto inout_view = raft::make_device_matrix_view( + d_in.data(), params.nrows, params.ncols); auto out_view = raft::make_device_matrix_view( d_out_act.data(), map_length, params.ncols); auto map_view = raft::make_device_vector_view(d_map.data(), map_length); @@ -154,12 +164,23 @@ class GatherTest : public ::testing::TestWithParam> { handle, in_view, out_view, map_view, stencil_view, pred_op, transform_op); } else if (Conditional) { raft::matrix::gather_if(handle, in_view, out_view, map_view, stencil_view, pred_op); + } else if (MapTransform && Inplace) { + raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size, transform_op); } else if (MapTransform) { raft::matrix::gather(handle, in_view, map_view, out_view, transform_op); + } else if (Inplace) { + raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size); } else { raft::matrix::gather(handle, in_view, map_view, out_view); } + if (Inplace) { + raft::copy_async(d_out_act.data(), + d_in.data(), + map_length * params.ncols, + raft::resource::get_cuda_stream(handle)); + } + resource::sync_stream(handle, stream); } @@ -173,39 +194,53 @@ class GatherTest : public ::testing::TestWithParam> { rmm::device_uvector d_map; }; -#define GATHER_TEST(test_type, test_name, test_inputs) \ - typedef RAFT_DEPAREN(test_type) test_name; \ - TEST_P(test_name, Result) \ - { \ - ASSERT_TRUE(devArrMatch(d_out_exp.data(), \ - d_out_act.data(), \ - params.map_length* params.ncols, \ - raft::Compare())); \ - } \ +#define GATHER_TEST(test_type, test_name, test_inputs) \ + typedef RAFT_DEPAREN(test_type) test_name; \ + TEST_P(test_name, Result) \ + { \ + ASSERT_TRUE( \ + devArrMatch(d_out_exp.data(), d_out_act.data(), d_out_exp.size(), raft::Compare())); \ + } \ INSTANTIATE_TEST_CASE_P(GatherTests, test_name, ::testing::ValuesIn(test_inputs)) -const std::vector> inputs_i32 = - raft::util::itertools::product>({25, 2000}, {6, 31, 129}, {11, 999}, {1234ULL}); +const std::vector> inputs_i32 = raft::util::itertools::product>( + {25, 2000}, {6, 31, 129}, {11, 999}, {2, 3, 6}, {1234ULL}); const std::vector> inputs_i64 = raft::util::itertools::product>( - {25, 2000}, {6, 31, 129}, {11, 999}, {1234ULL}); + {25, 2000}, {6, 31, 129}, {11, 999}, {2, 3, 6}, {1234ULL}); +const std::vector> inplace_inputs_i32 = + raft::util::itertools::product>( + {25, 2000}, {6, 31, 129}, {11, 999}, {0, 1, 2, 3, 6, 100}, {1234ULL}); +const std::vector> inplace_inputs_i64 = + raft::util::itertools::product>( + {25, 2000}, {6, 31, 129}, {11, 999}, {0, 1, 2, 3, 6, 100}, {1234ULL}); -GATHER_TEST((GatherTest), GatherTestFU32I32, inputs_i32); -GATHER_TEST((GatherTest), +GATHER_TEST((GatherTest), GatherTestFU32I32, inputs_i32); +GATHER_TEST((GatherTest), GatherTransformTestFU32I32, inputs_i32); -GATHER_TEST((GatherTest), GatherIfTestFU32I32, inputs_i32); -GATHER_TEST((GatherTest), +GATHER_TEST((GatherTest), + GatherIfTestFU32I32, + inputs_i32); +GATHER_TEST((GatherTest), GatherIfTransformTestFU32I32, inputs_i32); -GATHER_TEST((GatherTest), +GATHER_TEST((GatherTest), GatherIfTransformTestDU32I32, inputs_i32); -GATHER_TEST((GatherTest), +GATHER_TEST((GatherTest), GatherIfTransformTestFU32I64, inputs_i64); -GATHER_TEST((GatherTest), +GATHER_TEST((GatherTest), GatherIfTransformTestFI64I64, inputs_i64); - +GATHER_TEST((GatherTest), + GatherInplaceTestFU32I32, + inplace_inputs_i32); +GATHER_TEST((GatherTest), + GatherInplaceTestFU32I64, + inplace_inputs_i64); +GATHER_TEST((GatherTest), + GatherInplaceTestFI64I64, + inplace_inputs_i64); } // end namespace raft \ No newline at end of file diff --git a/cpp/test/matrix/scatter.cu b/cpp/test/matrix/scatter.cu new file mode 100644 index 0000000000..3a1a40086e --- /dev/null +++ b/cpp/test/matrix/scatter.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace raft { + +template +void naiveScatter( + InputIteratorT in, IdxT D, IdxT N, MapIteratorT map, IdxT map_length, OutputIteratorT out) +{ + for (IdxT outRow = 0; outRow < map_length; ++outRow) { + typename std::iterator_traits::value_type map_val = map[outRow]; + IdxT outRowStart = map_val * D; + IdxT inRowStart = outRow * D; + for (IdxT i = 0; i < D; ++i) { + out[outRowStart + i] = in[inRowStart + i]; + } + } +} + +template +struct ScatterInputs { + IdxT nrows; + IdxT ncols; + IdxT col_batch_size; + unsigned long long int seed; +}; + +template +class ScatterTest : public ::testing::TestWithParam> { + protected: + ScatterTest() + : stream(resource::get_cuda_stream(handle)), + params(::testing::TestWithParam>::GetParam()), + d_in(0, stream), + d_out_exp(0, stream), + d_map(0, stream) + { + } + + void SetUp() override + { + raft::random::RngState r(params.seed); + raft::random::RngState r_int(params.seed); + + IdxT len = params.nrows * params.ncols; + + // input matrix setup + d_in.resize(params.nrows * params.ncols, stream); + h_in.resize(params.nrows * params.ncols); + raft::random::uniform(handle, r, d_in.data(), len, MatrixT(-1.0), MatrixT(1.0)); + raft::update_host(h_in.data(), d_in.data(), len, stream); + + // map setup + d_map.resize(params.nrows, stream); + h_map.resize(params.nrows); + + auto exec_policy = raft::resource::get_thrust_policy(handle); + + thrust::counting_iterator permute_iter(0); + thrust::copy(exec_policy, permute_iter, permute_iter + params.nrows, d_map.data()); + + thrust::default_random_engine g; + thrust::shuffle(exec_policy, d_map.data(), d_map.data() + params.nrows, g); + + raft::update_host(h_map.data(), d_map.data(), params.nrows, stream); + resource::sync_stream(handle, stream); + + // expected and actual output matrix setup + h_out.resize(params.nrows * params.ncols); + d_out_exp.resize(params.nrows * params.ncols, stream); + + // launch scatter on the host and copy the results to device + naiveScatter(h_in.data(), params.ncols, params.nrows, h_map.data(), params.nrows, h_out.data()); + raft::update_device(d_out_exp.data(), h_out.data(), params.nrows * params.ncols, stream); + + auto inout_view = raft::make_device_matrix_view( + d_in.data(), params.nrows, params.ncols); + auto map_view = raft::make_device_vector_view(d_map.data(), params.nrows); + + raft::matrix::scatter(handle, inout_view, map_view, params.col_batch_size); + resource::sync_stream(handle, stream); + } + + protected: + raft::resources handle; + cudaStream_t stream = 0; + ScatterInputs params; + std::vector h_in, h_out; + std::vector h_map; + rmm::device_uvector d_in, d_out_exp; + rmm::device_uvector d_map; +}; + +#define SCATTER_TEST(test_type, test_name, test_inputs) \ + typedef RAFT_DEPAREN(test_type) test_name; \ + TEST_P(test_name, Result) \ + { \ + ASSERT_TRUE( \ + devArrMatch(d_in.data(), d_out_exp.data(), d_out_exp.size(), raft::Compare())); \ + } \ + INSTANTIATE_TEST_CASE_P(ScatterTests, test_name, ::testing::ValuesIn(test_inputs)) + +const std::vector> inputs_i32 = + raft::util::itertools::product>( + {25, 2000}, {6, 31, 129}, {0, 1, 2, 3, 6, 100}, {1234ULL}); +const std::vector> inputs_i64 = + raft::util::itertools::product>( + {25, 2000}, {6, 31, 129}, {0, 1, 2, 3, 6, 100}, {1234ULL}); + +SCATTER_TEST((ScatterTest), ScatterTestFI32, inputs_i32); +SCATTER_TEST((ScatterTest), ScatterTestFI64, inputs_i64); +} // end namespace raft \ No newline at end of file diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu index 702fd1c407..63f020b420 100644 --- a/cpp/test/matrix/select_k.cu +++ b/cpp/test/matrix/select_k.cu @@ -13,356 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#include "../test_utils.cuh" -#include - -#include - -#include -#include -#include -#include - -#include - -#include -#include - -#include -#include +#include "select_k.cuh" namespace raft::matrix { -template -auto gen_simple_ids(uint32_t batch_size, uint32_t len) -> std::vector -{ - std::vector out(batch_size * len); - auto s = rmm::cuda_stream_default; - rmm::device_uvector out_d(out.size(), s); - sparse::iota_fill(out_d.data(), IdxT(batch_size), IdxT(len), s); - update_host(out.data(), out_d.data(), out.size(), s); - s.synchronize(); - return out; -} - -template -struct io_simple { - public: - bool not_supported = false; - - io_simple(const select::params& spec, - const std::vector& in_dists, - const std::vector& out_dists, - const std::vector& out_ids) - : in_dists_(in_dists), - in_ids_(gen_simple_ids(spec.batch_size, spec.len)), - out_dists_(out_dists), - out_ids_(out_ids) - { - } - - auto get_in_dists() -> std::vector& { return in_dists_; } - auto get_in_ids() -> std::vector& { return in_ids_; } - auto get_out_dists() -> std::vector& { return out_dists_; } - auto get_out_ids() -> std::vector& { return out_ids_; } - - private: - std::vector in_dists_; - std::vector in_ids_; - std::vector out_dists_; - std::vector out_ids_; -}; - -template -struct io_computed { - public: - bool not_supported = false; - - io_computed(const select::params& spec, - const select::Algo& algo, - const std::vector& in_dists, - const std::optional>& in_ids = std::nullopt) - : in_dists_(in_dists), - in_ids_(in_ids.value_or(gen_simple_ids(spec.batch_size, spec.len))), - out_dists_(spec.batch_size * spec.k), - out_ids_(spec.batch_size * spec.k) - { - // check if the size is supported by the algorithm - switch (algo) { - case select::Algo::kWarpAuto: - case select::Algo::kWarpImmediate: - case select::Algo::kWarpFiltered: - case select::Algo::kWarpDistributed: - case select::Algo::kWarpDistributedShm: { - if (spec.k > raft::matrix::detail::select::warpsort::kMaxCapacity) { - not_supported = true; - return; - } - } break; - default: break; - } - - resources handle{}; - auto stream = resource::get_cuda_stream(handle); - - rmm::device_uvector in_dists_d(in_dists_.size(), stream); - rmm::device_uvector in_ids_d(in_ids_.size(), stream); - rmm::device_uvector out_dists_d(out_dists_.size(), stream); - rmm::device_uvector out_ids_d(out_ids_.size(), stream); - - update_device(in_dists_d.data(), in_dists_.data(), in_dists_.size(), stream); - update_device(in_ids_d.data(), in_ids_.data(), in_ids_.size(), stream); - - select::select_k_impl(handle, - algo, - in_dists_d.data(), - spec.use_index_input ? in_ids_d.data() : nullptr, - spec.batch_size, - spec.len, - spec.k, - out_dists_d.data(), - out_ids_d.data(), - spec.select_min); - - update_host(out_dists_.data(), out_dists_d.data(), out_dists_.size(), stream); - update_host(out_ids_.data(), out_ids_d.data(), out_ids_.size(), stream); - - interruptible::synchronize(stream); - - auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min); - apply_permutation(out_dists_, p); - apply_permutation(out_ids_, p); - } - - auto get_in_dists() -> std::vector& { return in_dists_; } - auto get_in_ids() -> std::vector& { return in_ids_; } - auto get_out_dists() -> std::vector& { return out_dists_; } - auto get_out_ids() -> std::vector& { return out_ids_; } - - private: - std::vector in_dists_; - std::vector in_ids_; - std::vector out_dists_; - std::vector out_ids_; - - auto topk_sort_permutation(const std::vector& vec, - const std::vector& inds, - uint32_t k, - bool select_min) -> std::vector - { - std::vector p(vec.size()); - std::iota(p.begin(), p.end(), 0); - if (select_min) { - std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) { - const IdxT ik = i / k; - const IdxT jk = j / k; - if (ik == jk) { - if (vec[i] == vec[j]) { return inds[i] < inds[j]; } - return vec[i] < vec[j]; - } - return ik < jk; - }); - } else { - std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) { - const IdxT ik = i / k; - const IdxT jk = j / k; - if (ik == jk) { - if (vec[i] == vec[j]) { return inds[i] < inds[j]; } - return vec[i] > vec[j]; - } - return ik < jk; - }); - } - return p; - } - - template - void apply_permutation(std::vector& vec, const std::vector& p) // NOLINT - { - for (auto i = IdxT(vec.size()) - 1; i > 0; i--) { - auto j = p[i]; - while (j > i) - j = p[j]; - std::swap(vec[j], vec[i]); - } - } -}; - -template -using Params = std::tuple; - -template typename ParamsReader> -struct SelectK // NOLINT - : public testing::TestWithParam::params_t> { - const select::params spec; - const select::Algo algo; - typename ParamsReader::io_t ref; - io_computed res; - - explicit SelectK(Params::io_t> ps) - : spec(std::get<0>(ps)), - algo(std::get<1>(ps)), // NOLINT - ref(std::get<2>(ps)), // NOLINT - res(spec, algo, ref.get_in_dists(), ref.get_in_ids()) // NOLINT - { - } - - explicit SelectK(typename ParamsReader::params_t ps) - : SelectK(ParamsReader::read(ps)) - { - } - - SelectK() - : SelectK(testing::TestWithParam::params_t>::GetParam()) - { - } - - void run() - { - if (ref.not_supported || res.not_supported) { GTEST_SKIP(); } - ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare())); - - // If the dists (keys) are the same, different corresponding ids may end up in the selection due - // to non-deterministic nature of some implementations. - auto& in_ids = ref.get_in_ids(); - auto& in_dists = ref.get_in_dists(); - auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) { - if (i == j) return true; - auto ix_i = static_cast(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin()); - auto ix_j = static_cast(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin()); - if (static_cast(ix_i) >= in_ids.size() || static_cast(ix_j) >= in_ids.size()) - return false; - auto dist_i = in_dists[ix_i]; - auto dist_j = in_dists[ix_j]; - if (dist_i == dist_j) return true; - std::cout << "ERROR: ref[" << ix_i << "] = " << dist_i << " != " - << "res[" << ix_j << "] = " << dist_j << std::endl; - return false; - }; - ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), compare_ids)); - } -}; - -template -struct params_simple { - using io_t = io_simple; - using input_t = - std::tuple, std::vector, std::vector>; - using params_t = std::tuple; - - static auto read(params_t ps) -> Params - { - auto ins = std::get<0>(ps); - auto algo = std::get<1>(ps); - return std::make_tuple( - std::get<0>(ins), - algo, - io_simple( - std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins))); - } -}; - -auto inputs_simple_f = testing::Values( - params_simple::input_t( - {5, 5, 5, true, true}, - {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, - 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, - {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, - 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0}, - {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}), - params_simple::input_t( - {5, 5, 3, true, true}, - {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, - 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, - {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0}, - {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}), - params_simple::input_t( - {5, 5, 5, true, false}, - {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, - 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, - {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, - 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0}, - {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}), - params_simple::input_t( - {5, 5, 3, true, false}, - {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, - 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, - {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0}, - {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}), - params_simple::input_t( - {5, 7, 3, true, true}, - {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0, - 4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0, 7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2}, - {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2}, - {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}), - params_simple::input_t( - {1, 7, 3, true, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}), - params_simple::input_t( - {1, 7, 3, false, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}), - params_simple::input_t( - {1, 7, 3, false, true}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}), - params_simple::input_t( - {1, 130, 5, false, true}, - {19, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, - 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20}, - {20, 19, 18, 17, 16}, - {129, 0, 117, 116, 115}), - params_simple::input_t( - {1, 130, 15, false, true}, - {19, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, - 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20}, - {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6}, - {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105})); - -using SimpleFloatInt = SelectK; -TEST_P(SimpleFloatInt, Run) { run(); } // NOLINT -INSTANTIATE_TEST_CASE_P( // NOLINT - SelectK, - SimpleFloatInt, - testing::Combine(inputs_simple_f, - testing::Values(select::Algo::kPublicApi, - select::Algo::kRadix8bits, - select::Algo::kRadix11bits, - select::Algo::kRadix11bitsExtraPass, - select::Algo::kWarpImmediate, - select::Algo::kWarpFiltered, - select::Algo::kWarpDistributed))); - -template -struct with_ref { - template - struct params_random { - using io_t = io_computed; - using params_t = std::tuple; - - static auto read(params_t ps) -> Params - { - auto spec = std::get<0>(ps); - auto algo = std::get<1>(ps); - std::vector dists(spec.len * spec.batch_size); - - raft::resources handle; - { - auto s = resource::get_cuda_stream(handle); - rmm::device_uvector dists_d(spec.len * spec.batch_size, s); - raft::random::RngState r(42); - normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0)); - update_host(dists.data(), dists_d.data(), dists_d.size(), s); - s.synchronize(); - } - - return std::make_tuple(spec, algo, io_computed(spec, RefAlgo, dists)); - } - }; -}; - auto inputs_random_longlist = testing::Values(select::params{1, 130, 15, false}, select::params{1, 128, 15, false}, select::params{20, 700, 1, true}, @@ -411,7 +65,7 @@ auto inputs_random_largesize = testing::Values(select::params{100, 100000, 1, tr select::params{1, 1000000000, 256, false, false}); auto inputs_random_largek = testing::Values(select::params{100, 100000, 1000, true}, - select::params{100, 100000, 2000, true}, + select::params{100, 100000, 2000, false}, select::params{100, 100000, 100000, true, false}, select::params{100, 100000, 2048, false}, select::params{100, 100000, 1237, true}); @@ -457,14 +111,4 @@ INSTANTIATE_TEST_CASE_P( // NOLINT select::Algo::kRadix8bits, select::Algo::kRadix11bits, select::Algo::kRadix11bitsExtraPass))); - -using ReferencedRandomFloatSizeT = - SelectK::params_random>; -TEST_P(ReferencedRandomFloatSizeT, LargeK) { run(); } // NOLINT -INSTANTIATE_TEST_CASE_P(SelectK, // NOLINT - ReferencedRandomFloatSizeT, - testing::Combine(inputs_random_largek, - testing::Values(select::Algo::kRadix11bits, - select::Algo::kRadix11bitsExtraPass))); - } // namespace raft::matrix diff --git a/cpp/test/matrix/select_k.cuh b/cpp/test/matrix/select_k.cuh new file mode 100644 index 0000000000..e0e0cad225 --- /dev/null +++ b/cpp/test/matrix/select_k.cuh @@ -0,0 +1,366 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include + +#include + +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +namespace raft::matrix { + +template +auto gen_simple_ids(uint32_t batch_size, uint32_t len) -> std::vector +{ + std::vector out(batch_size * len); + auto s = rmm::cuda_stream_default; + rmm::device_uvector out_d(out.size(), s); + sparse::iota_fill(out_d.data(), IdxT(batch_size), IdxT(len), s); + update_host(out.data(), out_d.data(), out.size(), s); + s.synchronize(); + return out; +} + +template +struct io_simple { + public: + bool not_supported = false; + + io_simple(const select::params& spec, + const std::vector& in_dists, + const std::vector& out_dists, + const std::vector& out_ids) + : in_dists_(in_dists), + in_ids_(gen_simple_ids(spec.batch_size, spec.len)), + out_dists_(out_dists), + out_ids_(out_ids) + { + } + + auto get_in_dists() -> std::vector& { return in_dists_; } + auto get_in_ids() -> std::vector& { return in_ids_; } + auto get_out_dists() -> std::vector& { return out_dists_; } + auto get_out_ids() -> std::vector& { return out_ids_; } + + private: + std::vector in_dists_; + std::vector in_ids_; + std::vector out_dists_; + std::vector out_ids_; +}; + +template +struct io_computed { + public: + bool not_supported = false; + + io_computed(const select::params& spec, + const select::Algo& algo, + const std::vector& in_dists, + const std::optional>& in_ids = std::nullopt) + : in_dists_(in_dists), + in_ids_(in_ids.value_or(gen_simple_ids(spec.batch_size, spec.len))), + out_dists_(spec.batch_size * spec.k), + out_ids_(spec.batch_size * spec.k) + { + // check if the size is supported by the algorithm + switch (algo) { + case select::Algo::kWarpAuto: + case select::Algo::kWarpImmediate: + case select::Algo::kWarpFiltered: + case select::Algo::kWarpDistributed: + case select::Algo::kWarpDistributedShm: { + if (spec.k > raft::matrix::detail::select::warpsort::kMaxCapacity) { + not_supported = true; + return; + } + } break; + default: break; + } + + resources handle{}; + auto stream = resource::get_cuda_stream(handle); + + rmm::device_uvector in_dists_d(in_dists_.size(), stream); + rmm::device_uvector in_ids_d(in_ids_.size(), stream); + rmm::device_uvector out_dists_d(out_dists_.size(), stream); + rmm::device_uvector out_ids_d(out_ids_.size(), stream); + + update_device(in_dists_d.data(), in_dists_.data(), in_dists_.size(), stream); + update_device(in_ids_d.data(), in_ids_.data(), in_ids_.size(), stream); + + select::select_k_impl(handle, + algo, + in_dists_d.data(), + spec.use_index_input ? in_ids_d.data() : nullptr, + spec.batch_size, + spec.len, + spec.k, + out_dists_d.data(), + out_ids_d.data(), + spec.select_min); + + update_host(out_dists_.data(), out_dists_d.data(), out_dists_.size(), stream); + update_host(out_ids_.data(), out_ids_d.data(), out_ids_.size(), stream); + + interruptible::synchronize(stream); + + auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min); + apply_permutation(out_dists_, p); + apply_permutation(out_ids_, p); + } + + auto get_in_dists() -> std::vector& { return in_dists_; } + auto get_in_ids() -> std::vector& { return in_ids_; } + auto get_out_dists() -> std::vector& { return out_dists_; } + auto get_out_ids() -> std::vector& { return out_ids_; } + + private: + std::vector in_dists_; + std::vector in_ids_; + std::vector out_dists_; + std::vector out_ids_; + + auto topk_sort_permutation(const std::vector& vec, + const std::vector& inds, + uint32_t k, + bool select_min) -> std::vector + { + std::vector p(vec.size()); + std::iota(p.begin(), p.end(), 0); + if (select_min) { + std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) { + const IdxT ik = i / k; + const IdxT jk = j / k; + if (ik == jk) { + if (vec[i] == vec[j]) { return inds[i] < inds[j]; } + return vec[i] < vec[j]; + } + return ik < jk; + }); + } else { + std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) { + const IdxT ik = i / k; + const IdxT jk = j / k; + if (ik == jk) { + if (vec[i] == vec[j]) { return inds[i] < inds[j]; } + return vec[i] > vec[j]; + } + return ik < jk; + }); + } + return p; + } + + template + void apply_permutation(std::vector& vec, const std::vector& p) // NOLINT + { + for (auto i = IdxT(vec.size()) - 1; i > 0; i--) { + auto j = p[i]; + while (j > i) + j = p[j]; + std::swap(vec[j], vec[i]); + } + } +}; + +template +using Params = std::tuple; + +template typename ParamsReader> +struct SelectK // NOLINT + : public testing::TestWithParam::params_t> { + const select::params spec; + const select::Algo algo; + typename ParamsReader::io_t ref; + io_computed res; + + explicit SelectK(Params::io_t> ps) + : spec(std::get<0>(ps)), + algo(std::get<1>(ps)), // NOLINT + ref(std::get<2>(ps)), // NOLINT + res(spec, algo, ref.get_in_dists(), ref.get_in_ids()) // NOLINT + { + } + + explicit SelectK(typename ParamsReader::params_t ps) + : SelectK(ParamsReader::read(ps)) + { + } + + SelectK() + : SelectK(testing::TestWithParam::params_t>::GetParam()) + { + } + + void run() + { + if (ref.not_supported || res.not_supported) { GTEST_SKIP(); } + ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare())); + + // If the dists (keys) are the same, different corresponding ids may end up in the selection due + // to non-deterministic nature of some implementations. + auto& in_ids = ref.get_in_ids(); + auto& in_dists = ref.get_in_dists(); + auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) { + if (i == j) return true; + auto ix_i = static_cast(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin()); + auto ix_j = static_cast(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin()); + if (static_cast(ix_i) >= in_ids.size() || static_cast(ix_j) >= in_ids.size()) + return false; + auto dist_i = in_dists[ix_i]; + auto dist_j = in_dists[ix_j]; + if (dist_i == dist_j) return true; + std::cout << "ERROR: ref[" << ix_i << "] = " << dist_i << " != " + << "res[" << ix_j << "] = " << dist_j << std::endl; + return false; + }; + ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), compare_ids)); + } +}; + +template +struct params_simple { + using io_t = io_simple; + using input_t = + std::tuple, std::vector, std::vector>; + using params_t = std::tuple; + + static auto read(params_t ps) -> Params + { + auto ins = std::get<0>(ps); + auto algo = std::get<1>(ps); + return std::make_tuple( + std::get<0>(ins), + algo, + io_simple( + std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins))); + } +}; + +auto inputs_simple_f = testing::Values( + params_simple::input_t( + {5, 5, 5, true, true}, + {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, + 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, + {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, + 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0}, + {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}), + params_simple::input_t( + {5, 5, 3, true, true}, + {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, + 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, + {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0}, + {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}), + params_simple::input_t( + {5, 5, 5, true, false}, + {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, + 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, + {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, + 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0}, + {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}), + params_simple::input_t( + {5, 5, 3, true, false}, + {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0, + 1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0}, + {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0}, + {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}), + params_simple::input_t( + {5, 7, 3, true, true}, + {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0, + 4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0, 7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2}, + {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2}, + {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}), + params_simple::input_t( + {1, 7, 3, true, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}), + params_simple::input_t( + {1, 7, 3, false, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}), + params_simple::input_t( + {1, 7, 3, false, true}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}), + params_simple::input_t( + {1, 130, 5, false, true}, + {19, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20}, + {20, 19, 18, 17, 16}, + {129, 0, 117, 116, 115}), + params_simple::input_t( + {1, 130, 15, false, true}, + {19, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20}, + {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6}, + {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105})); + +using SimpleFloatInt = SelectK; +TEST_P(SimpleFloatInt, Run) { run(); } // NOLINT +INSTANTIATE_TEST_CASE_P( // NOLINT + SelectK, + SimpleFloatInt, + testing::Combine(inputs_simple_f, + testing::Values(select::Algo::kPublicApi, + select::Algo::kRadix8bits, + select::Algo::kRadix11bits, + select::Algo::kRadix11bitsExtraPass, + select::Algo::kWarpImmediate, + select::Algo::kWarpFiltered, + select::Algo::kWarpDistributed))); + +template +struct with_ref { + template + struct params_random { + using io_t = io_computed; + using params_t = std::tuple; + + static auto read(params_t ps) -> Params + { + auto spec = std::get<0>(ps); + auto algo = std::get<1>(ps); + std::vector dists(spec.len * spec.batch_size); + + raft::resources handle; + { + auto s = resource::get_cuda_stream(handle); + rmm::device_uvector dists_d(spec.len * spec.batch_size, s); + raft::random::RngState r(42); + normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0)); + update_host(dists.data(), dists_d.data(), dists_d.size(), s); + s.synchronize(); + } + + return std::make_tuple(spec, algo, io_computed(spec, RefAlgo, dists)); + } + }; +}; + +} // namespace raft::matrix diff --git a/cpp/test/matrix/select_large_k.cu b/cpp/test/matrix/select_large_k.cu new file mode 100644 index 0000000000..2772e84eb3 --- /dev/null +++ b/cpp/test/matrix/select_large_k.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "select_k.cuh" + +namespace raft::matrix { + +auto inputs_random_largek = testing::Values(select::params{100, 100000, 1000, true}, + select::params{100, 100000, 2000, false}, + select::params{100, 100000, 100000, true, false}, + select::params{100, 100000, 2048, false}, + select::params{100, 100000, 1237, true}); + +using ReferencedRandomFloatSizeT = + SelectK::params_random>; +TEST_P(ReferencedRandomFloatSizeT, LargeK) { run(); } // NOLINT +INSTANTIATE_TEST_CASE_P(SelectK, // NOLINT + ReferencedRandomFloatSizeT, + testing::Combine(inputs_random_largek, + testing::Values(select::Algo::kRadix11bits, + select::Algo::kRadix11bitsExtraPass))); + +} // namespace raft::matrix diff --git a/cpp/test/matrix/slice.cu b/cpp/test/matrix/slice.cu index 332db379b7..fbf735aaf7 100644 --- a/cpp/test/matrix/slice.cu +++ b/cpp/test/matrix/slice.cu @@ -29,24 +29,29 @@ template struct SliceInputs { int rows, cols; unsigned long long int seed; + bool rowMajor; }; template ::std::ostream& operator<<(::std::ostream& os, const SliceInputs& I) { - os << "{ " << I.rows << ", " << I.cols << ", " << I.seed << '}' << std::endl; + os << "{ " << I.rows << ", " << I.cols << ", " << I.seed << ", " << I.rowMajor << '}' + << std::endl; return os; } // Col-major slice reference test template -void naiveSlice(const Type* in, Type* out, int rows, int cols, int x1, int y1, int x2, int y2) +void naiveSlice( + const Type* in, Type* out, int in_lda, int x1, int y1, int x2, int y2, bool row_major) { - int out_rows = x2 - x1; - // int out_cols = y2 - y1; + int out_lda = row_major ? y2 - y1 : x2 - x1; for (int j = y1; j < y2; ++j) { for (int i = x1; i < x2; ++i) { - out[(i - x1) + (j - y1) * out_rows] = in[i + j * rows]; + if (row_major) + out[(i - x1) * out_lda + (j - y1)] = in[j + i * in_lda]; + else + out[(i - x1) + (j - y1) * out_lda] = in[i + j * in_lda]; } } } @@ -67,6 +72,7 @@ class SliceTest : public ::testing::TestWithParam> { std::default_random_engine dre(rd()); raft::random::RngState r(params.seed); int rows = params.rows, cols = params.cols, len = rows * cols; + auto lda = params.rowMajor ? cols : rows; uniform(handle, r, data.data(), len, T(-10.0), T(10.0)); std::uniform_int_distribution rowGenerator(0, (rows / 2) - 1); @@ -83,12 +89,19 @@ class SliceTest : public ::testing::TestWithParam> { std::vector h_data(rows * cols); raft::update_host(h_data.data(), data.data(), rows * cols, stream); - naiveSlice(h_data.data(), exp_result.data(), rows, cols, row1, col1, row2, col2); - auto input = + naiveSlice(h_data.data(), exp_result.data(), lda, row1, col1, row2, col2, params.rowMajor); + auto input_F = raft::make_device_matrix_view(data.data(), rows, cols); - auto output = raft::make_device_matrix_view( + auto output_F = raft::make_device_matrix_view( d_act_result.data(), row2 - row1, col2 - col1); - slice(handle, input, output, slice_coordinates(row1, col1, row2, col2)); + auto input_C = + raft::make_device_matrix_view(data.data(), rows, cols); + auto output_C = raft::make_device_matrix_view( + d_act_result.data(), row2 - row1, col2 - col1); + if (params.rowMajor) + slice(handle, input_C, output_C, slice_coordinates(row1, col1, row2, col2)); + else + slice(handle, input_F, output_F, slice_coordinates(row1, col1, row2, col2)); raft::update_host(act_result.data(), d_act_result.data(), d_act_result.size(), stream); resource::sync_stream(handle, stream); @@ -104,26 +117,26 @@ class SliceTest : public ::testing::TestWithParam> { }; ///// Row- and column-wise tests -const std::vector> inputsf = {{32, 1024, 1234ULL}, - {64, 1024, 1234ULL}, - {128, 1024, 1234ULL}, - {256, 1024, 1234ULL}, - {512, 512, 1234ULL}, - {1024, 32, 1234ULL}, - {1024, 64, 1234ULL}, - {1024, 128, 1234ULL}, - {1024, 256, 1234ULL}}; +const std::vector> inputsf = {{32, 1024, 1234ULL, true}, + {64, 1024, 1234ULL, false}, + {128, 1024, 1234ULL, true}, + {256, 1024, 1234ULL, false}, + {512, 512, 1234ULL, true}, + {1024, 32, 1234ULL, false}, + {1024, 64, 1234ULL, true}, + {1024, 128, 1234ULL, false}, + {1024, 256, 1234ULL, true}}; const std::vector> inputsd = { - {32, 1024, 1234ULL}, - {64, 1024, 1234ULL}, - {128, 1024, 1234ULL}, - {256, 1024, 1234ULL}, - {512, 512, 1234ULL}, - {1024, 32, 1234ULL}, - {1024, 64, 1234ULL}, - {1024, 128, 1234ULL}, - {1024, 256, 1234ULL}, + {32, 1024, 1234ULL, true}, + {64, 1024, 1234ULL, false}, + {128, 1024, 1234ULL, true}, + {256, 1024, 1234ULL, false}, + {512, 512, 1234ULL, true}, + {1024, 32, 1234ULL, false}, + {1024, 64, 1234ULL, true}, + {1024, 128, 1234ULL, false}, + {1024, 256, 1234ULL, true}, }; typedef SliceTest SliceTestF; diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 63c8114de6..89cb070afc 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -45,16 +45,16 @@ namespace raft::neighbors::experimental::cagra { namespace { // For sort_knn_graph test template -void RandomSuffle(raft::host_matrix_view index) +void RandomSuffle(raft::host_matrix_view index) { for (IdxT i = 0; i < index.extent(0); i++) { uint64_t rand = i; IdxT* const row_ptr = index.data_handle() + i * index.extent(1); for (unsigned j = 0; j < index.extent(1); j++) { // Swap two indices at random - rand = raft::neighbors::experimental::cagra::detail::device::xorshift64(rand); + rand = raft::neighbors::cagra::detail::device::xorshift64(rand); const auto i0 = rand % index.extent(1); - rand = raft::neighbors::experimental::cagra::detail::device::xorshift64(rand); + rand = raft::neighbors::cagra::detail::device::xorshift64(rand); const auto i1 = rand % index.extent(1); const auto tmp = row_ptr[i0]; @@ -65,8 +65,8 @@ void RandomSuffle(raft::host_matrix_view index) } template -testing::AssertionResult CheckOrder(raft::host_matrix_view index_test, - raft::host_matrix_view dataset) +testing::AssertionResult CheckOrder(raft::host_matrix_view index_test, + raft::host_matrix_view dataset) { for (IdxT i = 0; i < index_test.extent(0); i++) { const DatatT* const base_vec = dataset.data_handle() + i * dataset.extent(1); @@ -134,7 +134,7 @@ struct AnnCagraInputs { int max_queries; int team_size; int itopk_size; - int num_parents; + int search_width; raft::distance::DistanceType metric; bool host_dataset; // std::optional @@ -146,7 +146,7 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p) std::vector algo = {"single-cta", "multi_cta", "multi_kernel", "auto"}; os << "{n_queries=" << p.n_queries << ", dataset shape=" << p.n_rows << "x" << p.dim << ", k=" << p.k << ", " << algo.at((int)p.algo) << ", max_queries=" << p.max_queries - << ", itopk_size=" << p.itopk_size << ", num_parents=" << p.num_parents + << ", itopk_size=" << p.itopk_size << ", search_width=" << p.search_width << ", metric=" << static_cast(p.metric) << (p.host_dataset ? ", host" : ", device") << '}' << std::endl; return os; @@ -166,10 +166,6 @@ class AnnCagraTest : public ::testing::TestWithParam { protected: void testCagra() { - if (ps.dim * sizeof(DataT) % 8 != 0) { - GTEST_SKIP() - << "CAGRA requires the input data rows to be aligned at least to 8 bytes for now."; - } size_t queries_size = ps.n_queries * ps.k; std::vector indices_Cagra(queries_size); std::vector indices_naive(queries_size); @@ -179,7 +175,8 @@ class AnnCagraTest : public ::testing::TestWithParam { { rmm::device_uvector distances_naive_dev(queries_size, stream_); rmm::device_uvector indices_naive_dev(queries_size, stream_); - naive_knn(distances_naive_dev.data(), + naive_knn(handle_, + distances_naive_dev.data(), indices_naive_dev.data(), search_queries.data(), database.data(), @@ -187,8 +184,7 @@ class AnnCagraTest : public ::testing::TestWithParam { ps.n_rows, ps.dim, ps.k, - ps.metric, - stream_); + ps.metric); update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_); resource::sync_stream(handle_); @@ -207,15 +203,15 @@ class AnnCagraTest : public ::testing::TestWithParam { search_params.max_queries = ps.max_queries; search_params.team_size = ps.team_size; - auto database_view = raft::make_device_matrix_view( + auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.n_rows, ps.dim); { cagra::index index(handle_); if (ps.host_dataset) { - auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); + auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); - auto database_host_view = raft::make_host_matrix_view( + auto database_host_view = raft::make_host_matrix_view( (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); index = cagra::build(handle_, index_params, database_host_view); } else { @@ -225,21 +221,21 @@ class AnnCagraTest : public ::testing::TestWithParam { } auto index = cagra::deserialize(handle_, "cagra_index"); - auto search_queries_view = raft::make_device_matrix_view( + auto search_queries_view = raft::make_device_matrix_view( search_queries.data(), ps.n_queries, ps.dim); auto indices_out_view = - raft::make_device_matrix_view(indices_dev.data(), ps.n_queries, ps.k); - auto dists_out_view = - raft::make_device_matrix_view(distances_dev.data(), ps.n_queries, ps.k); + raft::make_device_matrix_view(indices_dev.data(), ps.n_queries, ps.k); + auto dists_out_view = raft::make_device_matrix_view( + distances_dev.data(), ps.n_queries, ps.k); cagra::search( handle_, search_params, index, search_queries_view, indices_out_view, dists_out_view); - update_host(distances_Cagra.data(), distances_dev.data(), queries_size, stream_); update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_); resource::sync_stream(handle_); } - // for (int i = 0; i < ps.n_queries; i++) { + + // for (int i = 0; i < min(ps.n_queries, 10); i++) { // // std::cout << "query " << i << std::end; // print_vector("T", indices_naive.data() + i * ps.k, ps.k, std::cout); // print_vector("C", indices_Cagra.data() + i * ps.k, ps.k, std::cout); @@ -247,7 +243,7 @@ class AnnCagraTest : public ::testing::TestWithParam { // print_vector("C", distances_Cagra.data() + i * ps.k, ps.k, std::cout); // } double min_recall = ps.min_recall; - ASSERT_TRUE(eval_neighbours(indices_naive, + EXPECT_TRUE(eval_neighbours(indices_naive, indices_Cagra, distances_naive, distances_Cagra, @@ -255,7 +251,7 @@ class AnnCagraTest : public ::testing::TestWithParam { ps.k, 0.001, min_recall)); - ASSERT_TRUE(eval_distances(handle_, + EXPECT_TRUE(eval_distances(handle_, database.data(), search_queries.data(), indices_dev.data(), @@ -271,11 +267,8 @@ class AnnCagraTest : public ::testing::TestWithParam { void SetUp() override { - std::cout << "Resizing database: " << ps.n_rows * ps.dim << std::endl; database.resize(((size_t)ps.n_rows) * ps.dim, stream_); - std::cout << "Done.\nResizing queries" << std::endl; search_queries.resize(ps.n_queries * ps.dim, stream_); - std::cout << "Done.\nRuning rng" << std::endl; raft::random::Rng r(1234ULL); if constexpr (std::is_same{}) { r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_); @@ -315,17 +308,17 @@ class AnnCagraSortTest : public ::testing::TestWithParam { { { // Step 1: Build a sorted KNN graph by CAGRA knn build - auto database_view = raft::make_device_matrix_view( + auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.n_rows, ps.dim); - auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); + auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); raft::copy( database_host.data_handle(), database.data(), database.size(), handle_.get_stream()); - auto database_host_view = raft::make_host_matrix_view( + auto database_host_view = raft::make_host_matrix_view( (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); cagra::index_params index_params; auto knn_graph = - raft::make_host_matrix(ps.n_rows, index_params.intermediate_graph_degree); + raft::make_host_matrix(ps.n_rows, index_params.intermediate_graph_degree); if (ps.host_dataset) { cagra::build_knn_graph(handle_, database_host_view, knn_graph.view()); @@ -373,34 +366,34 @@ class AnnCagraSortTest : public ::testing::TestWithParam { inline std::vector generate_inputs() { - // Todo(tfeher): MULTI_CTA tests a bug, consider disabling that mode. + // TODO(tfeher): test MULTI_CTA kernel with search_width > 1 to allow multiple CTA per queries std::vector inputs = raft::util::itertools::product( {100}, {1000}, - {8}, - {1, 16, 33}, // k - {search_algo::SINGLE_CTA, search_algo::MULTI_KERNEL}, - {1, 10, 100}, // query size + {1, 8, 17}, + {1, 16}, // k + {search_algo::SINGLE_CTA, search_algo::MULTI_CTA, search_algo::MULTI_KERNEL}, + {0, 1, 10, 100}, // query size {0}, - {64}, + {256}, {1}, {raft::distance::DistanceType::L2Expanded}, {false}, {0.995}); - auto inputs2 = - raft::util::itertools::product({100}, - {1000}, - {8, 64, 128, 192, 256, 512, 1024}, // dim - {16}, - {search_algo::AUTO}, - {10}, - {0}, - {64}, - {1}, - {raft::distance::DistanceType::L2Expanded}, - {false}, - {0.995}); + auto inputs2 = raft::util::itertools::product( + {100}, + {1000}, + {1, 3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, // dim + {16}, // k + {search_algo::AUTO}, + {10}, + {0}, + {64}, + {1}, + {raft::distance::DistanceType::L2Expanded}, + {false}, + {0.995}); inputs.insert(inputs.end(), inputs2.begin(), inputs2.end()); inputs2 = raft::util::itertools::product({100}, diff --git a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh new file mode 100644 index 0000000000..f61e476652 --- /dev/null +++ b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include // RAFT_EXPLICIT + +namespace raft::neighbors::cagra::detail { + +namespace multi_cta_search { +#define instantiate_kernel_selection(TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + extern template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t block_size, \ + uint32_t result_buffer_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + uint32_t num_cta_per_query, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_kernel_selection(32, 1024, float, uint64_t, float); +instantiate_kernel_selection(8, 128, float, uint64_t, float); +instantiate_kernel_selection(16, 256, float, uint64_t, float); +instantiate_kernel_selection(32, 512, float, uint64_t, float); + +#undef instantiate_kernel_selection +} // namespace multi_cta_search + +namespace single_cta_search { + +#define instantiate_single_cta_select_and_run( \ + TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T) \ + extern template void select_and_run( \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view graph, \ + INDEX_T* const topk_indices_ptr, \ + DISTANCE_T* const topk_distances_ptr, \ + const DATA_T* const queries_ptr, \ + const uint32_t num_queries, \ + const INDEX_T* dev_seed_ptr, \ + uint32_t* const num_executed_iterations, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + INDEX_T* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + uint32_t num_random_samplings, \ + uint64_t rand_xor_mask, \ + uint32_t num_seeds, \ + size_t itopk_size, \ + size_t search_width, \ + size_t min_iterations, \ + size_t max_iterations, \ + cudaStream_t stream); + +instantiate_single_cta_select_and_run(32, 1024, float, uint64_t, float); +instantiate_single_cta_select_and_run(8, 128, float, uint64_t, float); +instantiate_single_cta_select_and_run(16, 256, float, uint64_t, float); +instantiate_single_cta_select_and_run(32, 512, float, uint64_t, float); + +} // namespace single_cta_search +} // namespace raft::neighbors::cagra::detail diff --git a/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu b/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu index e473a72b2b..fa3d76d066 100644 --- a/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu +++ b/cpp/test/neighbors/ann_cagra/test_float_int64_t.cu @@ -16,8 +16,8 @@ #include -#undef RAFT_EXPLICIT_INSTANTIATE_ONLY #include "../ann_cagra.cuh" +#include "search_kernel_uint64_t.cuh" namespace raft::neighbors::experimental::cagra { diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index 88bf53280b..d72d73680a 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -17,15 +17,27 @@ #include "../test_utils.cuh" #include "ann_utils.cuh" +#include +#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include #include #include #include #include +#include #include +#include #include #include #include @@ -36,6 +48,7 @@ #include +#include #include #include @@ -76,7 +89,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { { } - protected: void testIVFFlat() { size_t queries_size = ps.num_queries * ps.k; @@ -88,7 +100,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { { rmm::device_uvector distances_naive_dev(queries_size, stream_); rmm::device_uvector indices_naive_dev(queries_size, stream_); - naive_knn(distances_naive_dev.data(), + naive_knn(handle_, + distances_naive_dev.data(), indices_naive_dev.data(), search_queries.data(), database.data(), @@ -96,8 +109,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { ps.num_db_vecs, ps.dim, ps.k, - ps.metric, - stream_); + ps.metric); update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_); resource::sync_stream(handle_); @@ -264,6 +276,136 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { } } + void testPacker() + { + ivf_flat::index_params index_params; + ivf_flat::search_params search_params; + index_params.n_lists = ps.nlist; + index_params.metric = ps.metric; + index_params.adaptive_centers = false; + search_params.n_probes = ps.nprobe; + + index_params.add_data_on_build = false; + index_params.kmeans_trainset_fraction = 1.0; + index_params.metric_arg = 0; + + auto database_view = raft::make_device_matrix_view( + (const DataT*)database.data(), ps.num_db_vecs, ps.dim); + + auto idx = ivf_flat::build(handle_, index_params, database_view); + + const std::optional> no_opt = std::nullopt; + index extend_index = ivf_flat::extend(handle_, database_view, no_opt, idx); + + auto list_sizes = raft::make_host_vector(idx.n_lists()); + update_host(list_sizes.data_handle(), + extend_index.list_sizes().data_handle(), + extend_index.n_lists(), + stream_); + resource::sync_stream(handle_); + + auto& lists = idx.lists(); + + // conservative memory allocation for codepacking + auto list_device_spec = list_spec{idx.dim(), false}; + + for (uint32_t label = 0; label < idx.n_lists(); label++) { + uint32_t list_size = list_sizes.data_handle()[label]; + + ivf::resize_list(handle_, lists[label], list_device_spec, list_size, 0); + } + + idx.recompute_internal_state(handle_); + + using interleaved_group = Pow2; + + for (uint32_t label = 0; label < idx.n_lists(); label++) { + uint32_t list_size = list_sizes.data_handle()[label]; + + if (list_size > 0) { + uint32_t padded_list_size = interleaved_group::roundUp(list_size); + uint32_t n_elems = padded_list_size * idx.dim(); + auto list_data = lists[label]->data; + auto list_inds = extend_index.lists()[label]->indices; + + // fetch the flat codes + auto flat_codes = make_device_matrix(handle_, list_size, idx.dim()); + + matrix::gather( + handle_, + make_device_matrix_view( + (const DataT*)database.data(), static_cast(ps.num_db_vecs), idx.dim()), + make_device_vector_view((const IdxT*)list_inds.data_handle(), + list_size), + flat_codes.view()); + + helpers::codepacker::pack( + handle_, make_const_mdspan(flat_codes.view()), idx.veclen(), 0, list_data.view()); + + { + auto mask = make_device_vector(handle_, n_elems); + + linalg::map_offset(handle_, + mask.view(), + [dim = idx.dim(), + list_size, + padded_list_size, + chunk_size = util::FastIntDiv(idx.veclen())] __device__(auto i) { + uint32_t max_group_offset = interleaved_group::roundDown(list_size); + if (i < max_group_offset * dim) { return true; } + uint32_t surplus = (i - max_group_offset * dim); + uint32_t ingroup_id = interleaved_group::mod(surplus / chunk_size); + return ingroup_id < (list_size - max_group_offset); + }); + + // ensure that the correct number of indices are masked out + ASSERT_TRUE(thrust::reduce(resource::get_thrust_policy(handle_), + mask.data_handle(), + mask.data_handle() + n_elems, + 0) == list_size * ps.dim); + + auto packed_list_data = make_device_vector(handle_, n_elems); + + linalg::map_offset(handle_, + packed_list_data.view(), + [mask = mask.data_handle(), + list_data = list_data.data_handle()] __device__(uint32_t i) { + if (mask[i]) return list_data[i]; + return DataT{0}; + }); + + auto extend_data = extend_index.lists()[label]->data; + auto extend_data_filtered = make_device_vector(handle_, n_elems); + linalg::map_offset(handle_, + extend_data_filtered.view(), + [mask = mask.data_handle(), + extend_data = extend_data.data_handle()] __device__(uint32_t i) { + if (mask[i]) return extend_data[i]; + return DataT{0}; + }); + + ASSERT_TRUE(raft::devArrMatch(packed_list_data.data_handle(), + extend_data_filtered.data_handle(), + n_elems, + raft::Compare(), + stream_)); + } + + auto unpacked_flat_codes = + make_device_matrix(handle_, list_size, idx.dim()); + + helpers::codepacker::unpack( + handle_, list_data.view(), idx.veclen(), 0, unpacked_flat_codes.view()); + + ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(), + unpacked_flat_codes.data_handle(), + list_size * ps.dim, + raft::Compare(), + stream_)); + } + } + } + void SetUp() override { database.resize(ps.num_db_vecs * ps.dim, stream_); diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu index f0988ca988..3bfea283e5 100644 --- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu +++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu @@ -21,7 +21,11 @@ namespace raft::neighbors::ivf_flat { typedef AnnIVFFlatTest AnnIVFFlatTestF; -TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); } +TEST_P(AnnIVFFlatTestF, AnnIVFFlat) +{ + this->testIVFFlat(); + this->testPacker(); +} INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs)); diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index de4453a034..e03d09ae50 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -186,7 +186,8 @@ class ivf_pq_test : public ::testing::TestWithParam { size_t queries_size = size_t{ps.num_queries} * size_t{ps.k}; rmm::device_uvector distances_naive_dev(queries_size, stream_); rmm::device_uvector indices_naive_dev(queries_size, stream_); - naive_knn(distances_naive_dev.data(), + naive_knn(handle_, + distances_naive_dev.data(), indices_naive_dev.data(), search_queries.data(), database.data(), @@ -194,8 +195,7 @@ class ivf_pq_test : public ::testing::TestWithParam { ps.num_db_vecs, ps.dim, ps.k, - ps.index_params.metric, - stream_); + ps.index_params.metric); distances_ref.resize(queries_size); update_host(distances_ref.data(), distances_naive_dev.data(), queries_size, stream_); indices_ref.resize(queries_size); diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu index 5d63338b45..6030e2a1a6 100644 --- a/cpp/test/neighbors/selection.cu +++ b/cpp/test/neighbors/selection.cu @@ -441,7 +441,7 @@ auto inputs_random_largesize = testing::Values(SelectTestSpec{100, 100000, 1, tr SelectTestSpec{1, 100000000, 256, false, false}); auto inputs_random_largek = testing::Values(SelectTestSpec{100, 100000, 1000, true}, - SelectTestSpec{100, 100000, 2000, true}, + SelectTestSpec{100, 100000, 2000, false}, SelectTestSpec{100, 100000, 100000, true, false}, SelectTestSpec{100, 100000, 2048, false}, SelectTestSpec{100, 100000, 1237, true}); @@ -482,6 +482,11 @@ INSTANTIATE_TEST_CASE_P(SelectionTest, * SelectionTest/ReferencedRandomFloatSizeT.LargeK/0 * Indicices do not match! ref[91628] = 131.359 != res[36504] = 158.438 * Actual: false (actual=36504 != expected=91628 @38999; + * + * SelectionTest/ReferencedRandomFloatSizeT.LargeK/1 + * ERROR: ref[57977] = 58.9079 != res[21973] = 54.9354 + * Actual: false (actual=21973 != expected=57977 @107999; + * */ typedef SelectionTest::params_random> ReferencedRandomFloatSizeT; diff --git a/cpp/test/random/rng_discrete.cu b/cpp/test/random/rng_discrete.cu index 799f44735e..d1293f34ea 100644 --- a/cpp/test/random/rng_discrete.cu +++ b/cpp/test/random/rng_discrete.cu @@ -193,15 +193,16 @@ const std::vector> inputs_i64 = { {1, 10000, 5, 5, GenPhilox, 1234ULL}, }; -#define RNG_DISCRETE_TEST(test_type, test_name, test_inputs) \ - typedef RAFT_DEPAREN(test_type) test_name; \ - TEST_P(test_name, Result) \ - { \ - ASSERT_TRUE(devArrMatchHost(exp_histogram.data(), \ - histogram.data(), \ - exp_histogram.size(), \ - CompareApprox(tolerance))); \ - } \ +#define RNG_DISCRETE_TEST(test_type, test_name, test_inputs) \ + typedef RAFT_DEPAREN(test_type) test_name; \ + TEST_P(test_name, Result) \ + { \ + ASSERT_TRUE(devArrMatchHost(exp_histogram.data(), \ + histogram.data(), \ + exp_histogram.size(), \ + CompareApprox(tolerance), \ + stream)); \ + } \ INSTANTIATE_TEST_CASE_P(ReduceTests, test_name, ::testing::ValuesIn(test_inputs)) RNG_DISCRETE_TEST((RngDiscreteTest), RngDiscreteTestI32FI32, inputs_i32); diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index 2b7e8233a5..c729334d00 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -245,7 +245,7 @@ class SparseDistanceCOOSPMVTest // output data rmm::device_uvector out_dists, out_dists_ref; - raft::sparse::distance::distances_config_t dist_config; + raft::sparse::distance::detail::distances_config_t dist_config; SparseDistanceCOOSPMVInputs params; }; diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index debb439345..6b4e5c7cfa 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -61,7 +61,6 @@ class SparseDistanceTest public: SparseDistanceTest() : params(::testing::TestWithParam>::GetParam()), - dist_config(handle), indptr(0, resource::get_cuda_stream(handle)), indices(0, resource::get_cuda_stream(handle)), data(0, resource::get_cuda_stream(handle)), @@ -74,24 +73,25 @@ class SparseDistanceTest { make_data(); - dist_config.b_nrows = params.indptr_h.size() - 1; - dist_config.b_ncols = params.n_cols; - dist_config.b_nnz = params.indices_h.size(); - dist_config.b_indptr = indptr.data(); - dist_config.b_indices = indices.data(); - dist_config.b_data = data.data(); - dist_config.a_nrows = params.indptr_h.size() - 1; - dist_config.a_ncols = params.n_cols; - dist_config.a_nnz = params.indices_h.size(); - dist_config.a_indptr = indptr.data(); - dist_config.a_indices = indices.data(); - dist_config.a_data = data.data(); - - int out_size = dist_config.a_nrows * dist_config.b_nrows; + int out_size = static_cast(params.indptr_h.size() - 1) * + static_cast(params.indptr_h.size() - 1); out_dists.resize(out_size, resource::get_cuda_stream(handle)); - pairwiseDistance(out_dists.data(), dist_config, params.metric, params.metric_arg); + auto out = raft::make_device_matrix_view( + out_dists.data(), + static_cast(params.indptr_h.size() - 1), + static_cast(params.indptr_h.size() - 1)); + + auto x_structure = raft::make_device_compressed_structure_view( + indptr.data(), + indices.data(), + static_cast(params.indptr_h.size() - 1), + params.n_cols, + static_cast(params.indices_h.size())); + auto x = raft::make_device_csr_matrix_view(data.data(), x_structure); + + pairwise_distance(handle, x, x, out, params.metric, params.metric_arg); RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle))); } @@ -127,7 +127,7 @@ class SparseDistanceTest update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), - resource::get_cuda_stream(dist_config.handle)); + resource::get_cuda_stream(handle)); } raft::resources handle; @@ -140,7 +140,6 @@ class SparseDistanceTest rmm::device_uvector out_dists, out_dists_ref; SparseDistanceInputs params; - raft::sparse::distance::distances_config_t dist_config; }; const std::vector> inputs_i32_f = { diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu index 87cebd3519..7b4736a08c 100644 --- a/cpp/test/sparse/gram.cu +++ b/cpp/test/sparse/gram.cu @@ -157,6 +157,8 @@ class GramMatrixTest : public ::testing::TestWithParam { raft::random::Rng r(42137ULL); r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream); r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream); + + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); } @@ -204,7 +206,6 @@ class GramMatrixTest : public ::testing::TestWithParam { raft::update_device(indices, indices_host.data(), nnz, stream); raft::update_device(data, data_host.data(), nnz, stream); resource::sync_stream(handle, stream); - return nnz; } @@ -273,7 +274,9 @@ class GramMatrixTest : public ::testing::TestWithParam { (*kernel)(handle, x1_csr, x2_csr, out_span); } } - + // Something in gram is executing not on the 'stream' and therefore + // a full device sync is required + RAFT_CUDA_TRY(cudaDeviceSynchronize()); naiveGramMatrixKernel(params.n1, params.n2, params.n_cols, @@ -287,11 +290,10 @@ class GramMatrixTest : public ::testing::TestWithParam { params.kernel, stream, handle); - resource::sync_stream(handle, stream); ASSERT_TRUE(raft::devArrMatchHost( - gram_host.data(), gram.data(), gram.size(), raft::CompareApprox(1e-6f))); + gram_host.data(), gram.data(), gram.size(), raft::CompareApprox(1e-6f), stream)); } raft::resources handle; diff --git a/cpp/test/sparse/neighbors/connect_components.cu b/cpp/test/sparse/neighbors/connect_components.cu deleted file mode 100644 index 373963b653..0000000000 --- a/cpp/test/sparse/neighbors/connect_components.cu +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// XXX: We allow the instantiation of fused_l2_nn here: -// raft::linkage::FixConnectivitiesRedOp red_op(colors.data(), params.n_row); -// raft::linkage::connect_components( -// handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); -// -// TODO: consider adding this to libraft.so or creating an instance in a -// separate translation unit for this test. -#undef RAFT_EXPLICIT_INSTANTIATE_ONLY - -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "../../test_utils.cuh" - -namespace raft { -namespace sparse { - -using namespace std; - -template -struct ConnectComponentsInputs { - value_idx n_row; - value_idx n_col; - std::vector data; - - int c; -}; - -template -class ConnectComponentsTest - : public ::testing::TestWithParam> { - protected: - void basicTest() - { - raft::resources handle; - - auto stream = resource::get_cuda_stream(handle); - - params = ::testing::TestWithParam>::GetParam(); - - raft::sparse::COO out_edges(resource::get_cuda_stream(handle)); - - rmm::device_uvector data(params.n_row * params.n_col, - resource::get_cuda_stream(handle)); - - raft::copy(data.data(), params.data.data(), data.size(), resource::get_cuda_stream(handle)); - - rmm::device_uvector indptr(params.n_row + 1, stream); - - /** - * 1. Construct knn graph - */ - raft::sparse::COO knn_graph_coo(stream); - - raft::sparse::neighbors::knn_graph(handle, - data.data(), - params.n_row, - params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, - knn_graph_coo, - params.c); - - raft::sparse::convert::sorted_coo_to_csr( - knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, stream); - - /** - * 2. Construct MST, sorted by weights - */ - rmm::device_uvector colors(params.n_row, stream); - - auto mst_coo = raft::mst::mst(handle, - indptr.data(), - knn_graph_coo.cols(), - knn_graph_coo.vals(), - params.n_row, - knn_graph_coo.nnz, - colors.data(), - stream, - false, - true); - - /** - * 3. connect_components to fix connectivities - */ - raft::linkage::FixConnectivitiesRedOp red_op(colors.data(), params.n_row); - raft::linkage::connect_components( - handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); - - /** - * Construct final edge list - */ - rmm::device_uvector indptr2(params.n_row + 1, stream); - - raft::sparse::convert::sorted_coo_to_csr( - out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, stream); - - auto output_mst = raft::mst::mst(handle, - indptr2.data(), - out_edges.cols(), - out_edges.vals(), - params.n_row, - out_edges.nnz, - colors.data(), - stream, - false, - false); - - resource::sync_stream(handle, stream); - - // The sum of edges for both MST runs should be n_rows - 1 - final_edges = output_mst.n_edges + mst_coo.n_edges; - } - - void SetUp() override { basicTest(); } - - void TearDown() override {} - - protected: - ConnectComponentsInputs params; - - value_idx final_edges; -}; - -const std::vector> fix_conn_inputsf2 = { - // Test n_clusters == n_points - {10, - 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, - 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, - 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, - 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, - 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, - 0.76166195, 0.66613745}, - -1}, - // Test n_points == 100 - {100, - 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 - - }, - -4}}; - -typedef ConnectComponentsTest ConnectComponentsTestF_Int; -TEST_P(ConnectComponentsTestF_Int, Result) -{ - /** - * Verify the src & dst vertices on each edge have different colors - */ - EXPECT_TRUE(final_edges == params.n_row - 1); -} - -INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, - ConnectComponentsTestF_Int, - ::testing::ValuesIn(fix_conn_inputsf2)); -}; // namespace sparse -}; // end namespace raft diff --git a/cpp/test/sparse/neighbors/cross_component_nn.cu b/cpp/test/sparse/neighbors/cross_component_nn.cu new file mode 100644 index 0000000000..7cadf25e88 --- /dev/null +++ b/cpp/test/sparse/neighbors/cross_component_nn.cu @@ -0,0 +1,1036 @@ +/* + * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// XXX: We allow the instantiation of masked_l2_nn here: +// raft::linkage::FixConnectivitiesRedOp red_op(params.n_row); +// raft::linkage::cross_component_nn( +// handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); +// +// TODO: consider adding this to libraft.so or creating an instance in a +// separate translation unit for this test. +// +// TODO: edge case testing. Reference: https://github.com/rapidsai/raft/issues/1669 + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../../test_utils.cuh" + +namespace raft { +namespace sparse { + +using namespace std; + +template +struct ConnectComponentsInputs { + value_idx n_row; + value_idx n_col; + std::vector data; + + int c; +}; + +template +class ConnectComponentsTest + : public ::testing::TestWithParam> { + protected: + void basicTest() + { + raft::resources handle; + + auto stream = resource::get_cuda_stream(handle); + + params = ::testing::TestWithParam>::GetParam(); + + raft::sparse::COO out_edges(resource::get_cuda_stream(handle)); + raft::sparse::COO out_edges_batched(resource::get_cuda_stream(handle)); + + rmm::device_uvector data(params.n_row * params.n_col, + resource::get_cuda_stream(handle)); + + raft::copy(data.data(), params.data.data(), data.size(), resource::get_cuda_stream(handle)); + + rmm::device_uvector indptr(params.n_row + 1, stream); + + /** + * 1. Construct knn graph + */ + raft::sparse::COO knn_graph_coo(stream); + + raft::sparse::neighbors::knn_graph(handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + knn_graph_coo, + params.c); + + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, stream); + + /** + * 2. Construct MST, sorted by weights + */ + rmm::device_uvector colors(params.n_row, stream); + + auto mst_coo = raft::mst::mst(handle, + indptr.data(), + knn_graph_coo.cols(), + knn_graph_coo.vals(), + params.n_row, + knn_graph_coo.nnz, + colors.data(), + stream, + false, + true); + + /** + * 3. cross_component_nn to fix connectivities + */ + raft::linkage::FixConnectivitiesRedOp red_op(params.n_row); + raft::linkage::cross_component_nn(handle, + out_edges, + data.data(), + colors.data(), + params.n_row, + params.n_col, + red_op, + params.n_row, + params.n_col); + + raft::linkage::cross_component_nn(handle, + out_edges_batched, + data.data(), + colors.data(), + params.n_row, + params.n_col, + red_op, + params.n_row / 2, + params.n_col / 2); + + ASSERT_TRUE(out_edges.nnz == out_edges_batched.nnz); + + ASSERT_TRUE( + devArrMatch(out_edges.rows(), out_edges_batched.rows(), out_edges.nnz, Compare())); + + ASSERT_TRUE( + devArrMatch(out_edges.cols(), out_edges_batched.cols(), out_edges.nnz, Compare())); + + ASSERT_TRUE(devArrMatch( + out_edges.vals(), out_edges_batched.vals(), out_edges.nnz, CompareApprox(1e-4))); + + /** + * Construct final edge list + */ + rmm::device_uvector indptr2(params.n_row + 1, stream); + + raft::sparse::convert::sorted_coo_to_csr( + out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, stream); + + auto output_mst = raft::mst::mst(handle, + indptr2.data(), + out_edges.cols(), + out_edges.vals(), + params.n_row, + out_edges.nnz, + colors.data(), + stream, + false, + false); + + resource::sync_stream(handle, stream); + + // The sum of edges for both MST runs should be n_rows - 1 + final_edges = output_mst.n_edges + mst_coo.n_edges; + } + + void SetUp() override { basicTest(); } + + void TearDown() override {} + + protected: + ConnectComponentsInputs params; + + value_idx final_edges; +}; + +const std::vector> fix_conn_inputsf2 = { + // Test n_clusters == n_points + {10, + 5, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + 0.76166195, 0.66613745}, + -1}, + // Test n_points == 100 + {100, + 10, + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 + + }, + -4}}; + +typedef ConnectComponentsTest ConnectComponentsTestF_Int; +TEST_P(ConnectComponentsTestF_Int, Result) +{ + /** + * Verify the src & dst vertices on each edge have different colors + */ + EXPECT_TRUE(final_edges == params.n_row - 1); +} + +INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, + ConnectComponentsTestF_Int, + ::testing::ValuesIn(fix_conn_inputsf2)); + +template +struct MutualReachabilityFixConnectivitiesRedOp { + value_t* core_dists; + value_idx m; + + DI MutualReachabilityFixConnectivitiesRedOp() : m(0) {} + + MutualReachabilityFixConnectivitiesRedOp(value_t* core_dists_, value_idx m_) + : core_dists(core_dists_), m(m_){}; + + typedef typename raft::KeyValuePair KVP; + DI void operator()(value_idx rit, KVP* out, const KVP& other) const + { + if (rit < m && other.value < std::numeric_limits::max()) { + value_t core_dist_rit = core_dists[rit]; + value_t core_dist_other = max(core_dist_rit, max(core_dists[other.key], other.value)); + + value_t core_dist_out; + if (out->key > -1) { + core_dist_out = max(core_dist_rit, max(core_dists[out->key], out->value)); + } else { + core_dist_out = out->value; + } + + bool smaller = core_dist_other < core_dist_out; + out->key = smaller ? other.key : out->key; + out->value = smaller ? core_dist_other : core_dist_out; + } + } + + DI KVP operator()(value_idx rit, const KVP& a, const KVP& b) const + { + if (rit < m && a.key > -1) { + value_t core_dist_rit = core_dists[rit]; + value_t core_dist_a = max(core_dist_rit, max(core_dists[a.key], a.value)); + + value_t core_dist_b; + if (b.key > -1) { + core_dist_b = max(core_dist_rit, max(core_dists[b.key], b.value)); + } else { + core_dist_b = b.value; + } + + return core_dist_a < core_dist_b ? KVP(a.key, core_dist_a) : KVP(b.key, core_dist_b); + } + + return b; + } + + DI void init(value_t* out, value_t maxVal) const { *out = maxVal; } + DI void init(KVP* out, value_t maxVal) const + { + out->key = -1; + out->value = maxVal; + } + + DI void init_key(value_t& out, value_idx idx) const { return; } + DI void init_key(KVP& out, value_idx idx) const { out.key = idx; } + + DI value_t get_value(KVP& out) const { return out.value; } + DI value_t get_value(value_t& out) const { return out; } + + void gather(const raft::resources& handle, value_idx* map) + { + auto tmp_core_dists = raft::make_device_vector(handle, m); + thrust::gather(raft::resource::get_thrust_policy(handle), + map, + map + m, + core_dists, + tmp_core_dists.data_handle()); + raft::copy_async( + core_dists, tmp_core_dists.data_handle(), m, raft::resource::get_cuda_stream(handle)); + } + + void scatter(const raft::resources& handle, value_idx* map) + { + auto tmp_core_dists = raft::make_device_vector(handle, m); + thrust::scatter(raft::resource::get_thrust_policy(handle), + core_dists, + core_dists + m, + map, + tmp_core_dists.data_handle()); + raft::copy_async( + core_dists, tmp_core_dists.data_handle(), m, raft::resource::get_cuda_stream(handle)); + } +}; + +template +struct ConnectComponentsMutualReachabilityInputs { + value_idx n_row; + value_idx n_col; + std::vector data; + std::vector core_dists; + std::vector colors; + std::vector expected_rows; + std::vector expected_cols; + std::vector expected_vals; +}; + +template +class ConnectComponentsEdgesTest + : public ::testing::TestWithParam> { + protected: + void basicTest() + { + raft::resources handle; + + auto stream = resource::get_cuda_stream(handle); + + params = ::testing::TestWithParam< + ConnectComponentsMutualReachabilityInputs>::GetParam(); + + raft::sparse::COO out_edges_unbatched(resource::get_cuda_stream(handle)); + raft::sparse::COO out_edges_batched(resource::get_cuda_stream(handle)); + + rmm::device_uvector data(params.n_row * params.n_col, + resource::get_cuda_stream(handle)); + rmm::device_uvector core_dists(params.n_row, resource::get_cuda_stream(handle)); + rmm::device_uvector colors(params.n_row, resource::get_cuda_stream(handle)); + + raft::copy(data.data(), params.data.data(), data.size(), resource::get_cuda_stream(handle)); + raft::copy(core_dists.data(), + params.core_dists.data(), + core_dists.size(), + resource::get_cuda_stream(handle)); + raft::copy( + colors.data(), params.colors.data(), colors.size(), resource::get_cuda_stream(handle)); + + /** + * 3. cross_component_nn to fix connectivities + */ + MutualReachabilityFixConnectivitiesRedOp red_op(core_dists.data(), + params.n_row); + + raft::linkage::cross_component_nn(handle, + out_edges_unbatched, + data.data(), + colors.data(), + params.n_row, + params.n_col, + red_op, + params.n_row, + params.n_col); + + raft::linkage::cross_component_nn(handle, + out_edges_batched, + data.data(), + colors.data(), + params.n_row, + params.n_col, + red_op, + 11, + 1); + + ASSERT_TRUE(out_edges_unbatched.nnz == out_edges_batched.nnz && + out_edges_unbatched.nnz == params.expected_rows.size()); + + ASSERT_TRUE(devArrMatch(out_edges_unbatched.rows(), + params.expected_rows.data(), + out_edges_unbatched.nnz, + Compare())); + + ASSERT_TRUE(devArrMatch(out_edges_unbatched.cols(), + params.expected_cols.data(), + out_edges_unbatched.nnz, + Compare())); + + ASSERT_TRUE(devArrMatch(out_edges_unbatched.vals(), + params.expected_vals.data(), + out_edges_unbatched.nnz, + CompareApprox(1e-4))); + + ASSERT_TRUE(devArrMatch(out_edges_batched.rows(), + params.expected_rows.data(), + out_edges_batched.nnz, + Compare())); + + ASSERT_TRUE(devArrMatch(out_edges_batched.cols(), + params.expected_cols.data(), + out_edges_batched.nnz, + Compare())); + + ASSERT_TRUE(devArrMatch(out_edges_batched.vals(), + params.expected_vals.data(), + out_edges_batched.nnz, + CompareApprox(1e-4))); + } + + void SetUp() override { basicTest(); } + + void TearDown() override {} + + protected: + ConnectComponentsMutualReachabilityInputs params; +}; + +const std::vector> mr_fix_conn_inputsf2 = { + {100, + 2, + {-7.72642, -8.39496, 5.4534, 0.742305, -2.97867, 9.55685, 6.04267, 0.571319, -6.52184, + -6.31932, 3.64934, 1.40687, -2.17793, 9.98983, 4.42021, 2.33028, 4.73696, 2.94181, + -3.66019, 9.38998, -3.05358, 9.12521, -6.65217, -5.57297, -6.35769, -6.58313, -3.61553, + 7.81808, -1.77073, 9.18565, -7.95052, -6.39764, -6.60294, -6.05293, -2.58121, 10.0178, + -7.76348, -6.72638, -6.40639, -6.95294, -2.97262, 8.54856, -6.95673, -6.53896, -7.32614, + -6.02371, -2.1478, 10.5523, -2.54502, 10.5789, -2.96984, 10.0714, 3.22451, 1.55252, + -6.25396, -7.73727, -7.85431, -6.09303, -8.11658, -8.20057, -7.55965, -6.64786, 4.936, + 2.23423, 4.44752, 2.27472, -5.72103, -7.70079, -0.929985, 9.78172, -3.10984, 8.72259, + -2.44167, 7.58954, -2.18511, 8.6292, 5.55528, 2.30192, 4.73164, -0.0143992, -8.2573, + -7.81793, -2.98837, 8.82863, 4.60517, 0.804492, -3.83738, 9.21115, -2.62485, 8.71318, + 3.57758, 2.44676, -8.48711, -6.69548, -6.70645, -6.49479, -6.86663, -5.42658, 3.83139, + 1.47141, 2.02013, 2.79507, 4.64499, 1.73858, -1.69667, 10.3705, -6.61974, -6.09829, + -6.05757, -4.98332, -7.10309, -6.16611, -3.52203, 9.32853, -2.26724, 7.10101, 6.11777, + 1.4549, -4.23412, 8.452, -6.58655, -7.59446, 3.93783, 1.64551, -7.12502, -7.63385, + 2.72111, 1.94666, -7.14428, -4.15994, -6.66553, -8.12585, 4.70011, 4.43641, -7.76914, + -7.69592, 4.11012, 2.48644, 4.89743, 1.89872, 4.29716, 1.17089, -6.62913, -6.53366, + -8.07093, -6.22356, -2.16558, 7.25125, 4.73953, 1.46969, -5.91625, -6.46733, 5.43091, + 1.06378, -6.82142, -8.02308, 6.52606, 2.14775, 3.08922, 2.04173, -2.14756, 8.36917, + 3.85663, 1.65111, -1.68665, 7.79344, -5.01385, -6.40628, -2.52269, 7.95658, -2.30033, + 7.05462, -1.04355, 8.78851, 3.72045, 3.5231, -3.98772, 8.29444, 4.24777, 0.509655, + 4.72693, 1.67416, 5.7827, 2.7251, -3.41722, 7.60198, 5.22674, 4.16363, -3.1109, + 10.8666, -3.18612, 9.62596, -1.4782, 9.94557, 4.47859, 2.37722, -5.79658, -5.82631, + -3.34842, 8.70507}, + {0.978428, 1.01917, 0.608673, 1.45629, 0.310713, 0.689461, 0.701126, 0.63296, 0.774788, + 0.701648, 0.513282, 0.757651, 0.45638, 0.973111, 0.901396, 0.613692, 0.482497, 0.688143, + 0.72428, 0.666345, 0.58232, 0.554756, 0.710315, 0.903611, 0.694115, 0.796099, 0.639759, + 0.798998, 0.639839, 1.30727, 0.663729, 0.57476, 0.571348, 1.14662, 1.26518, 0.485068, + 0.78207, 0.791621, 1.01678, 1.28509, 1.14715, 0.381395, 0.850507, 0.788511, 0.588341, + 0.878516, 0.928669, 0.405874, 0.776421, 0.612274, 1.84963, 0.57476, 0.95226, 0.488078, + 1.24868, 0.515136, 0.589378, 0.903632, 1.01678, 1.09964, 0.666345, 0.713265, 0.877168, + 1.10053, 1.96887, 1.03574, 2.03728, 0.969553, 0.774788, 0.586338, 0.65168, 0.435472, + 0.664396, 0.790584, 0.678637, 0.715964, 0.865494, 0.978428, 1.59242, 0.861109, 0.833259, + 0.65168, 0.903632, 1.49599, 0.76347, 0.960453, 1.1848, 1.37398, 0.928957, 1.07848, + 0.661798, 1.21104, 1.04579, 1.89047, 1.24288, 0.529553, 0.903611, 0.620897, 0.882467, + 0.647189}, + {0, 1, 2, 1, 0, 1, 2, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 2, + 2, 1, 0, 0, 0, 0, 1, 1, 0, 2, 2, 2, 2, 1, 1, 0, 2, 1, 2, 2, 1, 0, 0, 0, 1, + 1, 1, 2, 0, 0, 0, 2, 2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 2, 1, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 1, 0, 2}, + {50, 54, 57, 63, 82, 87}, + {57, 63, 50, 54, 87, 82}, + {6.0764, 11.1843, 6.0764, 11.1843, 6.89004, 6.89004}}, + {1000, + 2, + {-6.59634, -7.13901, -6.13753, -6.58082, 5.19821, 2.04918, -2.96856, 8.16444, + -2.76879, 7.51114, -6.82261, -6.61152, 5.02008, 2.58376, 5.55621, 2.31966, + 4.86379, 3.33731, 5.84639, 1.15623, -2.17159, 8.60241, -4.97844, -6.94077, + -2.31014, 8.41407, 5.5582, 0.402669, 5.25265, 0.919754, 5.85298, 2.11489, + -3.29245, 8.69222, -1.9621, 8.81209, -1.53408, 8.86723, -2.18227, 8.79519, + 4.60519, 2.20738, -6.4759, -6.9043, -7.18766, -6.10045, -9.00148, -7.48793, + 4.01674, 1.41769, -2.45347, 10.1085, -3.20892, 9.22827, -3.18612, 9.62596, + 4.81977, 3.36517, 4.90693, 2.8628, -6.44269, -5.68946, -8.30144, -5.37878, + 4.61485, 2.79094, -1.98726, 9.31127, -3.66019, 9.38998, -6.58607, -8.23669, + -7.46015, -6.29153, 4.08468, 3.85433, -6.36842, -5.50645, -6.83602, -5.18506, + -0.627173, 10.3597, 3.98846, 1.48928, -2.9968, 8.58173, -7.2144, -7.28376, + -0.660242, 10.1409, -4.23528, -8.38308, -3.15984, 8.52716, -2.40987, 9.76567, + -8.7548, -6.76508, 4.56971, 0.312209, -7.5487, -5.8402, -1.6096, 9.32159, + 5.04813, 0.270586, -7.6525, -6.47306, -1.79758, 7.88964, -9.0153, -3.74236, + -3.5715, 9.48788, -1.65154, 8.85435, -3.47412, 9.70034, 6.31245, 2.39219, + 4.03851, 2.29295, -3.17098, 9.86672, -6.90693, -7.81338, -6.22373, -6.68537, + -3.22204, 9.12072, -0.365254, 9.6482, -7.76712, -7.31757, 4.15669, 3.54716, + 4.1937, 0.083629, -3.03896, 9.52755, -6.29293, -7.35501, -2.95926, 9.63714, + 4.02709, 1.58547, 4.56828, 1.93595, 5.6242, 1.75918, -7.36237, -7.83344, + 5.32177, 3.81988, -2.43183, 8.153, -1.97939, 10.4559, -3.49492, 9.51833, + 3.39602, 1.28026, -2.42215, 8.71528, -3.57682, 8.87191, -2.77385, 11.7345, + 5.71351, 0.946654, -6.50253, -6.90937, 4.08239, 0.603367, -5.64134, -6.85884, + -2.76177, 7.7665, -2.25165, 8.93984, -3.49071, 9.47639, -1.06792, 7.57842, + 5.15754, 1.24743, 3.63574, 1.20537, -6.07969, -8.49642, 4.12227, 2.19696, + -7.17144, -8.4433, -1.92234, 11.2047, 3.23237, 1.19535, 3.85389, 0.641937, + 4.82665, 1.21779, -7.68923, -6.45605, -7.00816, -8.76196, -5.12894, 9.83619, + -5.66247, -5.35879, 3.05598, 2.73358, 6.06038, 1.40242, -1.69568, 7.78342, + 5.13391, 2.23384, -2.96984, 10.0714, -5.36618, -6.2493, 5.55896, 1.6829, + 3.55882, 2.58911, 5.36155, 0.844118, -0.0634456, 9.14351, 4.88368, 1.40909, + -7.04675, -6.59753, -7.78333, -6.55575, 5.39881, 2.25436, -2.85189, 8.64285, + -2.22821, 8.39159, 3.88591, 1.69249, -7.55481, -7.02463, 4.60032, 2.65467, + -6.90615, -7.76198, -6.76005, -7.85318, 4.15044, 3.01733, -7.18884, -7.63227, + 4.68874, 2.01376, 3.51716, 2.35558, -3.81367, 9.68396, 4.42644, 3.4639, + 4.81758, 0.637825, -6.20705, -4.98023, -1.68603, 9.0876, -4.99504, -5.33687, + -1.77073, 9.18565, 4.86433, 3.02027, 4.20538, 1.664, 4.59042, 2.64799, + -3.09856, 9.86389, -3.02306, 7.95507, -6.32402, -6.79053, -7.67205, -7.18807, + -8.10918, -6.38341, -1.67979, 6.80315, 4.00249, 3.16219, -2.54391, 7.84561, + -3.22764, 8.80084, -2.63712, 8.05875, -2.41744, 7.02672, -6.71117, -5.56251, + 5.18348, 1.60256, -7.40824, -6.29375, -4.22233, 10.3682, 4.8509, 1.87646, + -2.99456, 9.09616, 5.1332, 2.15801, -2.27358, 9.78515, -6.73874, -8.64855, + 4.96124, 2.39509, -3.70949, 8.67978, -4.13674, 9.06237, 2.80367, 2.48116, + -0.876786, 7.58414, -3.7005, 9.67084, 6.48652, 0.903085, 6.28189, 2.98299, + -6.07922, -6.12582, -5.67921, -7.537, 4.55014, 3.41329, -1.63688, 9.19763, + -4.02439, 10.3812, 5.23053, 3.08187, -2.2951, 7.76855, -6.24491, -5.77041, + 6.02415, 2.53708, -6.91286, -7.08823, 4.83193, 1.66405, -7.07454, -5.74634, + -2.09576, 10.8911, 3.29543, 1.05452, -3.49973, 8.44799, 5.2922, 0.396778, + -2.54502, 10.5789, -6.38865, -6.14523, -1.75221, 8.09212, -9.30387, -5.99606, + -2.98113, 10.1032, -6.2017, -7.36802, 4.63628, 0.814805, -1.81905, 8.61307, + 4.88926, 3.55062, 3.08325, 2.57918, -2.51717, 10.4942, -5.75358, -6.9315, + 6.36742, 2.40949, 5.74806, 0.933264, 4.74408, 1.91058, -7.41496, -6.97064, + -2.98414, 8.36096, 6.72825, 1.83358, -2.95349, 9.39159, -3.35599, 7.49944, + 6.18738, 3.76905, -3.17182, 9.58488, 5.17863, 1.0525, -3.0397, 8.43847, + -2.23874, 8.96405, 3.04689, 2.41364, 6.14064, 2.82339, -6.33334, -6.87369, + -7.92444, -8.84647, 3.65129, 0.86958, 5.29842, 3.98337, -2.06538, 9.78892, + -6.89494, -6.30082, -2.52144, 8.11703, -8.11398, -7.47257, 5.3381, 2.36666, + -6.93452, -6.59456, -7.50634, -6.01772, 6.23438, 1.12621, -2.15218, 8.32138, + -7.04777, -7.3522, -2.52771, 8.72563, -2.77907, 8.03552, 4.29123, 1.62391, + -8.07551, -6.43551, -3.28202, 8.77747, -2.21308, 9.27534, -8.25153, -8.49367, + -3.54644, 8.82395, -8.05867, -5.69243, 4.46681, 1.98875, 3.8362, 3.61229, + -6.96231, -7.00186, 5.18993, 1.00483, -5.35116, -6.37227, 5.23298, 1.66362, + -5.68306, -7.03864, -9.03144, -7.59926, -6.10127, -7.4313, 4.83572, 0.994797, + -7.32695, -5.59909, 0.569683, 10.1339, 3.35957, 2.84563, -2.4122, 9.60944, + 5.00855, 1.57983, -2.57528, 7.80327, 3.96349, 3.77411, 4.59429, 2.21651, + -6.54765, -6.68961, 4.76798, 1.29212, -1.67351, 7.88458, 5.63615, 1.47941, + -2.5301, 9.13161, 4.26075, 1.76959, 4.67788, 2.0932, 4.39955, 1.59835, + 3.91274, 1.72565, -4.1786, 9.55765, -7.34566, -8.47481, 4.8364, 2.68217, + -7.36848, -7.99973, -5.84708, -5.7534, 5.37252, 1.89245, -2.1707, 8.599, + -1.3299, 9.0818, -6.79122, -5.40258, 5.56391, 1.78827, -0.194539, 7.14702, + 4.60489, 3.74397, 5.50995, 2.46885, -3.98772, 8.29444, -5.21837, -7.33721, + -1.63959, 10.3699, -5.92932, -5.1695, -5.88358, -7.6369, 4.11716, 3.02218, + -6.54114, -7.17551, 3.97179, 2.96521, -6.75325, -4.94118, 5.26169, 0.402945, + 3.25031, 0.327771, -0.44845, 10.7696, -2.15141, 9.57507, 7.04329, 1.91555, + -3.74615, 7.69383, -7.52318, -5.85015, -6.80419, -8.48208, -4.57664, 8.92517, + 4.57574, 2.30193, 4.84098, 3.02382, -9.43355, -5.94579, -3.52203, 9.32853, + 3.43018, 2.5731, -6.15725, -7.25294, -6.69861, -8.17694, -2.40955, 8.51081, + -4.82342, -7.98332, -7.10611, -6.51274, 5.86755, 0.763529, -6.56045, -5.53966, + -3.61553, 7.81808, 4.3825, 0.304586, -6.52818, -5.80996, 4.59972, 0.542395, + -6.90603, -6.59995, -6.3585, -6.23489, -6.01915, -7.46319, -5.38694, -7.15123, + -7.83475, -6.45651, 5.89564, 1.07856, -5.15266, -7.27975, -6.97978, -7.08378, + 5.83493, 0.449983, -2.62374, 10.2521, -7.34494, -6.98606, -6.79719, -8.33766, + 3.54757, 1.65676, -8.40528, -5.61753, -5.85556, -6.28758, 4.66862, 3.25162, + -6.26047, -4.82261, 4.61552, 4.11544, -1.36637, 9.76622, 4.2517, 2.14359, + -2.45099, 7.87132, -0.376164, 7.0622, 4.34493, 3.22091, 6.95921, 2.36649, + -6.70319, -7.24714, -5.56932, -5.48443, -7.43149, -4.32191, -3.23956, 9.23074, + -5.77255, -7.00049, 4.96601, 0.722056, -7.88617, -5.74023, 4.18757, -0.45071, + -7.12569, -7.72336, 5.27366, 2.38697, 3.93487, 1.9174, 3.19186, -0.225636, + -3.41722, 7.60198, -3.08286, 8.46743, -5.87905, -7.55073, -5.26425, -7.20243, + -2.97867, 9.55685, -1.23153, 8.42272, -2.33602, 9.3996, -3.33819, 8.45411, + -3.58009, 9.49676, 3.78152, 2.67348, -1.54582, 9.42707, -4.04331, 10.292, + 3.3452, 3.134, -2.75494, 8.74156, -3.26555, 7.59203, -7.27139, -7.80252, + 3.5293, 3.72544, 6.11642, 3.35326, 4.01611, 3.8872, 4.89591, 2.95586, + -7.06677, -5.89438, 4.19438, 3.42655, -6.11355, -5.65318, -7.59645, -8.74665, + -5.80362, -6.8588, 3.80453, 4.11832, 5.70655, 3.14247, -4.98084, 8.21739, + -1.87642, 11.285, 4.39864, 2.32523, -3.48388, 9.80137, 4.02836, 0.566509, + -2.41212, 9.98293, -5.40846, -7.08943, 4.01506, 1.99926, -3.43613, 8.95476, + -7.24458, -7.71932, 6.02204, 2.62188, -6.29999, -6.55431, 6.19038, 0.974816, + 3.55882, 3.02632, -7.06011, -3.687, -1.55877, 8.43738, -5.14711, -4.64881, + 4.7167, 0.690177, -7.90381, -5.02602, 4.17218, 2.31967, -0.643423, 9.48812, + -7.95237, -6.64086, -4.05986, 9.08285, -6.24158, -6.37927, -6.6105, -7.2233, + -6.21675, -5.70664, -3.29967, 9.48575, 3.41775, 2.68617, -2.24948, 8.10997, + -2.24931, 9.79611, -9.0523, -6.03269, -2.2587, 9.36073, 5.20965, 2.42088, + -3.10159, 8.1503, -6.67906, -5.73147, 4.0687, 2.54575, -1.24229, 8.30662, + -2.09627, 8.45056, -7.87801, -6.57832, 4.72216, 3.03865, -0.929985, 9.78172, + -8.56307, -7.68598, -7.05257, -5.1684, -7.09076, -7.86729, 4.61432, 3.1459, + -6.34133, -5.8076, -3.82943, 10.8457, -8.46082, -5.98507, 5.34763, 1.4107, + -1.68714, 10.9111, -1.67886, 8.1582, -0.623012, 9.18886, -4.21258, 8.95874, + -2.16744, 10.8905, -6.57158, -7.27176, 2.14047, 4.26411, -8.44217, -7.40916, + 5.29008, 1.87399, 4.31824, 4.04992, -3.77008, 9.93215, -2.72688, 10.1131, + -6.14278, -7.16144, -3.92457, 8.59364, -5.92649, -6.59299, 4.68369, 1.82617, + -6.89905, -7.18329, 3.95173, 4.22561, -7.66453, -6.23183, -2.44167, 7.58954, + -6.36603, -7.41281, -6.45081, -6.187, -6.6125, -6.37138, 5.46036, 2.48044, + -2.14756, 8.36917, -2.3889, 9.52872, 3.80752, 2.44459, -3.98778, 10.158, + -6.63887, -4.27843, -8.65266, -5.61819, -7.97003, -5.46918, -5.9604, -7.54825, + -0.916011, 8.50307, -3.69246, 6.97505, -7.98533, -7.09503, -2.30033, 7.05462, + 4.76218, 2.51647, -7.04981, -7.33334, 3.66401, 3.02681, -2.50408, 8.7797, + 7.19996, 1.87711, 4.01291, 3.78562, -0.356015, 8.24694, -0.958046, 9.12996, + 4.60675, 3.76773, 6.21945, 1.45031, 4.27744, 0.8535, -4.72232, -7.48582, + 6.03923, 2.8978, -3.26833, 9.16468, -7.97059, -7.29092, -2.3998, 9.74005, + -2.66721, 8.58741, -7.36269, -6.73332, -7.87893, -7.38488, 4.65023, 0.661333, + -4.8171, -7.94764, -4.11564, 9.21775, 4.80633, 2.46562, -2.72887, 9.3714, + -5.26735, -5.5652, 4.9826, 2.42992, -6.17018, -7.3156, 4.38084, 1.77682, + 5.35084, 2.41743, -2.61796, 9.416, 5.27229, 2.94572, -7.52315, -5.95227, + -1.45077, 7.25555, -3.79916, 7.71921, -2.23251, 9.84147, 3.70054, 1.82908, + -1.93831, 10.1499, -6.18324, -5.9248, -3.33142, 9.25797, -6.08536, -8.1344, + 5.95727, 2.17077, 4.87366, 0.417274, -6.529, -6.39092, -9.24256, -7.88984, + -6.36652, -7.13966, -3.90777, 9.57726, -7.06252, -5.50523, -2.26423, 8.50734, + -2.84498, 10.6833, 5.0391, 2.62037, -2.74815, 8.10672, 3.35945, 3.72796, + -4.11668, 9.19892, 5.66903, 2.44577, -1.63807, 8.68826, -7.42587, -6.48831, + 6.17063, 3.19193, -2.28511, 9.02688, -7.10088, -7.15692, 4.46293, 1.17487, + -5.91017, -6.45292, -2.26724, 7.10101, -2.43339, 8.33712, -4.63309, 8.48853, + -3.31769, 8.51253, -2.49078, 10.6907, -1.30798, 8.60621, 6.30535, 2.98754, + -5.79384, -6.78213, -1.93213, 8.81124, 4.55773, 3.09047, 6.37584, 2.17108, + 4.3927, 1.29119, -3.2245, 9.69388, -1.69634, 9.64392, 2.799, 0.693593, + -2.1426, 8.07441, -8.4505, -8.00688, 4.736, 1.51089, -2.5863, 9.35544, + -2.94924, 9.14503, 6.2054, 1.90742, 5.67172, 0.487609, -5.69071, -6.17181, + -8.24651, -7.10488, -7.34424, -6.67895, -6.71977, -7.90778, -1.82294, 7.40157, + -9.40991, -7.16611, -4.37999, 8.66277, -1.42615, 10.0681, -2.00828, 8.03673, + -7.50228, -6.6855, -5.65859, -6.29801, -8.02335, -6.77155, -3.40761, 9.50621, + -2.82447, 9.77326, -1.5938, 9.34304, -3.5213, 7.35943, -3.36961, 8.62973, + -7.01708, -5.92724, 5.20886, 3.60157, -1.71817, 8.1049, -2.46363, 8.36269, + -2.77809, 7.90776, -2.75459, 8.26055, -2.03596, 8.94146, -4.53434, 9.20074, + -7.44387, -6.69556, -6.90099, -7.62732, 3.29169, 2.71643, 6.08686, 2.16972, + -2.31111, 8.86993, -5.75046, 7.9899, 4.69951, 1.32623, 4.71851, -0.025031, + -6.42374, -4.71511, -8.04974, -8.68209, -3.16103, 9.06168, -6.18267, -7.21393, + -7.94202, -6.4518, -7.07697, -7.03138, 3.93554, 0.564708, -1.20372, 9.03529, + -7.10611, -7.83955, -7.47529, -5.50567, -6.15453, -6.36393, -2.98024, 9.24634, + -7.75761, -7.70699, -3.08597, 9.76968, -8.04954, -9.75237, 5.2534, 0.950377, + 5.63789, -0.923086, -5.7065, -6.51047, -8.02132, -7.07377, -8.28594, -6.96322, + -7.70722, -6.79397, -2.4962, 10.4678, 5.02846, 4.46617, 4.02648, 1.6707, + -0.319395, 8.20599, 4.74525, 0.639144, -1.0313, 8.49602, 4.08766, 2.6061, + 3.63826, 1.69207, 2.55795, 3.66963, 5.2826, 3.30232, -1.04355, 8.78851, + -6.84762, -7.63353, -4.70868, -7.056, 3.53651, -0.179721, -3.38482, 7.63149, + -5.9265, -6.36702, -0.986074, 9.5532, -2.42261, 8.85861, -7.42835, -6.78726, + -4.02857, 8.53005, -8.22675, -7.85172, -5.57529, -8.5426, 6.03009, 2.53098, + -7.10448, -7.53011, -3.4988, 8.8885, -2.62485, 8.71318, -6.39489, -7.72647, + 3.93789, 1.31027, 4.27627, 1.91622, -0.923181, 7.77647, -5.16017, 10.1058, + -6.44307, -5.97617, -7.24495, -6.69543, 6.27331, 0.826824, -6.55655, -7.13246, + 5.66245, 4.41292, -2.13805, 8.4103, 5.23463, 2.82659, -4.86624, -6.74357, + -6.14082, -6.26474, -2.67048, 9.41834, -1.26311, 6.9409, -7.20231, -7.13094, + -1.35109, 9.80595, 3.9906, 0.749229, -6.75696, -5.25543, 4.84826, -0.0685652, + -7.4914, -6.91715, 4.46725, 2.85683, -2.95571, 9.87068, 6.32381, 1.51429, + -6.81177, -6.02734, -2.57188, 9.96943, -4.28792, 10.5103, 3.65025, 2.91394, + -7.11856, -7.24693, -6.98693, -6.43239, 4.7651, 1.54376, 4.00092, 0.65008, + -7.14816, -7.7713, -7.58803, -8.39382, 4.3321, 2.19232, -7.89545, -6.81843, + -2.11475, 8.5933, -0.743743, 9.41927, 3.64849, -0.18022, -1.68665, 7.79344, + 4.00214, 1.44217, -6.96799, -7.25012, -1.58302, 10.9237, -6.68524, -7.23328, + 4.65831, 2.32075, 4.62024, 2.52566, -4.23412, 8.452, -0.822056, 9.89593, + -7.19868, -7.67614, -3.32742, 11.1067, 5.27861, 0.830165, 4.48982, 2.09875, + -6.58087, -7.6319, -0.880582, 7.63418, -7.01088, -6.80326, -7.31601, -6.98972, + -6.85883, -7.60811, 6.14328, 2.85053, -7.49206, -6.51861, -2.28174, 10.3214, + 4.81074, 1.78919, -5.58987, -6.20693, 4.08096, 2.35038, -1.5029, 8.43739, + 4.11536, 2.46254, -3.28299, 7.76963, 4.31953, 2.39734, 4.91146, 0.696421, + -1.4782, 9.94557, -3.34842, 8.70507, -6.97822, -6.86126, 4.10012, 1.19486, + -2.50395, 9.06127, 4.41891, 2.00006, -2.73266, 9.72829, 3.5436, 0.533119, + 5.78864, 0.233456, -6.62589, -6.41242, -2.21942, 11.0897, -6.76636, -8.31839, + -2.71732, 8.52129, -5.20972, -6.48544, 3.26056, 1.24224, 3.45228, 2.28299, + 4.72171, 1.87428, -7.52585, -5.1048, 5.0695, 2.18086, -6.55646, -7.02771, + 3.23727, 3.72275, 3.41411, 0.508795, -7.80698, -6.64174, -5.90443, -6.37902, + -0.387041, 10.0468, -1.3506, 8.1936, -6.08614, -8.62864, -5.91478, -5.26453, + -2.61623, 7.97904, 4.45459, 1.84335, -6.66643, -7.63208, 3.6729, 1.92546, + -1.32976, 8.54511, 6.31758, 1.41958, 4.63381, 2.81166, -7.01394, -6.0693, + -2.7786, 9.73183, -2.90131, 7.55077, -7.13842, -5.28146, 6.71514, 1.28398, + -6.98408, -7.04893, -3.03946, 8.22141, -2.76417, 10.5183, -7.35347, -6.89456, + 4.19345, 2.16726, -2.02819, 9.23817, 4.97076, 2.8067, -0.544473, 9.04955, + 4.90727, 2.29487, -6.31871, -7.17559, 3.71665, 0.621485, 4.7903, 2.33813, + -6.47994, -7.53147, -6.80958, -5.71823, -8.07326, -5.96096, 4.77342, 1.8207, + 5.71856, 1.93466, -2.70156, 9.31583, -2.1478, 10.5523, 4.78855, 1.63608, + 5.53507, 2.60834, -7.00058, -6.46058, 5.4738, 2.43235, -1.34603, 9.02452, + -7.5337, -8.71074, -7.30893, -7.57253, -5.33752, -4.87402, -7.01364, -6.86542, + -7.93331, -7.94791, -5.69392, -6.16116, -7.32291, -7.76491, -6.41965, -7.55783, + -7.87996, -7.55785, -6.69005, -5.87906, 3.92147, 2.86809, -1.5552, 9.66568, + 5.07989, 1.47112, -7.48524, -5.0541, -1.82724, 8.70402, -2.00421, 9.88004, + -2.62153, 8.79332, -7.52111, -6.44819, 4.06424, 2.09518, -6.65494, -5.94752, + 6.93878, 1.61033, -3.95728, 7.60682, 5.67016, 2.21196, -7.81507, -5.79413, + -2.41152, 8.24128, -3.83738, 9.21115, 4.5516, 4.55288, -5.75551, -5.93258, + 4.56545, 2.59384, -7.45614, -9.47115, -2.39568, 9.67642, 5.57816, 1.45712, + -7.48184, -6.41134, -1.99415, 12.867, -8.35854, -6.69675, -7.52559, -7.6793, + 5.7454, 3.1602, 2.94692, 1.87483, -8.77324, -6.66682, -3.21125, 8.68662, + -6.25806, -7.24972, 5.17639, 1.0747, -2.44897, 11.4775, -3.30172, 8.89955, + -2.85191, 8.21201, -8.85893, -6.1322, 4.08957, 1.30155, -5.88132, -7.31173, + -7.10309, -7.22943, -2.46068, 8.18334, -7.01226, -7.85464, 4.75411, 2.12347, + -3.42862, 10.5642, 7.16681, 1.4423, 5.42568, 2.39863, -6.00833, -8.22609, + -1.7619, 9.62466, -2.49527, 8.99016, -2.98837, 8.82863, -2.97262, 8.54856, + -1.34142, 9.26871, -5.99652, -6.95795, -1.87061, 7.35277, -8.68277, -8.46425, + -7.01808, -8.10441, -7.04269, -7.62501, -7.69783, -6.88348, -2.19829, 10.4896, + 4.67396, 1.2032, -5.58263, -6.90298, -5.69224, -4.29055, 4.77285, 1.27305, + -3.33469, 8.6929, -2.54195, 8.47086, 4.46492, 1.21742, 5.41158, -0.875373, + -8.68069, -7.42278, -3.88687, 8.07646, 4.6682, 2.00293, -8.29799, -8.64092, + -1.86382, 10.3829, -6.51234, -5.04193, 4.54458, 2.25219, -1.93264, 9.32554, + -3.06285, 7.81641, -6.90714, -5.10786, 4.69653, 2.50286, 6.43757, 2.61401, + -1.85483, 8.9587, 4.60224, 3.07647, 4.4492, 2.1906, 5.02181, 2.40321, + -2.22923, 7.8888, 5.68943, 1.43793, -6.71097, -6.43817, -5.00633, -5.80006, + -2.43763, 8.53663, 5.72577, 2.44787, -6.57079, -5.17789, -5.77867, -4.92176, + -6.57222, -6.06437, 3.96639, 2.25216, -7.95177, -9.80146, 4.92574, 2.30763, + -7.6221, -8.20013, -6.4132, -6.91575, 4.01432, 2.36897, 3.0833, 1.54505, + -1.99416, 9.52807, -7.85128, -8.25973, -0.86423, 8.76525, -6.31412, -8.64087, + -8.07355, -6.73717, -2.52821, 8.01176, -5.82357, -6.65687, -7.08865, -7.73063, + -5.56251, -6.99818, -2.12513, 8.98159, -6.89834, -7.26863, -7.92654, -6.34346, + 4.86201, 1.49442, 4.92905, 4.42847, -5.57789, -5.3186, 4.34232, 3.34888, + 2.64614, 2.34723, -4.10363, 8.41491, -2.18648, 8.18706, -3.39871, 8.19848, + -2.66098, 9.6026, -6.95927, -6.42774, -5.61392, -7.74628, 5.60376, 4.18369, + 5.28536, 4.13642, 4.8428, 0.457426, -6.33816, -6.12095, -2.4394, 8.62897, + 4.56938, 2.45967, 4.0582, 0.958413, 5.62164, 1.64834, 5.73119, 2.58231, + 4.66806, 1.96405, -6.71905, -6.87706, -2.18503, 8.88414, -6.03901, -6.33338, + -8.38435, -6.12005, 0.0641622, 9.0735, 5.19967, 3.05395, -5.48716, -7.13016, + -6.85541, -5.46789, -1.88353, 8.15713, 4.27891, 3.1325, -2.75816, 9.98586, + -2.03022, 9.34795, -7.66741, -7.50096, -3.39305, 9.16801, -8.49476, -5.71537, + -1.68378, 9.8278, -7.41559, -6.07205, -3.15577, 7.93274, 5.22381, 1.61388, + 3.65739, 1.74854, 4.94251, 1.21889, -7.12832, -5.27276, -9.58286, -6.20223, + -2.21613, 8.29993, 5.34799, 2.92987, 4.09496, 2.37231, -7.25183, -5.79136, + -6.46981, -7.12137, -6.28607, -9.8205, 4.52865, 1.06926, -3.10984, 8.72259, + 3.61865, 2.68153, -5.96604, -7.68329, 3.11435, 1.28126, -1.1064, 7.61243, + -2.17688, 8.2658, -3.27246, 7.2094, -5.55143, -6.32388, -1.69667, 10.3705, + -2.16558, 7.25125, -6.36572, -6.70053, 4.12259, 3.38252, -4.80554, -7.79949, + -5.23966, -6.13798, 4.21969, 1.69139, -1.98985, 10.547, -2.52269, 7.95658, + -6.75642, -6.32862, -3.51521, 7.8001, 4.70435, -0.00229688, 6.25359, 2.4267, + 5.82935, 0.745562, 5.24778, 2.15978, 5.48052, 1.32055, -3.05358, 9.12521, + -3.18922, 9.24654, 4.47276, 2.11988, 5.36751, 2.02512, -2.18511, 8.6292, + -2.48469, 9.51228, 5.57556, 3.24472, -2.58121, 10.0178, -6.12629, -6.49895, + -4.54732, 8.0062, -4.20166, 10.5438, -7.61422, -7.69036, -4.42797, 8.98777, + 4.45301, 1.53344, 4.59296, 2.45021, -6.81264, -6.36417, 4.62346, 3.16156, + -5.93007, -8.36501, -2.78425, 6.71237, -6.17141, -6.64689, -5.20608, 8.95999, + -7.30598, -5.73166, 4.39572, 2.93726, -1.89503, 9.77179, -5.683, -7.48989, + 4.80924, 0.559455, -2.17793, 9.98983, 5.23728, 2.67434, -7.03976, -6.20877, + 3.90435, 3.20926, -7.78536, -7.53388, -1.00684, 9.08838, -5.26741, -5.98327, + 3.28002, 2.71942, -1.47166, 8.50427, -2.32733, 9.26251, 5.16271, 1.39947, + -6.59093, -6.61979, -2.44492, 7.93654, -1.05805, 9.97356, -3.1109, 10.8666, + 3.38834, 3.41693, 4.83098, 2.01961, -2.74013, 9.71049, -3.34892, 8.41489, + 4.94768, 0.263001, 3.57477, 1.66795, 5.78915, 1.26999, -4.81812, -5.67174, + -1.88508, 9.64263, 3.69048, 4.60555, 4.03037, 1.7862, -7.4418, -7.08933}, + {0.127717, 0.211407, 0.195547, 0.21633, 0.39671, 0.229008, 0.20839, 0.169236, 0.314314, + 0.322473, 0.169506, 0.45499, 0.147819, 0.296502, 0.15198, 0.356444, 0.0992833, 0.220833, + 0.296206, 0.178067, 0.135359, 0.189725, 0.243099, 0.519986, 0.168105, 0.273465, 0.126033, + 0.18045, 0.282832, 0.193901, 0.213704, 0.425046, 0.203191, 0.228674, 0.209267, 0.355039, + 0.212918, 0.315495, 0.294112, 0.257576, 0.5786, 0.186019, 0.171919, 0.171919, 0.449151, + 1.34947, 0.171919, 0.16341, 0.641387, 0.342115, 0.267343, 0.246125, 0.277612, 0.181462, + 0.22944, 1.95598, 0.164897, 0.235803, 0.228273, 0.314629, 0.127403, 0.241241, 0.189362, + 0.151691, 0.130085, 0.526707, 0.217069, 0.282306, 0.531523, 0.177035, 0.169776, 0.20395, + 0.177165, 0.146628, 0.280013, 0.223033, 0.50947, 0.184133, 0.295329, 0.183219, 0.28166, + 0.179348, 0.276462, 1.00283, 0.248147, 0.214453, 0.231732, 0.170672, 0.256893, 0.133271, + 0.151137, 0.500823, 0.23678, 0.376983, 0.362061, 0.140013, 0.388863, 0.398552, 0.38015, + 0.190081, 0.167115, 0.206884, 0.473849, 1.05117, 0.435665, 0.323618, 0.326201, 0.32226, + 0.201787, 0.246496, 0.28325, 0.226596, 0.238153, 0.277268, 0.674629, 0.179433, 0.175651, + 0.154778, 0.178195, 0.192796, 0.103571, 0.227621, 0.201124, 0.160525, 0.160964, 0.240099, + 0.258027, 0.134127, 0.127717, 0.341378, 0.311595, 0.282306, 0.168988, 0.40775, 0.246125, + 0.583131, 0.236804, 0.238633, 0.194824, 0.169315, 0.244227, 0.249511, 0.189725, 0.305662, + 0.301415, 0.658641, 0.250944, 0.151792, 0.141383, 0.143843, 0.563347, 0.184216, 0.204155, + 0.221764, 0.314908, 0.144518, 0.228808, 0.255785, 0.163457, 0.424705, 0.170202, 0.312598, + 0.300629, 0.532614, 0.661392, 0.228273, 0.543432, 0.257175, 0.258994, 0.281413, 0.273897, + 0.246837, 0.293489, 0.25533, 0.260492, 0.213704, 0.3091, 0.17103, 0.172285, 0.241399, + 0.35999, 0.372243, 0.269191, 0.390239, 0.31761, 0.200593, 0.22197, 0.752914, 0.266571, + 0.13102, 0.268659, 0.293723, 0.356294, 0.296258, 0.264531, 0.15468, 0.358535, 0.243711, + 0.112147, 0.121659, 0.197101, 0.515292, 0.245628, 0.279863, 0.789807, 0.195156, 0.196073, + 0.149564, 0.118675, 0.389373, 0.233821, 0.176128, 0.481088, 0.360027, 0.553152, 0.208207, + 0.171608, 0.160489, 0.334298, 0.139426, 0.168603, 0.266199, 0.326458, 0.103571, 0.171208, + 0.130961, 0.190887, 0.177229, 0.241651, 0.115152, 0.196753, 0.481088, 0.230965, 0.354631, + 0.14591, 0.328543, 0.141544, 0.195888, 0.290379, 0.245954, 0.184547, 0.575214, 0.186929, + 0.28527, 0.292213, 1.20033, 0.281528, 0.15625, 0.211524, 0.186398, 0.298061, 0.147393, + 0.245349, 0.164527, 0.224771, 0.222382, 0.251643, 0.148835, 0.135359, 0.204967, 0.193024, + 0.486309, 0.389686, 0.211921, 0.307405, 0.38666, 0.26802, 0.16605, 0.323134, 0.268397, + 0.217894, 0.974118, 0.371618, 0.156201, 0.305787, 0.339305, 0.371032, 0.381765, 0.22747, + 0.24906, 0.100884, 0.253192, 0.314253, 0.388289, 0.580947, 1.00267, 0.241998, 0.489101, + 0.341501, 0.247423, 0.328311, 0.440281, 0.14927, 0.244469, 0.846828, 0.191725, 0.217429, + 0.123403, 0.322875, 0.145373, 0.757259, 0.190086, 0.316286, 0.268397, 0.296721, 0.440472, + 0.186848, 0.232134, 0.180239, 0.219724, 0.205886, 0.250975, 0.145636, 0.312476, 0.366418, + 0.128135, 0.315235, 0.264531, 0.161815, 0.31631, 0.296489, 0.37171, 0.197217, 0.195625, + 0.479579, 0.443037, 0.323347, 0.193616, 0.160251, 0.8952, 0.256291, 0.593345, 0.177165, + 0.409514, 0.847863, 0.111448, 0.210031, 0.251347, 0.351953, 0.705204, 0.117901, 0.182343, + 0.230179, 0.83632, 0.22104, 0.145163, 0.200326, 0.23431, 0.21868, 0.253575, 0.186562, + 0.192757, 0.172716, 0.27396, 0.258581, 0.327892, 0.376138, 0.223477, 0.302375, 0.145845, + 0.436902, 0.421794, 0.328543, 0.19246, 0.238889, 0.254866, 0.284674, 0.457849, 0.202937, + 0.392568, 0.453083, 0.782713, 0.465401, 0.178623, 0.304863, 0.190081, 0.228641, 0.255135, + 0.245037, 0.217526, 0.109584, 0.276462, 0.182301, 0.38582, 0.349942, 1.3889, 0.30235, + 0.796353, 0.160168, 0.643204, 0.153752, 0.410268, 0.186439, 0.256834, 0.185783, 0.0957629, + 0.226596, 0.197951, 0.17123, 0.192836, 0.18405, 0.575784, 0.228874, 0.201787, 0.241209, + 0.217386, 0.195751, 0.291585, 0.144531, 0.14176, 0.157635, 0.410268, 0.476338, 0.308148, + 0.148077, 0.152093, 0.196791, 0.568087, 0.414026, 0.250587, 0.473463, 0.293645, 0.396768, + 0.2766, 0.38664, 0.135034, 1.50827, 0.472527, 0.268418, 0.40383, 0.375914, 0.246496, + 0.176474, 0.340405, 0.220833, 0.138782, 0.159009, 0.444219, 0.259582, 0.33638, 0.195586, + 0.210974, 0.200288, 0.148129, 0.0974216, 0.211588, 0.280081, 0.44113, 0.773921, 0.553848, + 0.448079, 0.183136, 0.380854, 0.685021, 0.308767, 0.553276, 0.181578, 0.164759, 0.313889, + 0.137886, 0.545387, 0.278449, 0.736895, 0.360054, 0.358929, 0.457315, 0.343278, 0.507662, + 0.280829, 0.113886, 0.23146, 0.160584, 0.192796, 0.147561, 0.241272, 0.168988, 0.730511, + 0.27836, 0.179847, 0.22555, 0.418069, 0.158348, 0.128965, 0.179454, 0.126366, 0.164434, + 0.273633, 0.309556, 0.500823, 0.367852, 0.192875, 0.230262, 0.32724, 0.249969, 0.142618, + 0.494229, 0.36108, 0.227931, 0.23113, 0.742825, 0.190126, 0.33741, 0.280598, 0.145268, + 0.378423, 0.211921, 0.183594, 0.59201, 0.279563, 0.195683, 0.248101, 0.199754, 0.342494, + 0.174343, 0.14149, 0.28085, 0.175781, 0.518738, 0.17223, 0.489904, 0.181167, 0.354286, + 0.297824, 0.280829, 0.219412, 0.22814, 0.195625, 0.313949, 0.294708, 0.211551, 0.236255, + 0.666933, 0.204808, 0.52591, 0.180725, 0.186889, 0.246589, 0.410575, 0.338348, 0.206219, + 0.361766, 0.158143, 0.280816, 0.4149, 0.773082, 0.340046, 0.369672, 0.256923, 0.167195, + 0.197217, 0.252339, 0.172716, 0.191526, 0.263085, 0.345698, 0.168286, 0.243099, 0.434631, + 0.22944, 0.161862, 0.206589, 0.23457, 0.181924, 0.419063, 0.183427, 0.186152, 0.236352, + 0.306336, 0.149002, 1.50086, 0.188231, 0.442757, 0.485602, 0.466662, 0.17329, 0.141329, + 0.180619, 0.160061, 0.192569, 0.270999, 0.117901, 0.362693, 0.217561, 0.208975, 0.233658, + 0.175173, 1.10307, 0.14625, 1.31124, 0.237608, 0.286784, 0.325112, 0.2485, 0.259641, + 0.553152, 0.179039, 0.780781, 0.174758, 0.297824, 0.2558, 0.235949, 0.952186, 0.356744, + 0.312646, 0.189362, 0.574524, 0.705204, 0.213168, 0.225956, 0.424165, 0.169506, 0.137109, + 0.352451, 0.454554, 0.653302, 0.31261, 0.194412, 0.23719, 0.137886, 0.31498, 0.199085, + 0.203875, 0.597248, 1.10036, 0.196869, 0.22104, 0.451345, 0.105613, 0.683928, 0.135204, + 0.25533, 0.607871, 0.219724, 0.184464, 0.725001, 0.160061, 0.333407, 0.192569, 0.234147, + 0.47178, 0.161815, 0.242455, 0.215305, 0.410575, 0.242376, 0.211335, 0.462804, 0.275065, + 0.126878, 0.170404, 0.179433, 0.147244, 0.109584, 0.352905, 0.158215, 0.197604, 0.172407, + 0.407506, 0.645446, 0.313061, 0.165602, 0.136663, 0.55444, 0.15527, 0.133128, 0.125912, + 0.340405, 0.44521, 0.122783, 0.814526, 0.243773, 0.15743, 0.266743, 0.684458, 0.22221, + 0.181294, 0.193901, 0.258802, 0.167195, 0.292056, 0.132309, 0.227671, 0.117334, 0.271758, + 0.146185, 0.225042, 0.225964, 0.194863, 0.290274, 0.138438, 0.196714, 0.266012, 0.267771, + 0.162544, 0.244258, 0.358038, 0.522617, 0.192875, 0.45066, 0.330396, 0.223477, 0.42967, + 0.350884, 0.404655, 0.123155, 0.431583, 0.191675, 0.147354, 0.609034, 0.459487, 0.187337, + 0.215128, 0.604169, 0.330165, 0.494229, 0.40775, 0.167377, 0.192648, 0.234635, 0.275578, + 0.253094, 0.420063, 0.228299, 0.206478, 0.20395, 0.377656, 0.317393, 0.478623, 0.159009, + 0.217034, 0.300933, 0.139754, 0.153901, 0.261077, 0.22834, 0.449609, 0.157672, 0.176474, + 0.285704, 0.180186, 0.212738, 0.266428, 0.388313, 0.0954637, 0.298093, 0.251643, 0.330696, + 0.159572, 0.210666, 0.149411, 0.139618, 0.338472, 0.450304, 0.208793, 0.583609, 0.185865, + 0.400576, 0.21626, 0.174867, 0.239144, 0.249113, 0.200402, 0.275065, 0.238793, 0.205784, + 0.4475, 0.231262, 0.259082, 0.20934, 0.16806, 0.193616, 0.213811, 0.395632, 0.482465, + 0.274649, 0.307405, 0.165866, 0.334275, 0.683337, 0.368825, 0.14625, 0.780742, 0.163457, + 0.226596, 0.138713, 1.79155, 0.400443, 0.233658, 0.426399, 0.623024, 0.670955, 0.123588, + 0.110899, 0.173751, 0.651068, 0.199983, 0.190887, 0.541435, 0.21324, 0.266571, 0.134638, + 0.179348, 0.145636, 0.170929, 0.623252, 0.587738, 0.109688, 0.515314, 0.217666, 0.213311, + 0.249144, 0.187947, 0.270999, 0.268311, 0.469782, 0.763609, 0.32124, 0.146315, 0.265223, + 0.298694, 0.197623, 0.21349, 0.845778, 0.175466, 0.123588, 0.17223, 0.258603, 1.17119, + 0.538142, 0.407675, 0.120288, 0.587238, 0.244664, 0.333956, 0.132812, 0.21399, 0.302375, + 0.275882, 0.134284, 0.377555, 0.228541, 0.187307, 0.143804, 0.180545, 0.222451, 0.239638, + 0.188028, 0.46334, 0.175868, 0.242392, 0.314762, 0.44473, 0.21962, 0.175966, 1.12364, + 0.138837, 0.400576, 0.18184, 0.137706, 0.409763, 0.216894, 0.466662, 0.376604, 0.487155, + 0.283143, 0.118547, 0.221591, 0.122783, 0.179007, 0.16628, 0.180999, 0.239845, 0.169607, + 0.578402, 0.396537, 0.222288, 0.563237, 0.371238, 0.138658, 0.324336, 0.191526, 0.168603, + 0.357715, 0.640905, 0.460706, 0.220902, 0.240797, 0.164062, 0.157853, 0.34457, 0.196092, + 0.289353, 0.104597, 0.259641, 0.126878, 0.175781, 0.441458, 0.820108, 0.261864, 0.23431, + 0.254506, 0.271955, 0.227529, 0.22834, 0.196753, 0.224906, 0.193783, 0.419481, 0.236933, + 0.229706, 0.29785, 0.222947, 0.177606, 0.216911, 0.305188, 0.933438, 0.116666, 0.278483, + 0.0973824, 0.271224, 0.127717, 1.28139, 0.276283, 0.180704, 0.234554, 0.285984, 0.290172, + 0.49594, 0.135879, 0.436784, 0.206219, 0.342215, 0.374165, 0.182217, 0.274864, 0.625, + 0.356925, 0.194324, 0.342215, 0.113012, 0.155123, 0.254207, 0.438919, 0.262548, 0.302299, + 0.179528, 0.312744, 0.168513, 0.142618, 0.150543, 0.231361, 0.166004, 0.186725, 0.38848, + 0.179857, 0.182301, 0.629476, 0.44113, 0.289669, 0.328543, 0.279938, 0.14625, 0.187174, + 0.157635, 0.396749, 0.798931, 0.201541, 0.778619, 0.265883, 0.258027, 0.218576, 0.266571, + 0.160168, 0.230303, 0.273633, 0.233298, 0.30175, 0.217069, 0.345145, 0.397901, 0.224499, + 0.248101, 0.241335, 0.222947, 0.237094, 0.176518, 0.380032, 0.634775, 0.426193, 0.16362, + 0.231097, 0.219898, 0.343789, 0.275578, 0.282022, 0.628542, 0.232184, 0.848367, 0.200754, + 0.179177}, + {0, 0, 2, 3, 3, 0, 2, 2, 2, 2, 3, 0, 3, 2, 2, 2, 3, 3, 3, 3, 2, 0, 0, 0, 2, 3, 3, 3, 2, 2, 0, 0, + 2, 3, 3, 0, 0, 2, 0, 0, 3, 2, 3, 0, 3, 0, 3, 3, 0, 2, 0, 3, 2, 0, 3, 0, 3, 3, 3, 2, 2, 3, 0, 0, + 3, 3, 0, 2, 2, 3, 0, 3, 2, 2, 2, 0, 2, 3, 3, 3, 2, 3, 3, 3, 2, 0, 2, 0, 3, 3, 3, 3, 2, 2, 0, 2, + 0, 3, 2, 2, 2, 0, 0, 3, 0, 2, 2, 3, 2, 3, 0, 2, 2, 2, 3, 2, 0, 0, 2, 3, 3, 2, 0, 2, 0, 0, 2, 0, + 2, 2, 3, 2, 2, 0, 3, 0, 3, 2, 2, 2, 3, 3, 0, 0, 0, 3, 2, 3, 3, 3, 3, 0, 2, 0, 3, 2, 3, 2, 3, 0, + 2, 3, 3, 2, 3, 3, 2, 2, 0, 0, 2, 3, 3, 2, 3, 0, 2, 0, 2, 0, 3, 2, 3, 2, 3, 0, 3, 0, 3, 0, 2, 3, + 2, 2, 3, 0, 2, 2, 2, 0, 3, 2, 3, 3, 2, 3, 2, 3, 3, 2, 2, 0, 0, 2, 2, 3, 0, 3, 0, 2, 0, 0, 2, 3, + 0, 3, 3, 2, 0, 3, 3, 0, 3, 0, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 3, 2, 3, 2, 3, 2, 2, 0, 2, 3, 2, + 3, 2, 2, 2, 2, 3, 0, 2, 0, 0, 2, 3, 3, 0, 2, 3, 2, 2, 3, 0, 3, 0, 0, 2, 0, 2, 0, 2, 2, 3, 3, 2, + 3, 0, 0, 3, 2, 2, 0, 3, 2, 0, 0, 3, 0, 0, 2, 0, 3, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 3, 0, 0, + 2, 0, 0, 2, 0, 2, 3, 2, 3, 3, 2, 2, 0, 0, 0, 3, 0, 2, 0, 2, 0, 2, 2, 2, 3, 3, 0, 0, 3, 3, 3, 3, + 3, 2, 3, 3, 2, 3, 3, 0, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 2, 3, 3, 2, 3, 2, 3, 0, 2, 3, 0, 2, 0, 2, + 2, 0, 3, 0, 2, 0, 2, 3, 0, 3, 0, 0, 0, 3, 2, 3, 3, 0, 3, 2, 3, 0, 2, 3, 3, 0, 2, 3, 0, 0, 0, 2, + 0, 3, 0, 2, 3, 3, 3, 3, 3, 0, 2, 0, 2, 2, 3, 3, 0, 3, 0, 2, 0, 2, 0, 3, 0, 0, 0, 2, 3, 3, 2, 3, + 0, 0, 0, 0, 3, 3, 0, 3, 2, 0, 2, 3, 2, 2, 3, 3, 2, 2, 2, 0, 2, 3, 0, 3, 3, 0, 0, 2, 0, 3, 2, 3, + 0, 2, 0, 2, 2, 3, 2, 0, 3, 3, 3, 2, 3, 0, 3, 0, 2, 2, 0, 0, 0, 3, 0, 3, 3, 2, 3, 2, 3, 2, 3, 0, + 2, 3, 0, 2, 0, 3, 3, 3, 3, 3, 3, 2, 0, 3, 2, 2, 2, 3, 3, 2, 3, 0, 2, 3, 3, 2, 2, 0, 0, 0, 0, 3, + 0, 3, 3, 3, 0, 0, 0, 3, 3, 3, 3, 3, 0, 2, 3, 3, 3, 3, 3, 3, 0, 0, 2, 2, 3, 3, 2, 2, 0, 0, 3, 0, + 0, 0, 2, 3, 0, 0, 0, 3, 0, 3, 0, 2, 2, 0, 0, 0, 0, 3, 2, 2, 3, 2, 3, 2, 2, 2, 2, 3, 0, 0, 2, 3, + 0, 3, 3, 0, 3, 0, 0, 2, 0, 3, 3, 0, 2, 2, 3, 3, 0, 0, 2, 0, 2, 3, 2, 0, 0, 3, 3, 0, 3, 2, 0, 2, + 0, 2, 3, 2, 0, 3, 3, 2, 0, 0, 2, 2, 0, 0, 2, 0, 3, 3, 2, 3, 2, 0, 3, 0, 2, 2, 3, 3, 0, 3, 2, 2, + 0, 3, 0, 0, 0, 2, 0, 3, 2, 0, 2, 3, 2, 3, 2, 2, 3, 3, 0, 2, 3, 2, 3, 2, 2, 0, 3, 0, 3, 0, 2, 2, + 2, 0, 2, 0, 2, 2, 0, 0, 3, 3, 0, 0, 3, 2, 0, 2, 3, 2, 2, 0, 3, 3, 0, 2, 0, 3, 3, 0, 2, 3, 2, 3, + 2, 0, 2, 2, 0, 0, 0, 2, 2, 3, 3, 2, 2, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 2, 0, 3, 3, + 3, 0, 2, 0, 2, 3, 2, 0, 3, 3, 2, 0, 2, 0, 3, 2, 0, 3, 0, 0, 2, 2, 0, 3, 0, 2, 3, 3, 3, 0, 2, 0, + 0, 3, 0, 2, 3, 2, 2, 0, 3, 3, 3, 3, 3, 0, 3, 0, 0, 0, 0, 3, 2, 0, 0, 2, 3, 3, 2, 2, 0, 3, 2, 0, + 3, 0, 2, 3, 3, 0, 2, 2, 3, 2, 2, 2, 3, 2, 0, 0, 3, 2, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 3, 0, 3, 0, + 0, 3, 0, 0, 0, 3, 0, 0, 2, 2, 0, 2, 2, 3, 3, 3, 3, 0, 0, 2, 2, 2, 0, 3, 2, 2, 2, 2, 2, 0, 3, 0, + 0, 3, 2, 0, 0, 3, 2, 3, 3, 0, 3, 0, 3, 0, 3, 2, 2, 2, 0, 0, 3, 2, 2, 0, 0, 0, 2, 3, 2, 0, 2, 3, + 3, 3, 0, 3, 3, 0, 2, 0, 0, 2, 3, 3, 0, 3, 2, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, 2, 3, 0, 3, 3, 0, 3, + 2, 2, 0, 2, 0, 3, 0, 3, 0, 2, 3, 0, 2, 3, 2, 0, 2, 0, 3, 0, 2, 3, 3, 2, 0, 3, 3, 3, 2, 2, 3, 3, + 2, 2, 2, 0, 3, 2, 2, 0}, + {271, 271, 329, 343, 387, 426, 426, 601}, + {426, 601, 426, 387, 343, 271, 329, 271}, + {3.70991, 4.43491, 3.76334, 9.43944, 9.43944, 3.70991, 3.76334, 4.43491}}}; + +typedef ConnectComponentsEdgesTest ConnectComponentsEdgesTestF_Int; +TEST_P(ConnectComponentsEdgesTestF_Int, Result) { EXPECT_TRUE(true); } + +INSTANTIATE_TEST_CASE_P(ConnectComponentsEdgesTest, + ConnectComponentsEdgesTestF_Int, + ::testing::ValuesIn(mr_fix_conn_inputsf2)); + +}; // namespace sparse +}; // end namespace raft diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu index 5e8a67c8f6..56f798b617 100644 --- a/cpp/test/util/device_atomics.cu +++ b/cpp/test/util/device_atomics.cu @@ -51,16 +51,16 @@ TEST(Raft, AtomicIncWarp) // Write all 1M thread indices to a unique location in `out_device` test_atomic_inc_warp_kernel<<>>(counter.data(), out_device.data()); - // Copy data to host - RAFT_CUDA_TRY(cudaMemcpy(out_host.data(), - (const void*)out_device.data(), - num_elts * sizeof(int), - cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY(cudaMemcpyAsync(out_host.data(), + (const void*)out_device.data(), + num_elts * sizeof(int), + cudaMemcpyDeviceToHost, + s)); // Check that count is correct and that each thread index is contained in the // array exactly once. - ASSERT_EQ(num_elts, counter.value(s)); + ASSERT_EQ(num_elts, counter.value(s)); // NB: accessing the counter synchronizes `s` std::sort(out_host.begin(), out_host.end()); for (int i = 0; i < num_elts; ++i) { ASSERT_EQ(i, out_host[i]); diff --git a/dependencies.yaml b/dependencies.yaml index 97d5731881..e4666fd7cc 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -3,7 +3,7 @@ files: all: output: conda matrix: - cuda: ["11.8"] + cuda: ["11.8", "12.0"] arch: [x86_64] includes: - build @@ -109,10 +109,10 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - cmake>=3.23.1,!=3.25.0 + - &cmake_ver cmake>=3.23.1,!=3.25.0 - cython>=0.29,<0.30 - ninja - - scikit-build>=0.13.1,<0.17.2 + - scikit-build>=0.13.1 - output_types: [conda] packages: - c-compiler @@ -135,8 +135,17 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - &cuda_python cuda-python>=11.7.1,<12.0 - - &rmm rmm==23.6.* + - &rmm rmm==23.8.* + specific: + - output_types: [conda, requirements, pyproject] + matrices: + - matrix: + cuda: "12.0" + packages: + - &cuda_python12 cuda-python>=12.0,<13.0a0 + - matrix: # All CUDA 11 versions + packages: + - &cuda_python11 cuda-python>=11.7.1,<12.0a0 checks: common: - output_types: [conda, requirements] @@ -160,15 +169,27 @@ dependencies: - h5py>=3.8.0 - libfaiss>=1.7.1 - faiss-proc=*=cuda + - matplotlib cudatoolkit: specific: - output_types: conda matrices: + - matrix: + cuda: "12.0" + packages: + - cuda-version=12.0 + - cuda-cudart-dev + - cuda-profiler-api + - libcublas-dev + - libcurand-dev + - libcusolver-dev + - libcusparse-dev - matrix: cuda: "11.8" packages: - - cudatoolkit=11.8 + - cuda-version=11.8 + - cudatoolkit - cuda-profiler-api=11.8.86 - libcublas-dev=11.11.3.6 - libcublas=11.11.3.6 @@ -181,7 +202,8 @@ dependencies: - matrix: cuda: "11.5" packages: - - cudatoolkit=11.5 + - cuda-version=11.5 + - cudatoolkit - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages - libcublas-dev>=11.7.3.1,<=11.7.4.6 - libcublas>=11.7.3.1,<=11.7.4.6 @@ -194,7 +216,8 @@ dependencies: - matrix: cuda: "11.4" packages: - - cudatoolkit=11.4 + - cuda-version=11.4 + - cudatoolkit - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages - &libcublas_dev114 libcublas-dev>=11.5.2.43,<=11.6.5.2 - &libcublas114 libcublas>=11.5.2.43,<=11.6.5.2 @@ -207,7 +230,8 @@ dependencies: - matrix: cuda: "11.2" packages: - - cudatoolkit=11.2 + - cuda-version=11.2 + - cudatoolkit - cuda-profiler-api>=11.4.240,<=11.8.86 # use any `11.x` version since pkg is missing several CUDA/arch packages # The NVIDIA channel doesn't publish pkgs older than 11.4 for these libs, # so 11.2 uses 11.4 packages (the oldest available). @@ -223,6 +247,7 @@ dependencies: common: - output_types: [conda] packages: + - *cmake_ver - gtest>=1.13.0 - gmock>=1.13.0 docs: @@ -264,27 +289,36 @@ dependencies: - output_types: [conda, pyproject] packages: - &numpy numpy>=1.21 - - *cuda_python - *rmm + specific: + - output_types: [conda, requirements, pyproject] + matrices: + - matrix: + cuda: "12.0" + packages: + - *cuda_python12 + - matrix: # All CUDA 11 versions + packages: + - *cuda_python11 run_raft_dask: common: - output_types: [conda, pyproject] packages: - - dask==2023.3.2 - - dask-cuda==23.6.* - - distributed==2023.3.2.1 + - dask==2023.7.1 + - dask-cuda==23.8.* + - distributed==2023.7.1 - joblib>=0.11 - numba>=0.57 - *numpy - - ucx-py=0.32.* + - ucx-py==0.33.* - output_types: conda packages: - - dask-core==2023.3.2 + - dask-core==2023.7.1 - ucx>=1.13.0 - ucx-proc=*=gpu - output_types: pyproject packages: - - pylibraft==23.6.* + - pylibraft==23.8.* test_python_common: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/source/ann_benchmarks_build.md b/docs/source/ann_benchmarks_build.md new file mode 100644 index 0000000000..80730c5d68 --- /dev/null +++ b/docs/source/ann_benchmarks_build.md @@ -0,0 +1,48 @@ +### Dependencies + +CUDA 11 and a GPU with Pascal architecture or later are required to run the benchmarks. + +Please refer to the [installation docs](https://docs.rapids.ai/api/raft/stable/build.html#cuda-gpu-requirements) for the base requirements to build RAFT. + +In addition to the base requirements for building RAFT, additional dependencies needed to build the ANN benchmarks include: +1. FAISS GPU >= 1.7.1 +2. Google Logging (GLog) +3. H5Py +4. HNSWLib +5. nlohmann_json +6. GGNN + +[rapids-cmake](https://github.com/rapidsai/rapids-cmake) is used to build the ANN benchmarks so the code for dependencies not already supplied in the CUDA toolkit will be downloaded and built automatically. + +The easiest (and most reproducible) way to install the dependencies needed to build the ANN benchmarks is to use the conda environment file located in the `conda/environments` directory of the RAFT repository. The following command will use `mamba` (which is preferred over `conda`) to build and activate a new environment for compiling the benchmarks: + +```bash +mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +conda activate raft_ann_benchmarks +``` + +The above conda environment will also reduce the compile times as dependencies like FAISS will already be installed and not need to be compiled with `rapids-cmake`. + +### Compiling the Benchmarks + +After the needed dependencies are satisfied, the easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms: +```bash +./build.sh bench-ann +``` + +You can limit the algorithms that are built by providing a semicolon-delimited list of executable names (each algorithm is suffixed with `_ANN_BENCH`): +```bash +./build.sh bench-ann -n --limit-bench-ann=HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH +``` + +Available targets to use with `--limit-bench-ann` are: +- FAISS_IVF_FLAT_ANN_BENCH +- FAISS_IVF_PQ_ANN_BENCH +- FAISS_BFKNN_ANN_BENCH +- GGNN_ANN_BENCH +- HNSWLIB_ANN_BENCH +- RAFT_CAGRA_ANN_BENCH +- RAFT_IVF_PQ_ANN_BENCH +- RAFT_IVF_FLAT_ANN_BENCH + +By default, the `*_ANN_BENCH` executables program infer the dataset's datatype from the filename's extension. For example, an extension of `fbin` uses a `float` datatype, `f16bin` uses a `float16` datatype, extension of `i8bin` uses `int8_t` datatype, and `u8bin` uses `uint8_t` type. Currently, only `float`, `float16`, int8_t`, and `unit8_t` are supported. \ No newline at end of file diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md new file mode 100644 index 0000000000..99a6bfbd3a --- /dev/null +++ b/docs/source/ann_benchmarks_dataset.md @@ -0,0 +1,47 @@ +# ANN Benchmarks Datasets + +A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation. + +The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively. +These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order. + +Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type. + +Commonly used datasets can be downloaded from two websites: +1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks). + + However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it: + ```bash + pip3 install numpy h5py + ``` + The usage of this script is: + ```bash + $ cpp/bench/ann/scripts/hdf5_to_fbin.py + usage: scripts/hdf5_to_fbin.py [-n] .hdf5 + -n: normalize base/query set + outputs: .base.fbin + .query.fbin + .groundtruth.neighbors.ibin + .groundtruth.distances.fbin + ``` + So for an input `.hdf5` file, four output binary files will be produced. See previous section for an example of prepossessing GloVe dataset. + + Most datasets provided by `ann-benchmarks` use `Angular` or `Euclidean` distance. `Angular` denotes cosine distance. However, computing cosine distance reduces to computing inner product by normalizing vectors beforehand. In practice, we can always do the normalization to decrease computation cost, so it's better to measure the performance of inner product rather than cosine distance. The `-n` option of `hdf5_to_fbin.py` can be used to normalize the dataset. + +2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this: + ```bash + $ cpp/bench/ann/scripts/split_groundtruth.pl + usage: script/split_groundtruth.pl input output_prefix + ``` + Take Deep-1B dataset as an example: + ```bash + pushd + cd cpp/bench/ann + mkdir -p data/deep-1B && cd data/deep-1B + # download manually "Ground Truth" file of "Yandex DEEP" + # suppose the file name is deep_new_groundtruth.public.10K.bin + ../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth + # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced + popd + ``` + Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation. \ No newline at end of file diff --git a/docs/source/ann_benchmarks_low_level.md b/docs/source/ann_benchmarks_low_level.md new file mode 100644 index 0000000000..f95d01f66f --- /dev/null +++ b/docs/source/ann_benchmarks_low_level.md @@ -0,0 +1,146 @@ +### Low-level Scripts and Executables +#### End-to-end Example +An end-to-end example (run from the RAFT source code root directory): +```bash +# (1) prepare a dataset +pushd + +cd cpp/bench/ann +mkdir data && cd data +wget http://ann-benchmarks.com/glove-100-angular.hdf5 + +# option -n is used here to normalize vectors so cosine distance is converted +# to inner product; don't use -n for l2 distance +python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5 + +mkdir glove-100-inner +mv glove-100-angular.base.fbin glove-100-inner/base.fbin +mv glove-100-angular.query.fbin glove-100-inner/query.fbin +mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin +mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin +popd + +# (2) build index +./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -b -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json + +# (3) search +./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -s -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json + +# (4) evaluate result +pushd +cd cpp/bench/ann +./scripts/eval.pl \ + -o result.csv \ + data/glove-100-inner/groundtruth.neighbors.ibin \ + result/glove-100-inner/faiss_ivf_flat +popd + +# optional step: plot QPS-Recall figure using data in result.csv with your favorite tool +``` + +##### Step 1: Prepare Dataset +[Instructions](ann_benchmarks_dataset.md) + + +##### Step 2: Build Index +An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk. + +To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`cpp/bench/ann/conf/glove-100-inner.json`](../../cpp/cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections: +* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed. + - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset. +* `search_basic_param` section specifies basic parameters for searching: + - `k` is the "k" in "k-nn", that is, the number of neighbors (or results) we want from the searching. + - `run_count` means how many times we run the searching. A single run of searching will search neighbors for all vectors in `test` set. The total time used for a run is recorded, and the final searching time is the smallest one among these runs. +* `index` section specifies an array of configurations for index building and searching: + - `build_param` and `search_params` are parameters for building and searching, respectively. `search_params` is an array since we will search with different parameters to get different recall values. + - `file` is the file name of index. Building will save built index to this file, while searching will load this file. + - `search_result_file` is the file name prefix of searching results. Searching will save results to these files, and plotting script will read these files to plot results. Note this is a prefix rather than a whole file name. Suppose its value is `${prefix}`, then the real file names are like `${prefix}.0.{ibin|txt}`, `${prefix}.1.{ibin|txt}`, etc. Each of them corresponds to an item in `search_params` array. That is, for one searching parameter, there will be some corresponding search result files. + - if `multigpu` is specified, multiple GPUs will be used for index build and search. + - if `refine_ratio` is specified, refinement, as a post-processing step of search, will be done. It's for algorithms that compress vectors. For example, if `"refine_ratio" : 2` is set, 2`k` results are first computed, then exact distances of them are computed using original uncompressed vectors, and finally top `k` results among them are kept. + + +The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables: +```bash +$ ./cpp/build/*_ANN_BENCH -h +usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json + -b: build mode, will build index + -s: search mode, will search using built index + one and only one of -b and -s should be specified + -f: force overwriting existing output files + -i: by default will build/search all the indices found in conf.json + '-i' can be used to select a subset of indices + 'index_names' is a list of comma-separated index names + '*' is allowed as the last character of a name to select all matched indices + for example, -i "hnsw1,hnsw2,faiss" or -i "hnsw*,faiss" +``` +* `-b`: build index. +* `-s`: do the searching with built index. +* `-f`: before doing the real task, the program checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option. +* `-i`: by default, the `-b` flag will build all indices found in the configuration file, and `-s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option. + +It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains: +```json + "index" : [ + { + "name" : "hnsw1", + ... + }, + { + "name" : "hnsw1", + ... + }, + { + "name" : "faiss", + ... + } + ] +``` +Then, +```bash +# build all indices: hnsw1, hnsw2 and faiss +./cpp/build/HNSWLIB_ANN_BENCH -b a.json + +# build only hnsw1 +./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1 a.json + +# build hnsw1 and hnsw2 +./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json + +# build hnsw1 and hnsw2 +./cpp/build/HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json + +# build faiss +./cpp/build/FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json +``` +In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell. + + +##### Step 3: Searching +Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the same as in step 2. + + +##### Step 4: Evaluating Results +Use `cpp/bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is: +```bash +$ cpp/bench/ann/scripts/eval.pl +usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths... + result_paths... are paths to the search result files. + Can specify multiple paths. + For each of them, if it's a directory, all the .txt files found under + it recursively will be regarded as inputs. + + -f: force to recompute recall and update it in result file if needed + -o: also write result to a csv file +``` +Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files. +An example: +```bash +cpp/bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \ + result/glove-100-angular/10/hnsw/angular_M_24_*.txt \ + result/glove-100-angular/10/faiss/ +``` +The search result files used by this command are files matching `result/glove-100-angular/10/hnsw/angular_M_24_*.txt`, and all `.txt` files under directory `result/glove-100-angular/10/faiss/` recursively. + +This script prints recall and QPS for every result file. Also, it outputs estimated "recall at QPS=2000" and "QPS at recall=0.9", which can be used to compare performance quantitatively. + +It saves recall value in result txt file, so avoids to recompute recall if the same command is run again. To force to recompute recall, option `-f` can be used. If option `-o ` is specified, a csv output file will be produced. This file can be used to plot Throughput-Recall curves. diff --git a/docs/source/build.md b/docs/source/build.md index bd2afe6638..cb7ca6f4e6 100644 --- a/docs/source/build.md +++ b/docs/source/build.md @@ -8,9 +8,15 @@ The easiest way to install RAFT is through conda and several packages are provid - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives. - `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters. -Use the following command to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command. +Use the following command, depending on your CUDA version, to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command. ```bash -mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft +# for CUDA 11.8 +mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=11.8 +``` + +```bash +# for CUDA 12.0 +mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0 ``` You can also install the conda packages individually using the `mamba` command above. @@ -258,7 +264,7 @@ While not a highly suggested method for building against RAFT, when all of the n set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo") ExternalProject_Add(raft GIT_REPOSITORY git@github.com:rapidsai/raft.git - GIT_TAG branch-23.06 + GIT_TAG branch-23.08 PREFIX ${RAFT_GIT_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -288,7 +294,7 @@ The following `cmake` snippet enables a flexible configuration of RAFT: ```cmake -set(RAFT_VERSION "23.06") +set(RAFT_VERSION "23.08") set(RAFT_FORK "rapidsai") set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}") diff --git a/docs/source/conf.py b/docs/source/conf.py index 62fb2b2148..551049f3e1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -67,15 +67,9 @@ # built documents. # # The short X.Y version. -version = '23.06' +version = '23.08' # The full version, including alpha/beta/rc tags. -<<<<<<< HEAD -release = '23.06.02' -||||||| 994e6c8b -release = '23.06.02' -======= -release = '23.06.02' ->>>>>>> upstream/branch-23.06 +release = '23.08.00' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst index c4728337a0..7e69f92948 100644 --- a/docs/source/cpp_api/core.rst +++ b/docs/source/cpp_api/core.rst @@ -19,4 +19,5 @@ expose in public APIs. core_kvp.rst core_nvtx.rst core_interruptible.rst - core_operators.rst \ No newline at end of file + core_operators.rst + core_math.rst \ No newline at end of file diff --git a/docs/source/cpp_api/core_math.rst b/docs/source/cpp_api/core_math.rst new file mode 100644 index 0000000000..681bf02e66 --- /dev/null +++ b/docs/source/cpp_api/core_math.rst @@ -0,0 +1,18 @@ +Mathematical Functions +====================== + +.. role:: py(code) + :language: c++ + :class: highlight + + +The math functions APIs guarantee both CUDA and CPU compatibility, making it more straightforward to write `__host__ __device__` functions without being concerned whether the underlying intrinsics will build and work. + +``#include `` + +namespace *raft::core* + +.. doxygengroup:: math_functions + :project: RAFT + :members: + :content-only: diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst index 4f1dd4e5a4..e3d402d6af 100644 --- a/docs/source/cpp_api/core_resources.rst +++ b/docs/source/cpp_api/core_resources.rst @@ -143,7 +143,7 @@ Device Memory Resource namespace *raft::resource* - .. doxygengroup:: resource_memory_resource + .. doxygengroup:: device_memory_resource :project: RAFT :members: :content-only: diff --git a/docs/source/cpp_api/mdspan_mdarray.rst b/docs/source/cpp_api/mdspan_mdarray.rst index e14fe5a9e3..bcc2254204 100644 --- a/docs/source/cpp_api/mdspan_mdarray.rst +++ b/docs/source/cpp_api/mdspan_mdarray.rst @@ -7,7 +7,7 @@ mdarray: Multi-dimensional Owning Container ``#include `` -.. doxygengroup:: mdarray +.. doxygengroup:: mdarray_apis :project: RAFT :members: :content-only: diff --git a/docs/source/cpp_api/mdspan_mdspan.rst b/docs/source/cpp_api/mdspan_mdspan.rst index 619150f538..6011a9f103 100644 --- a/docs/source/cpp_api/mdspan_mdspan.rst +++ b/docs/source/cpp_api/mdspan_mdspan.rst @@ -19,11 +19,15 @@ mdspan: Multi-dimensional Non-owning View .. doxygenfunction:: raft::make_strided_layout(Extents extents, Strides strides) :project: RAFT -.. doxygenfunction:: raft::unravel_index +.. doxygengroup:: mdspan_unravel :project: RAFT + :members: + :content-only: -.. doxygenfunction:: raft::make_const_mdspan(mdspan_type mds) +.. doxygengroup:: mdspan_make_const :project: RAFT + :members: + :content-only: Device Vocabulary diff --git a/docs/source/cpp_api/mdspan_representation.rst b/docs/source/cpp_api/mdspan_representation.rst index f514cf38e0..386e6f14e9 100644 --- a/docs/source/cpp_api/mdspan_representation.rst +++ b/docs/source/cpp_api/mdspan_representation.rst @@ -8,14 +8,12 @@ Multi-dimensional Representation Data Layouts ------------- -``#include `` - -.. doxygentypedef:: raft::row_major - :project: RAFT +``#include `` -.. doxygentypedef:: raft::col_major +.. doxygengroup:: mdspan_layout :project: RAFT - + :members: + :content-only: Shapes ------ diff --git a/docs/source/cpp_api/mdspan_temporary_device_buffer.rst b/docs/source/cpp_api/mdspan_temporary_device_buffer.rst index 90d08ac5bb..8c6fdd2a9d 100644 --- a/docs/source/cpp_api/mdspan_temporary_device_buffer.rst +++ b/docs/source/cpp_api/mdspan_temporary_device_buffer.rst @@ -7,17 +7,15 @@ temporary_device_buffer: Temporary raft::device_mdspan Producing Object ``#include `` -.. doxygenclass:: raft::temporary_device_buffer +.. doxygengroup:: temporary_device_buffer :project: RAFT :members: + :content-only: Factories --------- -.. doxygenfunction:: raft::make_temporary_device_buffer - :project: RAFT - -.. doxygenfunction:: raft::make_readonly_temporary_device_buffer - :project: RAFT -.. doxygenfunction:: raft::make_writeback_temporary_device_buffer +.. doxygengroup:: temporary_device_buffer_factories :project: RAFT + :members: + :content-only: diff --git a/docs/source/cpp_api/neighbors_cagra.rst b/docs/source/cpp_api/neighbors_cagra.rst index 68372bbb71..6613b0b06d 100644 --- a/docs/source/cpp_api/neighbors_cagra.rst +++ b/docs/source/cpp_api/neighbors_cagra.rst @@ -11,7 +11,7 @@ Please note that the CAGRA implementation is currently experimental and the API ``#include `` -namespace *raft::neighbors::experimental::cagra* +namespace *raft::neighbors::cagra* .. doxygengroup:: cagra :project: RAFT diff --git a/docs/source/cpp_api/sparse_types.rst b/docs/source/cpp_api/sparse_types.rst index e69de29bb2..4ddf2cc0d5 100644 --- a/docs/source/cpp_api/sparse_types.rst +++ b/docs/source/cpp_api/sparse_types.rst @@ -0,0 +1,22 @@ +Sparse Types +============ + +.. role:: py(code) + :language: c++ + :class: highlight + + +``#include `` + +.. doxygengroup:: sparse_types + :project: RAFT + :members: + :content-only: + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + sparse_types_coo_matrix.rst + sparse_types_csr_matrix.rst diff --git a/docs/source/cpp_api/sparse_types_coo_matrix.rst b/docs/source/cpp_api/sparse_types_coo_matrix.rst new file mode 100644 index 0000000000..855d89fdea --- /dev/null +++ b/docs/source/cpp_api/sparse_types_coo_matrix.rst @@ -0,0 +1,39 @@ +COO Matrix +========== + +.. role:: py(code) + :language: c++ + :class: highlight + + +Basic Vocabulary +---------------- + +``#include `` + +.. doxygengroup:: coo_matrix + :project: RAFT + :members: + :content-only: + + +Device COO Matrix +----------------- + +``#include `` + +.. doxygengroup:: device_coo_matrix + :project: RAFT + :members: + :content-only: + +Host COO Matrix +----------------- + +``#include `` + +.. doxygengroup:: host_coo_matrix + :project: RAFT + :members: + :content-only: + diff --git a/docs/source/cpp_api/sparse_types_csr_matrix.rst b/docs/source/cpp_api/sparse_types_csr_matrix.rst new file mode 100644 index 0000000000..b704846c4e --- /dev/null +++ b/docs/source/cpp_api/sparse_types_csr_matrix.rst @@ -0,0 +1,39 @@ +CSR Matrix +========== + +.. role:: py(code) + :language: c++ + :class: highlight + + +Basic Vocabulary +---------------- + +``#include `` + +.. doxygengroup:: csr_matrix + :project: RAFT + :members: + :content-only: + + +Device CSR Matrix +----------------- + +``#include `` + +.. doxygengroup:: device_csr_matrix + :project: RAFT + :members: + :content-only: + +Host CSR Matrix +----------------- + +``#include `` + +.. doxygengroup:: host_csr_matrix + :project: RAFT + :members: + :content-only: + diff --git a/docs/source/cuda_ann_benchmarks.md b/docs/source/cuda_ann_benchmarks.md deleted file mode 100644 index 708f5f7dba..0000000000 --- a/docs/source/cuda_ann_benchmarks.md +++ /dev/null @@ -1,322 +0,0 @@ -# CUDA ANN Benchmarks - -This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU. - -## Benchmark - -### Dependencies - -CUDA 11 and a GPU with Pascal architecture or later are required to run the benchmarks. - -Please refer to the [installation docs](https://docs.rapids.ai/api/raft/stable/build.html#cuda-gpu-requirements) for the base requirements to build RAFT. - -In addition to the base requirements for building RAFT, additional dependencies needed to build the ANN benchmarks include: -1. FAISS GPU >= 1.7.1 -2. Google Logging (GLog) -3. H5Py -4. HNSWLib -5. nlohmann_json -6. GGNN - -[rapids-cmake](https://github.com/rapidsai/rapids-cmake) is used to build the ANN benchmarks so the code for dependencies not already supplied in the CUDA toolkit will be downloaded and built automatically. - -The easiest (and most reproducible) way to install the dependencies needed to build the ANN benchmarks is to use the conda environment file located in the `conda/environments` directory of the RAFT repository. The following command will use `mamba` (which is preferred over `conda`) to build and activate a new environment for compiling the benchmarks: - -```bash -mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml -conda activate raft_ann_benchmarks -``` - -The above conda environment will also reduce the compile times as dependencies like FAISS will already be installed and not need to be compiled with `rapids-cmake`. - -### Compiling the Benchmarks - -After the needed dependencies are satisfied, the easiest way to compile ANN benchmarks is through the `build.sh` script in the root of the RAFT source code repository. The following will build the executables for all the support algorithms: -```bash -./build.sh bench-ann -``` - -You can limit the algorithms that are built by providing a semicolon-delimited list of executable names (each algorithm is suffixed with `_ANN_BENCH`): -```bash -./build.sh bench-ann --limit-bench-ann=HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH -``` - -Available targets to use with `--limit-bench-ann` are: -- FAISS_IVF_FLAT_ANN_BENCH -- FAISS_IVF_PQ_ANN_BENCH -- FAISS_BFKNN_ANN_BENCH -- GGNN_ANN_BENCH -- HNSWLIB_ANN_BENCH -- RAFT_IVF_PQ_ANN_BENCH -- RAFT_IVF_FLAT_ANN_BENCH -- RAFT_BFKNN_ANN_BENCH - -By default, the `*_ANN_BENCH` executables program infer the dataset's datatype from the filename's extension. For example, an extension of `fbin` uses a `float` datatype, `f16bin` uses a `float16` datatype, extension of `i8bin` uses `int8_t` datatype, and `u8bin` uses `uint8_t` type. Currently, only `float`, `float16`, int8_t`, and `unit8_t` are supported. - -### Usage -There are 4 general steps to running the benchmarks: -1. Prepare Dataset -2. Build Index -3. Search Using Built Index -4. Evaluate Result - -#### End-to-end Example -An end-to-end example (run from the RAFT source code root directory): -```bash -# (1) prepare a dataset -pushd - -cd cpp/bench/ann -mkdir data && cd data -wget http://ann-benchmarks.com/glove-100-angular.hdf5 - -# option -n is used here to normalize vectors so cosine distance is converted -# to inner product; don't use -n for l2 distance -python scripts/hdf5_to_fbin.py -n glove-100-angular.hdf5 - -mkdir glove-100-inner -mv glove-100-angular.base.fbin glove-100-inner/base.fbin -mv glove-100-angular.query.fbin glove-100-inner/query.fbin -mv glove-100-angular.groundtruth.neighbors.ibin glove-100-inner/groundtruth.neighbors.ibin -mv glove-100-angular.groundtruth.distances.fbin glove-100-inner/groundtruth.distances.fbin -popd - -# (2) build index -./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -b -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json - -# (3) search -./cpp/build/RAFT_IVF_FLAT_ANN_BENCH -s -i raft_ivf_flat.nlist1024 conf/glove-100-inner.json - -# (4) evaluate result -pushd -cd cpp/bench/ann -./scripts/eval.pl \ - -o result.csv \ - data/glove-100-inner/groundtruth.neighbors.ibin \ - result/glove-100-inner/faiss_ivf_flat -popd - -# optional step: plot QPS-Recall figure using data in result.csv with your favorite tool -``` - -##### Step 1: Prepare Dataset -A dataset usually has 4 binary files containing database vectors, query vectors, ground truth neighbors and their corresponding distances. For example, Glove-100 dataset has files `base.fbin` (database vectors), `query.fbin` (query vectors), `groundtruth.neighbors.ibin` (ground truth neighbors), and `groundtruth.distances.fbin` (ground truth distances). The first two files are for index building and searching, while the other two are associated with a particular distance and are used for evaluation. - -The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively. -These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order. - -Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `script/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type. - -Commonly used datasets can be downloaded from two websites: -1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks). - - However, these datasets are in HDF5 format. Use `cpp/bench/ann/scripts/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it: - ```bash - pip3 install numpy h5py - ``` - The usage of this script is: - ```bash - $ cpp/bench/ann/scripts/hdf5_to_fbin.py - usage: scripts/hdf5_to_fbin.py [-n] .hdf5 - -n: normalize base/query set - outputs: .base.fbin - .query.fbin - .groundtruth.neighbors.ibin - .groundtruth.distances.fbin - ``` - So for an input `.hdf5` file, four output binary files will be produced. See previous section for an example of prepossessing GloVe dataset. - - Most datasets provided by `ann-benchmarks` use `Angular` or `Euclidean` distance. `Angular` denotes cosine distance. However, computing cosine distance reduces to computing inner product by normalizing vectors beforehand. In practice, we can always do the normalization to decrease computation cost, so it's better to measure the performance of inner product rather than cosine distance. The `-n` option of `hdf5_to_fbin.py` can be used to normalize the dataset. - -2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this: - ```bash - $ cpp/bench/ann/scripts/split_groundtruth.pl - usage: script/split_groundtruth.pl input output_prefix - ``` - Take Deep-1B dataset as an example: - ```bash - pushd - cd cpp/bench/ann - mkdir -p data/deep-1B && cd data/deep-1B - # download manually "Ground Truth" file of "Yandex DEEP" - # suppose the file name is deep_new_groundtruth.public.10K.bin - ../../scripts/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth - # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced - popd - ``` - Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation. - - -##### Step 2: Build Index -An index is a data structure to facilitate searching. Different algorithms may use different data structures for their index. We can use `RAFT_IVF_FLAT_ANN_BENCH -b` to build an index and save it to disk. - -To run a benchmark executable, like `RAFT_IVF_FLAT_ANN_BENCH`, a JSON configuration file is required. Refer to [`cpp/bench/ann/conf/glove-100-inner.json`](../../cpp/cpp/bench/ann/conf/glove-100-inner.json) as an example. Configuration file has 3 sections: -* `dataset` section specifies the name and files of a dataset, and also the distance in use. Since the `*_ANN_BENCH` programs are for index building and searching, only `base_file` for database vectors and `query_file` for query vectors are needed. Ground truth files are for evaluation thus not needed. - - To use only a subset of the base dataset, an optional parameter `subset_size` can be specified. It means using only the first `subset_size` vectors of `base_file` as the base dataset. -* `search_basic_param` section specifies basic parameters for searching: - - `k` is the "k" in "k-nn", that is, the number of neighbors (or results) we want from the searching. - - `run_count` means how many times we run the searching. A single run of searching will search neighbors for all vectors in `test` set. The total time used for a run is recorded, and the final searching time is the smallest one among these runs. -* `index` section specifies an array of configurations for index building and searching: - - `build_param` and `search_params` are parameters for building and searching, respectively. `search_params` is an array since we will search with different parameters to get different recall values. - - `file` is the file name of index. Building will save built index to this file, while searching will load this file. - - `search_result_file` is the file name prefix of searching results. Searching will save results to these files, and plotting script will read these files to plot results. Note this is a prefix rather than a whole file name. Suppose its value is `${prefix}`, then the real file names are like `${prefix}.0.{ibin|txt}`, `${prefix}.1.{ibin|txt}`, etc. Each of them corresponds to an item in `search_params` array. That is, for one searching parameter, there will be some corresponding search result files. - - if `multigpu` is specified, multiple GPUs will be used for index build and search. - - if `refine_ratio` is specified, refinement, as a post-processing step of search, will be done. It's for algorithms that compress vectors. For example, if `"refine_ratio" : 2` is set, 2`k` results are first computed, then exact distances of them are computed using original uncompressed vectors, and finally top `k` results among them are kept. - - -The usage of `*_ANN_BENCH` can be found by running `*_ANN_BENCH -h` on one of the executables: -```bash -$ ./cpp/build/*_ANN_BENCH -h -usage: ./cpp/build/*_ANN_BENCH -b|s [-f] [-i index_names] conf.json - -b: build mode, will build index - -s: search mode, will search using built index - one and only one of -b and -s should be specified - -f: force overwriting existing output files - -i: by default will build/search all the indices found in conf.json - '-i' can be used to select a subset of indices - 'index_names' is a list of comma-separated index names - '*' is allowed as the last character of a name to select all matched indices - for example, -i "hnsw1,hnsw2,faiss" or -i "hnsw*,faiss" -``` -* `-b`: build index. -* `-s`: do the searching with built index. -* `-f`: before doing the real task, the program checks that needed input files exist and output files don't exist. If these conditions are not met, it quits so no file would be overwritten accidentally. To ignore existing output files and force overwrite them, use the `-f` option. -* `-i`: by default, the `-b` flag will build all indices found in the configuration file, and `-s` will search using all the indices. To select a subset of indices to build or search, we can use the `-i` option. - -It's easier to describe the usage of `-i` option with an example. Suppose we have a configuration file `a.json`, and it contains: -```json - "index" : [ - { - "name" : "hnsw1", - ... - }, - { - "name" : "hnsw1", - ... - }, - { - "name" : "faiss", - ... - } - ] -``` -Then, -```bash -# build all indices: hnsw1, hnsw2 and faiss -./cpp/build/HNSWLIB_ANN_BENCH -b a.json - -# build only hnsw1 -./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1 a.json - -# build hnsw1 and hnsw2 -./cpp/build/HNSWLIB_ANN_BENCH -b -i hnsw1,hnsw2 a.json - -# build hnsw1 and hnsw2 -./cpp/build/HNSWLIB_ANN_BENCH -b -i 'hnsw*' a.json - -# build faiss -./cpp/build/FAISS_IVF_FLAT_ANN_BENCH -b -i 'faiss' a.json -``` -In the last two commands, we use wildcard "`*`" to match both `hnsw1` and `hnsw2`. Note the use of "`*`" is quite limited. It can occur only at the end of a pattern, so both "`*nsw1`" and "`h*sw1`" are interpreted literally and will not match anything. Also note that quotation marks must be used to prevent "`*`" from being interpreted by the shell. - - -##### Step 3: Searching -Use the `-s` flag on any of the `*_ANN_BENCH` executables. Other options are the same as in step 2. - - -##### Step 4: Evaluating Results -Use `cpp/bench/ann/scripts/eval.pl` to evaluate benchmark results. The usage is: -```bash -$ cpp/bench/ann/scripts/eval.pl -usage: [-f] [-o output.csv] groundtruth.neighbors.ibin result_paths... - result_paths... are paths to the search result files. - Can specify multiple paths. - For each of them, if it's a directory, all the .txt files found under - it recursively will be regarded as inputs. - - -f: force to recompute recall and update it in result file if needed - -o: also write result to a csv file -``` -Note that there can be multiple arguments for paths of result files. Each argument can be either a file name or a path. If it's a directory, all files found under it recursively will be used as input files. -An example: -```bash -cpp/bench/ann/scripts/eval.pl groundtruth.neighbors.ibin \ - result/glove-100-angular/10/hnsw/angular_M_24_*.txt \ - result/glove-100-angular/10/faiss/ -``` -The search result files used by this command are files matching `result/glove-100-angular/10/hnsw/angular_M_24_*.txt`, and all `.txt` files under directory `result/glove-100-angular/10/faiss/` recursively. - -This script prints recall and QPS for every result file. Also, it outputs estimated "recall at QPS=2000" and "QPS at recall=0.9", which can be used to compare performance quantitatively. - -It saves recall value in result txt file, so avoids to recompute recall if the same command is run again. To force to recompute recall, option `-f` can be used. If option `-o ` is specified, a csv output file will be produced. This file can be used to plot Throughput-Recall curves. - -## Adding a new ANN algorithm -Implementation of a new algorithm should be a class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions. - -In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN::AnnSearchParam`. Take `class HnswLib` as an example, its definition is: -```c++ -template -class HnswLib : public ANN { -public: - struct BuildParam { - int M; - int ef_construction; - int num_threads; - }; - - using typename ANN::AnnSearchParam; - struct SearchParam : public AnnSearchParam { - int ef; - int num_threads; - }; - - // ... -}; -``` - -The benchmark program uses JSON configuration file. To add the new algorithm to the benchmark, need be able to specify `build_param`, whose value is a JSON object, and `search_params`, whose value is an array of JSON objects, for this algorithm in configuration file. Still take the configuration for `HnswLib` as an example: -```json -{ - "name" : "...", - "algo" : "hnswlib", - "build_param": {"M":12, "efConstruction":500, "numThreads":32}, - "file" : "/path/to/file", - "search_params" : [ - {"ef":10, "numThreads":1}, - {"ef":20, "numThreads":1}, - {"ef":40, "numThreads":1}, - ], - "search_result_file" : "/path/to/file" -}, -``` - -How to interpret these JSON objects is totally left to the implementation and should be specified in `cpp/bench/ann/src/factory.cuh`: -1. First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively: - ```c++ - template - void parse_build_param(const nlohmann::json& conf, - typename cuann::HnswLib::BuildParam& param) { - param.ef_construction = conf.at("efConstruction"); - param.M = conf.at("M"); - if (conf.contains("numThreads")) { - param.num_threads = conf.at("numThreads"); - } - } - - template - void parse_search_param(const nlohmann::json& conf, - typename cuann::HnswLib::SearchParam& param) { - param.ef = conf.at("ef"); - if (conf.contains("numThreads")) { - param.num_threads = conf.at("numThreads"); - } - } - ``` - -2. Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example, - ```c++ - // JSON configuration file contains a line like: "algo" : "hnswlib" - if (algo == "hnswlib") { - // ... - } - ``` diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md index 7664744145..3b90570028 100644 --- a/docs/source/developer_guide.md +++ b/docs/source/developer_guide.md @@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour 1. Do not split empty functions/records/namespaces. 2. Two-space indentation everywhere, including the line continuations. 3. Disable reflowing of comments. - The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/.clang-format). + The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-23.08/cpp/.clang-format). [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter. In order to run doxygen as a linter on C++/CUDA code, run @@ -205,7 +205,7 @@ you can run `codespell -i 3 -w .` from the repository root directory. This will bring up an interactive prompt to select which spelling fixes to apply. ### #include style -[include_checker.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/include_checker.py) is used to enforce the include style as follows: +[include_checker.py](https://github.com/rapidsai/raft/blob/branch-23.08/cpp/scripts/include_checker.py) is used to enforce the include style as follows: 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies. 2. `#include <...>` should be used for referencing everything else @@ -215,7 +215,7 @@ python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list ``` ### Copyright header -[copyright.py](https://github.com/rapidsai/raft/blob/branch-23.06/ci/checks/copyright.py) checks the Copyright header for all git-modified files +[copyright.py](https://github.com/rapidsai/raft/blob/branch-23.08/ci/checks/copyright.py) checks the Copyright header for all git-modified files Manually, you can run the following to bulk-fix the header if only the years need to be updated: ```bash @@ -229,7 +229,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` ## Logging ### Introduction -Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all. +Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-23.08/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all. ### Usage ```cpp @@ -255,7 +255,7 @@ There are 7 logging levels with each successive level becoming quieter: 7. RAFT_LEVEL_OFF Pass one of these as per your needs into the `set_level()` method as follows: ```cpp -raft::logger::get.set_level(RAFT_LEVEL_WARN); +raft::logger::get().set_level(RAFT_LEVEL_WARN); // From now onwards, this will print only WARN and above kind of messages ``` diff --git a/docs/source/index.rst b/docs/source/index.rst index 23e346c872..37235c2f25 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,7 +1,25 @@ -RAPIDS RAFT: Reusable Accelerated Functions and Tools -===================================================== +RAPIDS RAFT: Reusable Accelerated Functions and Tools for Vector Search and More +================================================================================ -RAFT contains fundamental widely-used algorithms and primitives for scientific computing, data science and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics. +.. image:: ../../img/raft-tech-stack-vss.png + :width: 800 + :alt: RAFT Tech Stack + +Resources +######### + +.. _raft_reference: https://docs.rapids.ai/api/raft/stable/ + +- `Example Notebooks `_: Example jupyer notebooks +- `RAPIDS Community `_: Get help, contribute, and collaborate. +- `GitHub repository `_: Download the RAFT source code. +- `Issue tracker `_: Report issues or request features. + + +Overview +######## + +RAFT contains fundamental widely-used algorithms and primitives for machine learning and information retrieval. The algorithms are CUDA-accelerated and form building blocks for more easily writing high performance applications. By taking a primitives-based approach to algorithm development, RAFT @@ -9,7 +27,6 @@ By taking a primitives-based approach to algorithm development, RAFT - reduces the maintenance burden by maximizing reuse across projects, and - centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them. - While not exhaustive, the following general categories help summarize the accelerated building blocks that RAFT contains: .. list-table:: @@ -25,7 +42,7 @@ While not exhaustive, the following general categories help summarize the accele * - Sparse Operations - linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling * - Spatial - - pairwise distances, nearest neighbors, neighborhood graph construction + - pairwise distances, nearest neighbors and vector search, neighborhood graph construction * - Basic Clustering - spectral clustering, hierarchical clustering, k-means * - Solvers @@ -36,18 +53,18 @@ While not exhaustive, the following general categories help summarize the accele - common utilities for developing CUDA applications, multi-node multi-gpu infrastructure .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Contents: quick_start.md build.md - developer_guide.md cpp_api.rst pylibraft_api.rst - cuda_ann_benchmarks.md + using_libraft.md + raft_ann_benchmarks.md raft_dask_api.rst using_comms.rst - using_libraft.md + developer_guide.md contributing.md diff --git a/docs/source/pylibraft_api/neighbors.rst b/docs/source/pylibraft_api/neighbors.rst index c314f1c84d..ca89c25ed4 100644 --- a/docs/source/pylibraft_api/neighbors.rst +++ b/docs/source/pylibraft_api/neighbors.rst @@ -14,6 +14,20 @@ Brute Force .. autofunction:: pylibraft.neighbors.brute_force.knn +CAGRA +##### + +.. autoclass:: pylibraft.neighbors.cagra.IndexParams + :members: + +.. autofunction:: pylibraft.neighbors.cagra.build + +.. autoclass:: pylibraft.neighbors.cagra.SearchParams + :members: + +.. autofunction:: pylibraft.neighbors.cagra.search + + IVF-Flat ######## diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index e955706dc4..3909b40f20 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -118,7 +118,7 @@ auto metric = raft::distance::DistanceType::L2SqrtExpanded; raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric); ``` -### Python Example +## Python Example The `pylibraft` package contains a Python API for RAFT algorithms and primitives. `pylibraft` integrates nicely into other libraries by being very lightweight with minimal dependencies and accepting any object that supports the `__cuda_array_interface__`, such as [CuPy's ndarray](https://docs.cupy.dev/en/stable/user_guide/interoperability.html#rmm). The number of RAFT algorithms exposed in this package is continuing to grow from release to release. diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md new file mode 100644 index 0000000000..91958c0bcd --- /dev/null +++ b/docs/source/raft_ann_benchmarks.md @@ -0,0 +1,277 @@ +# RAFT ANN Benchmarks + +This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU. + +## Installing the benchmarks + +The easiest way to install these benchmarks is through conda. We suggest using mamba as it generally leads to a faster install time:: +```bash +mamba env create --name raft_ann_benchmarks -f conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +conda activate raft_ann_benchmarks + +mamba install -c rapidsai libraft-ann-bench +``` +The channel `rapidsai` can easily be substituted `rapidsai-nightly` if nightly benchmarks are desired. + +Please see the [build instructions](ann_benchmarks_build.md) to build the benchmarks from source. + +## Running the benchmarks + +### Usage +There are 4 general steps to running the benchmarks and vizualizing the results: +1. Prepare Dataset +2. Build Index and Search Index +3. Evaluate Results +4. Plot Results + +We provide a collection of lightweight Python scripts that are wrappers over +lower level scripts and executables to run our benchmarks. Either Python scripts or +[low-level scripts and executables](ann_benchmarks_low_level.md) are valid methods to run benchmarks, +however plots are only provided through our Python scripts. An environment variable `RAFT_HOME` is +expected to be defined to run these scripts; this variable holds the directory where RAFT is cloned. +### End-to-end example: Million-scale +```bash +export RAFT_HOME=$(pwd) +# All scripts are present in directory raft/scripts/ann-benchmarks + +# (1) prepare dataset +python scripts/ann-benchmarks/get_dataset.py --name glove-100-angular --normalize + +# (2) build and search index +python scripts/ann-benchmarks/run.py --configuration conf/glove-100-inner.json + +# (3) evaluate results +python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/glove-100-inner/groundtruth.neighbors.ibin result/glove-100-inner/ + +# (4) plot results +python scripts/ann-benchmarks/plot.py --result_csv out.csv +``` + +### End-to-end example: Billion-scale +`scripts/get_dataset.py` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.html#billion-scale) +because they are so large. You should instead use our billion-scale datasets guide to download and prepare them. +All other python scripts mentioned below work as intended once the +billion-scale dataset has been downloaded. +To download Billion-scale datasets, visit [big-ann-benchmarks](http://big-ann-benchmarks.com/neurips21.html) + +```bash +export RAFT_HOME=$(pwd) +# All scripts are present in directory raft/scripts/ann-benchmarks + +mkdir -p data/deep-1B +# (1) prepare dataset +# download manually "Ground Truth" file of "Yandex DEEP" +# suppose the file name is deep_new_groundtruth.public.10K.bin +python scripts/ann-benchmarks/split_groundtruth.py data/deep-1B/deep_new_groundtruth.public.10K.bin +# two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced + +# (2) build and search index +python scripts/ann-benchmarks/run.py --configuration conf/deep-1B.json + +# (3) evaluate results +python scripts/ann-benchmarks/data_export.py --output out.csv --groundtruth data/deep-1B/groundtruth.neighbors.ibin result/deep-1B/ + +# (4) plot results +python scripts/ann-benchmarks/plot.py --result_csv out.csv +``` + +The usage of `scripts/ann-benchmarks/split-groundtruth.py` is: +```bash +usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH + +options: + -h, --help show this help message and exit + --groundtruth GROUNDTRUTH + Path to billion-scale dataset groundtruth file (default: None) +``` + +##### Step 1: Prepare Dataset +The script `scripts/ann-benchmarks/get_dataset.py` will download and unpack the dataset in directory +that the user provides. As of now, only million-scale datasets are supported by this +script. For more information on [datasets and formats](ann_benchmarks_dataset.md). + +The usage of this script is: +```bash +usage: get_dataset.py [-h] [--name NAME] [--path PATH] [--normalize] + +options: + -h, --help show this help message and exit + --name NAME dataset to download (default: glove-100-angular) + --path PATH path to download dataset (default: {os.getcwd()}/data) + --normalize normalize cosine distance to inner product (default: False) +``` + +When option `normalize` is provided to the script, any dataset that has cosine distances +will be normalized to inner product. So, for example, the dataset `glove-100-angular` +will be written at location `data/glove-100-inner/`. + +#### Step 2: Build and Search Index +The script `scripts/ann-benchmarks/run.py` will build and search indices for a given dataset and its +specified configuration. +To confirgure which algorithms are available, we use `algos.yaml`. +To configure building/searching indices for a dataset, look at [index configuration](#json-index-config). +An entry in `algos.yaml` looks like: +```yaml +raft_ivf_pq: + executable: RAFT_IVF_PQ_ANN_BENCH + disabled: false +``` +`executable` : specifies the binary that will build/search the index. It is assumed to be +available in `raft/cpp/build/`. +`disabled` : denotes whether an algorithm should be excluded from benchmark runs. + +The usage of the script `scripts/run.py` is: +```bash +usage: run.py [-h] --configuration CONFIGURATION [--build] [--search] [--algorithms ALGORITHMS] [--indices INDICES] [--force] + +options: + -h, --help show this help message and exit + --configuration CONFIGURATION + path to configuration file for a dataset (default: None) + --build + --search + --algorithms ALGORITHMS + run only comma separated list of named algorithms (default: None) + --indices INDICES run only comma separated list of named indices. parameter `algorithms` is ignored (default: None) + --force re-run algorithms even if their results already exist (default: False) +``` + +`build` and `search` : if both parameters are not supplied to the script then +it is assumed both are `True`. + +`indices` and `algorithms` : these parameters ensure that the algorithm specified for an index +is available in `algos.yaml` and not disabled, as well as having an associated executable. + +#### Step 3: Evaluating Results +The script `scripts/ann-benchmarks/data_export.py` will evaluate results for a dataset whose index has been built +and search with at least one algorithm. For every result file that is supplied to the script, the output +will be combined and written to a CSV file. + +The usage of this script is: +```bash +usage: data_export.py [-h] --output OUTPUT [--recompute] --groundtruth GROUNDTRUTH + +options: + -h, --help show this help message and exit + --output OUTPUT Path to the CSV output file (default: None) + --recompute Recompute metrics (default: False) + --groundtruth GROUNDTRUTH + Path to groundtruth.neighbors.ibin file for a dataset (default: None) +``` + +`result_filepaths` : whitespace delimited list of result files/directories that can be captured via pattern match. For more [information and examples](ann_benchmarks_low_level.html#result-filepath-example) + +#### Step 4: Plot Results +The script `scripts/ann-benchmarks/plot.py` will plot all results evaluated to a CSV file for a given dataset. + +The usage of this script is: +```bash +usage: plot.py [-h] --result_csv RESULT_CSV [--output OUTPUT] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw] + +options: + -h, --help show this help message and exit + --result_csv RESULT_CSV + Path to CSV Results (default: None) + --output OUTPUT Path to the PNG output file (default: /home/nfs/dgala/raft/out.png) + --x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear) + --y-scale {linear,log,symlog,logit} + Scale to use when drawing the Y-axis (default: linear) + --raw Show raw results (not just Pareto frontier) in faded colours (default: False) +``` + +All algorithms present in the CSV file supplied to this script with parameter `result_csv` +will appear in the plot. + +## Adding a new ANN algorithm +### Implementation and Configuration +Implementation of a new algorithm should be a C++ class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions. + +In addition, it should define two `struct`s for building and searching parameters. The searching parameter class should inherit `struct ANN::AnnSearchParam`. Take `class HnswLib` as an example, its definition is: +```c++ +template +class HnswLib : public ANN { +public: + struct BuildParam { + int M; + int ef_construction; + int num_threads; + }; + + using typename ANN::AnnSearchParam; + struct SearchParam : public AnnSearchParam { + int ef; + int num_threads; + }; + + // ... +}; +``` + +The benchmark program uses JSON configuration file. To add the new algorithm to the benchmark, need be able to specify `build_param`, whose value is a JSON object, and `search_params`, whose value is an array of JSON objects, for this algorithm in configuration file. Still take the configuration for `HnswLib` as an example: +```json +{ + "name" : "...", + "algo" : "hnswlib", + "build_param": {"M":12, "efConstruction":500, "numThreads":32}, + "file" : "/path/to/file", + "search_params" : [ + {"ef":10, "numThreads":1}, + {"ef":20, "numThreads":1}, + {"ef":40, "numThreads":1}, + ], + "search_result_file" : "/path/to/file" +}, +``` + +How to interpret these JSON objects is totally left to the implementation and should be specified in `cpp/bench/ann/src/factory.cuh`: +1. First, add two functions for parsing JSON object to `struct BuildParam` and `struct SearchParam`, respectively: + ```c++ + template + void parse_build_param(const nlohmann::json& conf, + typename cuann::HnswLib::BuildParam& param) { + param.ef_construction = conf.at("efConstruction"); + param.M = conf.at("M"); + if (conf.contains("numThreads")) { + param.num_threads = conf.at("numThreads"); + } + } + + template + void parse_search_param(const nlohmann::json& conf, + typename cuann::HnswLib::SearchParam& param) { + param.ef = conf.at("ef"); + if (conf.contains("numThreads")) { + param.num_threads = conf.at("numThreads"); + } + } + ``` + +2. Next, add corresponding `if` case to functions `create_algo()` and `create_search_param()` by calling parsing functions. The string literal in `if` condition statement must be the same as the value of `algo` in configuration file. For example, + ```c++ + // JSON configuration file contains a line like: "algo" : "hnswlib" + if (algo == "hnswlib") { + // ... + } + ``` + +### Adding a CMake Target +In `raft/cpp/bench/ann/CMakeLists.txt`, we provide a `CMake` function to configure a new Benchmark target with the following signature: +``` +ConfigureAnnBench( + NAME + PATH + INCLUDES + CXXFLAGS + LINKS +) +``` + +To add a target for `HNSWLIB`, we would call the function as: +``` +ConfigureAnnBench( + NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp INCLUDES + ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib CXXFLAGS "${HNSW_CXX_FLAGS}" +) +``` + +This will create an executable called `HNSWLIB_ANN_BENCH`, which can then be used to run `HNSWLIB` benchmarks. diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake index baead41cca..a5d5fd7c4a 100644 --- a/fetch_rapids.cmake +++ b/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # ============================================================================= if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake ) endif() diff --git a/img/raft-tech-stack-vss.png b/img/raft-tech-stack-vss.png new file mode 100644 index 0000000000..cb24f002ab Binary files /dev/null and b/img/raft-tech-stack-vss.png differ diff --git a/notebooks/VectorSearch_QuestionRetrieval.ipynb b/notebooks/VectorSearch_QuestionRetrieval.ipynb new file mode 100644 index 0000000000..b3a15d3a08 --- /dev/null +++ b/notebooks/VectorSearch_QuestionRetrieval.ipynb @@ -0,0 +1,628 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f5499b54", + "metadata": {}, + "source": [ + "\n", + "# Similar Questions Retrieval\n", + "\n", + "This notebook is inspired by the [similar search example of Sentence-Transformers](https://www.sbert.net/examples/applications/semantic-search/README.html#similar-questions-retrieval), and adapted to support [RAFT ANN](https://github.com/rapidsai/raft) algorithm.\n", + "\n", + "The model was pre-trained on the [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions). It consists of about 100k real Google search queries, together with an annotated passage from Wikipedia that provides the answer. It is an example of an asymmetric search task. As corpus, we use the smaller [Simple English Wikipedia](http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz) so that it fits easily into memory.\n", + "\n", + "The steps to install the latest stable `pylibraft` package are available in the [documentation](https://docs.rapids.ai/api/raft/stable/build)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e8d55ede", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: sentence_transformers in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (2.2.2)\n", + "Requirement already satisfied: torch in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (2.0.1)\n", + "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (4.31.0)\n", + "Requirement already satisfied: tqdm in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (4.65.0)\n", + "Requirement already satisfied: torchvision in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (0.15.2)\n", + "Requirement already satisfied: numpy in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (1.24.4)\n", + "Requirement already satisfied: scikit-learn in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (1.3.0)\n", + "Requirement already satisfied: scipy in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (1.11.1)\n", + "Requirement already satisfied: nltk in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (3.8.1)\n", + "Requirement already satisfied: sentencepiece in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (0.1.99)\n", + "Requirement already satisfied: huggingface-hub>=0.4.0 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sentence_transformers) (0.16.4)\n", + "Requirement already satisfied: filelock in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (3.12.2)\n", + "Requirement already satisfied: typing-extensions in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (4.7.1)\n", + "Requirement already satisfied: sympy in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (1.12)\n", + "Requirement already satisfied: networkx in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (3.1)\n", + "Requirement already satisfied: jinja2 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (3.1.2)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (11.7.99)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (11.7.99)\n", + "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (11.7.101)\n", + "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (8.5.0.96)\n", + "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (11.10.3.66)\n", + "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (10.9.0.58)\n", + "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (10.2.10.91)\n", + "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (11.4.0.1)\n", + "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (11.7.4.91)\n", + "Requirement already satisfied: nvidia-nccl-cu11==2.14.3 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (2.14.3)\n", + "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (11.7.91)\n", + "Requirement already satisfied: triton==2.0.0 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torch) (2.0.0)\n", + "Requirement already satisfied: setuptools in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch) (68.0.0)\n", + "Requirement already satisfied: wheel in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from nvidia-cublas-cu11==11.10.3.66->torch) (0.41.0)\n", + "Requirement already satisfied: cmake in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from triton==2.0.0->torch) (3.27.0)\n", + "Requirement already satisfied: lit in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from triton==2.0.0->torch) (16.0.6)\n", + "Requirement already satisfied: fsspec in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n", + "Requirement already satisfied: requests in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.31.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0)\n", + "Requirement already satisfied: packaging>=20.9 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (23.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.6.3)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.13.3)\n", + "Requirement already satisfied: safetensors>=0.3.1 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.3.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from jinja2->torch) (2.1.3)\n", + "Requirement already satisfied: click in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from nltk->sentence_transformers) (8.1.6)\n", + "Requirement already satisfied: joblib in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from nltk->sentence_transformers) (1.3.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (3.2.0)\n", + "Requirement already satisfied: mpmath>=0.19 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from sympy->torch) (1.3.0)\n", + "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from torchvision->sentence_transformers) (10.0.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /raid/danteg/miniconda3/envs/raft-ann/lib/python3.10/site-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2023.7.22)\n" + ] + } + ], + "source": [ + "!pip install sentence_transformers torch\n", + "\n", + "# Note: if you have a Hopper based GPU, like an H100, use these to install:\n", + "# pip install torch --index-url https://download.pytorch.org/whl/cu118\n", + "# pip install sentence_transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "eb1e81c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mon Jul 31 14:35:31 2023 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|===============================+======================+======================|\n", + "| 0 NVIDIA H100 80G... On | 00000000:1B:00.0 Off | 0 |\n", + "| N/A 30C P0 75W / 700W | 0MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 1 NVIDIA H100 80G... On | 00000000:43:00.0 Off | 0 |\n", + "| N/A 31C P0 72W / 700W | 0MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 2 NVIDIA H100 80G... On | 00000000:52:00.0 Off | 0 |\n", + "| N/A 34C P0 70W / 700W | 0MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 3 NVIDIA H100 80G... On | 00000000:61:00.0 Off | 0 |\n", + "| N/A 33C P0 70W / 700W | 0MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 4 NVIDIA H100 80G... On | 00000000:9D:00.0 Off | 0 |\n", + "| N/A 32C P0 74W / 700W | 0MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 5 NVIDIA H100 80G... On | 00000000:C3:00.0 Off | 0 |\n", + "| N/A 30C P0 73W / 700W | 0MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 6 NVIDIA H100 80G... On | 00000000:D1:00.0 Off | 0 |\n", + "| N/A 33C P0 73W / 700W | 0MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-------------------------------+----------------------+----------------------+\n", + "| 7 NVIDIA H100 80G... On | 00000000:DF:00.0 Off | 0 |\n", + "| N/A 35C P0 73W / 700W | 0MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n" + ] + } + ], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ee4c5cc0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/raid/danteg/miniconda3/envs/raftann/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import json\n", + "from sentence_transformers import SentenceTransformer, CrossEncoder, util\n", + "import time\n", + "import gzip\n", + "import os\n", + "import torch\n", + "import pylibraft\n", + "from pylibraft.neighbors import ivf_flat, ivf_pq\n", + "pylibraft.config.set_output_as(lambda device_ndarray: device_ndarray.copy_to_host())\n", + "\n", + "if not torch.cuda.is_available():\n", + " print(\"Warning: No GPU found. Please add GPU to your notebook\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0a1a6307", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Passages: 509663\n" + ] + } + ], + "source": [ + "# We use the Bi-Encoder to encode all passages, so that we can use it with semantic search\n", + "model_name = 'nq-distilbert-base-v1'\n", + "bi_encoder = SentenceTransformer(model_name)\n", + "\n", + "# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only\n", + "# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder\n", + "\n", + "wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'\n", + "\n", + "if not os.path.exists(wikipedia_filepath):\n", + " util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)\n", + "\n", + "passages = []\n", + "with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:\n", + " for line in fIn:\n", + " data = json.loads(line.strip())\n", + " for paragraph in data['paragraphs']:\n", + " # We encode the passages as [title, text]\n", + " passages.append([data['title'], paragraph])\n", + "\n", + "# If you like, you can also limit the number of passages you want to use\n", + "print(\"Passages:\", len(passages))\n", + "\n", + "# To speed things up, pre-computed embeddings are downloaded.\n", + "# The provided file encoded the passages with the model 'nq-distilbert-base-v1'\n", + "if model_name == 'nq-distilbert-base-v1':\n", + " embeddings_filepath = 'simplewiki-2020-11-01-nq-distilbert-base-v1.pt'\n", + " if not os.path.exists(embeddings_filepath):\n", + " util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01-nq-distilbert-base-v1.pt', embeddings_filepath)\n", + "\n", + " corpus_embeddings = torch.load(embeddings_filepath)\n", + " corpus_embeddings = corpus_embeddings.float() # Convert embedding file to float\n", + " if torch.cuda.is_available():\n", + " corpus_embeddings = corpus_embeddings.to('cuda')\n", + "else: # Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)\n", + " corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1f4e9b9d", + "metadata": {}, + "source": [ + "# Vector Search using RAPIDS RAFT\n", + "Now that our embeddings are ready to be indexed and that the model has been loaded, we can use RAPIDS RAFT to do our vector search.\n", + "\n", + "This is done in two step: First we build the index, then we search it.\n", + "With `pylibraft` all you need is those four Python lines:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ad90b4be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[W] [14:35:48.810785] [raft::ivf_pq::build] the default cuda resource is used for the raft workspace allocations. This may lead to a significant slowdown for this algorithm. Consider using the default pool resource (`raft::resource::set_workspace_to_pool_resource`) or set your own resource explicitly (`raft::resource::set_workspace_resource`).\n", + "[W] [14:35:53.831753] [raft::ivf_pq::extend] the default cuda resource is used for the raft workspace allocations. This may lead to a significant slowdown for this algorithm. Consider using the default pool resource (`raft::resource::set_workspace_to_pool_resource`) or set your own resource explicitly (`raft::resource::set_workspace_resource`).\n", + "CPU times: user 2.21 s, sys: 2.49 s, total: 4.7 s\n", + "Wall time: 5.13 s\n" + ] + } + ], + "source": [ + "%%time\n", + "params = ivf_pq.IndexParams(n_lists=150, pq_dim=96)\n", + "pq_index = ivf_pq.build(params, corpus_embeddings)\n", + "search_params = ivf_pq.SearchParams()\n", + "\n", + "def search_raft_pq(query, top_k = 5):\n", + " # Encode the query using the bi-encoder and find potentially relevant passages\n", + " question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n", + "\n", + " hits = ivf_pq.search(search_params, pq_index, question_embedding[None], top_k)\n", + "\n", + " # Output of top-k hits\n", + " print(\"Input question:\", query)\n", + " for k in range(top_k):\n", + " print(\"\\t{:.3f}\\t{}\".format(hits[0][0, k], passages[hits[1][0, k]]))" + ] + }, + { + "cell_type": "markdown", + "id": "07935bca", + "metadata": {}, + "source": [ + "For IVF-PQ we want to reduce the memory footprint while keeping a good accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "724dcacb", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IVF-PQ memory footprint: 373.3 MB\n", + "Original dataset: 1493.2 MB\n", + "Memory saved: 75.0%\n" + ] + } + ], + "source": [ + "pq_index_mem = pq_index.pq_dim * pq_index.size * pq_index.pq_bits\n", + "print(\"IVF-PQ memory footprint: {:.1f} MB\".format(pq_index_mem / 2**20))\n", + "\n", + "original_mem = corpus_embeddings.shape[0] * corpus_embeddings.shape[1] * 4\n", + "print(\"Original dataset: {:.1f} MB\".format(original_mem / 2**20))\n", + "\n", + "print(\"Memory saved: {:.1f}%\".format(100 * (1 - pq_index_mem / original_mem)))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c27d4715", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[W] [14:36:07.640223] [raft::ivf_pq::search] the default cuda resource is used for the raft workspace allocations. This may lead to a significant slowdown for this algorithm. Consider using the default pool resource (`raft::resource::set_workspace_to_pool_resource`) or set your own resource explicitly (`raft::resource::set_workspace_resource`).\n", + "Input question: Who was Grace Hopper?\n", + "\t190.855\t['Leona Helmsley', 'Leona Helmsley (July 4, 1920 – August 20, 2007) was an American businesswoman. She was known for having a flamboyant personality. She had a reputation for tyrannical behavior; she was nicknamed the Queen of Mean.']\n", + "\t195.364\t['Grace Hopper', 'Hopper was born in New York, USA. Hopper graduated from Vassar College in 1928 and Yale University in 1934 with a Ph.D degree in mathematics. She joined the US Navy during the World War II in 1943. She worked on computers in the Navy for 43 years. She then worked in other private industry companies after 1949. She retired from the Navy in 1986 and died on January 1, 1992.']\n", + "\t202.536\t['Anita Borg', 'Anita Borg (January 17, 1949 – April 6, 2003) was an American computer scientist. She founded the Institute for Women and Technology and the Grace Hopper Celebration of Women in Computing.']\n", + "\t203.717\t['Brett Butler', 'Brett Butler (born January 30, 1958) is an American actress and stand-up comedian. She is best known for playing Grace in the sitcom \"Grace Under Fire\". She has also done other television programs and comedy acts.']\n", + "\t203.991\t['Nellie Bly', 'Elizabeth Cochrane Seaman (born Elizabeth Jane Cochran; May 5, 1864 – January 27, 1922), better known by her pen name Nellie Bly, was an American journalist, novelist and inventor. She was a newspaper reporter, who worked at various jobs for exposing poor working conditions. Nellie Bly, also, fought for women\\'s right and was known for investigative reporting. She best known for her record-breaking trip around the world in 72 days, inspired by the adventure novel \"Around the World in Eighty Days\" by Jules Verne. In the 1880s, she went undercover as a mentally ill patient in a psychiatric hospital for ten days, with the report being made public in a book called \"\"Ten Days in a Mad-House\"\". She was added to the National Women\\'s Hall of Fame in 1998.']\n", + "CPU times: user 98.3 ms, sys: 81.2 ms, total: 180 ms\n", + "Wall time: 120 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "search_raft_pq(query=\"Who was Grace Hopper?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bc375518", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input question: Who was Alan Turing?\n", + "\t139.827\t['Alan Turing', 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.']\n", + "\t169.849\t['William Kahan', 'William Morton Kahan (born June 5, 1933) is a Canadian mathematician and computer scientist. He received the Turing Award in 1989 for \"\"his fundamental contributions to numerical analysis\".\" He was named an ACM Fellow in 1994, and added to the National Academy of Engineering in 2005.']\n", + "\t177.520\t['Rolf Noskwith', 'Rolf Noskwith (19 June 1919 – 3 January 2017) was a British businessman. During the Second World War, he worked under Alan Turing as a cryptographer at the British military base Bletchley Park in Milton Keynes, Buckinghamshire.']\n", + "\t179.202\t['Marvin Minsky', \"Marvin Lee Minsky (August 9, 1927 – January 24, 2016) was an American cognitive scientist in the field of artificial intelligence (AI). He was the co-founder of the Massachusetts Institute of Technology's AI laboratory, and author of several texts on AI and philosophy. He won the Turing Award in 1969.\"]\n", + "\t179.819\t['Edsger W. Dijkstra', 'Edsger Wybe Dijkstra (May 11, 1930 – August 6, 2002; ) was a Dutch computer scientist. He received the 1972 Turing Award for fundamental contributions to developing programming languages, and was the Schlumberger Centennial Chair of Computer Sciences at The University of Texas at Austin from 1984 until 2000.']\n", + "CPU times: user 4.89 ms, sys: 7.52 ms, total: 12.4 ms\n", + "Wall time: 12 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "search_raft_pq(query=\"Who was Alan Turing?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ab154181", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input question: What is creating tides?\n", + "\t125.037\t['Tide', \"A tide is the periodic rising and falling of Earth's ocean surface caused mainly by the gravitational pull of the Moon acting on the oceans. Tides cause changes in the depth of marine and estuarine (river mouth) waters. Tides also make oscillating currents known as tidal streams (~'rip tides'). This means that being able to predict the tide is important for coastal navigation. The strip of seashore that is under water at high tide and exposed at low tide, called the intertidal zone, is an important ecological product of ocean tides.\"]\n", + "\t163.835\t['Tidal energy', \"Many things affect tides. The pull of the Moon is the largest effect, and most of the energy comes from the slowing of the Earth's spin.\"]\n", + "\t167.368\t['Storm surge', 'A storm surge is a sudden rise of water hitting areas close to the coast. Storm surges are usually created by a hurricane or other tropical cyclone. The surge happens because a storm has fast winds and low atmospheric pressure. Water is pushed on shore, and the water level rises. Strong storm surges can flood coastal towns and destroy homes. A storm surge is considered the deadliest part of a hurricane. They kill many people each year.']\n", + "\t177.143\t['Tidal force', 'Tidal force is caused by gravity and makes tides happen. This is because the gravitational field changes across the middle of a body (the diameter).']\n", + "\t186.108\t['Tsunami', \"A tsunami is a natural disaster which is a series of fast-moving waves in the ocean caused by powerful earthquakes, volcanic eruptions, landslides, or simply an asteroid or a meteor crash inside the ocean. A tsunami has a very long wavelength. It can be hundreds of kilometers long. Usually, a tsunami starts suddenly. The waves travel at a great speed across an ocean with little energy loss. They can remove sand from beaches, destroy trees, toss and drag vehicles, houses and even destroy whole towns. Tsunamis can even be caused when a meteorite strikes the earth's surface, though it is very rare. A tsunami normally occurs in the Pacific Ocean, especially in what is called the ring of fire, but can occur in any large body of water.\"]\n", + "CPU times: user 4.44 ms, sys: 4.65 ms, total: 9.09 ms\n", + "Wall time: 12.4 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "search_raft_pq(query = \"What is creating tides?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2d6017ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 208 ms, sys: 63.8 ms, total: 271 ms\n", + "Wall time: 286 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "params = ivf_flat.IndexParams(n_lists=150)\n", + "flat_index = ivf_flat.build(params, corpus_embeddings)\n", + "search_params = ivf_flat.SearchParams()\n", + "\n", + "def search_raft_flat(query, top_k = 5):\n", + " # Encode the query using the bi-encoder and find potentially relevant passages\n", + " question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n", + " \n", + " start_time = time.time()\n", + " hits = ivf_flat.search(search_params, flat_index, question_embedding[None], top_k)\n", + " end_time = time.time()\n", + "\n", + " # Output of top-k hits\n", + " print(\"Input question:\", query)\n", + " print(\"Results (after {:.3f} seconds):\".format(end_time - start_time))\n", + " for k in range(top_k):\n", + " print(\"\\t{:.3f}\\t{}\".format(hits[0][0, k], passages[hits[1][0, k]]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f5cfb644", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input question: Who was Grace Hopper?\n", + "Results (after 0.002 seconds):\n", + "\t181.650\t['Grace Hopper', 'Hopper was born in New York, USA. Hopper graduated from Vassar College in 1928 and Yale University in 1934 with a Ph.D degree in mathematics. She joined the US Navy during the World War II in 1943. She worked on computers in the Navy for 43 years. She then worked in other private industry companies after 1949. She retired from the Navy in 1986 and died on January 1, 1992.']\n", + "\t192.946\t['Leona Helmsley', 'Leona Helmsley (July 4, 1920 – August 20, 2007) was an American businesswoman. She was known for having a flamboyant personality. She had a reputation for tyrannical behavior; she was nicknamed the Queen of Mean.']\n", + "\t194.951\t['Grace Hopper', 'Grace Murray Hopper (December 9 1906 – January 1 1992) was an American computer scientist and United States Navy officer.']\n", + "\t202.192\t['Nellie Bly', 'Elizabeth Cochrane Seaman (born Elizabeth Jane Cochran; May 5, 1864 – January 27, 1922), better known by her pen name Nellie Bly, was an American journalist, novelist and inventor. She was a newspaper reporter, who worked at various jobs for exposing poor working conditions. Nellie Bly, also, fought for women\\'s right and was known for investigative reporting. She best known for her record-breaking trip around the world in 72 days, inspired by the adventure novel \"Around the World in Eighty Days\" by Jules Verne. In the 1880s, she went undercover as a mentally ill patient in a psychiatric hospital for ten days, with the report being made public in a book called \"\"Ten Days in a Mad-House\"\". She was added to the National Women\\'s Hall of Fame in 1998.']\n", + "\t205.038\t['Abbie Hoffman', 'Abbot Howard \"Abbie\" Hoffman (November 30, 1936 – April 12, 1989) was an American social and political activist.']\n", + "CPU times: user 6.48 ms, sys: 0 ns, total: 6.48 ms\n", + "Wall time: 6.22 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "search_raft_flat(query=\"Who was Grace Hopper?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b5694d00", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input question: Who was Alan Turing?\n", + "Results (after 0.002 seconds):\n", + "\t106.131\t['Alan Turing', 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.']\n", + "\t158.646\t['William Kahan', 'William Morton Kahan (born June 5, 1933) is a Canadian mathematician and computer scientist. He received the Turing Award in 1989 for \"\"his fundamental contributions to numerical analysis\".\" He was named an ACM Fellow in 1994, and added to the National Academy of Engineering in 2005.']\n", + "\t165.094\t['Alan Turing', 'A brilliant mathematician and cryptographer Alan was to become the founder of modern-day computer science and artificial intelligence; designing a machine at Bletchley Park to break secret Enigma encrypted messages used by the Nazi German war machine to protect sensitive commercial, diplomatic and military communications during World War 2. Thus, Turing made the single biggest contribution to the Allied victory in the war against Nazi Germany, possibly saving the lives of an estimated 2 million people, through his effort in shortening World War II.']\n", + "\t167.321\t['Rolf Noskwith', 'Rolf Noskwith (19 June 1919 – 3 January 2017) was a British businessman. During the Second World War, he worked under Alan Turing as a cryptographer at the British military base Bletchley Park in Milton Keynes, Buckinghamshire.']\n", + "\t176.480\t['Marvin Minsky', \"Marvin Lee Minsky (August 9, 1927 – January 24, 2016) was an American cognitive scientist in the field of artificial intelligence (AI). He was the co-founder of the Massachusetts Institute of Technology's AI laboratory, and author of several texts on AI and philosophy. He won the Turing Award in 1969.\"]\n", + "CPU times: user 4.81 ms, sys: 1.19 ms, total: 6 ms\n", + "Wall time: 6.06 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "search_raft_flat(query=\"Who was Alan Turing?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fcfc3c5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input question: What is creating tides?\n", + "Results (after 0.002 seconds):\n", + "\t94.909\t['Tide', \"A tide is the periodic rising and falling of Earth's ocean surface caused mainly by the gravitational pull of the Moon acting on the oceans. Tides cause changes in the depth of marine and estuarine (river mouth) waters. Tides also make oscillating currents known as tidal streams (~'rip tides'). This means that being able to predict the tide is important for coastal navigation. The strip of seashore that is under water at high tide and exposed at low tide, called the intertidal zone, is an important ecological product of ocean tides.\"]\n", + "\t159.539\t['Tidal energy', \"Many things affect tides. The pull of the Moon is the largest effect, and most of the energy comes from the slowing of the Earth's spin.\"]\n", + "\t159.740\t['Storm surge', 'A storm surge is a sudden rise of water hitting areas close to the coast. Storm surges are usually created by a hurricane or other tropical cyclone. The surge happens because a storm has fast winds and low atmospheric pressure. Water is pushed on shore, and the water level rises. Strong storm surges can flood coastal towns and destroy homes. A storm surge is considered the deadliest part of a hurricane. They kill many people each year.']\n", + "\t178.283\t['Sea', 'Wind blowing over the surface of a body of water forms waves. The friction between air and water caused by a gentle breeze on a pond causes ripples to form. A strong blow over the ocean causes larger waves as the moving air pushes against the raised ridges of water. The waves reach their greatest height when the rate at which they travel nearly matches the speed of the wind. The waves form at right angles to the direction from which the wind blows. In open water, if the wind continues to blow, as happens in the Roaring Forties in the southern hemisphere, long, organized masses of water called swell roll across the ocean. If the wind dies down, the wave formation is reduced but waves already formed continue to travel in their original direction until they meet land. Small waves form in small areas of water with islands and other landmasses but large waves form in open stretches of sea where the wind blows steadily and strongly. When waves meet other waves coming from different directions, interference between the two can produce broken, irregular seas.']\n", + "\t181.498\t['Tidal force', 'Tidal force is caused by gravity and makes tides happen. This is because the gravitational field changes across the middle of a body (the diameter).']\n", + "CPU times: user 5.91 ms, sys: 0 ns, total: 5.91 ms\n", + "Wall time: 5.65 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "search_raft_flat(query = \"What is creating tides?\")" + ] + }, + { + "cell_type": "markdown", + "id": "a59d7b32-0832-4c3a-864e-aeb2e6e7fe1f", + "metadata": {}, + "source": [ + "## Using CAGRA: GPU graph-based Vector Search\n", + "\n", + "CAGRA is a graph-based nearest neighbors implementation with state-of-the art query performance for both small- and large-batch sized vector searches. \n", + "\n", + "CAGRA follows the same two-step APIs as IVF-FLAT and IVF-PQ in RAFT. First we build the index:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "50df1f43-c580-4019-949a-06bdc7185536", + "metadata": {}, + "outputs": [], + "source": [ + "from pylibraft.neighbors import cagra" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "091cde52-4652-4230-af2b-75c35357f833", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1min 23s, sys: 2min 7s, total: 3min 31s\n", + "Wall time: 4min 43s\n" + ] + } + ], + "source": [ + "%%time\n", + "params = cagra.IndexParams(intermediate_graph_degree=128, graph_degree=64)\n", + "cagra_index = cagra.build(params, corpus_embeddings)\n", + "search_params = cagra.SearchParams()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "df229e21-f6b6-4d6c-ad54-2724f8738934", + "metadata": {}, + "outputs": [], + "source": [ + "def search_raft_cagra(query, top_k = 5):\n", + " # Encode the query using the bi-encoder and find potentially relevant passages\n", + " question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n", + "\n", + " hits = cagra.search(search_params, cagra_index, question_embedding[None], top_k)\n", + "\n", + " # Output of top-k hits\n", + " print(\"Input question:\", query)\n", + " for k in range(top_k):\n", + " print(\"\\t{:.3f}\\t{}\".format(hits[0][0, k], passages[hits[1][0, k]]))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b5e862fd-b7e5-4423-8fbf-36918f02c8f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 16 µs, sys: 25 µs, total: 41 µs\n", + "Wall time: 83.7 µs\n", + "Input question: Who was Grace Hopper?\n", + "\t181.649\t['Grace Hopper', 'Hopper was born in New York, USA. Hopper graduated from Vassar College in 1928 and Yale University in 1934 with a Ph.D degree in mathematics. She joined the US Navy during the World War II in 1943. She worked on computers in the Navy for 43 years. She then worked in other private industry companies after 1949. She retired from the Navy in 1986 and died on January 1, 1992.']\n", + "\t192.946\t['Leona Helmsley', 'Leona Helmsley (July 4, 1920 – August 20, 2007) was an American businesswoman. She was known for having a flamboyant personality. She had a reputation for tyrannical behavior; she was nicknamed the Queen of Mean.']\n", + "\t194.951\t['Grace Hopper', 'Grace Murray Hopper (December 9 1906 – January 1 1992) was an American computer scientist and United States Navy officer.']\n", + "\t202.192\t['Nellie Bly', 'Elizabeth Cochrane Seaman (born Elizabeth Jane Cochran; May 5, 1864 – January 27, 1922), better known by her pen name Nellie Bly, was an American journalist, novelist and inventor. She was a newspaper reporter, who worked at various jobs for exposing poor working conditions. Nellie Bly, also, fought for women\\'s right and was known for investigative reporting. She best known for her record-breaking trip around the world in 72 days, inspired by the adventure novel \"Around the World in Eighty Days\" by Jules Verne. In the 1880s, she went undercover as a mentally ill patient in a psychiatric hospital for ten days, with the report being made public in a book called \"\"Ten Days in a Mad-House\"\". She was added to the National Women\\'s Hall of Fame in 1998.']\n", + "\t205.038\t['Abbie Hoffman', 'Abbot Howard \"Abbie\" Hoffman (November 30, 1936 – April 12, 1989) was an American social and political activist.']\n" + ] + } + ], + "source": [ + "%time \n", + "search_raft_cagra(query=\"Who was Grace Hopper?\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/tutorial_ivf_pq.ipynb b/notebooks/tutorial_ivf_pq.ipynb new file mode 100644 index 0000000000..6aa8cd6495 --- /dev/null +++ b/notebooks/tutorial_ivf_pq.ipynb @@ -0,0 +1,1385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAFT IVF-PQ tutorial\n", + "In this tutorial you will learn to build IVF-PQ index and use it to search approximate nearest neighbors (ANN).\n", + "We will start with a brief overview of the functionality, but then dive into details to gain the understanding of the model parameters.\n", + "Along the way, we will benchmark the model and give some practical recommendations on how to maximize its performance for various use cases.\n", + "\n", + "This tutorial uses the data from [ANN benchmarks website](https://ann-benchmarks.com)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting adjustText\n", + " Downloading adjustText-0.8-py3-none-any.whl (9.1 kB)\n", + "Collecting h5py\n", + " Downloading h5py-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.8/4.8 MB\u001b[0m \u001b[31m46.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting matplotlib\n", + " Downloading matplotlib-3.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.6/11.6 MB\u001b[0m \u001b[31m97.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy in /opt/conda/envs/cuml_dev/lib/python3.9/site-packages (from adjustText) (1.24.4)\n", + "Collecting contourpy>=1.0.1 (from matplotlib)\n", + " Downloading contourpy-1.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (300 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m300.4/300.4 kB\u001b[0m \u001b[31m86.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting cycler>=0.10 (from matplotlib)\n", + " Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)\n", + "Collecting fonttools>=4.22.0 (from matplotlib)\n", + " Downloading fonttools-4.41.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m115.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting kiwisolver>=1.0.1 (from matplotlib)\n", + " Downloading kiwisolver-1.4.4-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m119.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /opt/conda/envs/cuml_dev/lib/python3.9/site-packages (from matplotlib) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/cuml_dev/lib/python3.9/site-packages (from matplotlib) (10.0.0)\n", + "Collecting pyparsing<3.1,>=2.3.1 (from matplotlib)\n", + " Downloading pyparsing-3.0.9-py3-none-any.whl (98 kB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.3/98.3 kB\u001b[0m \u001b[31m43.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7 in /opt/conda/envs/cuml_dev/lib/python3.9/site-packages (from matplotlib) (2.8.2)\n", + "Collecting importlib-resources>=3.2.0 (from matplotlib)\n", + " Downloading importlib_resources-6.0.0-py3-none-any.whl (31 kB)\n", + "Requirement already satisfied: zipp>=3.1.0 in /opt/conda/envs/cuml_dev/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.15.0)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/envs/cuml_dev/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", + "Installing collected packages: pyparsing, kiwisolver, importlib-resources, h5py, fonttools, cycler, contourpy, matplotlib, adjustText\n", + "Successfully installed adjustText-0.8 contourpy-1.1.0 cycler-0.11.0 fonttools-4.41.1 h5py-3.9.0 importlib-resources-6.0.0 kiwisolver-1.4.4 matplotlib-3.7.2 pyparsing-3.0.9\n" + ] + } + ], + "source": [ + "!pip install adjustText h5py matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import tempfile\n", + "import cupy as cp\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import rmm\n", + "import urllib.request\n", + "import h5py\n", + "\n", + "from rmm.allocators.cupy import rmm_cupy_allocator\n", + "from pylibraft.common import DeviceResources\n", + "from pylibraft.neighbors import ivf_pq, refine\n", + "from adjustText import adjust_text\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# A clumsy helper for inspecting properties of an object\n", + "def show_properties(obj):\n", + " return {\n", + " attr: getattr(obj, attr)\n", + " for attr in dir(obj)\n", + " if type(getattr(type(obj), attr)).__name__ == 'getset_descriptor'\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The index and data will be saved in /tmp/raft_ivf_pq_tutorial\n" + ] + } + ], + "source": [ + "# We'll need to load store some data in this tutorial\n", + "WORK_FOLDER = os.path.join(tempfile.gettempdir(), 'raft_ivf_pq_tutorial')\n", + "\n", + "if not os.path.exists(WORK_FOLDER):\n", + " os.makedirs(WORK_FOLDER)\n", + "print(\"The index and data will be saved in\", WORK_FOLDER)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fri Jul 28 08:21:25 2023 \n", + "+---------------------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 535.49 Driver Version: 535.49 CUDA Version: 12.2 |\n", + "|-----------------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|=========================================+======================+======================|\n", + "| 0 NVIDIA H100 PCIe On | 00000000:41:00.0 Off | 0 |\n", + "| N/A 34C P0 46W / 350W | 4MiB / 81559MiB | 0% Default |\n", + "| | | Disabled |\n", + "+-----------------------------------------+----------------------+----------------------+\n", + " \n", + "+---------------------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=======================================================================================|\n", + "| No running processes found |\n", + "+---------------------------------------------------------------------------------------+\n" + ] + } + ], + "source": [ + "# Report the GPU in use to put the measurements into perspective\n", + "!nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use the pool memory resource\n", + "RAFT uses RMM allocator widely across its algorithms, including the performance-sensitive parts like IVF-PQ search.\n", + "It's strongly advised to set up the RMM pool memory resource to minimize the overheads of repeated CUDA allocations.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "pool = rmm.mr.PoolMemoryResource(\n", + " rmm.mr.CudaMemoryResource(),\n", + " initial_pool_size=2**30\n", + ")\n", + "rmm.mr.set_current_device_resource(pool)\n", + "cp.cuda.set_allocator(rmm_cupy_allocator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the data\n", + "The [ANN benchmarks website](https://ann-benchmarks.com) provides the datasets in [HDF5 format](https://www.hdfgroup.org/solutions/hdf5/).\n", + "\n", + "The list of prepared datasets can be found at https://github.com/erikbern/ann-benchmarks/#data-sets" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_URL = \"http://ann-benchmarks.com/sift-128-euclidean.hdf5\"\n", + "DATASET_FILENAME = DATASET_URL.split('/')[-1]\n", + "\n", + "## download the dataset\n", + "dataset_path = os.path.join(WORK_FOLDER, DATASET_FILENAME)\n", + "if not os.path.exists(dataset_path):\n", + " urllib.request.urlretrieve(DATASET_URL, dataset_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded dataset of size (1000000, 128); metric: 'euclidean'.\n", + "Number of test queries: 10000\n" + ] + } + ], + "source": [ + "f = h5py.File(dataset_path, \"r\")\n", + "\n", + "metric = f.attrs['distance']\n", + "\n", + "dataset = cp.array(f['train'])\n", + "queries = cp.array(f['test'])\n", + "gt_neighbors = cp.array(f['neighbors'])\n", + "gt_distances = cp.array(f['distances'])\n", + "\n", + "print(f\"Loaded dataset of size {dataset.shape}; metric: '{metric}'.\")\n", + "print(f\"Number of test queries: {queries.shape[0]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the index\n", + "Construction of the index generally consists of two phases: training (building the clusters) and filling-in (extending the index with data).\n", + "In the first phase, a balanced hierarchical k-means algorithm clusters the training data.\n", + "In the second phase, the new data is classified and added into the appropriate clusters in the index.\n", + "Hence, a user should call `ivf_pq.build` once and then possibly `ivf_pq.extend` several times.\n", + "Though for user convenience `ivf_pq.build` by default adds the whole training set into the index." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# RAFT's DeviceResources controls the GPU, cuda stream, memory policies etc.\n", + "# For now, we just create a default instance.\n", + "resources = DeviceResources()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'add_data_on_build': True,\n", + " 'codebook_kind': 0,\n", + " 'conservative_memory_allocation': False,\n", + " 'force_random_rotation': False,\n", + " 'kmeans_n_iters': 20,\n", + " 'kmeans_trainset_fraction': 0.5,\n", + " 'metric': 1,\n", + " 'n_lists': 1024,\n", + " 'pq_bits': 8,\n", + " 'pq_dim': 64}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First, we need to initialize the build/indexing parameters.\n", + "# One of the more important parameters is the product quantisation (PQ) dim.\n", + "# Effectively, this parameter says\n", + "# \"shrink the dataset to this dimensionality to reduce the index size\".\n", + "# It must be not bigger than the dataset dim,\n", + "# and it should be divisible by 32 for better GPU performance.\n", + "pq_dim = 1\n", + "while pq_dim * 2 < dataset.shape[1]:\n", + " pq_dim = pq_dim * 2\n", + "# We'll use the ANN-benchmarks-provided metric and sensible defaults for the rest of parameters.\n", + "index_params = ivf_pq.IndexParams(n_lists=1024, metric=metric, pq_dim=pq_dim)\n", + "\n", + "show_properties(index_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.71 s, sys: 16.2 ms, total: 1.72 s\n", + "Wall time: 1.71 s\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(type=IVF-PQ, metric=euclidean, codebook=subspace, size=1000000, dim=128, pq_dim=64, pq_bits=8, n_lists=1024, rot_dim=128)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "## Build the index\n", + "# This function takes a row-major either numpy or cupy (GPU) array.\n", + "# Generally, it's a bit faster with GPU inputs, but the CPU version may come in handy\n", + "# if the whole dataset cannot fit into GPU memory.\n", + "index = ivf_pq.build(index_params, dataset, handle=resources)\n", + "# This function is asynchronous so we need to explicitly synchronize the GPU before we can measure the execution time\n", + "resources.sync()\n", + "index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Index serialization\n", + "For bigger datasets, building an index can take some time. To avoid building the index from scratch every time you need it, you can save it to a file. Here is how this works:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 89.7 ms, sys: 56 ms, total: 146 ms\n", + "Wall time: 145 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(type=IVF-PQ, metric=euclidean, codebook=subspace, size=1000000, dim=128, pq_dim=64, pq_bits=8, n_lists=1024, rot_dim=128)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "index_filepath = os.path.join(WORK_FOLDER, \"ivf_pq.bin\")\n", + "ivf_pq.save(index_filepath, index) \n", + "loaded_index = ivf_pq.load(index_filepath)\n", + "resources.sync()\n", + "index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Search\n", + "The search function returns the requested number `k` of (approximate) nearest neighbor in no particular order.\n", + "Besides the queries and `k`, the function can take a few more parameters to tweak the performance of the algorithm.\n", + "Again, these are passed via the struct with some sensible defaults." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'internal_distance_dtype': 0, 'lut_dtype': 0, 'n_probes': 20}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "k = 10\n", + "search_params = ivf_pq.SearchParams()\n", + "show_properties(search_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 19.9 ms, sys: 12.3 ms, total: 32.2 ms\n", + "Wall time: 31.5 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "distances, neighbors = ivf_pq.search(search_params, index, queries, k, handle=resources)\n", + "# Sync the GPU to make sure we've got the timing right\n", + "resources.sync()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Measuring the quality of the predictions\n", + "We use [recall](https://en.wikipedia.org/wiki/Precision_and_recall) to measure the quality of the prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Got recall = 0.85409 with the default parameters (k = 10).\n" + ] + } + ], + "source": [ + "## Check the quality of the prediction (recall)\n", + "def calc_recall(found_indices, ground_truth):\n", + " found_indices = cp.asarray(found_indices)\n", + " bs, k = found_indices.shape\n", + " if bs != ground_truth.shape[0]:\n", + " raise RuntimeError(\n", + " \"Batch sizes do not match {} vs {}\".format(\n", + " bs, ground_truth.shape[0])\n", + " )\n", + " if k > ground_truth.shape[1]:\n", + " raise RuntimeError(\n", + " \"Not enough indices in the ground truth ({} > {})\".format(\n", + " k, ground_truth.shape[1])\n", + " )\n", + " n = 0\n", + " # Go over the batch\n", + " for i in range(bs):\n", + " # Note, ivf-pq does not guarantee the ordered input, hence the use of intersect1d\n", + " n += cp.intersect1d(found_indices[i, :k], ground_truth[i, :k]).size\n", + " recall = n / found_indices.size\n", + " return recall\n", + "\n", + "recall_first_try = calc_recall(neighbors, gt_neighbors)\n", + "print(f\"Got recall = {recall_first_try} with the default parameters (k = {k}).\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Refine\n", + "Let's improve our results a little bit!\n", + "The refinement operation follows an approximate NN search.\n", + "It recomputes the exact distances for the already selected candidates and selects a subset of them thus improving the recall." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 193 ms, sys: 142 µs, total: 193 ms\n", + "Wall time: 191 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "candidates = ivf_pq.search(search_params, index, queries, k * 2, handle=resources)[1]\n", + "distances, neighbors = refine(dataset, queries, candidates, k, handle=resources)\n", + "resources.sync()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Got recall = 0.94953 with 2x refinement (k = 10).\n" + ] + } + ], + "source": [ + "recall_refine2x = calc_recall(neighbors, gt_neighbors)\n", + "print(f\"Got recall = {recall_refine2x} with 2x refinement (k = {k}).\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tweaking search parameters\n", + "Before diving deep into tweaking the model, let's quickly define the performance metrics.\n", + "As we've mentioned earlier, we use the recall to measure the quality of prediction.\n", + "The other important metric is the speed of the search.\n", + "We measure the speed in terms of queries per second (QPS).\n", + "\n", + "Most of the time, by changing the model parameters we balance the trade-off between the QPS and the recall." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Number of neighbors\n", + "Let's see how QPS depens on `k`. " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16.5 ms ± 13.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "17 ms ± 2.12 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "17.5 ms ± 2.92 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "18 ms ± 3.05 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "18.7 ms ± 4.25 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "23.4 ms ± 45.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "25.9 ms ± 5.49 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "40.2 ms ± 12.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "23.6 ms ± 26.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "28.7 ms ± 18.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bench_k = np.exp2(np.arange(10)).astype(np.int32)\n", + "bench_avg = np.zeros_like(bench_k, dtype=np.float32)\n", + "bench_std = np.zeros_like(bench_k, dtype=np.float32)\n", + "for i, k in enumerate(bench_k):\n", + " r = %timeit -o ivf_pq.search(search_params, index, queries, k, handle=resources); resources.sync()\n", + " bench_avg[i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", + " bench_std[i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).std()\n", + "\n", + "fig, ax = plt.subplots(1, 1, figsize=plt.figaspect(1/2))\n", + "ax.errorbar(bench_k, bench_avg, bench_std)\n", + "ax.set_xscale('log')\n", + "ax.set_xticks(bench_k, bench_k)\n", + "ax.set_xlabel('k')\n", + "ax.grid()\n", + "ax.set_ylabel('QPS');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Number of probes\n", + "IVF-PQ search runs in two phases; first it looks for nearest clusters,\n", + "then it searches for the neighbors in every selected cluster.\n", + "\n", + "We can set how many clusters we want to inspect.\n", + "For this, `ivf_pq.SearchParams` has a parameter `n_probes`.\n", + "This is the core parameter to control the QPS/recall trade-off." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.67 ms ± 3.91 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "4.78 ms ± 1.74 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "6.65 ms ± 3.72 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "10.2 ms ± 4.86 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "17.2 ms ± 14.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "60.2 ms ± 16.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "115 ms ± 41.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "222 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "430 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "829 ms ± 162 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "1.6 s ± 354 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "bench_probes = np.exp2(np.arange(11)).astype(np.int32)\n", + "bench_qps = np.zeros_like(bench_probes, dtype=np.float32)\n", + "bench_recall = np.zeros_like(bench_probes, dtype=np.float32)\n", + "k = 100\n", + "for i, n_probes in enumerate(bench_probes):\n", + " sp = ivf_pq.SearchParams(n_probes=n_probes)\n", + " r = %timeit -o ivf_pq.search(sp, index, queries, k, handle=resources); resources.sync()\n", + " bench_qps[i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", + " bench_recall[i] = calc_recall(ivf_pq.search(sp, index, queries, k, handle=resources)[1], gt_neighbors)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's clear that the search time scales almost linearly with the number of probes.\n", + "This is due to the algorithm spending most of the time in the second phase scanning through individual clusters.\n", + "Thanks to the balanced nature of the clustering k-means algorithm, the sizes of the clusters are roughly similar;\n", + "hence the linear relation `n_probes` ~ query time.\n", + "\n", + "Let's draw some plots to illustrate how the number of probes affects QPS and recall." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(1, 3, figsize=plt.figaspect(1/4))\n", + "\n", + "ax[0].plot(bench_probes, bench_recall)\n", + "ax[0].set_xscale('log')\n", + "ax[0].set_xticks(bench_probes, bench_probes)\n", + "ax[0].set_xlabel('n_probes')\n", + "ax[0].set_ylabel('recall')\n", + "ax[0].grid()\n", + "\n", + "ax[1].plot(bench_probes, bench_qps)\n", + "ax[1].set_xscale('log')\n", + "ax[1].set_xticks(bench_probes, bench_probes)\n", + "ax[1].set_xlabel('n_probes')\n", + "ax[1].set_ylabel('QPS')\n", + "ax[1].set_yscale('log')\n", + "ax[1].grid()\n", + "\n", + "ax[2].plot(bench_recall, bench_qps)\n", + "ax[2].set_xlabel('recall')\n", + "ax[2].set_ylabel('QPS')\n", + "ax[2].set_yscale('log')\n", + "ax[2].grid();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Internal search types\n", + "Besides `n_probes`, `ivf_pq.SearchParams` contains a couple more parameters, which affect the internal workings of the algorithm.\n", + "\n", + "`internal_distance_dtype` controls the representation of the distance/similarity during the search.\n", + "By default, it's `np.float32`, but you can change it to `np.float16` when appropriate to save the memory bandwidth.\n", + "This can be a good idea when the dataset type is low precision anyway (e.g. `np.uint8`),\n", + "yet it may help with 32-bit float datasets too.\n", + "\n", + "`lut_dtype` is the Look-Up Table Data Type.\n", + "The specifics of the PQ algorithm is that it stores the data in the Product Quantizer (PQ) encoded format,\n", + "which needs to be decoded during the second-phase (in-cluster) search.\n", + "Thus, the algorithm constructs a lookup table for each cluster.\n", + "This is a costly operation, and the table itself can be rather large.\n", + "By default, the individual elements in the table are stored as 32-bit floats,\n", + "but you can change this to `np.float16` or `np.uint8` to reduce the table size.\n", + "\n", + "The exact size of the table is as follows:\n", + "\n", + "$ \\mathtt{lut\\_size} = \\mathtt{pq\\_dim} \\cdot \\mathtt{sizeof(lut\\_dtype) \\cdot 2^{\\mathtt{pq\\_bits}}} $\n", + "\n", + "Ideally, the lookup table should fit in the shared memory of a GPU's multiprocessor,\n", + "but it's not the case for wider datasets.\n", + "The logic of deciding whether this table should stay in the shared or the global memory of the GPU is somewhat complicated.\n", + "Yet, you can see the outcome when you gradually change `pq_dim` and observe a sudden drop in QPS after a certain threshold.\n", + "The shared-memory kernel version is typically 2-5x faster than the global-memory version.\n", + "\n", + "However `pq_dim` strongly affects the recall and requires the index to be re-build on change.\n", + "This is where `lut_dtype` comes in handy: you can halve or quarter the lookup table size by changing it.\n", + "Though it does affect the recall too.\n", + "\n", + "Also note, it does not make sense to set the `lut_dtype` to a more precise type than `internal_distance_dtype`,\n", + "as the former is converted to the latter internally.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "209 ms ± 151 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "178 ms ± 485 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "182 ms ± 297 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "176 ms ± 220 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "181 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "bench_qps_s1 = np.zeros((5,), dtype=np.float32)\n", + "bench_recall_s1 = np.zeros((5,), dtype=np.float32)\n", + "k = 10\n", + "n_probes = 256\n", + "search_params_32_32 = ivf_pq.SearchParams(n_probes=n_probes, internal_distance_dtype=np.float32, lut_dtype=np.float32)\n", + "search_params_32_16 = ivf_pq.SearchParams(n_probes=n_probes, internal_distance_dtype=np.float32, lut_dtype=np.float16)\n", + "search_params_32_08 = ivf_pq.SearchParams(n_probes=n_probes, internal_distance_dtype=np.float32, lut_dtype=np.uint8)\n", + "search_params_16_16 = ivf_pq.SearchParams(n_probes=n_probes, internal_distance_dtype=np.float16, lut_dtype=np.float16)\n", + "search_params_16_08 = ivf_pq.SearchParams(n_probes=n_probes, internal_distance_dtype=np.float16, lut_dtype=np.uint8)\n", + "search_ps = [search_params_32_32, search_params_32_16, search_params_32_08, search_params_16_16, search_params_16_08]\n", + "bench_names = ['32/32', '32/16', '32/8', '16/16', '16/8']\n", + "\n", + "for i, sp in enumerate(search_ps):\n", + " r = %timeit -o ivf_pq.search(sp, index, queries, k, handle=resources); resources.sync()\n", + " bench_qps_s1[i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", + " bench_recall_s1[i] = calc_recall(ivf_pq.search(sp, index, queries, k, handle=resources)[1], gt_neighbors)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(1, 1, figsize=plt.figaspect(1/2))\n", + "fig.suptitle(\n", + " f'Effects of search parameters on QPS/recall trade-off ({DATASET_FILENAME})\\n' + \\\n", + " f'k = {k}, n_probes = {n_probes}, pq_dim = {pq_dim}')\n", + "ax.plot(bench_recall_s1, bench_qps_s1, 'o')\n", + "ax.set_xlabel('recall')\n", + "ax.set_ylabel('QPS')\n", + "ax.grid()\n", + "annotations = []\n", + "for i, label in enumerate(bench_names):\n", + " annotations.append(ax.text(\n", + " bench_recall_s1[i], bench_qps_s1[i],\n", + " f\" {label} \",\n", + " ha='center', va='center'))\n", + "clutter = [\n", + " ax.text(\n", + " 0.02, 0.08,\n", + " 'Labels denote the bitsize of: internal_distance_dtype/lut_dtype',\n", + " verticalalignment='top',\n", + " bbox={'facecolor': 'white', 'edgecolor': 'grey'},\n", + " transform = ax.transAxes)\n", + "]\n", + "adjust_text(annotations, objects=clutter);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This figure represents the trade-offs one does by choosing different combintations of the internal search types (the bit sizes of the data types are shown as point labels).\n", + "Depending on the GPU and the selected dataset, you may see different pictures.\n", + "With SIFT-128 (`pq_dim = 64`), reducing the `internal_distance_dtype` comes at a huge cost to recall,\n", + "whereas `lut_dtype` doesn't cost too much while significantly improving QPS.\n", + "\n", + "Also, often you may see `16/16` version being faster than `16/8`.\n", + "This indicates that ALU is the bottleneck in this configuration, and a few extra ALU operations for converting between fp8 and fp16 do more harm than the saved L1 bandwidth does good for the performance.\n", + "\n", + "\n", + "Let's try the same experiment, but with refinement.\n", + "We'll try ratio 2 and 4 and see how it affects recall and QPS." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "210 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "181 ms ± 331 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "184 ms ± 536 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "179 ms ± 331 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "182 ms ± 329 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "410 ms ± 203 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "344 ms ± 304 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "338 ms ± 632 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "320 ms ± 269 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "323 ms ± 194 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "425 ms ± 743 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "389 ms ± 688 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "381 ms ± 519 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "325 ms ± 552 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "340 ms ± 876 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "def search_refine(ps, ratio):\n", + " k_search = k * ratio\n", + " candidates = ivf_pq.search(ps, index, queries, k_search, handle=resources)[1]\n", + " return candidates if ratio == 1 else refine(dataset, queries, candidates, k, handle=resources)[1]\n", + "\n", + "ratios = [1, 2, 4]\n", + "bench_qps_sr = np.zeros((len(ratios), len(search_ps)), dtype=np.float32)\n", + "bench_recall_sr = np.zeros((len(ratios), len(search_ps)), dtype=np.float32)\n", + "\n", + "for j, ratio in enumerate(ratios): \n", + " for i, ps in enumerate(search_ps):\n", + " r = %timeit -o search_refine(ps, ratio); resources.sync()\n", + " bench_qps_sr[j, i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", + " bench_recall_sr[j, i] = calc_recall(search_refine(ps, ratio), gt_neighbors)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(1, 1, figsize=plt.figaspect(1/2))\n", + "fig.suptitle(\n", + " f'Effects of search parameters on QPS/recall trade-off ({DATASET_FILENAME})\\n' + \\\n", + " f'k = {k}, n_probes = {n_probes}, pq_dim = {pq_dim}')\n", + "labels = []\n", + "for j, ratio in enumerate(ratios):\n", + " ax.plot(bench_recall_sr[j, :], bench_qps_sr[j, :], 'o')\n", + " labels.append(f\"refine ratio = {ratio}\")\n", + "ax.legend(labels)\n", + "ax.set_xlabel('recall')\n", + "ax.set_ylabel('QPS')\n", + "ax.grid()\n", + "colors = plt.rcParams[\"axes.prop_cycle\"].by_key()[\"color\"]\n", + "annotations = []\n", + "for j, ratio in enumerate(ratios):\n", + " for i, label in enumerate(bench_names):\n", + " annotations.append(ax.text(\n", + " bench_recall_sr[j, i], bench_qps_sr[j, i],\n", + " f\" {label} \",\n", + " color=colors[j],\n", + " ha='center', va='center'))\n", + "clutter = [\n", + " ax.text(\n", + " 0.02, 0.08,\n", + " 'Labels denote the bitsize of: internal_distance_dtype/lut_dtype',\n", + " verticalalignment='top',\n", + " bbox={'facecolor': 'white', 'edgecolor': 'grey'},\n", + " transform = ax.transAxes)\n", + "]\n", + "adjust_text(annotations, objects=clutter);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Depending on the dataset, you may see very different pictures here. For SIFT-128, we pick three interesting candidates candidates featuring compromizes between the QPS and the recall:\n", + " - `internal_distance_dtype = 16, lut_dtype = 16`\n", + " - `internal_distance_dtype = 32, lut_dtype = 8`\n", + " - `internal_distance_dtype = 32, lut_dtype = 8, refine_ratio = 2`\n", + "\n", + "This is all for the search parameters, but we will come back to the look-up table question in the next section." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def search_refine(internal_distance_dtype, lut_dtype, ratio, n_probes):\n", + " k_search = k * ratio\n", + " ps = ivf_pq.SearchParams(\n", + " n_probes=n_probes,\n", + " internal_distance_dtype=internal_distance_dtype,\n", + " lut_dtype=lut_dtype)\n", + " candidates = ivf_pq.search(ps, index, queries, k_search, handle=resources)[1]\n", + " return candidates if ratio == 1 else refine(dataset, queries, candidates, k, handle=resources)[1]\n", + "\n", + "search_configs = [\n", + " lambda n_probes: search_refine(np.float16, np.float16, 1, n_probes),\n", + " lambda n_probes: search_refine(np.float32, np.uint8, 1, n_probes),\n", + " lambda n_probes: search_refine(np.float32, np.uint8, 2, n_probes)\n", + "]\n", + "search_config_names = [\n", + " '16/16', '32/8', '32/8/r2'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tweaking indexing parameters\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Deciding on the indexing parameters is a bit more involved than on the search parameters. This is obviously because `ivf_pq.IndexParams` has more members than `ivf_pq.SearchParams`, but also because the try-test loop takes longer time when it includes training.\n", + "Since RAFT's IVF-PQ algorithm uses balanced-hierarchical k-means clustering and efficient logic for encoding, we find significantly improved index build times.\n", + "\n", + "First of all, let's pick the parameters we __don't need__ to tweak:\n", + "\n", + " - `metric` - the distance metric often depens on the problem and thus fixed (currently RAFT supports variations of eucliean and inner product distances).\n", + " - `conservative_memory_allocation` only affects how data is allocated - does not affect the search performance.\n", + " - `add_data_on_build` is a convenience flag. When activated, it automatically adds the training data to the index during `ivf_pq.build`. Otherwise, no data is added during `ivf_pq.build` and vectors need to be explicitly added to the index using `ivf_pq.extend`.\n", + " - `force_random_rotation` may slightly affect performance when the data dimensionality is a power of two (see the module docs), but normally you don't need to change the defaults. \n", + "\n", + "The rest of the parameters can be divided in two categories: influencing the coarse search (`kmeans_n_iters`, `kmeans_trainset_fraction` , `n_lists`) and the fine search / product quantization (`codebook_kind`, `pq_dim`, `pq_bits`)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing parameters affecting the coarse search\n", + "\n", + "#### n_lists\n", + "\n", + "`n_lists` is the first parameter to look at. It has a profound impact on overall performance during both training and search.\n", + "`n_lists` defines the number of clusters into which the index data is partitioned; you should keep this in mind when selecting the `n_probes` search parameter.\n", + "\n", + "The ratio `n_probes/n_lists` tells how large fraction of the dataset is compared to each query. If `n_lists == n_probes`, that is like a brute force search: we compare all dataset vectors to all query vectors. One would expect the recall is equal to `1` in such a case, but that does not take into account the PQ compression, which is lossy; in reality the recall is always lower unless you refine the search results.\n", + "\n", + "As `n_probes` approaches `n_lists`, IVF-PQ becomes slower than brute force because of all the extra work the algorithm does: dimension padding / transform, two-step search, extra PQ compute, etc. In practice searching around 0.1-1% of lists is enough for many datasets. But this depends on how well the input can be clustered. (e.g. for uniform random numbers as inputs, IVF methods don't work well).\n", + "\n", + "`n_lists = sqrt(n_samples)` is a good starting point for the balance of coarse/fine search time. To make sure the GPU resources are utilized efficiently, keep in mind:\n", + " - The average cluster size (i.e. `n_smaples / n_lists`) should be in the range of at least ~2k records to keep individual SMs busy\n", + " - Total amount of search work (`n_queries * n_probes`) should be a good multiple of number of SMs\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4.36 ms ± 2.38 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "4.36 ms ± 1.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "4.37 ms ± 2.47 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "7.74 ms ± 19.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "33.8 ms ± 733 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "44.1 ms ± 714 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "1.83 ms ± 1.66 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n", + "3.1 ms ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "6.43 ms ± 16.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "11.9 ms ± 33 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "45.2 ms ± 622 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "87.3 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "2.55 ms ± 452 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "5.1 ms ± 11.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "9.32 ms ± 15.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "16.1 ms ± 34.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "74 ms ± 254 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "145 ms ± 295 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "3.92 ms ± 5.94 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "8.12 ms ± 6.62 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "14.7 ms ± 23.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "27.8 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "132 ms ± 289 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "259 ms ± 3.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "7.49 ms ± 4.68 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "17.2 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "32.4 ms ± 111 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "63 ms ± 149 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "303 ms ± 2.32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "603 ms ± 1.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "n_list_variants = [100, 500, 1000, 2000, 5000]\n", + "pl_ratio_variants = [500, 200, 100, 50, 10, 5]\n", + "selected_search_variant = 1\n", + "search_fun = search_configs[selected_search_variant]\n", + "search_label = search_config_names[selected_search_variant]\n", + "\n", + "bench_qps_nl = np.zeros((len(n_list_variants), len(pl_ratio_variants)), dtype=np.float32)\n", + "bench_recall_nl = np.zeros_like(bench_qps_nl, dtype=np.float32)\n", + "\n", + "for i, n_lists in enumerate(n_list_variants):\n", + " index_params = ivf_pq.IndexParams(n_lists=n_lists, metric=metric, pq_dim=pq_dim)\n", + " index = ivf_pq.build(index_params, dataset, handle=resources)\n", + " for j, pl_ratio in enumerate(pl_ratio_variants):\n", + " n_probes = max(1, n_lists // pl_ratio)\n", + " r = %timeit -o search_fun(n_probes); resources.sync()\n", + " bench_qps_nl[i, j] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", + " bench_recall_nl[i, j] = calc_recall(search_fun(n_probes), gt_neighbors)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(1, 1, figsize=plt.figaspect(1/2))\n", + "fig.suptitle(\n", + " f'Effects of n_list on QPS/recall trade-off ({DATASET_FILENAME})\\n' + \\\n", + " f'k = {k}, pq_dim = {pq_dim}, search = {search_label}')\n", + "labels = []\n", + "for i, n_lists in enumerate(n_list_variants):\n", + " ax.plot(bench_recall_nl[i, :], bench_qps_nl[i, :])\n", + " labels.append(f\"n_lists = {n_lists}\")\n", + "\n", + "ax.legend(labels)\n", + "ax.set_xlabel('recall')\n", + "ax.set_ylabel('QPS')\n", + "ax.set_yscale('log')\n", + "ax.grid()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This chart demonstrates that for the given data set (SIFT-128) and the selected parameters, the QPS/recall curves are rather close to each other.\n", + "Yet, two lines, which correspond to 100- and 5000-cluster indices, lag below the others.\n", + "This suggests that 5000 clusters is probably too many and 100 clusters is probably too few for this dataset. In the range of 500-2000 the algorithm performs very similar though.\n", + "Hence, you shouldn't worry about finding the exact single best value of `n_lists`, but rather make sure it's within a reasonable range.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### kmeans_trainset_fraction\n", + "\n", + "This parameter defines how much of the original data should be fed into training.\n", + "This is useful when in conjunction with `add_data_on_build = True`.\n", + "For example, having a 100M-record dataset, it's reasonable to set `kmeans_trainset_fraction = 0.1` to train the index (i.e. run the k-means clustering) using 10M records only (10% of data), and then add the whole dataset to the index.\n", + "Hence, this parameter directly affects the training speed, but can indirectly affect the search performance (depending on how well the training set represents the full dataset).\n", + "\n", + "Note, if `add_data_on_build = False`, setting the trainset fraction less than one is identical to passing a smaller dataset to the `ivf_pq.build`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### kmeans_n_iters\n", + "\n", + "This parameter is passed directly to the k-means algorithm during training. It's set to a reasonable default of 20, which works for most datasets. However, once in a while you may see a warning complaining that the trained clusters are imbalanced. You can try to fix that by increasing the number of iterations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing parameters affecting the fine search / product quantization\n", + "\n", + "In the IVF-PQ index, a database vector y is approximated with two level quantization:\n", + "\n", + "$ y = Q_1(y) + Q_2(y - Q_1(y)) $\n", + "\n", + "The first level quantizer ($Q_1$), maps the vector y to the nearest cluster center. The number of\n", + "clusters is `n_lists`.\n", + "\n", + "The second quantizer encodes the residual, and it is defined as a product quantizer\n", + "(see [_\"Product quantization for nearest neighbor search\" by Herve Jegou, Matthijs Douze, Cordelia Schmid_](https://www.researchgate.net/publication/47815472_Product_Quantization_for_Nearest_Neighbor_Search)).\n", + "\n", + "A product quantizer encodes a `dim` dimensional vector with a `pq_dim` dimensional vector.\n", + "First we split the input vector into `pq_dim` subvectors (denoted by u), where each u vector\n", + "contains `pq_len` distinct components of y\n", + "```\n", + "y_1, y_2, ... y_{pq_len}, y_{pq_len+1}, ... y_{2*pq_len}, ... y_{dim-pq_len+1} ... y_{dim}\n", + " \\___________________/ \\____________________________/ \\______________________/\n", + " u_1 u_2 u_{pq_dim}\n", + "```\n", + "Then each subvector encoded with a separate quantizer $q_i$, end the results are concatenated\n", + "\n", + "$ Q_2(y) = q_1(u_1),q_2(u_2),...,q_\\mathtt{pq\\_dim}(u_\\mathtt{pq\\_dim}) $\n", + "\n", + "Each quantizer $q_i$ outputs a code with `pq_bit` bits. The second level quantizers are also defined\n", + "by k-means clustering in the corresponding sub-space: the reproduction values are the centroids,\n", + "and the set of reproduction values is the codebook.\n", + "\n", + "During the search, for every query and probed list, a look-up table (LUT) is constructed using appropriate codebooks and the query coordinates.\n", + "The size of the LUT has profound effect on the performance; here it is one more time:\n", + "\n", + "$ \\mathtt{lut\\_size} = \\mathtt{pq\\_dim} \\cdot \\mathtt{sizeof(lut\\_dtype) \\cdot 2^{\\mathtt{pq\\_bits}}} $\n", + "\n", + "If possible, the LUT is stored fully in GPU L1 (shared) memory during search;\n", + "otherwise, a slower version of the kernel is used, which stores the LUT in the global memory.\n", + "\n", + "\n", + "#### codebook_kind\n", + "\n", + "The second-level quantizers are trained either for each subspace or for each cluster, controlled by parameter `codebook_kind`:\n", + "\n", + " 1. \"subspace\" (C++ api: `codebook_gen::PER_SUBSPACE`): \\\n", + " creates `pq_dim` second-level quantizers - one for each slice of the data along features;\n", + " 2. \"cluster\" (C++ api: `codebook_gen::PER_CLUSTER`): \\\n", + " creates `n_lists` second-level quantizers - one for each first-level cluster.\n", + "\n", + "In either case, the centroids are found using k-means clustering interpreting the data as having `pq_len` dimensions.\n", + "\n", + "There's no definitive way to tell in advance, which of the two options yields better performance for a particular use case.\n", + "A few observations, however, may help:\n", + "\n", + " - A per-cluster codebook tends to take more time to train, since `n_lists` is usually much higher than `pq_dim` - more codebooks to train.\n", + " - Search with a per-cluster codebook usually utilizes L1 cache of the GPU better than with a per-subspace codebook; this may result in a faster search when the LUT is big and occupies a large part of the GPU L1 memory.\n", + " - However, in practice, the recall is slightly higher with a per-subspace codebook.\n", + "\n", + "\n", + "#### pq_dim, pq_bits\n", + "\n", + "`pq_dim` parameter is the main way to control the compression in the database.\n", + "You should choose it depending on your expectations about the sparsity of the information in the data.\n", + "As an experiment, you could start with `pq_dim` in the range of the data dimensionality `[dim / 2, dim]`.\n", + "\n", + "`pq_bits` is the number of bits in a single PQ code.\n", + "Hence, it controls the codebook size - $2^{\\mathtt{pq\\_bits}}$ - the number of possible values a code can take.\n", + "IVF-PQ supports the codebooks sizes from 16 to 256, or the `pq_bits` in the range of `[4, 8]`.\n", + "\n", + "`pq_bits` affects the compression: a database with `pq_bits = 4` is twice smaller than with the `pq_bits = 8`.\n", + "Though much stronger `pq_bits` affects the LUT size, as the LUT size is proportional to $2^{\\mathtt{pq\\_bits}}$ (see the formula above).\n", + "This also means a drastic effect on the recall.\n", + "\n", + "A few observations:\n", + "\n", + " - It's required that `(pq_dim * pq_bits) % 8 == 0`; in general, keeping `pq_dim` in powers of two improves the search performance due to better data alignment.\n", + " - Keeping `pq_dim * pq_bits >= 128` and `(pq_dim * pq_bits) % 32 == 0` maximizes the GPU memory bandwidth utilization.\n", + " - Generally `pq_bits = 8` is a good starting point.\n", + " - The recall loss due to smaller `pq_bits` can be compensated by enabling refinement.\n", + " - For high-dimensional data and large `pq_dims`, lowering `pq_bits` can yield a drastic search speedup due to enabling the faster kernel that keeps the LUT in L1.\n", + " - Alternatively, setting the search parameter `lut_dtype` to `uint8` may be enough to keep the LUT in L1.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.25 ms ± 10.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "15.5 ms ± 24.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "36.7 ms ± 468 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "71.8 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "9.4 ms ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "16.2 ms ± 32.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "38.2 ms ± 520 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "74.4 ms ± 291 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "160 ms ± 48.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "168 ms ± 393 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "191 ms ± 139 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "228 ms ± 590 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "12.2 ms ± 24.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "25.2 ms ± 73.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "59.8 ms ± 167 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "117 ms ± 84.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "14.3 ms ± 19.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "25.2 ms ± 2.93 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "59.6 ms ± 29.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "116 ms ± 17.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "165 ms ± 757 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "176 ms ± 168 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "212 ms ± 245 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "270 ms ± 283 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "6.47 ms ± 20.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "11 ms ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "24.5 ms ± 285 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "46.2 ms ± 460 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "8.25 ms ± 19.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "13.2 ms ± 3.08 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "28.7 ms ± 3.21 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "53.4 ms ± 6.59 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "158 ms ± 135 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "164 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "180 ms ± 114 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "206 ms ± 322 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "6.29 ms ± 3.05 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "10.7 ms ± 10.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "23.8 ms ± 5.83 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "44.6 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "8.17 ms ± 6.97 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "13 ms ± 35.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "28.4 ms ± 11.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "52.6 ms ± 69.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "159 ms ± 205 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "164 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "181 ms ± 256 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "207 ms ± 2.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "# Let's try a few build configurations.\n", + "# Warning: this will take some time\n", + "\n", + "k = 10\n", + "n_probes_variants = [10, 20, 50, 100]\n", + "n_lists = 1000\n", + "\n", + "build_configs = {\n", + " '64-8-subspace': ivf_pq.IndexParams(n_lists=n_lists, metric=metric, pq_dim=64, pq_bits=8, codebook_kind=\"subspace\"),\n", + " '128-8-subspace': ivf_pq.IndexParams(n_lists=n_lists, metric=metric, pq_dim=128, pq_bits=8, codebook_kind=\"subspace\"),\n", + " '128-6-subspace': ivf_pq.IndexParams(n_lists=n_lists, metric=metric, pq_dim=128, pq_bits=6, codebook_kind=\"subspace\"),\n", + " '128-6-cluster': ivf_pq.IndexParams(n_lists=n_lists, metric=metric, pq_dim=128, pq_bits=6, codebook_kind=\"cluster\"),\n", + "}\n", + "\n", + "bench_qps_ip = np.zeros((len(build_configs), len(search_configs), len(n_probes_variants)), dtype=np.float32)\n", + "bench_recall_ip = np.zeros_like(bench_qps_ip, dtype=np.float32)\n", + "\n", + "for i, index_params in enumerate(build_configs.values()):\n", + " index = ivf_pq.build(index_params, dataset, handle=resources)\n", + " for l, search_fun in enumerate(search_configs):\n", + " for j, n_probes in enumerate(n_probes_variants):\n", + " r = %timeit -o search_fun(n_probes); resources.sync()\n", + " bench_qps_ip[i, l, j] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", + " bench_recall_ip[i, l, j] = calc_recall(search_fun(n_probes), gt_neighbors)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(len(search_config_names), 1, figsize=(16, len(search_config_names)*8))\n", + "fig.suptitle(\n", + " f'Effects of index parameters on QPS/recall trade-off ({DATASET_FILENAME})\\n' + \\\n", + " f'k = {k}, n_lists = {n_lists}')\n", + "\n", + "for j, search_label in enumerate(search_config_names):\n", + " labels = []\n", + " for i, index_label in enumerate(build_configs.keys()):\n", + " ax[j].plot(bench_recall_ip[i, j, :], bench_qps_ip[i, j, :])\n", + " labels.append(index_label)\n", + "\n", + " ax[j].set_title(f\"search: {search_label}\")\n", + " ax[j].legend(labels)\n", + " ax[j].set_xlabel('recall')\n", + " ax[j].set_ylabel('QPS')\n", + " ax[j].set_yscale('log')\n", + " ax[j].grid()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like `pq_dim = 128`, `pq_bits = 6` is the best parameter set for the `SIFT-128` dataset." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "vscode": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt index c498ede26e..29405e43c0 100644 --- a/python/pylibraft/CMakeLists.txt +++ b/python/pylibraft/CMakeLists.txt @@ -14,10 +14,15 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -set(pylibraft_version 23.06.02) - include(../../fetch_rapids.cmake) +set(pylibraft_version 23.08.00) + +# We always need CUDA for pylibraft because the raft dependency brings in a header-only cuco +# dependency that enables CUDA unconditionally. +include(rapids-cuda) +rapids_cuda_init_architectures(pylibraft) + project( pylibraft VERSION ${pylibraft_version} @@ -25,7 +30,7 @@ project( # language to be enabled here. The test project that is built in scikit-build to verify # various linking options for the python library is hardcoded to build with C, so until # that is fixed we need to keep C. - C CXX + C CXX CUDA ) option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files" @@ -51,15 +56,6 @@ endif() include(rapids-cython) if(NOT raft_FOUND) - # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required - # languages for the C++ project even if this project does not require those languages. - include(rapids-cuda) - rapids_cuda_init_architectures(pylibraft) - enable_language(CUDA) - # Since pylibraft only enables CUDA optionally we need to manually include the file that - # rapids_cuda_init_architectures relies on `project` including. - include("${CMAKE_PROJECT_pylibraft_INCLUDE}") - set(BUILD_TESTS OFF) set(BUILD_PRIMS_BENCH OFF) set(BUILD_ANN_BENCH OFF) diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py index ed57e3d7fa..52e0cc05ea 100644 --- a/python/pylibraft/pylibraft/__init__.py +++ b/python/pylibraft/pylibraft/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. # -__version__ = "23.06.02" +__version__ = "23.08.00" diff --git a/python/pylibraft/pylibraft/common/ai_wrapper.py b/python/pylibraft/pylibraft/common/ai_wrapper.py index b6b1f02187..b2b5935ede 100644 --- a/python/pylibraft/pylibraft/common/ai_wrapper.py +++ b/python/pylibraft/pylibraft/common/ai_wrapper.py @@ -34,6 +34,7 @@ def __init__(self, ai_arr): ai_arr : array interface array """ self.ai_ = ai_arr.__array_interface__ + self.from_cai = False @property def dtype(self): diff --git a/python/pylibraft/pylibraft/common/cai_wrapper.py b/python/pylibraft/pylibraft/common/cai_wrapper.py index cf11ea29ce..8a77a9b1b6 100644 --- a/python/pylibraft/pylibraft/common/cai_wrapper.py +++ b/python/pylibraft/pylibraft/common/cai_wrapper.py @@ -37,6 +37,7 @@ def __init__(self, cai_arr): __array_interface__=cai_arr.__cuda_array_interface__ ) super().__init__(helper) + self.from_cai = True def wrap_array(array): diff --git a/python/pylibraft/pylibraft/common/mdspan.pxd b/python/pylibraft/pylibraft/common/mdspan.pxd index 3be8d5e1a6..6b202c2b69 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pxd +++ b/python/pylibraft/pylibraft/common/mdspan.pxd @@ -19,10 +19,14 @@ # cython: embedsignature = True # cython: language_level = 3 -from libc.stdint cimport int8_t, int64_t, uint8_t +from libc.stdint cimport int8_t, int64_t, uint8_t, uint32_t from libcpp.string cimport string -from pylibraft.common.cpp.mdspan cimport device_matrix_view, row_major +from pylibraft.common.cpp.mdspan cimport ( + device_matrix_view, + host_matrix_view, + row_major, +) from pylibraft.common.handle cimport device_resources from pylibraft.common.optional cimport make_optional, optional @@ -41,3 +45,21 @@ cdef device_matrix_view[int64_t, int64_t, row_major] get_dmv_int64( cdef optional[device_matrix_view[int64_t, int64_t, row_major]] make_optional_view_int64( # noqa: E501 device_matrix_view[int64_t, int64_t, row_major]& dmv) except * + +cdef device_matrix_view[uint32_t, int64_t, row_major] get_dmv_uint32( + array, check_shape) except * + +cdef host_matrix_view[float, int64_t, row_major] get_hmv_float( + array, check_shape) except * + +cdef host_matrix_view[uint8_t, int64_t, row_major] get_hmv_uint8( + array, check_shape) except * + +cdef host_matrix_view[int8_t, int64_t, row_major] get_hmv_int8( + array, check_shape) except * + +cdef host_matrix_view[int64_t, int64_t, row_major] get_hmv_int64( + array, check_shape) except * + +cdef host_matrix_view[uint32_t, int64_t, row_major] get_hmv_uint32( + array, check_shape) except * diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx index f35a94bb9c..1219b1612d 100644 --- a/python/pylibraft/pylibraft/common/mdspan.pyx +++ b/python/pylibraft/pylibraft/common/mdspan.pyx @@ -30,6 +30,7 @@ from libc.stdint cimport int8_t, int32_t, int64_t, uint8_t, uint32_t, uintptr_t from pylibraft.common.cpp.mdspan cimport ( col_major, device_matrix_view, + host_matrix_view, host_mdspan, make_device_matrix_view, make_host_matrix_view, @@ -195,3 +196,72 @@ cdef device_matrix_view[int64_t, int64_t, row_major] \ cdef optional[device_matrix_view[int64_t, int64_t, row_major]] \ make_optional_view_int64(device_matrix_view[int64_t, int64_t, row_major]& dmv) except *: # noqa: E501 return make_optional[device_matrix_view[int64_t, int64_t, row_major]](dmv) + + +# todo(dantegd): we can unify and simplify this functions a little bit +# defining extra functions as-is is the quickest way to get what we need for +# cagra.pyx +cdef device_matrix_view[uint32_t, int64_t, row_major] \ + get_dmv_uint32(cai, check_shape) except *: + if cai.dtype != np.uint32: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_device_matrix_view[uint32_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef host_matrix_view[float, int64_t, row_major] \ + get_hmv_float(cai, check_shape) except *: + if cai.dtype != np.float32: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_host_matrix_view[float, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef host_matrix_view[uint8_t, int64_t, row_major] \ + get_hmv_uint8(cai, check_shape) except *: + if cai.dtype != np.uint8: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_host_matrix_view[uint8_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef host_matrix_view[int8_t, int64_t, row_major] \ + get_hmv_int8(cai, check_shape) except *: + if cai.dtype != np.int8: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_host_matrix_view[int8_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef host_matrix_view[int64_t, int64_t, row_major] \ + get_hmv_int64(cai, check_shape) except *: + if cai.dtype != np.int64: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_host_matrix_view[int64_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) + + +cdef host_matrix_view[uint32_t, int64_t, row_major] \ + get_hmv_uint32(cai, check_shape) except *: + if cai.dtype != np.int64: + raise TypeError("dtype %s not supported" % cai.dtype) + if check_shape and len(cai.shape) != 2: + raise ValueError("Expected a 2D array, got %d D" % len(cai.shape)) + shape = (cai.shape[0], cai.shape[1] if len(cai.shape) == 2 else 1) + return make_host_matrix_view[uint32_t, int64_t, row_major]( + cai.data, shape[0], shape[1]) diff --git a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt index 7b9c1591c1..45cd9f74e6 100644 --- a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt +++ b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt @@ -23,5 +23,6 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_ ) +add_subdirectory(cagra) add_subdirectory(ivf_flat) add_subdirectory(ivf_pq) diff --git a/python/pylibraft/pylibraft/neighbors/__init__.py b/python/pylibraft/pylibraft/neighbors/__init__.py index a50b6f21a7..325ea5842e 100644 --- a/python/pylibraft/pylibraft/neighbors/__init__.py +++ b/python/pylibraft/pylibraft/neighbors/__init__.py @@ -13,8 +13,8 @@ # limitations under the License. # -from pylibraft.neighbors import brute_force +from pylibraft.neighbors import brute_force, cagra, ivf_flat, ivf_pq from .refine import refine -__all__ = ["common", "refine", "brute_force"] +__all__ = ["common", "refine", "brute_force", "ivf_flat", "ivf_pq", "cagra"] diff --git a/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt new file mode 100644 index 0000000000..441bb0b311 --- /dev/null +++ b/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt @@ -0,0 +1,24 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Set the list of Cython files to build +set(cython_sources cagra.pyx) +set(linked_libraries raft::raft raft::compiled) + +# Build all of the Cython targets +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_cagra_ +) diff --git a/python/pylibraft/pylibraft/neighbors/cagra/__init__.pxd b/python/pylibraft/pylibraft/neighbors/cagra/__init__.pxd new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/pylibraft/pylibraft/neighbors/cagra/__init__.py b/python/pylibraft/pylibraft/neighbors/cagra/__init__.py new file mode 100644 index 0000000000..b2a872fc89 --- /dev/null +++ b/python/pylibraft/pylibraft/neighbors/cagra/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .cagra import Index, IndexParams, SearchParams, build, load, save, search + +__all__ = [ + "Index", + "IndexParams", + "SearchParams", + "build", + "load", + "save", + "search", +] diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx new file mode 100644 index 0000000000..7d758a32ef --- /dev/null +++ b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx @@ -0,0 +1,841 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import warnings + +import numpy as np + +from cython.operator cimport dereference as deref +from libc.stdint cimport ( + int8_t, + int32_t, + int64_t, + uint8_t, + uint32_t, + uint64_t, + uintptr_t, +) +from libcpp cimport bool, nullptr +from libcpp.string cimport string + +from pylibraft.distance.distance_type cimport DistanceType + +from pylibraft.common import ( + DeviceResources, + ai_wrapper, + auto_convert_output, + cai_wrapper, + device_ndarray, +) +from pylibraft.common.cai_wrapper import wrap_array +from pylibraft.common.interruptible import cuda_interruptible + +from pylibraft.common.handle cimport device_resources + +from pylibraft.common.handle import auto_sync_handle +from pylibraft.common.input_validation import is_c_contiguous + +from rmm._lib.memory_resource cimport ( + DeviceMemoryResource, + device_memory_resource, +) + +cimport pylibraft.neighbors.cagra.cpp.c_cagra as c_cagra +from pylibraft.common.optional cimport make_optional, optional + +from pylibraft.neighbors.common import _check_input_array, _get_metric + +from pylibraft.common.cpp.mdspan cimport ( + device_matrix_view, + device_vector_view, + make_device_vector_view, + row_major, +) +from pylibraft.common.mdspan cimport ( + get_dmv_float, + get_dmv_int8, + get_dmv_int64, + get_dmv_uint8, + get_dmv_uint32, + get_hmv_float, + get_hmv_int8, + get_hmv_int64, + get_hmv_uint8, + get_hmv_uint32, + make_optional_view_int64, +) +from pylibraft.neighbors.common cimport _get_metric_string + + +cdef class IndexParams: + cdef c_cagra.index_params params + + def __init__(self, *, + metric="sqeuclidean", + intermediate_graph_degree=128, + graph_degree=64, + add_data_on_build=True): + """" + Parameters to build index for CAGRA nearest neighbor search + + Parameters + ---------- + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean", "inner_product", + "euclidean"], where + - sqeuclidean is the euclidean distance without the square root + operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, + - euclidean is the euclidean distance + - inner product distance is defined as + distance(a, b) = \\sum_i a_i * b_i. + intermediate_graph_degree : int, default = 128 + + graph_degree : int, default = 64 + + add_data_on_build : bool, default = True + After training the coarse and fine quantizers, we will populate + the index with the dataset if add_data_on_build == True, otherwise + the index is left empty, and the extend method can be used + to add new vectors to the index. + """ + self.params.metric = _get_metric(metric) + self.params.metric_arg = 0 + self.params.intermediate_graph_degree = intermediate_graph_degree + self.params.graph_degree = graph_degree + self.params.add_data_on_build = add_data_on_build + + @property + def metric(self): + return self.params.metric + + @property + def intermediate_graph_degree(self): + return self.params.intermediate_graph_degree + + @property + def graph_degree(self): + return self.params.graph_degree + + @property + def add_data_on_build(self): + return self.params.add_data_on_build + + +cdef class Index: + cdef readonly bool trained + cdef str active_index_type + + def __cinit__(self): + self.trained = False + self.active_index_type = None + + +cdef class IndexFloat(Index): + cdef c_cagra.index[float, uint32_t] * index + + def __cinit__(self, handle=None): + if handle is None: + handle = DeviceResources() + cdef device_resources* handle_ = \ + handle.getHandle() + + self.index = new c_cagra.index[float, uint32_t]( + deref(handle_)) + + def __repr__(self): + m_str = "metric=" + _get_metric_string(self.index.metric()) + attr_str = [attr + "=" + str(getattr(self, attr)) + for attr in ["metric", "dim", "graph_degree"]] + attr_str = m_str + attr_str + return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" + + @property + def metric(self): + return self.index[0].metric() + + @property + def size(self): + return self.index[0].size() + + @property + def dim(self): + return self.index[0].dim() + + @property + def graph_degree(self): + return self.index[0].graph_degree() + + def __dealloc__(self): + if self.index is not NULL: + del self.index + + +cdef class IndexInt8(Index): + cdef c_cagra.index[int8_t, uint32_t] * index + + def __cinit__(self, handle=None): + if handle is None: + handle = DeviceResources() + cdef device_resources* handle_ = \ + handle.getHandle() + + self.index = new c_cagra.index[int8_t, uint32_t]( + deref(handle_)) + + def __repr__(self): + m_str = "metric=" + _get_metric_string(self.index.metric()) + attr_str = [attr + "=" + str(getattr(self, attr)) + for attr in ["metric", "dim", "graph_degree"]] + attr_str = m_str + attr_str + return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" + + @property + def metric(self): + return self.index[0].metric() + + @property + def size(self): + return self.index[0].size() + + @property + def dim(self): + return self.index[0].dim() + + @property + def graph_degree(self): + return self.index[0].graph_degree() + + def __dealloc__(self): + if self.index is not NULL: + del self.index + + +cdef class IndexUint8(Index): + cdef c_cagra.index[uint8_t, uint32_t] * index + + def __cinit__(self, handle=None): + if handle is None: + handle = DeviceResources() + cdef device_resources* handle_ = \ + handle.getHandle() + + self.index = new c_cagra.index[uint8_t, uint32_t]( + deref(handle_)) + + def __repr__(self): + m_str = "metric=" + _get_metric_string(self.index.metric()) + attr_str = [attr + "=" + str(getattr(self, attr)) + for attr in ["metric", "dim", "graph_degree"]] + attr_str = m_str + attr_str + return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" + + @property + def metric(self): + return self.index[0].metric() + + @property + def size(self): + return self.index[0].size() + + @property + def dim(self): + return self.index[0].dim() + + @property + def graph_degree(self): + return self.index[0].graph_degree() + + def __dealloc__(self): + if self.index is not NULL: + del self.index + + +@auto_sync_handle +@auto_convert_output +def build(IndexParams index_params, dataset, handle=None): + """ + Build the CAGRA index from the dataset for efficient search. + + The build performs two different steps- first an intermediate knn-graph is + constructed, then it's optimized it to create the final graph. The + index_params object controls the node degree of these graphs. + + It is required that both the dataset and the optimized graph fit the + GPU memory. + + The following distance metrics are supported: + - L2 + + Parameters + ---------- + index_params : IndexParams object + dataset : CUDA array interface compliant matrix shape (n_samples, dim) + Supported dtype [float, int8, uint8] + {handle_docstring} + + Returns + ------- + index: cagra.Index + + Examples + -------- + + >>> import cupy as cp + + >>> from pylibraft.common import DeviceResources + >>> from pylibraft.neighbors import cagra + + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_queries = 1000 + >>> k = 10 + + >>> dataset = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + + >>> handle = DeviceResources() + >>> build_params = cagra.IndexParams(metric="sqeuclidean") + + >>> index = cagra.build(build_params, dataset, handle=handle) + + >>> distances, neighbors = cagra.search(cagra.SearchParams(), + ... index, dataset, + ... k, handle=handle) + + >>> # pylibraft functions are often asynchronous so the + >>> # handle needs to be explicitly synchronized + >>> handle.sync() + + >>> distances = cp.asarray(distances) + >>> neighbors = cp.asarray(neighbors) + """ + dataset_ai = wrap_array(dataset) + dataset_dt = dataset_ai.dtype + _check_input_array(dataset_ai, [np.dtype('float32'), np.dtype('byte'), + np.dtype('ubyte')]) + + if handle is None: + handle = DeviceResources() + cdef device_resources* handle_ = \ + handle.getHandle() + + cdef IndexFloat idx_float + cdef IndexInt8 idx_int8 + cdef IndexUint8 idx_uint8 + + if dataset_ai.from_cai: + if dataset_dt == np.float32: + idx_float = IndexFloat(handle) + idx_float.active_index_type = "float32" + with cuda_interruptible(): + c_cagra.build_device( + deref(handle_), + index_params.params, + get_dmv_float(dataset_ai, check_shape=True), + deref(idx_float.index)) + idx_float.trained = True + return idx_float + elif dataset_dt == np.byte: + idx_int8 = IndexInt8(handle) + idx_int8.active_index_type = "byte" + with cuda_interruptible(): + c_cagra.build_device( + deref(handle_), + index_params.params, + get_dmv_int8(dataset_ai, check_shape=True), + deref(idx_int8.index)) + idx_int8.trained = True + return idx_int8 + elif dataset_dt == np.ubyte: + idx_uint8 = IndexUint8(handle) + idx_uint8.active_index_type = "ubyte" + with cuda_interruptible(): + c_cagra.build_device( + deref(handle_), + index_params.params, + get_dmv_uint8(dataset_ai, check_shape=True), + deref(idx_uint8.index)) + idx_uint8.trained = True + return idx_uint8 + else: + raise TypeError("dtype %s not supported" % dataset_dt) + else: + if dataset_dt == np.float32: + idx_float = IndexFloat(handle) + idx_float.active_index_type = "float32" + with cuda_interruptible(): + c_cagra.build_host( + deref(handle_), + index_params.params, + get_hmv_float(dataset_ai, check_shape=True), + deref(idx_float.index)) + idx_float.trained = True + return idx_float + elif dataset_dt == np.byte: + idx_int8 = IndexInt8(handle) + idx_int8.active_index_type = "byte" + with cuda_interruptible(): + c_cagra.build_host( + deref(handle_), + index_params.params, + get_hmv_int8(dataset_ai, check_shape=True), + deref(idx_int8.index)) + idx_int8.trained = True + return idx_int8 + elif dataset_dt == np.ubyte: + idx_uint8 = IndexUint8(handle) + idx_uint8.active_index_type = "ubyte" + with cuda_interruptible(): + c_cagra.build_host( + deref(handle_), + index_params.params, + get_hmv_uint8(dataset_ai, check_shape=True), + deref(idx_uint8.index)) + idx_uint8.trained = True + return idx_uint8 + else: + raise TypeError("dtype %s not supported" % dataset_dt) + + +cdef class SearchParams: + cdef c_cagra.search_params params + + def __init__(self, *, + max_queries=0, + itopk_size=64, + max_iterations=0, + algo="auto", + team_size=0, + search_width=1, + min_iterations=0, + thread_block_size=0, + hashmap_mode="auto", + hashmap_min_bitlen=0, + hashmap_max_fill_rate=0.5, + num_random_samplings=1, + rand_xor_mask=0x128394): + """ + CAGRA search parameters + + Parameters + ---------- + max_queries: int, default = 0 + Maximum number of queries to search at the same time (batch size). + Auto select when 0. + itopk_size: int, default = 64 + Number of intermediate search results retained during the search. + This is the main knob to adjust trade off between accuracy and + search speed. Higher values improve the search accuracy. + max_iterations: int, default = 0 + Upper limit of search iterations. Auto select when 0. + algo: string denoting the search algorithm to use, default = "auto" + Valid values for algo: ["auto", "single_cta", "multi_cta"], where + - auto will automatically select the best value based on query size + - single_cta is better when query contains larger number of + vectors (e.g >10) + - multi_cta is better when query contains only a few vectors + team_size: int, default = 0 + Number of threads used to calculate a single distance. 4, 8, 16, + or 32. + search_width: int, default = 1 + Number of graph nodes to select as the starting point for the + search in each iteration. + min_iterations: int, default = 0 + Lower limit of search iterations. + thread_block_size: int, default = 0 + Thread block size. 0, 64, 128, 256, 512, 1024. + Auto selection when 0. + hashmap_mode: string denoting the type of hash map to use. It's + usually better to allow the algorithm to select this value., + default = "auto" + Valid values for hashmap_mode: ["auto", "small", "hash"], where + - auto will automatically select the best value based on algo + - small will use the small shared memory hash table with resetting. + - hash will use a single hash table in global memory. + hashmap_min_bitlen: int, default = 0 + Upper limit of hashmap fill rate. More than 0.1, less than 0.9. + hashmap_max_fill_rate: float, default = 0.5 + Upper limit of hashmap fill rate. More than 0.1, less than 0.9. + num_random_samplings: int, default = 1 + Number of iterations of initial random seed node selection. 1 or + more. + rand_xor_mask: int, default = 0x128394 + Bit mask used for initial random seed node selection. + + + """ + self.params.max_queries = max_queries + self.params.itopk_size = itopk_size + self.params.max_iterations = max_iterations + if algo == "single_cta": + self.params.algo = c_cagra.search_algo.SINGLE_CTA + elif algo == "multi_cta": + self.params.algo = c_cagra.search_algo.MULTI_CTA + elif algo == "multi_kernel": + self.params.algo = c_cagra.search_algo.MULTI_KERNEL + elif algo == "auto": + self.params.algo = c_cagra.search_algo.AUTO + else: + raise ValueError("`algo` value not supported.") + + self.params.team_size = team_size + self.params.search_width = search_width + self.params.min_iterations = min_iterations + self.params.thread_block_size = thread_block_size + if hashmap_mode == "hash": + self.params.hashmap_mode = c_cagra.hash_mode.HASH + elif hashmap_mode == "small": + self.params.hashmap_mode = c_cagra.hash_mode.SMALL + elif hashmap_mode == "auto": + self.params.hashmap_mode = c_cagra.hash_mode.AUTO + else: + raise ValueError("`hashmap_mode` value not supported.") + + self.params.hashmap_min_bitlen = hashmap_min_bitlen + self.params.hashmap_max_fill_rate = hashmap_max_fill_rate + self.params.num_random_samplings = num_random_samplings + self.params.rand_xor_mask = rand_xor_mask + + def __repr__(self): + # todo(dantegd): add all relevant attrs + attr_str = [attr + "=" + str(getattr(self, attr)) + for attr in ["max_queries"]] + return "SearchParams(type=CAGRA, " + (", ".join(attr_str)) + ")" + + @property + def max_queries(self): + return self.params.max_queries + + @property + def itopk_size(self): + return self.params.itopk_size + + @property + def max_iterations(self): + return self.params.max_iterations + + @property + def algo(self): + return self.params.algo + + @property + def team_size(self): + return self.params.team_size + + @property + def search_width(self): + return self.params.search_width + + @property + def min_iterations(self): + return self.params.min_iterations + + @property + def thread_block_size(self): + return self.params.thread_block_size + + @property + def hashmap_mode(self): + return self.params.hashmap_mode + + @property + def hashmap_min_bitlen(self): + return self.params.hashmap_min_bitlen + + @property + def hashmap_max_fill_rate(self): + return self.params.hashmap_max_fill_rate + + @property + def num_random_samplings(self): + return self.params.num_random_samplings + + @property + def rand_xor_mask(self): + return self.params.rand_xor_mask + + +@auto_sync_handle +@auto_convert_output +def search(SearchParams search_params, + Index index, + queries, + k, + neighbors=None, + distances=None, + handle=None): + """ + Find the k nearest neighbors for each query. + + Parameters + ---------- + search_params : SearchParams + index : Index + Trained CAGRA index. + queries : CUDA array interface compliant matrix shape (n_samples, dim) + Supported dtype [float, int8, uint8] + k : int + The number of neighbors. + neighbors : Optional CUDA array interface compliant matrix shape + (n_queries, k), dtype int64_t. If supplied, neighbor + indices will be written here in-place. (default None) + distances : Optional CUDA array interface compliant matrix shape + (n_queries, k) If supplied, the distances to the + neighbors will be written here in-place. (default None) + {handle_docstring} + + Examples + -------- + >>> import cupy as cp + + >>> from pylibraft.common import DeviceResources + >>> from pylibraft.neighbors import cagra + + >>> n_samples = 50000 + >>> n_features = 50 + >>> n_queries = 1000 + >>> dataset = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + + >>> # Build index + >>> handle = DeviceResources() + >>> index = cagra.build(cagra.IndexParams(), dataset, handle=handle) + + >>> # Search using the built index + >>> queries = cp.random.random_sample((n_queries, n_features), + ... dtype=cp.float32) + >>> k = 10 + >>> search_params = cagra.SearchParams( + ... max_queries=100, + ... itopk_size=64 + ... ) + + >>> # Using a pooling allocator reduces overhead of temporary array + >>> # creation during search. This is useful if multiple searches + >>> # are performad with same query size. + >>> distances, neighbors = cagra.search(search_params, index, queries, + ... k, handle=handle) + + >>> # pylibraft functions are often asynchronous so the + >>> # handle needs to be explicitly synchronized + >>> handle.sync() + + >>> neighbors = cp.asarray(neighbors) + >>> distances = cp.asarray(distances) + """ + + if not index.trained: + raise ValueError("Index need to be built before calling search.") + + if handle is None: + handle = DeviceResources() + cdef device_resources* handle_ = \ + handle.getHandle() + + queries_cai = cai_wrapper(queries) + queries_dt = queries_cai.dtype + cdef uint32_t n_queries = queries_cai.shape[0] + + _check_input_array(queries_cai, [np.dtype('float32'), np.dtype('byte'), + np.dtype('ubyte')], + exp_cols=index.dim) + + if neighbors is None: + neighbors = device_ndarray.empty((n_queries, k), dtype='uint32') + + neighbors_cai = cai_wrapper(neighbors) + _check_input_array(neighbors_cai, [np.dtype('uint32')], + exp_rows=n_queries, exp_cols=k) + + if distances is None: + distances = device_ndarray.empty((n_queries, k), dtype='float32') + + distances_cai = cai_wrapper(distances) + _check_input_array(distances_cai, [np.dtype('float32')], + exp_rows=n_queries, exp_cols=k) + + cdef c_cagra.search_params params = search_params.params + cdef IndexFloat idx_float + cdef IndexInt8 idx_int8 + cdef IndexUint8 idx_uint8 + + if queries_dt == np.float32: + idx_float = index + with cuda_interruptible(): + c_cagra.search(deref(handle_), + params, + deref(idx_float.index), + get_dmv_float(queries_cai, check_shape=True), + get_dmv_uint32(neighbors_cai, check_shape=True), + get_dmv_float(distances_cai, check_shape=True)) + elif queries_dt == np.byte: + idx_int8 = index + with cuda_interruptible(): + c_cagra.search(deref(handle_), + params, + deref(idx_int8.index), + get_dmv_int8(queries_cai, check_shape=True), + get_dmv_uint32(neighbors_cai, check_shape=True), + get_dmv_float(distances_cai, check_shape=True)) + elif queries_dt == np.ubyte: + idx_uint8 = index + with cuda_interruptible(): + c_cagra.search(deref(handle_), + params, + deref(idx_uint8.index), + get_dmv_uint8(queries_cai, check_shape=True), + get_dmv_uint32(neighbors_cai, check_shape=True), + get_dmv_float(distances_cai, check_shape=True)) + else: + raise ValueError("query dtype %s not supported" % queries_dt) + + return (distances, neighbors) + + +@auto_sync_handle +def save(filename, Index index, handle=None): + """ + Saves the index to file. + + Saving / loading the index is. The serialization format is + subject to change. + + Parameters + ---------- + filename : string + Name of the file. + index : Index + Trained CAGRA index. + {handle_docstring} + + Examples + -------- + >>> import cupy as cp + + >>> from pylibraft.common import DeviceResources + >>> from pylibraft.neighbors import cagra + + >>> n_samples = 50000 + >>> n_features = 50 + >>> dataset = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + + >>> # Build index + >>> handle = DeviceResources() + >>> index = cagra.build(cagra.IndexParams(), dataset, handle=handle) + >>> cagra.save("my_index.bin", index, handle=handle) + """ + if not index.trained: + raise ValueError("Index need to be built before saving it.") + + if handle is None: + handle = DeviceResources() + cdef device_resources* handle_ = \ + handle.getHandle() + + cdef string c_filename = filename.encode('utf-8') + + cdef IndexFloat idx_float + cdef IndexInt8 idx_int8 + cdef IndexUint8 idx_uint8 + + if index.active_index_type == "float32": + idx_float = index + c_cagra.serialize_file( + deref(handle_), c_filename, deref(idx_float.index)) + elif index.active_index_type == "byte": + idx_int8 = index + c_cagra.serialize_file( + deref(handle_), c_filename, deref(idx_int8.index)) + elif index.active_index_type == "ubyte": + idx_uint8 = index + c_cagra.serialize_file( + deref(handle_), c_filename, deref(idx_uint8.index)) + else: + raise ValueError( + "Index dtype %s not supported" % index.active_index_type) + + +@auto_sync_handle +def load(filename, handle=None): + """ + Loads index from file. + + Saving / loading the index is. The serialization format is + subject to change, therefore loading an index saved with a previous + version of raft is not guaranteed to work. + + Parameters + ---------- + filename : string + Name of the file. + {handle_docstring} + + Returns + ------- + index : Index + + Examples + -------- + >>> import cupy as cp + + >>> from pylibraft.common import DeviceResources + >>> from pylibraft.neighbors import cagra + + """ + if handle is None: + handle = DeviceResources() + cdef device_resources* handle_ = \ + handle.getHandle() + + cdef string c_filename = filename.encode('utf-8') + cdef IndexFloat idx_float + cdef IndexInt8 idx_int8 + cdef IndexUint8 idx_uint8 + + # we extract the dtype from the arrai interfaces in the file + with open(filename, 'rb') as f: + type_str = f.read(700).decode("utf-8", errors='ignore') + + dataset_dt = np.dtype(type_str[673:676]) + + if dataset_dt == np.float32: + idx_float = IndexFloat(handle) + c_cagra.deserialize_file( + deref(handle_), c_filename, idx_float.index) + idx_float.trained = True + idx_float.active_index_type = 'float32' + return idx_float + elif dataset_dt == np.byte: + idx_int8 = IndexInt8(handle) + c_cagra.deserialize_file( + deref(handle_), c_filename, idx_int8.index) + idx_int8.trained = True + idx_int8.active_index_type = 'byte' + return idx_int8 + elif dataset_dt == np.ubyte: + idx_uint8 = IndexUint8(handle) + c_cagra.deserialize_file( + deref(handle_), c_filename, idx_uint8.index) + idx_uint8.trained = True + idx_uint8.active_index_type = 'ubyte' + return idx_uint8 + else: + raise ValueError("Dataset dtype %s not supported" % dataset_dt) diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cpp/__init__.pxd b/python/pylibraft/pylibraft/neighbors/cagra/cpp/__init__.pxd new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cpp/__init__.py b/python/pylibraft/pylibraft/neighbors/cagra/cpp/__init__.py new file mode 100644 index 0000000000..8f2cc34855 --- /dev/null +++ b/python/pylibraft/pylibraft/neighbors/cagra/cpp/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd b/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd new file mode 100644 index 0000000000..284c75b771 --- /dev/null +++ b/python/pylibraft/pylibraft/neighbors/cagra/cpp/c_cagra.pxd @@ -0,0 +1,202 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import numpy as np + +import pylibraft.common.handle + +from cython.operator cimport dereference as deref +from libc.stdint cimport int8_t, int64_t, uint8_t, uint32_t, uint64_t +from libcpp cimport bool, nullptr +from libcpp.string cimport string + +from rmm._lib.memory_resource cimport device_memory_resource + +from pylibraft.common.cpp.mdspan cimport ( + device_matrix_view, + device_vector_view, + host_matrix_view, + row_major, +) +from pylibraft.common.handle cimport device_resources +from pylibraft.common.optional cimport optional +from pylibraft.distance.distance_type cimport DistanceType +from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport ( + ann_index, + ann_index_params, + ann_search_params, + index_params as ivfpq_ip, + search_params as ivfpq_sp, +) + + +cdef extern from "raft/neighbors/cagra_types.hpp" \ + namespace "raft::neighbors::cagra" nogil: + + cpdef cppclass index_params(ann_index_params): + size_t intermediate_graph_degree + size_t graph_degree + + ctypedef enum search_algo: + SINGLE_CTA "raft::neighbors::cagra::search_algo::SINGLE_CTA", + MULTI_CTA "raft::neighbors::cagra::search_algo::MULTI_CTA", + MULTI_KERNEL "raft::neighbors::cagra::search_algo::MULTI_KERNEL", + AUTO "raft::neighbors::cagra::search_algo::AUTO" + + ctypedef enum hash_mode: + HASH "raft::neighbors::cagra::hash_mode::HASH", + SMALL "raft::neighbors::cagra::hash_mode::SMALL", + AUTO "raft::neighbors::cagra::hash_mode::AUTO" + + cpdef cppclass search_params(ann_search_params): + size_t max_queries + size_t itopk_size + size_t max_iterations + search_algo algo + size_t team_size + size_t search_width + size_t min_iterations + size_t thread_block_size + hash_mode hashmap_mode + size_t hashmap_min_bitlen + float hashmap_max_fill_rate + uint32_t num_random_samplings + uint64_t rand_xor_mask + + cdef cppclass index[T, IdxT](ann_index): + index(const device_resources&) + + DistanceType metric() + IdxT size() + uint32_t dim() + uint32_t graph_degree() + device_matrix_view[T, IdxT, row_major] dataset() + device_matrix_view[T, IdxT, row_major] graph() + +cdef extern from "raft_runtime/neighbors/cagra.hpp" \ + namespace "raft::runtime::neighbors::cagra" nogil: + + cdef void build_device( + const device_resources& handle, + const index_params& params, + device_matrix_view[float, int64_t, row_major] dataset, + index[float, uint32_t]& index) except + + + cdef void build_device( + const device_resources& handle, + const index_params& params, + device_matrix_view[int8_t, int64_t, row_major] dataset, + index[int8_t, uint32_t]& index) except + + + cdef void build_device( + const device_resources& handle, + const index_params& params, + device_matrix_view[uint8_t, int64_t, row_major] dataset, + index[uint8_t, uint32_t]& index) except + + + cdef void build_host( + const device_resources& handle, + const index_params& params, + host_matrix_view[float, int64_t, row_major] dataset, + index[float, uint32_t]& index) except + + + cdef void build_host( + const device_resources& handle, + const index_params& params, + host_matrix_view[int8_t, int64_t, row_major] dataset, + index[int8_t, uint32_t]& index) except + + + cdef void build_host( + const device_resources& handle, + const index_params& params, + host_matrix_view[uint8_t, int64_t, row_major] dataset, + index[uint8_t, uint32_t]& index) except + + + cdef void search( + const device_resources& handle, + const search_params& params, + const index[float, uint32_t]& index, + device_matrix_view[float, int64_t, row_major] queries, + device_matrix_view[uint32_t, int64_t, row_major] neighbors, + device_matrix_view[float, int64_t, row_major] distances) except + + + cdef void search( + const device_resources& handle, + const search_params& params, + const index[int8_t, uint32_t]& index, + device_matrix_view[int8_t, int64_t, row_major] queries, + device_matrix_view[uint32_t, int64_t, row_major] neighbors, + device_matrix_view[float, int64_t, row_major] distances) except + + + cdef void search( + const device_resources& handle, + const search_params& params, + const index[uint8_t, uint32_t]& index, + device_matrix_view[uint8_t, int64_t, row_major] queries, + device_matrix_view[uint32_t, int64_t, row_major] neighbors, + device_matrix_view[float, int64_t, row_major] distances) except + + + cdef void serialize(const device_resources& handle, + string& str, + const index[float, uint32_t]& index) except + + + cdef void deserialize(const device_resources& handle, + const string& str, + index[float, uint32_t]* index) except + + + cdef void serialize(const device_resources& handle, + string& str, + const index[uint8_t, uint32_t]& index) except + + + cdef void deserialize(const device_resources& handle, + const string& str, + index[uint8_t, uint32_t]* index) except + + + cdef void serialize(const device_resources& handle, + string& str, + const index[int8_t, uint32_t]& index) except + + + cdef void deserialize(const device_resources& handle, + const string& str, + index[int8_t, uint32_t]* index) except + + + cdef void serialize_file(const device_resources& handle, + const string& filename, + const index[float, uint32_t]& index) except + + + cdef void deserialize_file(const device_resources& handle, + const string& filename, + index[float, uint32_t]* index) except + + + cdef void serialize_file(const device_resources& handle, + const string& filename, + const index[uint8_t, uint32_t]& index) except + + + cdef void deserialize_file(const device_resources& handle, + const string& filename, + index[uint8_t, uint32_t]* index) except + + + cdef void serialize_file(const device_resources& handle, + const string& filename, + const index[int8_t, uint32_t]& index) except + + + cdef void deserialize_file(const device_resources& handle, + const string& filename, + index[int8_t, uint32_t]* index) except + diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx index 0e550547d3..e265bee23b 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx @@ -614,26 +614,10 @@ def search(SearchParams search_params, ... dtype=cp.float32) >>> k = 10 >>> search_params = ivf_flat.SearchParams( - ... n_probes=20, - ... lut_dtype=cp.float16, - ... internal_distance_dtype=cp.float32 - ... ) - - # TODO update example to set default pool allocator - # (instead of passing an mr) - - >>> # Using a pooling allocator reduces overhead of temporary array - >>> # creation during search. This is useful if multiple searches - >>> # are performad with same query size. - >>> import rmm - >>> mr = rmm.mr.PoolMemoryResource( - ... rmm.mr.CudaMemoryResource(), - ... initial_pool_size=2**29, - ... maximum_pool_size=2**31 + ... n_probes=20 ... ) >>> distances, neighbors = ivf_flat.search(search_params, index, queries, - ... k, memory_resource=mr, - ... handle=handle) + ... k, handle=handle) >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized @@ -817,7 +801,7 @@ def load(filename, handle=None): >>> handle = DeviceResources() >>> index = ivf_flat.load("my_index.bin", handle=handle) - >>> distances, neighbors = ivf_flat.search(ivf_pq.SearchParams(), index, + >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), index, ... queries, k=10, handle=handle) """ if handle is None: diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pxd new file mode 100644 index 0000000000..1b99da1fd7 --- /dev/null +++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pxd @@ -0,0 +1,25 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# distutils: language = c++ + +cimport pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq as c_ivf_pq + + +cdef class IndexParams: + cdef c_ivf_pq.index_params params + +cdef class SearchParams: + cdef c_ivf_pq.search_params params diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx index b89e5dd44d..413a9a1d4b 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx @@ -95,7 +95,6 @@ cdef _get_dtype_string(dtype): cdef class IndexParams: - cdef c_ivf_pq.index_params params def __init__(self, *, n_lists=1024, @@ -521,7 +520,6 @@ def extend(Index index, new_vectors, new_indices, handle=None): cdef class SearchParams: - cdef c_ivf_pq.search_params params def __init__(self, *, n_probes=20, lut_dtype=np.float32, diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx index 20f5327226..5e57da713c 100644 --- a/python/pylibraft/pylibraft/neighbors/refine.pyx +++ b/python/pylibraft/pylibraft/neighbors/refine.pyx @@ -18,7 +18,6 @@ # cython: embedsignature = True # cython: language_level = 3 -import cupy as cp import numpy as np from cython.operator cimport dereference as deref diff --git a/python/pylibraft/pylibraft/test/test_cagra.py b/python/pylibraft/pylibraft/test/test_cagra.py new file mode 100644 index 0000000000..435b2878a2 --- /dev/null +++ b/python/pylibraft/pylibraft/test/test_cagra.py @@ -0,0 +1,296 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# h ttp://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import pytest +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import normalize + +from pylibraft.common import device_ndarray +from pylibraft.neighbors import cagra + + +# todo (dantegd): consolidate helper utils of ann methods +def generate_data(shape, dtype): + if dtype == np.byte: + x = np.random.randint(-127, 128, size=shape, dtype=np.byte) + elif dtype == np.ubyte: + x = np.random.randint(0, 255, size=shape, dtype=np.ubyte) + else: + x = np.random.random_sample(shape).astype(dtype) + + return x + + +def calc_recall(ann_idx, true_nn_idx): + assert ann_idx.shape == true_nn_idx.shape + n = 0 + for i in range(ann_idx.shape[0]): + n += np.intersect1d(ann_idx[i, :], true_nn_idx[i, :]).size + recall = n / ann_idx.size + return recall + + +def run_cagra_build_search_test( + n_rows=10000, + n_cols=10, + n_queries=100, + k=10, + dtype=np.float32, + metric="euclidean", + intermediate_graph_degree=128, + graph_degree=64, + array_type="device", + compare=True, + inplace=True, + add_data_on_build=True, + search_params={}, +): + dataset = generate_data((n_rows, n_cols), dtype) + if metric == "inner_product": + dataset = normalize(dataset, norm="l2", axis=1) + dataset_device = device_ndarray(dataset) + + build_params = cagra.IndexParams( + metric=metric, + intermediate_graph_degree=intermediate_graph_degree, + graph_degree=graph_degree, + ) + + if array_type == "device": + index = cagra.build(build_params, dataset_device) + else: + index = cagra.build(build_params, dataset) + + assert index.trained + + if not add_data_on_build: + dataset_1 = dataset[: n_rows // 2, :] + dataset_2 = dataset[n_rows // 2 :, :] + indices_1 = np.arange(n_rows // 2, dtype=np.uint32) + indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.uint32) + if array_type == "device": + dataset_1_device = device_ndarray(dataset_1) + dataset_2_device = device_ndarray(dataset_2) + indices_1_device = device_ndarray(indices_1) + indices_2_device = device_ndarray(indices_2) + index = cagra.extend(index, dataset_1_device, indices_1_device) + index = cagra.extend(index, dataset_2_device, indices_2_device) + else: + index = cagra.extend(index, dataset_1, indices_1) + index = cagra.extend(index, dataset_2, indices_2) + + queries = generate_data((n_queries, n_cols), dtype) + out_idx = np.zeros((n_queries, k), dtype=np.uint32) + out_dist = np.zeros((n_queries, k), dtype=np.float32) + + queries_device = device_ndarray(queries) + out_idx_device = device_ndarray(out_idx) if inplace else None + out_dist_device = device_ndarray(out_dist) if inplace else None + + search_params = cagra.SearchParams(**search_params) + + ret_output = cagra.search( + search_params, + index, + queries_device, + k, + neighbors=out_idx_device, + distances=out_dist_device, + ) + + if not inplace: + out_dist_device, out_idx_device = ret_output + + if not compare: + return + + out_idx = out_idx_device.copy_to_host() + out_dist = out_dist_device.copy_to_host() + + # Calculate reference values with sklearn + skl_metric = { + "sqeuclidean": "sqeuclidean", + "inner_product": "cosine", + "euclidean": "euclidean", + }[metric] + nn_skl = NearestNeighbors( + n_neighbors=k, algorithm="brute", metric=skl_metric + ) + nn_skl.fit(dataset) + skl_idx = nn_skl.kneighbors(queries, return_distance=False) + + recall = calc_recall(out_idx, skl_idx) + assert recall > 0.7 + + +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) +@pytest.mark.parametrize("array_type", ["device", "host"]) +def test_cagra_dataset_dtype_host_device(dtype, array_type, inplace): + # Note that inner_product tests use normalized input which we cannot + # represent in int8, therefore we test only sqeuclidean metric here. + run_cagra_build_search_test( + dtype=dtype, + inplace=inplace, + array_type=array_type, + ) + + +@pytest.mark.parametrize( + "params", + [ + { + "intermediate_graph_degree": 64, + "graph_degree": 32, + "add_data_on_build": True, + "k": 1, + "metric": "euclidean", + }, + { + "intermediate_graph_degree": 32, + "graph_degree": 16, + "add_data_on_build": False, + "k": 5, + "metric": "sqeuclidean", + }, + { + "intermediate_graph_degree": 128, + "graph_degree": 32, + "add_data_on_build": True, + "k": 10, + "metric": "inner_product", + }, + ], +) +def test_cagra_index_params(params): + # Note that inner_product tests use normalized input which we cannot + # represent in int8, therefore we test only sqeuclidean metric here. + run_cagra_build_search_test( + k=params["k"], + metric=params["metric"], + graph_degree=params["graph_degree"], + intermediate_graph_degree=params["intermediate_graph_degree"], + compare=False, + ) + + +@pytest.mark.parametrize( + "params", + [ + { + "max_queries": 100, + "itopk_size": 32, + "max_iterations": 100, + "algo": "single_cta", + "team_size": 0, + "search_width": 1, + "min_iterations": 1, + "thread_block_size": 64, + "hashmap_mode": "hash", + "hashmap_min_bitlen": 0.2, + "hashmap_max_fill_rate": 0.5, + "num_random_samplings": 1, + }, + { + "max_queries": 10, + "itopk_size": 128, + "max_iterations": 0, + "algo": "multi_cta", + "team_size": 8, + "search_width": 2, + "min_iterations": 10, + "thread_block_size": 0, + "hashmap_mode": "auto", + "hashmap_min_bitlen": 0.9, + "hashmap_max_fill_rate": 0.5, + "num_random_samplings": 10, + }, + { + "max_queries": 0, + "itopk_size": 64, + "max_iterations": 0, + "algo": "multi_kernel", + "team_size": 16, + "search_width": 1, + "min_iterations": 0, + "thread_block_size": 0, + "hashmap_mode": "auto", + "hashmap_min_bitlen": 0, + "hashmap_max_fill_rate": 0.5, + "num_random_samplings": 1, + }, + { + "max_queries": 0, + "itopk_size": 64, + "max_iterations": 0, + "algo": "auto", + "team_size": 32, + "search_width": 4, + "min_iterations": 0, + "thread_block_size": 0, + "hashmap_mode": "small", + "hashmap_min_bitlen": 0, + "hashmap_max_fill_rate": 0.5, + "num_random_samplings": 1, + }, + ], +) +def test_cagra_search_params(params): + # Note that inner_product tests use normalized input which we cannot + # represent in int8, therefore we test only sqeuclidean metric here. + run_cagra_build_search_test(search_params=params) + + +@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.ubyte]) +def test_save_load(dtype): + n_rows = 10000 + n_cols = 50 + n_queries = 1000 + + dataset = generate_data((n_rows, n_cols), dtype) + dataset_device = device_ndarray(dataset) + + build_params = cagra.IndexParams() + index = cagra.build(build_params, dataset_device) + + assert index.trained + filename = "my_index.bin" + cagra.save(filename, index) + loaded_index = cagra.load(filename) + + queries = generate_data((n_queries, n_cols), dtype) + + queries_device = device_ndarray(queries) + search_params = cagra.SearchParams() + k = 10 + + distance_dev, neighbors_dev = cagra.search( + search_params, index, queries_device, k + ) + + neighbors = neighbors_dev.copy_to_host() + dist = distance_dev.copy_to_host() + del index + + distance_dev, neighbors_dev = cagra.search( + search_params, loaded_index, queries_device, k + ) + + neighbors2 = neighbors_dev.copy_to_host() + dist2 = distance_dev.copy_to_host() + + assert np.all(neighbors == neighbors2) + assert np.allclose(dist, dist2, rtol=1e-6) diff --git a/python/pylibraft/pylibraft/test/test_doctests.py b/python/pylibraft/pylibraft/test/test_doctests.py index 19e5c5c22f..c75f565236 100644 --- a/python/pylibraft/pylibraft/test/test_doctests.py +++ b/python/pylibraft/pylibraft/test/test_doctests.py @@ -97,8 +97,11 @@ def _find_doctests_in_obj(obj, finder=None, criteria=None): DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.distance)) DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.matrix.select_k)) DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors)) -DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.ivf_pq)) DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.brute_force)) +DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.cagra)) +DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.ivf_flat)) +DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.ivf_pq)) +DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.refine)) DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.random)) diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml index ac60af89d1..cb7e7ad8c2 100644 --- a/python/pylibraft/pyproject.toml +++ b/python/pylibraft/pyproject.toml @@ -16,11 +16,11 @@ requires = [ "cmake>=3.23.1,!=3.25.0", - "cuda-python>=11.7.1,<12.0", + "cuda-python>=11.7.1,<12.0a0", "cython>=0.29,<0.30", "ninja", - "rmm==23.6.*", - "scikit-build>=0.13.1,<0.17.2", + "rmm==23.8.*", + "scikit-build>=0.13.1", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -28,7 +28,7 @@ build-backend = "setuptools.build_meta" [project] name = "pylibraft" -version = "23.06.02" +version = "23.08.00" description = "RAFT: Reusable Algorithms Functions and other Tools" readme = { file = "README.md", content-type = "text/markdown" } authors = [ @@ -37,9 +37,9 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cuda-python>=11.7.1,<12.0", + "cuda-python>=11.7.1,<12.0a0", "numpy>=1.21", - "rmm==23.6.*", + "rmm==23.8.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt index c2623383ae..9dd8e64698 100644 --- a/python/raft-dask/CMakeLists.txt +++ b/python/raft-dask/CMakeLists.txt @@ -14,7 +14,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -set(raft_dask_version 23.06.02) +set(raft_dask_version 23.08.00) include(../../fetch_rapids.cmake) diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml index 0d0919f421..a33c4fed5e 100644 --- a/python/raft-dask/pyproject.toml +++ b/python/raft-dask/pyproject.toml @@ -18,14 +18,14 @@ requires = [ "cmake>=3.23.1,!=3.25.0", "cython>=0.29,<0.30", "ninja", - "scikit-build>=0.13.1,<0.17.2", + "scikit-build>=0.13.1", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project] name = "raft-dask" -version = "23.06.02" +version = "23.08.00" description = "Reusable Accelerated Functions & Tools Dask Infrastructure" readme = { file = "README.md", content-type = "text/markdown" } authors = [ @@ -34,14 +34,14 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "dask-cuda==23.6.*", - "dask==2023.3.2", - "distributed==2023.3.2.1", + "dask-cuda==23.8.*", + "dask==2023.7.1", + "distributed==2023.7.1", "joblib>=0.11", "numba>=0.57", "numpy>=1.21", - "pylibraft==23.6.*", - "ucx-py==0.32.*", + "pylibraft==23.8.*", + "ucx-py==0.33.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py index fd509419a4..f294906058 100644 --- a/python/raft-dask/raft_dask/__init__.py +++ b/python/raft-dask/raft_dask/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. # -__version__ = "23.06.02" +__version__ = "23.08.00" diff --git a/python/raft-dask/raft_dask/common/utils.py b/python/raft-dask/raft_dask/common/utils.py index 78a899aa50..dcc53fda9a 100644 --- a/python/raft-dask/raft_dask/common/utils.py +++ b/python/raft-dask/raft_dask/common/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/raft-dask/raft_dask/test/conftest.py b/python/raft-dask/raft_dask/test/conftest.py index 39ee21cbaa..d1baa684d4 100644 --- a/python/raft-dask/raft_dask/test/conftest.py +++ b/python/raft-dask/raft_dask/test/conftest.py @@ -1,54 +1,71 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import os import pytest from dask.distributed import Client -from dask_cuda import LocalCUDACluster, initialize +from dask_cuda import LocalCUDACluster os.environ["UCX_LOG_LEVEL"] = "error" -enable_tcp_over_ucx = True -enable_nvlink = False -enable_infiniband = False - - @pytest.fixture(scope="session") def cluster(): - cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) - yield cluster - cluster.close() + scheduler_file = os.environ.get("SCHEDULER_FILE") + if scheduler_file: + yield scheduler_file + else: + cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) + yield cluster + cluster.close() @pytest.fixture(scope="session") def ucx_cluster(): - initialize.initialize( - create_cuda_context=True, - enable_tcp_over_ucx=enable_tcp_over_ucx, - enable_nvlink=enable_nvlink, - enable_infiniband=enable_infiniband, - ) - cluster = LocalCUDACluster( - protocol="ucx", - enable_tcp_over_ucx=enable_tcp_over_ucx, - enable_nvlink=enable_nvlink, - enable_infiniband=enable_infiniband, - ) - yield cluster - cluster.close() + scheduler_file = os.environ.get("SCHEDULER_FILE") + if scheduler_file: + yield scheduler_file + else: + cluster = LocalCUDACluster( + protocol="ucx", + ) + yield cluster + cluster.close() @pytest.fixture(scope="session") def client(cluster): - client = Client(cluster) + client = create_client(cluster) yield client client.close() @pytest.fixture() def ucx_client(ucx_cluster): - client = Client(cluster) + client = create_client(ucx_cluster) yield client client.close() + + +def create_client(cluster): + """ + Create a Dask distributed client for a specified cluster. + + Parameters + ---------- + cluster : LocalCUDACluster instance or str + If a LocalCUDACluster instance is provided, a client will be created + for it directly. If a string is provided, it should specify the path to + a Dask scheduler file. A client will then be created for the cluster + referenced by this scheduler file. + + Returns + ------- + dask.distributed.Client + A client connected to the specified cluster. + """ + if isinstance(cluster, LocalCUDACluster): + return Client(cluster) + else: + return Client(scheduler_file=cluster) diff --git a/python/raft-dask/raft_dask/test/test_comms.py b/python/raft-dask/raft_dask/test/test_comms.py index 3a430f9270..68c9fee556 100644 --- a/python/raft-dask/raft_dask/test/test_comms.py +++ b/python/raft-dask/raft_dask/test/test_comms.py @@ -18,6 +18,7 @@ import pytest from dask.distributed import Client, get_worker, wait +from dask_cuda import LocalCUDACluster try: from raft_dask.common import ( @@ -42,10 +43,31 @@ pytestmark = pytest.mark.skip -def test_comms_init_no_p2p(cluster): +def create_client(cluster): + """ + Create a Dask distributed client for a specified cluster. + + Parameters + ---------- + cluster : LocalCUDACluster instance or str + If a LocalCUDACluster instance is provided, a client will be created + for it directly. If a string is provided, it should specify the path to + a Dask scheduler file. A client will then be created for the cluster + referenced by this scheduler file. + + Returns + ------- + dask.distributed.Client + A client connected to the specified cluster. + """ + if isinstance(cluster, LocalCUDACluster): + return Client(cluster) + else: + return Client(scheduler_file=cluster) - client = Client(cluster) +def test_comms_init_no_p2p(cluster): + client = create_client(cluster) try: cb = Comms(verbose=True) cb.init() @@ -121,8 +143,7 @@ def func_check_uid_on_worker(sessionId, uniqueId, dask_worker=None): def test_handles(cluster): - - client = Client(cluster) + client = create_client(cluster) def _has_handle(sessionId): return local_handle(sessionId, dask_worker=get_worker()) is not None diff --git a/scripts/ann-benchmarks/algos.yaml b/scripts/ann-benchmarks/algos.yaml new file mode 100644 index 0000000000..54fddf607b --- /dev/null +++ b/scripts/ann-benchmarks/algos.yaml @@ -0,0 +1,30 @@ +faise_gpu_ivf_flat: + executable: FAISS_IVF_FLAT_ANN_BENCH + disabled: false +faiss_gpu_flat: + executable: FAISS_IVF_FLAT_ANN_BENCH + disabled: false +faiss_gpu_ivf_pq: + executable: FAISS_IVF_PQ_ANN_BENCH + disabled: false +faiss_gpu_ivf_sq: + executable: FAISS_IVF_PQ_ANN_BENCH + disabled: false +faiss_gpu_bfknn: + executable: FAISS_BFKNN_ANN_BENCH + disabled: false +raft_ivf_flat: + executable: RAFT_IVF_FLAT_ANN_BENCH + disabled: false +raft_ivf_pq: + executable: RAFT_IVF_PQ_ANN_BENCH + disabled: false +raft_cagra: + executable: RAFT_CAGRA_ANN_BENCH + disabled: false +ggnn: + executable: GGNN_ANN_BENCH + disabled: false +hnswlib: + executable: HNSWLIB_ANN_BENCH + disabled: false \ No newline at end of file diff --git a/scripts/ann-benchmarks/data_export.py b/scripts/ann-benchmarks/data_export.py new file mode 100644 index 0000000000..5be73bef11 --- /dev/null +++ b/scripts/ann-benchmarks/data_export.py @@ -0,0 +1,59 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess + + +def export_results(output_filepath, recompute, groundtruth_filepath, + result_filepaths): + print(f"Writing output file to: {output_filepath}") + ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"), + "cpp/bench/ann/scripts") + ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir, + "eval.pl") + if recompute: + p = subprocess.Popen([ann_bench_scripts_path, "-f", "-o", output_filepath, + groundtruth_filepath] + result_filepaths) + else: + p = subprocess.Popen([ann_bench_scripts_path, "-o", output_filepath, + groundtruth_filepath] + result_filepaths) + p.wait() + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--output", help="Path to the CSV output file", + required=True) + parser.add_argument("--recompute", action="store_true", + help="Recompute metrics") + parser.add_argument("--groundtruth", + help="Path to groundtruth.neighbors.ibin file for a dataset", + required=True) + args, result_filepaths = parser.parse_known_args() + + # if nothing is provided + if len(result_filepaths) == 0: + raise ValueError("No filepaths to results were provided") + + groundtruth_filepath = args.groundtruth + export_results(args.output, args.recompute, groundtruth_filepath, + result_filepaths) + + +if __name__ == "__main__": + main() diff --git a/scripts/ann-benchmarks/get_dataset.py b/scripts/ann-benchmarks/get_dataset.py new file mode 100644 index 0000000000..5c21a5e2e1 --- /dev/null +++ b/scripts/ann-benchmarks/get_dataset.py @@ -0,0 +1,92 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess +from urllib.request import urlretrieve + + +def get_dataset_path(name, ann_bench_data_path): + if not os.path.exists(ann_bench_data_path): + os.mkdir(ann_bench_data_path) + return os.path.join(ann_bench_data_path, f"{name}.hdf5") + + +def download_dataset(url, path): + if not os.path.exists(path): + print(f"downloading {url} -> {path}...") + urlretrieve(url, path) + + +def convert_hdf5_to_fbin(path, normalize): + ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"), + "cpp/bench/ann/scripts") + ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir, + "hdf5_to_fbin.py") + if normalize and "angular" in path: + p = subprocess.Popen(["python", ann_bench_scripts_path, "-n", + "%s" % path]) + else: + p = subprocess.Popen(["python", ann_bench_scripts_path, + "%s" % path]) + p.wait() + + +def move(name, ann_bench_data_path): + if "angular" in name: + new_name = name.replace("angular", "inner") + else: + new_name = name + new_path = os.path.join(ann_bench_data_path, new_name) + if not os.path.exists(new_path): + os.mkdir(new_path) + for bin_name in ["base.fbin", "query.fbin", "groundtruth.neighbors.ibin", + "groundtruth.distances.fbin"]: + os.rename(f"{ann_bench_data_path}/{name}.{bin_name}", + f"{new_path}/{bin_name}") + + +def download(name, normalize, ann_bench_data_path): + path = get_dataset_path(name, ann_bench_data_path) + try: + url = f"http://ann-benchmarks.com/{name}.hdf5" + download_dataset(url, path) + + convert_hdf5_to_fbin(path, normalize) + + move(name, ann_bench_data_path) + except Exception: + print(f"Cannot download {url}") + raise + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--name", help="dataset to download", + default="glove-100-angular") + parser.add_argument("--path", help="path to download dataset", + default=os.path.join(os.getcwd(), "data")) + parser.add_argument("--normalize", + help="normalize cosine distance to inner product", + action="store_true") + args = parser.parse_args() + + download(args.name, args.normalize, args.path) + + +if __name__ == "__main__": + main() diff --git a/scripts/ann-benchmarks/plot.py b/scripts/ann-benchmarks/plot.py new file mode 100644 index 0000000000..772bdf8738 --- /dev/null +++ b/scripts/ann-benchmarks/plot.py @@ -0,0 +1,240 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is inspired by +# 1: https://github.com/erikbern/ann-benchmarks/blob/main/plot.py +# 2: https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/plotting/utils.py +# 3: https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/plotting/metrics.py +# Licence: https://github.com/erikbern/ann-benchmarks/blob/main/LICENSE + +import matplotlib as mpl + +mpl.use("Agg") # noqa +import argparse +import itertools +import matplotlib.pyplot as plt +import numpy as np +import os + + + +metrics = { + "k-nn": { + "description": "Recall", + "worst": float("-inf"), + "lim": [0.0, 1.03], + }, + "qps": { + "description": "Queries per second (1/s)", + "worst": float("-inf"), + } +} + + +def generate_n_colors(n): + vs = np.linspace(0.3, 0.9, 7) + colors = [(0.9, 0.4, 0.4, 1.0)] + + def euclidean(a, b): + return sum((x - y) ** 2 for x, y in zip(a, b)) + + while len(colors) < n: + new_color = max(itertools.product(vs, vs, vs), key=lambda a: min(euclidean(a, b) for b in colors)) + colors.append(new_color + (1.0,)) + return colors + + +def create_linestyles(unique_algorithms): + colors = dict(zip(unique_algorithms, generate_n_colors(len(unique_algorithms)))) + linestyles = dict((algo, ["--", "-.", "-", ":"][i % 4]) for i, algo in enumerate(unique_algorithms)) + markerstyles = dict((algo, ["+", "<", "o", "*", "x"][i % 5]) for i, algo in enumerate(unique_algorithms)) + faded = dict((algo, (r, g, b, 0.3)) for algo, (r, g, b, a) in colors.items()) + return dict((algo, (colors[algo], faded[algo], linestyles[algo], markerstyles[algo])) for algo in unique_algorithms) + + +def get_up_down(metric): + if metric["worst"] == float("inf"): + return "down" + return "up" + + +def get_left_right(metric): + if metric["worst"] == float("inf"): + return "left" + return "right" + + +def get_plot_label(xm, ym): + template = "%(xlabel)s-%(ylabel)s tradeoff - %(updown)s and" " to the %(leftright)s is better" + return template % { + "xlabel": xm["description"], + "ylabel": ym["description"], + "updown": get_up_down(ym), + "leftright": get_left_right(xm), + } + + +def create_pointset(data, xn, yn): + xm, ym = (metrics[xn], metrics[yn]) + rev_y = -1 if ym["worst"] < 0 else 1 + rev_x = -1 if xm["worst"] < 0 else 1 + data.sort(key=lambda t: (rev_y * t[-1], rev_x * t[-2])) + + axs, ays, als = [], [], [] + # Generate Pareto frontier + xs, ys, ls = [], [], [] + last_x = xm["worst"] + comparator = (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx) + for algo_name, xv, yv in data: + if not xv or not yv: + continue + axs.append(xv) + ays.append(yv) + als.append(algo_name) + if comparator(xv, last_x): + last_x = xv + xs.append(xv) + ys.append(yv) + ls.append(algo_name) + return xs, ys, ls, axs, ays, als + + +def create_plot(all_data, raw, x_scale, y_scale, fn_out, linestyles): + xn = "k-nn" + yn = "qps" + xm, ym = (metrics[xn], metrics[yn]) + # Now generate each plot + handles = [] + labels = [] + plt.figure(figsize=(12, 9)) + + # Sorting by mean y-value helps aligning plots with labels + def mean_y(algo): + xs, ys, ls, axs, ays, als = create_pointset(all_data[algo], xn, yn) + return -np.log(np.array(ys)).mean() + + # Find range for logit x-scale + min_x, max_x = 1, 0 + for algo in sorted(all_data.keys(), key=mean_y): + xs, ys, ls, axs, ays, als = create_pointset(all_data[algo], xn, yn) + min_x = min([min_x] + [x for x in xs if x > 0]) + max_x = max([max_x] + [x for x in xs if x < 1]) + color, faded, linestyle, marker = linestyles[algo] + (handle,) = plt.plot( + xs, ys, "-", label=algo, color=color, ms=7, mew=3, lw=3, marker=marker + ) + handles.append(handle) + if raw: + (handle2,) = plt.plot( + axs, ays, "-", label=algo, color=faded, ms=5, mew=2, lw=2, marker=marker + ) + labels.append(algo) + + ax = plt.gca() + ax.set_ylabel(ym["description"]) + ax.set_xlabel(xm["description"]) + # Custom scales of the type --x-scale a3 + if x_scale[0] == "a": + alpha = float(x_scale[1:]) + + def fun(x): + return 1 - (1 - x) ** (1 / alpha) + + def inv_fun(x): + return 1 - (1 - x) ** alpha + + ax.set_xscale("function", functions=(fun, inv_fun)) + if alpha <= 3: + ticks = [inv_fun(x) for x in np.arange(0, 1.2, 0.2)] + plt.xticks(ticks) + if alpha > 3: + from matplotlib import ticker + + ax.xaxis.set_major_formatter(ticker.LogitFormatter()) + # plt.xticks(ticker.LogitLocator().tick_values(min_x, max_x)) + plt.xticks([0, 1 / 2, 1 - 1e-1, 1 - 1e-2, 1 - 1e-3, 1 - 1e-4, 1]) + # Other x-scales + else: + ax.set_xscale(x_scale) + ax.set_yscale(y_scale) + ax.set_title(get_plot_label(xm, ym)) + plt.gca().get_position() + # plt.gca().set_position([box.x0, box.y0, box.width * 0.8, box.height]) + ax.legend(handles, labels, loc="center left", bbox_to_anchor=(1, 0.5), prop={"size": 9}) + plt.grid(visible=True, which="major", color="0.65", linestyle="-") + plt.setp(ax.get_xminorticklabels(), visible=True) + + # Logit scale has to be a subset of (0,1) + if "lim" in xm and x_scale != "logit": + x0, x1 = xm["lim"] + plt.xlim(max(x0, 0), min(x1, 1)) + elif x_scale == "logit": + plt.xlim(min_x, max_x) + if "lim" in ym: + plt.ylim(ym["lim"]) + + # Workaround for bug https://github.com/matplotlib/matplotlib/issues/6789 + ax.spines["bottom"]._adjust_location() + + plt.savefig(fn_out, bbox_inches="tight") + plt.close() + + +def load_all_results(result_filepath): + results = dict() + with open(result_filepath, 'r') as f: + for line in f.readlines()[1:]: + split_lines = line.split(',') + algo_name = split_lines[0].split('.')[0] + if algo_name not in results: + results[algo_name] = [] + results[algo_name].append([algo_name, float(split_lines[1]), + float(split_lines[2])]) + return results + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--result_csv", help="Path to CSV Results", required=True) + parser.add_argument("--output", help="Path to the PNG output file", + default=f"{os.getcwd()}/out.png") + parser.add_argument( + "--x-scale", + help="Scale to use when drawing the X-axis. \ + Typically linear, logit or a2", + default="linear" + ) + parser.add_argument( + "--y-scale", + help="Scale to use when drawing the Y-axis", + choices=["linear", "log", "symlog", "logit"], + default="linear", + ) + parser.add_argument( + "--raw", help="Show raw results (not just Pareto frontier) in faded colours", action="store_true" + ) + args = parser.parse_args() + + print(f"writing output to {args.output}") + + results = load_all_results(args.result_csv) + linestyles = create_linestyles(sorted(results.keys())) + + create_plot(results, args.raw, args.x_scale, args.y_scale, args.output, linestyles) + + +if __name__ == "__main__": + main() diff --git a/scripts/ann-benchmarks/run.py b/scripts/ann-benchmarks/run.py new file mode 100644 index 0000000000..e2236dce81 --- /dev/null +++ b/scripts/ann-benchmarks/run.py @@ -0,0 +1,185 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os +import subprocess +import yaml + + +def validate_algorithm(algos_conf, algo): + algos_conf_keys = set(algos_conf.keys()) + return algo in algos_conf_keys and not algos_conf[algo]["disabled"] + + +def find_executable(algos_conf, algo): + executable = algos_conf[algo]["executable"] + conda_path = os.path.join(os.getenv("CONDA_PREFIX"), "bin", "ann", + executable) + build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable) + if os.path.exists(conda_path): + return (executable, conda_path) + elif os.path.exists(build_path): + return (executable, build_path) + else: + raise FileNotFoundError(executable) + + +def run_build_and_search(conf_filename, conf_file, executables_to_run, + force, conf_filedir, build, search): + for executable, ann_executable_path in executables_to_run.keys(): + # Need to write temporary configuration + temp_conf_filename = f"temporary_executable_{conf_filename}" + temp_conf_filepath = os.path.join(conf_filedir, temp_conf_filename) + with open(temp_conf_filepath, "w") as f: + temp_conf = dict() + temp_conf["dataset"] = conf_file["dataset"] + temp_conf["search_basic_param"] = conf_file["search_basic_param"] + temp_conf["index"] = executables_to_run[(executable, + ann_executable_path)]["index"] + json.dump(temp_conf, f) + + if build: + if force: + p = subprocess.Popen([ann_executable_path, "-b", "-f", + temp_conf_filepath]) + p.wait() + else: + p = subprocess.Popen([ann_executable_path, "-b", + temp_conf_filepath]) + p.wait() + + if search: + if force: + p = subprocess.Popen([ann_executable_path, "-s", "-f", + temp_conf_filepath]) + p.wait() + else: + p = subprocess.Popen([ann_executable_path, "-s", + temp_conf_filepath]) + p.wait() + + os.remove(temp_conf_filepath) + + +def main(): + scripts_path = os.path.dirname(os.path.realpath(__file__)) + # Read list of allowed algorithms + with open(f"{scripts_path}/algos.yaml", "r") as f: + algos_conf = yaml.safe_load(f) + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--configuration", + help="path to configuration file for a dataset", + required=True + ) + parser.add_argument( + "--build", + action="store_true" + ) + parser.add_argument( + "--search", + action="store_true" + ) + parser.add_argument("--algorithms", + help="run only comma separated list of named \ + algorithms", + default=None) + parser.add_argument("--indices", + help="run only comma separated list of named indices. \ + parameter `algorithms` is ignored", + default=None) + parser.add_argument("--force", + help="re-run algorithms even if their results \ + already exist", + action="store_true") + + args = parser.parse_args() + + # If both build and search are not provided, + # run both + if not args.build and not args.search: + build = True + search = True + else: + if args.build: + build = args.build + if args.search: + search = args.search + + # Read configuration file associated to dataset + conf_filepath = args.configuration + conf_filename = conf_filepath.split("/")[-1] + conf_filedir = "/".join(conf_filepath.split("/")[:-1]) + if not os.path.exists(conf_filepath): + raise FileNotFoundError(conf_filename) + + with open(conf_filepath, "r") as f: + conf_file = json.load(f) + + # Ensure base and query files exist for dataset + if not os.path.exists(conf_file["dataset"]["base_file"]): + raise FileNotFoundError(conf_file["dataset"]["base_file"]) + if not os.path.exists(conf_file["dataset"]["query_file"]): + raise FileNotFoundError(conf_file["dataset"]["query_file"]) + + executables_to_run = dict() + # At least one named index should exist in config file + if args.indices: + indices = set(args.indices.split(",")) + # algo associated with index should still be present in algos.yaml + # and enabled + for index in conf_file["index"]: + curr_algo = index["algo"] + if index["name"] in indices and \ + validate_algorithm(algos_conf, curr_algo): + executable_path = find_executable(algos_conf, curr_algo) + if executable_path not in executables_to_run: + executables_to_run[executable_path] = {"index": []} + executables_to_run[executable_path]["index"].append(index) + + # switch to named algorithms if indices parameter is not supplied + elif args.algorithms: + algorithms = set(args.algorithms.split(",")) + # pick out algorithms from conf file that exist + # and are enabled in algos.yaml + for index in conf_file["index"]: + curr_algo = index["algo"] + if curr_algo in algorithms and \ + validate_algorithm(algos_conf, curr_algo): + executable_path = find_executable(algos_conf, curr_algo) + if executable_path not in executables_to_run: + executables_to_run[executable_path] = {"index": []} + executables_to_run[executable_path]["index"].append(index) + + # default, try to run all available algorithms + else: + for index in conf_file["index"]: + curr_algo = index["algo"] + if validate_algorithm(algos_conf, curr_algo): + executable_path = find_executable(algos_conf, curr_algo) + if executable_path not in executables_to_run: + executables_to_run[executable_path] = {"index": []} + executables_to_run[executable_path]["index"].append(index) + + run_build_and_search(conf_filename, conf_file, executables_to_run, + args.force, conf_filedir, build, search) + + +if __name__ == "__main__": + main() diff --git a/scripts/ann-benchmarks/split_groundtruth.py b/scripts/ann-benchmarks/split_groundtruth.py new file mode 100644 index 0000000000..cd67d9c8b8 --- /dev/null +++ b/scripts/ann-benchmarks/split_groundtruth.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess + + +def split_groundtruth(groundtruth_filepath): + ann_bench_scripts_dir = os.path.join(os.getenv("RAFT_HOME"), + "cpp/bench/ann/scripts") + ann_bench_scripts_path = os.path.join(ann_bench_scripts_dir, + "split_groundtruth.pl") + pwd = os.getcwd() + os.chdir("/".join(groundtruth_filepath.split("/")[:-1])) + groundtruth_filename = groundtruth_filepath.split("/")[-1] + p = subprocess.Popen([ann_bench_scripts_path, groundtruth_filename, + "groundtruth"]) + p.wait() + os.chdir(pwd) + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--groundtruth", + help="Path to billion-scale dataset groundtruth file", + required=True) + args = parser.parse_args() + + split_groundtruth(args.groundtruth) + + +if __name__ == "__main__": + main() diff --git a/thirdparty/LICENSES/LICENSE.ann-benchmark b/thirdparty/LICENSES/LICENSE.ann-benchmark new file mode 100644 index 0000000000..9f8e4222f6 --- /dev/null +++ b/thirdparty/LICENSES/LICENSE.ann-benchmark @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Erik Bernhardsson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file