diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 8da9b5428a..203f52f1a2 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -5,12 +5,12 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04" } }, "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 0b3ec79e37..080ece996e 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,13 +5,13 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda11.8-ubuntu22.04" } }, "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"}, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {} + "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {"version": "1.14.1"}, + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/ucx", diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json index f5af166b46..da8bfb4db9 100644 --- a/.devcontainer/cuda12.0-conda/devcontainer.json +++ b/.devcontainer/cuda12.0-conda/devcontainer.json @@ -5,12 +5,12 @@ "args": { "CUDA": "12.0", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:23.10-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:23.12-cpp-mambaforge-ubuntu22.04" } }, "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json index 9f28002d38..e2bee94f8a 100644 --- a/.devcontainer/cuda12.0-pip/devcontainer.json +++ b/.devcontainer/cuda12.0-pip/devcontainer.json @@ -5,13 +5,13 @@ "args": { "CUDA": "12.0", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda12.0-ubuntu22.04" + "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda12.0-ubuntu22.04" } }, "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"}, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {} + "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {"version": "1.14.1"}, + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/ucx", diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 107823d5ee..2435c477ca 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -22,13 +22,13 @@ on: default: nightly concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-pylibraft: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-pylibraft: needs: wheel-build-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: wheel-build-raft-dask: needs: wheel-publish-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -99,7 +99,7 @@ jobs: wheel-publish-raft-dask: needs: wheel-build-raft-dask secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 55117f774a..1ddd5b5cc3 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -6,6 +6,6 @@ jobs: triage: runs-on: ubuntu-latest steps: - - uses: actions/labeler@main + - uses: actions/labeler@v4 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index e539877851..bd37b8ac01 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -15,6 +15,7 @@ jobs: - checks - conda-cpp-build - conda-cpp-tests + - conda-cpp-checks - conda-python-build - conda-python-tests - docs-build @@ -24,41 +25,49 @@ jobs: - wheel-tests-raft-dask - devcontainer secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 with: build_type: pull-request node_type: cpu16 conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 with: build_type: pull-request + conda-cpp-checks: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-23.12 + with: + build_type: pull-request + enable_check_symbols: true + symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass) conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 with: build_type: pull-request docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -68,34 +77,34 @@ jobs: wheel-build-pylibraft: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: pull-request script: ci/build_wheel_pylibraft.sh wheel-tests-pylibraft: needs: wheel-build-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: pull-request script: ci/test_wheel_pylibraft.sh wheel-build-raft-dask: needs: wheel-tests-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: pull-request script: "ci/build_wheel_raft_dask.sh" wheel-tests-raft-dask: needs: wheel-build-raft-dask secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: pull-request script: ci/test_wheel_raft_dask.sh devcontainer: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.12 with: build_command: | sccache -z; diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a80d5ff0cf..0c46d83cf9 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -14,9 +14,19 @@ on: type: string jobs: + conda-cpp-checks: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-23.12 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + enable_check_symbols: true + symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass) conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +34,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -32,7 +42,7 @@ jobs: sha: ${{ inputs.sha }} wheel-tests-pylibraft: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -41,7 +51,7 @@ jobs: script: ci/test_wheel_pylibraft.sh wheel-tests-raft-dask: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 66862ada5e..80ad3614bc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -105,7 +105,10 @@ repos: hooks: - id: rapids-dependency-file-generator args: ["--clean"] - + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-json default_language_version: python: python3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 8edc900ccb..982ebe99e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,90 @@ +# raft 23.12.00 (6 Dec 2023) + +## 🐛 Bug Fixes + +- Update actions/labeler to v4 ([#2037](https://github.com/rapidsai/raft/pull/2037)) [@raydouglass](https://github.com/raydouglass) +- pylibraft only depends on numpy at runtime, not build time. ([#2013](https://github.com/rapidsai/raft/pull/2013)) [@bdice](https://github.com/bdice) +- Fixes to update-version.sh ([#1991](https://github.com/rapidsai/raft/pull/1991)) [@raydouglass](https://github.com/raydouglass) +- Adjusting end-to-end start time so it doesn't include stream creation time ([#1989](https://github.com/rapidsai/raft/pull/1989)) [@cjnolet](https://github.com/cjnolet) +- CAGRA graph optimizer: clamp rev_graph_count ([#1987](https://github.com/rapidsai/raft/pull/1987)) [@tfeher](https://github.com/tfeher) +- Catching conversion errors in data_export instead of fully failing ([#1979](https://github.com/rapidsai/raft/pull/1979)) [@cjnolet](https://github.com/cjnolet) +- Fix syncing mechanism in `raft-ann-bench` C++ search ([#1961](https://github.com/rapidsai/raft/pull/1961)) [@divyegala](https://github.com/divyegala) +- Fixing hnswlib in latency mode ([#1959](https://github.com/rapidsai/raft/pull/1959)) [@cjnolet](https://github.com/cjnolet) +- Fix `ucx-py` alpha version update for `raft-dask` ([#1953](https://github.com/rapidsai/raft/pull/1953)) [@divyegala](https://github.com/divyegala) +- Reduce NN Descent test threshold ([#1946](https://github.com/rapidsai/raft/pull/1946)) [@divyegala](https://github.com/divyegala) +- Fixes to new YAML config `raft-bench-ann` ([#1945](https://github.com/rapidsai/raft/pull/1945)) [@divyegala](https://github.com/divyegala) +- Set RNG seeds in NN Descent to diagnose flaky tests ([#1931](https://github.com/rapidsai/raft/pull/1931)) [@divyegala](https://github.com/divyegala) +- Fix FAISS CPU algorithm names in `raft-ann-bench` ([#1916](https://github.com/rapidsai/raft/pull/1916)) [@divyegala](https://github.com/divyegala) +- Increase iterations in NN Descent tests to avoid flakiness ([#1915](https://github.com/rapidsai/raft/pull/1915)) [@divyegala](https://github.com/divyegala) +- Fix filepath in `raft-ann-bench/split_groundtruth` module ([#1911](https://github.com/rapidsai/raft/pull/1911)) [@divyegala](https://github.com/divyegala) +- Remove dynamic entry-points from raft-ann-bench ([#1910](https://github.com/rapidsai/raft/pull/1910)) [@benfred](https://github.com/benfred) +- Remove unnecessary dataset path check in ANN bench ([#1908](https://github.com/rapidsai/raft/pull/1908)) [@tfeher](https://github.com/tfeher) +- Fixing Googletests and re-enabling in CI ([#1904](https://github.com/rapidsai/raft/pull/1904)) [@cjnolet](https://github.com/cjnolet) +- Fix NN Descent overflows ([#1875](https://github.com/rapidsai/raft/pull/1875)) [@divyegala](https://github.com/divyegala) +- Build fix for CUDA 12.2 ([#1870](https://github.com/rapidsai/raft/pull/1870)) [@benfred](https://github.com/benfred) +- [BUG] Fix a bug in NN descent ([#1869](https://github.com/rapidsai/raft/pull/1869)) [@enp1s0](https://github.com/enp1s0) + +## 📖 Documentation + +- Brute Force Index documentation fix ([#1944](https://github.com/rapidsai/raft/pull/1944)) [@lowener](https://github.com/lowener) +- Add `wiki_all` dataset config and documentation. ([#1918](https://github.com/rapidsai/raft/pull/1918)) [@cjnolet](https://github.com/cjnolet) +- Updates to raft-ann-bench docs ([#1905](https://github.com/rapidsai/raft/pull/1905)) [@cjnolet](https://github.com/cjnolet) +- End-to-end vector search tutorial in docs ([#1776](https://github.com/rapidsai/raft/pull/1776)) [@cjnolet](https://github.com/cjnolet) + +## 🚀 New Features + +- Adding `dry-run` option to `raft-ann-bench` ([#1970](https://github.com/rapidsai/raft/pull/1970)) [@cjnolet](https://github.com/cjnolet) +- Add ANN bench scripts to generate ground truth ([#1967](https://github.com/rapidsai/raft/pull/1967)) [@tfeher](https://github.com/tfeher) +- CAGRA build + HNSW search ([#1956](https://github.com/rapidsai/raft/pull/1956)) [@divyegala](https://github.com/divyegala) +- Verify conda-cpp-post-build-checks ([#1935](https://github.com/rapidsai/raft/pull/1935)) [@robertmaynard](https://github.com/robertmaynard) +- Make all cuda kernels have hidden visibility ([#1898](https://github.com/rapidsai/raft/pull/1898)) [@robertmaynard](https://github.com/robertmaynard) +- Update rapids-cmake functions to non-deprecated signatures ([#1884](https://github.com/rapidsai/raft/pull/1884)) [@robertmaynard](https://github.com/robertmaynard) +- [FEA] Helpers for identifying contiguous layouts. ([#1861](https://github.com/rapidsai/raft/pull/1861)) [@trivialfis](https://github.com/trivialfis) +- Add `raft::stats::neighborhood_recall` ([#1860](https://github.com/rapidsai/raft/pull/1860)) [@divyegala](https://github.com/divyegala) +- [FEA] Helpers and CodePacker for IVF-PQ ([#1826](https://github.com/rapidsai/raft/pull/1826)) [@tarang-jain](https://github.com/tarang-jain) + +## 🛠️ Improvements + +- Pinning fmt and spdlog for raft-ann-bench-cpu ([#2018](https://github.com/rapidsai/raft/pull/2018)) [@cjnolet](https://github.com/cjnolet) +- Build concurrency for nightly and merge triggers ([#2011](https://github.com/rapidsai/raft/pull/2011)) [@bdice](https://github.com/bdice) +- Using `EXPORT_SET` in `rapids_find_package_root` ([#2006](https://github.com/rapidsai/raft/pull/2006)) [@cjnolet](https://github.com/cjnolet) +- Remove static checks for serialization size ([#1997](https://github.com/rapidsai/raft/pull/1997)) [@cjnolet](https://github.com/cjnolet) +- Skipping bad json parse ([#1990](https://github.com/rapidsai/raft/pull/1990)) [@cjnolet](https://github.com/cjnolet) +- Update select-k heuristic ([#1985](https://github.com/rapidsai/raft/pull/1985)) [@benfred](https://github.com/benfred) +- ANN bench: use different offset for each thread ([#1981](https://github.com/rapidsai/raft/pull/1981)) [@tfeher](https://github.com/tfeher) +- Allow `raft-ann-bench/run` to continue after encountering bad YAML configs ([#1980](https://github.com/rapidsai/raft/pull/1980)) [@divyegala](https://github.com/divyegala) +- Add build and search params to `raft-ann-bench.data_export` CSVs ([#1971](https://github.com/rapidsai/raft/pull/1971)) [@divyegala](https://github.com/divyegala) +- Use new `rapids-dask-dependency` metapackage for managing dask versions ([#1968](https://github.com/rapidsai/raft/pull/1968)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unused header ([#1960](https://github.com/rapidsai/raft/pull/1960)) [@wphicks](https://github.com/wphicks) +- Adding pool back in and fixing cagra benchmark params ([#1951](https://github.com/rapidsai/raft/pull/1951)) [@cjnolet](https://github.com/cjnolet) +- Add constraints to `hnswlib` in `raft-bench-ann` ([#1949](https://github.com/rapidsai/raft/pull/1949)) [@divyegala](https://github.com/divyegala) +- Add support for iterating over batches in bfknn ([#1947](https://github.com/rapidsai/raft/pull/1947)) [@benfred](https://github.com/benfred) +- Fix ANN bench latency ([#1940](https://github.com/rapidsai/raft/pull/1940)) [@tfeher](https://github.com/tfeher) +- Add YAML config files to run parameter sweeps for ANN benchmarks ([#1929](https://github.com/rapidsai/raft/pull/1929)) [@divyegala](https://github.com/divyegala) +- Relax ucx pinning ([#1927](https://github.com/rapidsai/raft/pull/1927)) [@vyasr](https://github.com/vyasr) +- Try using contiguous rank to fix cuda_visible_devices ([#1926](https://github.com/rapidsai/raft/pull/1926)) [@VibhuJawa](https://github.com/VibhuJawa) +- Unpin `dask` and `distributed` for `23.12` development ([#1925](https://github.com/rapidsai/raft/pull/1925)) [@galipremsagar](https://github.com/galipremsagar) +- Adding `throughput` and `latency` modes to `raft-ann-bench` ([#1920](https://github.com/rapidsai/raft/pull/1920)) [@cjnolet](https://github.com/cjnolet) +- Providing `aarch64` yaml environment files ([#1914](https://github.com/rapidsai/raft/pull/1914)) [@cjnolet](https://github.com/cjnolet) +- CAGRA ANN bench: parse build options for IVF-PQ build algo ([#1912](https://github.com/rapidsai/raft/pull/1912)) [@tfeher](https://github.com/tfeher) +- Fix python script location in ANN bench description ([#1906](https://github.com/rapidsai/raft/pull/1906)) [@tfeher](https://github.com/tfeher) +- Refactor install/build guide. ([#1899](https://github.com/rapidsai/raft/pull/1899)) [@cjnolet](https://github.com/cjnolet) +- Check return values of raft-ann-bench subprocess calls ([#1897](https://github.com/rapidsai/raft/pull/1897)) [@benfred](https://github.com/benfred) +- ANN bench options to specify CAGRA graph and dataset locations ([#1896](https://github.com/rapidsai/raft/pull/1896)) [@cjnolet](https://github.com/cjnolet) +- Add check-json to pre-commit linters, and fix invalid ann-bench JSON config ([#1894](https://github.com/rapidsai/raft/pull/1894)) [@benfred](https://github.com/benfred) +- Use branch-23.12 workflows. ([#1886](https://github.com/rapidsai/raft/pull/1886)) [@bdice](https://github.com/bdice) +- Setup Consistent Nightly Versions for Pip and Conda ([#1880](https://github.com/rapidsai/raft/pull/1880)) [@divyegala](https://github.com/divyegala) +- Fix and improve one-block radix select ([#1878](https://github.com/rapidsai/raft/pull/1878)) [@yong-wang](https://github.com/yong-wang) +- [FEA] Improvements on bitset class ([#1877](https://github.com/rapidsai/raft/pull/1877)) [@lowener](https://github.com/lowener) +- Branch 23.12 merge 23.10 ([#1873](https://github.com/rapidsai/raft/pull/1873)) [@AyodeAwe](https://github.com/AyodeAwe) +- Branch 23.12 merge 23.10 ([#1868](https://github.com/rapidsai/raft/pull/1868)) [@cjnolet](https://github.com/cjnolet) +- Replace `raft::random` calls to not use deprecated API ([#1867](https://github.com/rapidsai/raft/pull/1867)) [@lowener](https://github.com/lowener) +- raft: Build CUDA 12.0 ARM conda packages. ([#1853](https://github.com/rapidsai/raft/pull/1853)) [@bdice](https://github.com/bdice) +- Documentation for raft ANN benchmark containers. ([#1833](https://github.com/rapidsai/raft/pull/1833)) [@dantegd](https://github.com/dantegd) +- [FEA] Support vector deletion in ANN IVF ([#1831](https://github.com/rapidsai/raft/pull/1831)) [@lowener](https://github.com/lowener) +- Provide a raft::copy overload for mdspan-to-mdspan copies ([#1818](https://github.com/rapidsai/raft/pull/1818)) [@wphicks](https://github.com/wphicks) +- Adding FAISS cpu to `raft-ann-bench` ([#1814](https://github.com/rapidsai/raft/pull/1814)) [@cjnolet](https://github.com/cjnolet) + # raft 23.10.00 (11 Oct 2023) ## 🚨 Breaking Changes diff --git a/README.md b/README.md index 56d422b489..5b1297b63c 100755 --- a/README.md +++ b/README.md @@ -255,106 +255,54 @@ pairwise_distance(in1, in2, out=output, metric="euclidean") ## Installing -RAFT itself can be installed through conda, [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), pip, or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on installing and building RAFT and using it in downstream projects. +RAFT's C++ and Python libraries can both be installed through Conda and the Python libraries through Pip. -### Conda + +### Installing C++ and Python through Conda The easiest way to install RAFT is through conda and several packages are provided. -- `libraft-headers` RAFT headers -- `libraft` (optional) shared library of pre-compiled template instantiations and runtime APIs. -- `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives. -- `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters. +- `libraft-headers` C++ headers +- `libraft` (optional) C++ shared library containing pre-compiled template instantiations and runtime API. +- `pylibraft` (optional) Python library +- `raft-dask` (optional) Python library for deployment of multi-node multi-GPU algorithms that use the RAFT `raft::comms` abstraction layer in Dask clusters. +- `raft-ann-bench` (optional) Benchmarking tool for easily producing benchmarks that compare RAFT's vector search algorithms against other state-of-the-art implementations. +- `raft-ann-bench-cpu` (optional) Reproducible benchmarking tool similar to above, but doesn't require CUDA to be installed on the machine. Can be used to test in environments with competitive CPUs. + +Use the following command, depending on your CUDA version, to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command. +```bash +# for CUDA 11.8 +mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=11.8 +``` -Use the following command to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command. ```bash -mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft +# for CUDA 12.0 +mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0 ``` -You can also install the conda packages individually using the `mamba` command above. +Note that the above commands will also install `libraft-headers` and `libraft`. + +You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers and pre-compiled shared library to use in your project: +```bash +# for CUDA 12.0 +mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0 +``` -After installing RAFT, `find_package(raft COMPONENTS compiled distributed)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed. +If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-23.12/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above. -### Pip +### Installing Python through Pip -pylibraft and raft-dask both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install): +`pylibraft` and `raft-dask` both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install): ```bash pip install pylibraft-cu11 --extra-index-url=https://pypi.nvidia.com pip install raft-dask-cu11 --extra-index-url=https://pypi.nvidia.com ``` -### CMake & CPM - -RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it easy to include in downstream cmake projects. RAPIDS-CMake provides a convenience layer around CPM. Please refer to [these instructions](https://github.com/rapidsai/rapids-cmake#installation) to install and use rapids-cmake in your project. - -#### Example Template Project +These packages statically build RAFT's pre-compiled instantiations and so the C++ headers and pre-compiled shared library won't be readily available to use in your code. -You can find an [example RAFT](cpp/template/README.md) project template in the `cpp/template` directory, which demonstrates how to build a new application with RAFT or incorporate RAFT into an existing cmake project. +The [build instructions](https://docs.rapids.ai/api/raft/nightly/build/) contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ and Python from source](https://docs.rapids.ai/api/raft/nightly/build/#building-c-and-python-from-source) section of the build instructions. -#### CMake Targets - -Additional CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available. RAFT headers require, at a minimum, the CUDA toolkit libraries and RMM dependencies. - -| Component | Target | Description | Base Dependencies | -|-------------|---------------------|----------------------------------------------------------|----------------------------------------| -| n/a | `raft::raft` | Full RAFT header library | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS | -| compiled | `raft::compiled` | Pre-compiled template instantiations and runtime library | raft::raft | -| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs | raft::raft, UCX, NCCL | - -### Source - -The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository: -1. Create an environment with the needed dependencies: -``` -mamba env create --name raft_dev_env -f conda/environments/all_cuda-118_arch-x86_64.yaml -mamba activate raft_dev_env -``` -``` -./build.sh raft-dask pylibraft libraft tests bench --compile-lib -``` +You can find an example [RAFT project template](cpp/template/README.md) in the `cpp/template` directory, which demonstrates how to build a new application with RAFT or incorporate RAFT into an existing CMake project. -The [build](docs/source/build.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](docs/source/build.md#building-raft-c-from-source-in-cmake) section of the build instructions. - -## Folder Structure and Contents - -The folder structure mirrors other RAPIDS repos, with the following folders: - -- `bench/ann`: Python scripts for running ANN benchmarks -- `ci`: Scripts for running CI in PRs -- `conda`: Conda recipes and development conda environments -- `cpp`: Source code for C++ libraries. - - `bench`: Benchmarks source code - - `cmake`: CMake modules and templates - - `doxygen`: Doxygen configuration - - `include`: The C++ API headers are fully-contained here (deprecated directories are excluded from the listing below) - - `cluster`: Basic clustering primitives and algorithms. - - `comms`: A multi-node multi-GPU communications abstraction layer for NCCL+UCX and MPI+NCCL, which can be deployed in Dask clusters using the `raft-dask` Python package. - - `core`: Core API headers which require minimal dependencies aside from RMM and Cudatoolkit. These are safe to expose on public APIs and do not require `nvcc` to build. This is the same for any headers in RAFT which have the suffix `*_types.hpp`. - - `distance`: Distance primitives - - `linalg`: Dense linear algebra - - `matrix`: Dense matrix operations - - `neighbors`: Nearest neighbors and knn graph construction - - `random`: Random number generation, sampling, and data generation primitives - - `solver`: Iterative and combinatorial solvers for optimization and approximation - - `sparse`: Sparse matrix operations - - `convert`: Sparse conversion functions - - `distance`: Sparse distance computations - - `linalg`: Sparse linear algebra - - `neighbors`: Sparse nearest neighbors and knn graph construction - - `op`: Various sparse operations such as slicing and filtering (Note: this will soon be renamed to `sparse/matrix`) - - `solver`: Sparse solvers for optimization and approximation - - `stats`: Moments, summary statistics, model performance measures - - `util`: Various reusable tools and utilities for accelerated algorithm development - - `internal`: A private header-only component that hosts the code shared between benchmarks and tests. - - `scripts`: Helpful scripts for development - - `src`: Compiled APIs and template instantiations for the shared libraries - - `template`: A skeleton template containing the bare-bones file structure and cmake configuration for writing applications with RAFT. - - `test`: Googletests source code -- `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs) -- `notebooks`: IPython notebooks with usage examples and tutorials -- `python`: Source code for Python libraries. - - `pylibraft`: Python build and source code for pylibraft library - - `raft-dask`: Python build and source code for raft-dask library -- `thirdparty`: Third-party licenses ## Contributing diff --git a/VERSION b/VERSION new file mode 100644 index 0000000000..a193fff41e --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +23.12.00 diff --git a/build.sh b/build.sh index 6200e6a2fa..51e59cc259 100755 --- a/build.sh +++ b/build.sh @@ -78,7 +78,7 @@ INSTALL_TARGET=install BUILD_REPORT_METRICS="" BUILD_REPORT_INCL_CACHE_STATS=OFF -TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;NEIGHBORS_TEST;NEIGHBORS_ANN_CAGRA_TEST;NEIGHBORS_ANN_NN_DESCENT_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;STATS_TEST;UTILS_TEST" +TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;NEIGHBORS_TEST;NEIGHBORS_ANN_CAGRA_TEST;NEIGHBORS_ANN_NN_DESCENT_TEST;NEIGHBORS_ANN_IVF_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;STATS_TEST;UTILS_TEST" BENCH_TARGETS="CLUSTER_BENCH;CORE_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH" CACHE_ARGS="" @@ -324,6 +324,8 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then $CMAKE_TARGET == *"DISTANCE_TEST"* || \ $CMAKE_TARGET == *"MATRIX_TEST"* || \ $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_ANN_IVF_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_ANN_NN_DESCENT_TEST"* || \ $CMAKE_TARGET == *"NEIGHBORS_TEST"* || \ $CMAKE_TARGET == *"SPARSE_DIST_TEST" || \ $CMAKE_TARGET == *"SPARSE_NEIGHBORS_TEST"* || \ diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index a41f81152d..178ce723a5 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -9,8 +9,10 @@ export CMAKE_GENERATOR=Ninja rapids-print-env +version=$(rapids-generate-version) + rapids-logger "Begin cpp build" -rapids-conda-retry mambabuild conda/recipes/libraft +RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild conda/recipes/libraft rapids-upload-conda-to-s3 cpp diff --git a/ci/build_docs.sh b/ci/build_docs.sh index d1a1e2f44f..2eb9f7da6d 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -28,7 +28,7 @@ rapids-mamba-retry install \ pylibraft \ raft-dask -export RAPIDS_VERSION_NUMBER="23.10" +export RAPIDS_VERSION_NUMBER="23.12" export RAPIDS_DOCS_DIR="$(mktemp -d)" rapids-logger "Build CPP docs" diff --git a/ci/build_python.sh b/ci/build_python.sh index c49677e78c..cf34776542 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -13,6 +13,17 @@ rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) +version=$(rapids-generate-version) +git_commit=$(git rev-parse HEAD) +export RAPIDS_PACKAGE_VERSION=${version} +echo "${version}" > VERSION + +package_dir="python" +for package_name in pylibraft raft-dask; do + underscore_package_name=$(echo "${package_name}" | tr "-" "_") + sed -i "/^__git_commit__/ s/= .*/= \"${git_commit}\"/g" "${package_dir}/${package_name}/${underscore_package_name}/_version.py" +done + # TODO: Remove `--no-test` flags once importing on a CPU # node works correctly rapids-conda-retry mambabuild \ diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 662a11ad0e..5d06e46303 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -10,9 +10,8 @@ underscore_package_name=$(echo "${package_name}" | tr "-" "_") source rapids-configure-sccache source rapids-date-string -# Use gha-tools rapids-pip-wheel-version to generate wheel version then -# update the necessary files -version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})" +version=$(rapids-generate-version) +git_commit=$(git rev-parse HEAD) RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -22,9 +21,11 @@ PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" # Patch project metadata files to include the CUDA version suffix and version override. pyproject_file="${package_dir}/pyproject.toml" +version_file="${package_dir}/${underscore_package_name}/_version.py" -sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file} sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} +echo "${version}" > VERSION +sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file} # For nightlies we want to ensure that we're pulling in alphas as well. The # easiest way to do so is to augment the spec with a constraint containing a @@ -37,7 +38,9 @@ fi if [[ ${package_name} == "raft-dask" ]]; then sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} - sed -i "s/ucx-py/ucx-py${PACKAGE_CUDA_SUFFIX}/g" python/raft-dask/pyproject.toml + sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} + sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file} + sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file} else sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file} fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index a867a71f68..d5bc17be56 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -43,9 +43,8 @@ sed_runner 's/'"pylibraft_version .*)"'/'"pylibraft_version ${NEXT_FULL_TAG})"'/ sed_runner 's/'"raft_dask_version .*)"'/'"raft_dask_version ${NEXT_FULL_TAG})"'/g' python/raft-dask/CMakeLists.txt sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake -# Python __init__.py updates -sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pylibraft/__init__.py -sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/raft_dask/__init__.py +# Centralized version file update +echo "${NEXT_FULL_TAG}" > VERSION # Wheel testing script sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_raft_dask.sh @@ -57,7 +56,12 @@ sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf. DEPENDENCIES=( dask-cuda pylibraft + pylibraft-cu11 + pylibraft-cu12 rmm + rmm-cu11 + rmm-cu12 + rapids-dask-dependency # ucx-py is handled separately below ) for FILE in dependencies.yaml conda/environments/*.yaml; do @@ -76,22 +80,21 @@ done sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml -# Wheel builds install dask-cuda from source, update its branch for FILE in .github/workflows/*.yaml; do - sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE}; -done - -for FILE in .github/workflows/*.yaml; do - sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" + sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxyfile sed_runner "/^set(RAFT_VERSION/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" docs/source/build.md -sed_runner "/GIT_TAG.*branch-/ s|branch-.*|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md +sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md sed_runner "/rapidsai\/raft/ s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/developer_guide.md +sed_runner "s|:[0-9][0-9].[0-9][0-9]|:${NEXT_SHORT_TAG}|g" docs/source/raft_ann_benchmarks.md + +sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" README.md + # .devcontainer files find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}" diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 9c487be156..0f8efb171e 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -36,6 +36,7 @@ trap "EXITCODE=1" ERR set +e # Run libraft gtests from libraft-tests package +cd "$CONDA_PREFIX"/bin/gtests/libraft ctest -j8 --output-on-failure rapids-logger "Test script exiting with value: $EXITCODE" diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh index fd9668e968..b70563b7a1 100755 --- a/ci/test_wheel_raft_dask.sh +++ b/ci/test_wheel_raft_dask.sh @@ -11,9 +11,6 @@ RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl -# Always install latest dask for testing -python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10 - # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/raft_dask*.whl)[test] diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml new file mode 100644 index 0000000000..c28f1961e6 --- /dev/null +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -0,0 +1,61 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-nvtx=11.8 +- cuda-profiler-api=11.8.86 +- cuda-python>=11.7.1,<12.0a0 +- cuda-version=11.8 +- cudatoolkit +- cupy>=12.0.0 +- cxx-compiler +- cython>=3.0.0 +- dask-cuda==23.12.* +- doxygen>=1.8.20 +- gcc_linux-aarch64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- joblib>=0.11 +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- nccl>=2.9.9 +- ninja +- numba>=0.57 +- numpy>=1.21 +- numpydoc +- nvcc_linux-aarch64=11.8 +- pre-commit +- pydata-sphinx-theme +- pytest +- pytest-cov +- rapids-dask-dependency==23.12.* +- recommonmark +- rmm==23.12.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-aarch64==2.17 +- ucx-proc=*=gpu +- ucx-py==0.35.* +- ucx>=1.13.0 +name: all_cuda-118_arch-aarch64 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 739e1e9785..9b7c110bc3 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -12,6 +12,7 @@ dependencies: - clang-tools=16.0.6 - clang==16.0.6 - cmake>=3.26.4 +- cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 - cuda-python>=11.7.1,<12.0a0 - cuda-version=11.8 @@ -19,10 +20,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-core==2023.9.2 -- dask-cuda==23.10.* -- dask==2023.9.2 -- distributed==2023.9.2 +- dask-cuda==23.12.* - doxygen>=1.8.20 - gcc_linux-64=11.* - gmock>=1.13.0 @@ -48,8 +46,9 @@ dependencies: - pydata-sphinx-theme - pytest - pytest-cov +- rapids-dask-dependency==23.12.* - recommonmark -- rmm==23.10.* +- rmm==23.12.* - scikit-build>=0.13.1 - scikit-learn - scipy @@ -57,6 +56,6 @@ dependencies: - sphinx-markdown-tables - sysroot_linux-64==2.17 - ucx-proc=*=gpu -- ucx-py==0.34.* +- ucx-py==0.35.* - ucx>=1.13.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml new file mode 100644 index 0000000000..8d614d3c2c --- /dev/null +++ b/conda/environments/all_cuda-120_arch-aarch64.yaml @@ -0,0 +1,57 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvtx-dev +- cuda-profiler-api +- cuda-python>=12.0,<13.0a0 +- cuda-version=12.0 +- cupy>=12.0.0 +- cxx-compiler +- cython>=3.0.0 +- dask-cuda==23.12.* +- doxygen>=1.8.20 +- gcc_linux-aarch64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- joblib>=0.11 +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- nccl>=2.9.9 +- ninja +- numba>=0.57 +- numpy>=1.21 +- numpydoc +- pre-commit +- pydata-sphinx-theme +- pytest +- pytest-cov +- rapids-dask-dependency==23.12.* +- recommonmark +- rmm==23.12.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-aarch64==2.17 +- ucx-proc=*=gpu +- ucx-py==0.35.* +- ucx>=1.13.0 +name: all_cuda-120_arch-aarch64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 321c17bf4f..f9d65cee39 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -14,16 +14,14 @@ dependencies: - cmake>=3.26.4 - cuda-cudart-dev - cuda-nvcc +- cuda-nvtx-dev - cuda-profiler-api - cuda-python>=12.0,<13.0a0 - cuda-version=12.0 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-core==2023.9.2 -- dask-cuda==23.10.* -- dask==2023.9.2 -- distributed==2023.9.2 +- dask-cuda==23.12.* - doxygen>=1.8.20 - gcc_linux-64=11.* - gmock>=1.13.0 @@ -44,8 +42,9 @@ dependencies: - pydata-sphinx-theme - pytest - pytest-cov +- rapids-dask-dependency==23.12.* - recommonmark -- rmm==23.10.* +- rmm==23.12.* - scikit-build>=0.13.1 - scikit-learn - scipy @@ -53,6 +52,6 @@ dependencies: - sphinx-markdown-tables - sysroot_linux-64==2.17 - ucx-proc=*=gpu -- ucx-py==0.34.* +- ucx-py==0.35.* - ucx>=1.13.0 name: all_cuda-120_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml new file mode 100644 index 0000000000..b5fc4e3bd5 --- /dev/null +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -0,0 +1,44 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- benchmark>=1.8.2 +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-nvtx=11.8 +- cuda-profiler-api=11.8.86 +- cuda-version=11.8 +- cudatoolkit +- cxx-compiler +- cython>=3.0.0 +- gcc_linux-aarch64=11.* +- glog>=0.6.0 +- h5py>=3.8.0 +- hnswlib=0.7.0 +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- matplotlib +- nccl>=2.9.9 +- ninja +- nlohmann_json>=3.11.2 +- nvcc_linux-aarch64=11.8 +- openblas +- pandas +- pyyaml +- rmm==23.12.* +- scikit-build>=0.13.1 +- sysroot_linux-aarch64==2.17 +name: bench_ann_cuda-118_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 4f1df12dfa..b868f26e15 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -12,12 +12,12 @@ dependencies: - clang-tools=16.0.6 - clang==16.0.6 - cmake>=3.26.4 +- cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 - cuda-version=11.8 - cudatoolkit - cxx-compiler - cython>=3.0.0 -- faiss-proc=*=cuda - gcc_linux-64=11.* - glog>=0.6.0 - h5py>=3.8.0 @@ -30,11 +30,15 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- libfaiss>=1.7.1 +- matplotlib - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 - nvcc_linux-64=11.8 +- openblas +- pandas +- pyyaml +- rmm==23.12.* - scikit-build>=0.13.1 - sysroot_linux-64==2.17 name: bench_ann_cuda-118_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml new file mode 100644 index 0000000000..4a3818fe5d --- /dev/null +++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml @@ -0,0 +1,40 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- benchmark>=1.8.2 +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvtx-dev +- cuda-profiler-api +- cuda-version=12.0 +- cxx-compiler +- cython>=3.0.0 +- gcc_linux-aarch64=11.* +- glog>=0.6.0 +- h5py>=3.8.0 +- hnswlib=0.7.0 +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- matplotlib +- nccl>=2.9.9 +- ninja +- nlohmann_json>=3.11.2 +- openblas +- pandas +- pyyaml +- rmm==23.12.* +- scikit-build>=0.13.1 +- sysroot_linux-aarch64==2.17 +name: bench_ann_cuda-120_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml new file mode 100644 index 0000000000..3d6f8c4ec1 --- /dev/null +++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml @@ -0,0 +1,40 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- benchmark>=1.8.2 +- c-compiler +- clang-tools=16.0.6 +- clang==16.0.6 +- cmake>=3.26.4 +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvtx-dev +- cuda-profiler-api +- cuda-version=12.0 +- cxx-compiler +- cython>=3.0.0 +- gcc_linux-64=11.* +- glog>=0.6.0 +- h5py>=3.8.0 +- hnswlib=0.7.0 +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- matplotlib +- nccl>=2.9.9 +- ninja +- nlohmann_json>=3.11.2 +- openblas +- pandas +- pyyaml +- rmm==23.12.* +- scikit-build>=0.13.1 +- sysroot_linux-64==2.17 +name: bench_ann_cuda-120_arch-x86_64 diff --git a/conda/recipes/libraft/build_libraft.sh b/conda/recipes/libraft/build_libraft.sh index 71e1533893..7d4173e8bb 100644 --- a/conda/recipes/libraft/build_libraft.sh +++ b/conda/recipes/libraft/build_libraft.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash # Copyright (c) 2022-2023, NVIDIA CORPORATION. -./build.sh libraft -v --allgpuarch --compile-lib --build-metrics=compile_lib --incl-cache-stats --no-nvtx +./build.sh libraft --allgpuarch --compile-lib --build-metrics=compile_lib --incl-cache-stats --no-nvtx diff --git a/conda/recipes/libraft/build_libraft_headers.sh b/conda/recipes/libraft/build_libraft_headers.sh index 330ac92ff3..cc3b840e43 100644 --- a/conda/recipes/libraft/build_libraft_headers.sh +++ b/conda/recipes/libraft/build_libraft_headers.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash # Copyright (c) 2022-2023, NVIDIA CORPORATION. -./build.sh libraft -v --allgpuarch --no-nvtx +./build.sh libraft --allgpuarch --no-nvtx diff --git a/conda/recipes/libraft/build_libraft_template.sh b/conda/recipes/libraft/build_libraft_template.sh index 974b0a5b58..bd7719af76 100644 --- a/conda/recipes/libraft/build_libraft_template.sh +++ b/conda/recipes/libraft/build_libraft_template.sh @@ -2,4 +2,4 @@ # Copyright (c) 2022-2023, NVIDIA CORPORATION. # Just building template so we verify it uses libraft.so and fail if it doesn't build -./build.sh template -v +./build.sh template diff --git a/conda/recipes/libraft/build_libraft_tests.sh b/conda/recipes/libraft/build_libraft_tests.sh index 08f0d33485..05a2b59eb0 100644 --- a/conda/recipes/libraft/build_libraft_tests.sh +++ b/conda/recipes/libraft/build_libraft_tests.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash # Copyright (c) 2022-2023, NVIDIA CORPORATION. -./build.sh tests bench-prims -v --allgpuarch --no-nvtx --build-metrics=tests_bench_prims --incl-cache-stats +./build.sh tests bench-prims --allgpuarch --no-nvtx --build-metrics=tests_bench_prims --incl-cache-stats cmake --install cpp/build --component testing diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml index d156f2609b..25493a34fa 100644 --- a/conda/recipes/libraft/conda_build_config.yaml +++ b/conda/recipes/libraft/conda_build_config.yaml @@ -71,3 +71,9 @@ cuda11_cuda_profiler_api_host_version: cuda11_cuda_profiler_api_run_version: - ">=11.4.240,<12" + +spdlog_version: + - ">=1.11.0,<1.12" + +fmt_version: + - ">=9.1.0,<10" diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml index f4d133d714..116f8d9e6e 100644 --- a/conda/recipes/libraft/meta.yaml +++ b/conda/recipes/libraft/meta.yaml @@ -2,7 +2,7 @@ # Usage: # conda build . -c conda-forge -c nvidia -c rapidsai -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} @@ -13,7 +13,7 @@ package: name: libraft-split source: - git_url: ../../.. + path: ../../.. outputs: - name: libraft-headers-only @@ -63,12 +63,16 @@ outputs: {% endif %} - cuda-version ={{ cuda_version }} - librmm ={{ minor_version }} + - spdlog {{ spdlog_version }} + - fmt {{ fmt_version }} run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} - cudatoolkit {% endif %} - librmm ={{ minor_version }} + - spdlog {{ spdlog_version }} + - fmt {{ fmt_version }} about: home: https://rapids.ai/ license: Apache-2.0 diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml index cc781d0cba..b8a088d0f3 100644 --- a/conda/recipes/pylibraft/meta.yaml +++ b/conda/recipes/pylibraft/meta.yaml @@ -2,7 +2,7 @@ # Usage: # conda build . -c conda-forge -c numba -c rapidsai -c pytorch -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -14,7 +14,7 @@ package: version: {{ version }} source: - git_url: ../../.. + path: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} @@ -48,7 +48,6 @@ requirements: - cython >=3.0.0 - libraft {{ version }} - libraft-headers {{ version }} - - numpy >=1.21 - python x.x - rmm ={{ minor_version }} - scikit-build >=0.13.1 @@ -60,6 +59,7 @@ requirements: {% endif %} - libraft {{ version }} - libraft-headers {{ version }} + - numpy >=1.21 - python x.x - rmm ={{ minor_version }} diff --git a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml index 0bd424f85b..fda3e4e53d 100644 --- a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml +++ b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml @@ -18,3 +18,9 @@ h5py_version: nlohmann_json_version: - ">=3.11.2" + +spdlog_version: + - ">=1.11.0,<1.12" + +fmt_version: + - ">=9.1.0,<10" diff --git a/conda/recipes/raft-ann-bench-cpu/meta.yaml b/conda/recipes/raft-ann-bench-cpu/meta.yaml index 06737b0497..fce85d5ffc 100644 --- a/conda/recipes/raft-ann-bench-cpu/meta.yaml +++ b/conda/recipes/raft-ann-bench-cpu/meta.yaml @@ -2,7 +2,7 @@ # Usage: # conda build . -c conda-forge -c nvidia -c rapidsai -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -14,7 +14,7 @@ package: script: build.sh source: - git_url: ../../.. + path: ../../.. build: script_env: @@ -48,6 +48,8 @@ requirements: - glog {{ glog_version }} - matplotlib - nlohmann_json {{ nlohmann_json_version }} + - spdlog {{ spdlog_version }} + - fmt {{ fmt_version }} - python - pyyaml - pandas @@ -60,8 +62,7 @@ requirements: - pyyaml - pandas - benchmark - about: home: https://rapids.ai/ license: Apache-2.0 - summary: libraft ann bench + summary: RAFT ANN CPU benchmarks diff --git a/conda/recipes/raft-ann-bench/build.sh b/conda/recipes/raft-ann-bench/build.sh index 9c411774b6..00078792a1 100644 --- a/conda/recipes/raft-ann-bench/build.sh +++ b/conda/recipes/raft-ann-bench/build.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash # Copyright (c) 2023, NVIDIA CORPORATION. -./build.sh bench-ann -v --allgpuarch --no-nvtx --build-metrics=bench_ann --incl-cache-stats +./build.sh bench-ann --allgpuarch --no-nvtx --build-metrics=bench_ann --incl-cache-stats cmake --install cpp/build --component ann_bench diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml index d156f2609b..da0b893c1d 100644 --- a/conda/recipes/raft-ann-bench/conda_build_config.yaml +++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml @@ -25,9 +25,6 @@ gtest_version: glog_version: - ">=0.6.0" -faiss_version: - - ">=1.7.1" - h5py_version: - ">=3.8.0" diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml index a2ab0af643..bf89afbcc1 100644 --- a/conda/recipes/raft-ann-bench/meta.yaml +++ b/conda/recipes/raft-ann-bench/meta.yaml @@ -2,7 +2,7 @@ # Usage: # conda build . -c conda-forge -c nvidia -c rapidsai -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -16,7 +16,7 @@ package: script: build.sh source: - git_url: ../../.. + path: ../../.. build: script_env: @@ -70,11 +70,6 @@ requirements: {% endif %} - glog {{ glog_version }} - nlohmann_json {{ nlohmann_json_version }} - # Temporarily ignore faiss benchmarks on CUDA 12 because packages do not exist yet - {% if cuda_major == "11" %} - - faiss-proc=*=cuda - - libfaiss {{ faiss_version }} - {% endif %} - h5py {{ h5py_version }} - benchmark - matplotlib @@ -92,11 +87,6 @@ requirements: - cudatoolkit {% endif %} - glog {{ glog_version }} - # Temporarily ignore faiss benchmarks on CUDA 12 because packages do not exist yet - {% if cuda_major == "11" %} - - faiss-proc=*=cuda - - libfaiss {{ faiss_version }} - {% endif %} - h5py {{ h5py_version }} - benchmark - glog {{ glog_version }} diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml index d9a11329d9..d89dbae4df 100644 --- a/conda/recipes/raft-dask/conda_build_config.yaml +++ b/conda/recipes/raft-dask/conda_build_config.yaml @@ -14,10 +14,10 @@ sysroot_version: - "2.17" ucx_version: - - ">=1.13.0,<1.15.0" + - ">=1.14.1,<1.16.0" ucx_py_version: - - "0.34.*" + - "0.35.*" cmake_version: - ">=3.26.4" diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml index 04dfef5063..eae5a6affe 100644 --- a/conda/recipes/raft-dask/meta.yaml +++ b/conda/recipes/raft-dask/meta.yaml @@ -2,7 +2,7 @@ # Usage: # conda build . -c conda-forge -c numba -c rapidsai -c pytorch -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -14,7 +14,7 @@ package: version: {{ version }} source: - git_url: ../../.. + path: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} @@ -60,10 +60,8 @@ requirements: - cudatoolkit {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - - dask ==2023.9.2 - - dask-core ==2023.9.2 - dask-cuda ={{ minor_version }} - - distributed ==2023.9.2 + - rapids-dask-dependency ={{ minor_version }} - joblib >=0.11 - nccl >=2.9.9 - pylibraft {{ version }} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7d63751906..5d2864e2e0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -10,8 +10,8 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing permissions and limitations under # the License. -set(RAPIDS_VERSION "23.10") -set(RAFT_VERSION "23.10.00") +set(RAPIDS_VERSION "23.12") +set(RAFT_VERSION "23.12.00") cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) include(../fetch_rapids.cmake) diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index 8985be328b..5919de07e7 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -15,12 +15,22 @@ # ################################################################################################## # * benchmark options ------------------------------------------------------------------------------ -option(RAFT_ANN_BENCH_USE_FAISS_BFKNN "Include faiss' brute-force knn algorithm in benchmark" ON) -option(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) -option(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT + "Include faiss' cpu brute-force knn algorithm in benchmark" ON +) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT "Include faiss' cpu brute-force algorithm in benchmark" ON) + +option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algorithm in benchmark" + ON +) +option(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT "Include raft's ivf flat algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ "Include raft's ivf pq algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_RAFT_CAGRA "Include raft's CAGRA in benchmark" ON) +option(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB "Include raft's CAGRA in benchmark" ON) option(RAFT_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON) option(RAFT_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" ON) option(RAFT_ANN_BENCH_SINGLE_EXE @@ -33,35 +43,54 @@ option(RAFT_ANN_BENCH_SINGLE_EXE find_package(Threads REQUIRED) if(BUILD_CPU_ONLY) - set(RAFT_ANN_BENCH_USE_FAISS_BFKNN OFF) - set(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT OFF) - set(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ OFF) + + # Include necessary logging dependencies + include(cmake/thirdparty/get_fmt.cmake) + include(cmake/thirdparty/get_spdlog.cmake) + + set(RAFT_FAISS_ENABLE_GPU OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF) set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF) set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF) set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF) + set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF) set(RAFT_ANN_BENCH_USE_GGNN OFF) else() # Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled. # https://github.com/rapidsai/raft/issues/1627 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0) - set(RAFT_ANN_BENCH_USE_FAISS_BFKNN OFF) - set(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT OFF) - set(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ OFF) + set(RAFT_FAISS_ENABLE_GPU OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF) + set(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT OFF) + set(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ OFF) + set(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT OFF) + else() + set(RAFT_FAISS_ENABLE_GPU ON) endif() endif() set(RAFT_ANN_BENCH_USE_FAISS OFF) -if(RAFT_ANN_BENCH_USE_FAISS_BFKNN - OR RAFT_ANN_BENCH_USE_FAISS_IVFPQ - OR RAFT_ANN_BENCH_USE_FAISS_IFFLAT +if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT + OR RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ + OR RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT + OR RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT + OR RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ + OR RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT ) set(RAFT_ANN_BENCH_USE_FAISS ON) + set(RAFT_USE_FAISS_STATIC ON) endif() set(RAFT_ANN_BENCH_USE_RAFT OFF) if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ + OR RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE OR RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OR RAFT_ANN_BENCH_USE_RAFT_CAGRA + OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB ) set(RAFT_ANN_BENCH_USE_RAFT ON) endif() @@ -69,7 +98,7 @@ endif() # ################################################################################################## # * Fetch requirements ------------------------------------------------------------- -if(RAFT_ANN_BENCH_USE_HNSWLIB) +if(RAFT_ANN_BENCH_USE_HNSWLIB OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) include(cmake/thirdparty/get_hnswlib.cmake) endif() @@ -80,6 +109,10 @@ if(RAFT_ANN_BENCH_USE_GGNN) endif() if(RAFT_ANN_BENCH_USE_FAISS) + # We need to ensure that faiss has all the conda information. So we currently use the very ugly + # hammer of `link_libraries` to ensure that all targets in this directory and the faiss directory + # will have the conda includes/link dirs + link_libraries($) include(cmake/thirdparty/get_faiss.cmake) endif() @@ -116,7 +149,6 @@ function(ConfigureAnnBench) ${BENCH_NAME} PRIVATE raft::raft nlohmann_json::nlohmann_json - $<$:$<$:NCCL::NCCL>> ${ConfigureAnnBench_LINKS} Threads::Threads $<$:${RAFT_CTK_MATH_DEPENDENCIES}> @@ -124,6 +156,8 @@ function(ConfigureAnnBench) $ -static-libgcc -static-libstdc++ + $<$:fmt::fmt-header-only> + $<$:spdlog::spdlog_header_only> ) set_target_properties( @@ -201,6 +235,12 @@ if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT) ) endif() +if(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE) + ConfigureAnnBench( + NAME RAFT_BRUTE_FORCE PATH bench/ann/src/raft/raft_benchmark.cu LINKS raft::compiled + ) +endif() + if(RAFT_ANN_BENCH_USE_RAFT_CAGRA) ConfigureAnnBench( NAME @@ -213,20 +253,67 @@ if(RAFT_ANN_BENCH_USE_RAFT_CAGRA) ) endif() -if(RAFT_ANN_BENCH_USE_FAISS_IVF_FLAT) +if(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) ConfigureAnnBench( - NAME FAISS_IVF_FLAT PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + NAME + RAFT_CAGRA_HNSWLIB + PATH + bench/ann/src/raft/raft_cagra_hnswlib.cu + INCLUDES + ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src/hnswlib + LINKS + raft::compiled + CXXFLAGS + "${HNSW_CXX_FLAGS}" ) endif() -if(RAFT_ANN_BENCH_USE_FAISS_IVF_PQ) +set(RAFT_FAISS_TARGETS faiss::faiss) +if(TARGET faiss::faiss_avx2) + set(RAFT_FAISS_TARGETS faiss::faiss_avx2) +endif() + +message("RAFT_FAISS_TARGETS: ${RAFT_FAISS_TARGETS}") +message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}") +if(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT) ConfigureAnnBench( - NAME FAISS_IVF_PQ PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss + NAME FAISS_CPU_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${RAFT_FAISS_TARGETS} ) endif() -if(RAFT_ANN_BENCH_USE_FAISS_BFKNN) - ConfigureAnnBench(NAME FAISS_BFKNN PATH bench/ann/src/faiss/faiss_benchmark.cu LINKS faiss::faiss) +if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT) + ConfigureAnnBench( + NAME FAISS_CPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ) + ConfigureAnnBench( + NAME FAISS_CPU_IVF_PQ PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT) + ConfigureAnnBench( + NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ) + ConfigureAnnBench( + NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS + ${RAFT_FAISS_TARGETS} + ) +endif() + +if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT) + ConfigureAnnBench( + NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS} + ) endif() if(RAFT_ANN_BENCH_USE_GGNN) @@ -277,7 +364,8 @@ if(RAFT_ANN_BENCH_SINGLE_EXE) target_compile_definitions( ANN_BENCH PRIVATE - $<$:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}"> + $<$:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH} + "> $<$:ANN_BENCH_NVTX3_HEADERS_FOUND> ) diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp index 33716bd45a..e964a81efa 100644 --- a/cpp/bench/ann/src/common/ann_types.hpp +++ b/cpp/bench/ann/src/common/ann_types.hpp @@ -24,6 +24,11 @@ namespace raft::bench::ann { +enum Objective { + THROUGHPUT, // See how many vectors we can push through + LATENCY // See how fast we can push a vector through +}; + enum class MemoryType { Host, HostMmap, @@ -59,10 +64,17 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType } } -struct AlgoProperty { +class AlgoProperty { + public: + inline AlgoProperty() {} + inline AlgoProperty(MemoryType dataset_memory_type_, MemoryType query_memory_type_) + : dataset_memory_type(dataset_memory_type_), query_memory_type(query_memory_type_) + { + } MemoryType dataset_memory_type; // neighbors/distances should have same memory type as queries MemoryType query_memory_type; + virtual ~AlgoProperty() = default; }; class AnnBase { @@ -79,7 +91,8 @@ template class ANN : public AnnBase { public: struct AnnSearchParam { - virtual ~AnnSearchParam() = default; + Objective metric_objective = Objective::LATENCY; + virtual ~AnnSearchParam() = default; [[nodiscard]] virtual auto needs_dataset() const -> bool { return false; }; }; @@ -107,7 +120,7 @@ class ANN : public AnnBase { // The advantage of this way is that index has smaller size // and many indices can share one dataset. // - // AlgoProperty::need_dataset_when_search of such algorithm should be true, + // SearchParam::needs_dataset() of such algorithm should be true, // and set_search_dataset() should save the passed-in pointer somewhere. // The client code should call set_search_dataset() before searching, // and should not release dataset before searching is finished. diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 4ec977700d..a2e77323c1 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -21,21 +21,31 @@ #include "util.hpp" #include +#include #include +#include +#include #include +#include #include #include #include #include +#include #include +#include #include #include #include - namespace raft::bench::ann { +std::mutex init_mutex; +std::condition_variable cond_var; +std::atomic_int processed_threads{0}; + static inline std::unique_ptr current_algo{nullptr}; +static inline std::shared_ptr current_algo_props{nullptr}; using kv_series = std::vector>>; @@ -122,7 +132,7 @@ void bench_build(::benchmark::State& state, log_info("Overwriting file: %s", index.file.c_str()); } else { return state.SkipWithMessage( - "Index file already exists (use --overwrite to overwrite the index)."); + "Index file already exists (use --force to overwrite the index)."); } } @@ -153,7 +163,7 @@ void bench_build(::benchmark::State& state, } } state.counters.insert( - {{"GPU Time", gpu_timer.total_time() / state.iterations()}, {"index_size", index_size}}); + {{"GPU", gpu_timer.total_time() / state.iterations()}, {"index_size", index_size}}); if (state.skipped()) { return; } make_sure_parent_dir_exists(index.file); @@ -162,12 +172,16 @@ void bench_build(::benchmark::State& state, template void bench_search(::benchmark::State& state, - std::shared_ptr> dataset, Configuration::Index index, - std::size_t search_param_ix) + std::size_t search_param_ix, + std::shared_ptr> dataset, + Objective metric_objective) { + std::size_t queries_processed = 0; + const auto& sp_json = index.search_params[search_param_ix]; - dump_parameters(state, sp_json); + + if (state.thread_index() == 0) { dump_parameters(state, sp_json); } // NB: `k` and `n_queries` are guaranteed to be populated in conf.cpp const std::uint32_t k = sp_json["k"]; @@ -176,133 +190,227 @@ void bench_search(::benchmark::State& state, // Round down the query data to a multiple of the batch size to loop over full batches of data const std::size_t query_set_size = (dataset->query_set_size() / n_queries) * n_queries; + if (dataset->query_set_size() < n_queries) { + std::stringstream msg; + msg << "Not enough queries in benchmark set. Expected " << n_queries << ", actual " + << dataset->query_set_size(); + return state.SkipWithError(msg.str()); + } + + // Each thread start from a different offset, so that the queries that they process do not + // overlap. + std::ptrdiff_t batch_offset = (state.thread_index() * n_queries) % query_set_size; + std::ptrdiff_t queries_stride = state.threads() * n_queries; + // Output is saved into a contiguous buffer (separate buffers for each thread). + std::ptrdiff_t out_offset = 0; + + const T* query_set = nullptr; + if (!file_exists(index.file)) { state.SkipWithError("Index file is missing. Run the benchmark in the build mode first."); return; } - // algo is static to cache it between close search runs to save time on index loading - static std::string index_file = ""; - if (index.file != index_file) { - current_algo.reset(); - index_file = index.file; - } - ANN* algo; - std::unique_ptr::AnnSearchParam> search_param; - try { - if (!current_algo || (algo = dynamic_cast*>(current_algo.get())) == nullptr) { - auto ualgo = ann::create_algo( - index.algo, dataset->distance(), dataset->dim(), index.build_param, index.dev_list); - algo = ualgo.get(); - algo->load(index_file); - current_algo = std::move(ualgo); + + /** + * Make sure the first thread loads the algo and dataset + */ + if (state.thread_index() == 0) { + std::unique_lock lk(init_mutex); + cond_var.wait(lk, [] { return processed_threads.load(std::memory_order_acquire) == 0; }); + // algo is static to cache it between close search runs to save time on index loading + static std::string index_file = ""; + if (index.file != index_file) { + current_algo.reset(); + index_file = index.file; } - search_param = ann::create_search_param(index.algo, sp_json); - } catch (const std::exception& e) { - return state.SkipWithError("Failed to create an algo: " + std::string(e.what())); - } - algo->set_search_param(*search_param); - const auto algo_property = parse_algo_property(algo->get_preference(), sp_json); - const T* query_set = dataset->query_set(algo_property.query_memory_type); - buf distances{algo_property.query_memory_type, k * query_set_size}; - buf neighbors{algo_property.query_memory_type, k * query_set_size}; + std::unique_ptr::AnnSearchParam> search_param; + ANN* algo; + try { + if (!current_algo || (algo = dynamic_cast*>(current_algo.get())) == nullptr) { + auto ualgo = ann::create_algo( + index.algo, dataset->distance(), dataset->dim(), index.build_param, index.dev_list); + algo = ualgo.get(); + algo->load(index_file); + current_algo = std::move(ualgo); + } + search_param = ann::create_search_param(index.algo, sp_json); + search_param->metric_objective = metric_objective; + } catch (const std::exception& e) { + state.SkipWithError("Failed to create an algo: " + std::string(e.what())); + return; + } + + auto algo_property = parse_algo_property(algo->get_preference(), sp_json); + current_algo_props = std::make_shared(algo_property.dataset_memory_type, + algo_property.query_memory_type); - if (search_param->needs_dataset()) { + if (search_param->needs_dataset()) { + try { + algo->set_search_dataset(dataset->base_set(current_algo_props->dataset_memory_type), + dataset->base_set_size()); + } catch (const std::exception& ex) { + state.SkipWithError("The algorithm '" + index.name + + "' requires the base set, but it's not available. " + + "Exception: " + std::string(ex.what())); + return; + } + } try { - algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type), - dataset->base_set_size()); + algo->set_search_param(*search_param); + } catch (const std::exception& ex) { - state.SkipWithError("The algorithm '" + index.name + - "' requires the base set, but it's not available. " + - "Exception: " + std::string(ex.what())); + state.SkipWithError("An error occurred setting search parameters: " + std::string(ex.what())); return; } + + query_set = dataset->query_set(current_algo_props->query_memory_type); + processed_threads.store(state.threads(), std::memory_order_acq_rel); + cond_var.notify_all(); + } else { + std::unique_lock lk(init_mutex); + // All other threads will wait for the first thread to initialize the algo. + cond_var.wait(lk, [&state] { + return processed_threads.load(std::memory_order_acquire) == state.threads(); + }); + // gbench ensures that all threads are synchronized at the start of the benchmark loop. + // We are accessing shared variables (like current_algo, current_algo_probs) before the + // benchmark loop, therefore the synchronization here is necessary. } + const auto algo_property = *current_algo_props; + query_set = dataset->query_set(algo_property.query_memory_type); + + /** + * Each thread will manage its own outputs + */ + std::shared_ptr> distances = + std::make_shared>(algo_property.query_memory_type, k * query_set_size); + std::shared_ptr> neighbors = + std::make_shared>(algo_property.query_memory_type, k * query_set_size); - std::ptrdiff_t batch_offset = 0; - std::size_t queries_processed = 0; cuda_timer gpu_timer; + auto start = std::chrono::high_resolution_clock::now(); { nvtx_case nvtx{state.name()}; + + ANN* algo = dynamic_cast*>(current_algo.get()); for (auto _ : state) { - // measure the GPU time using the RAII helper [[maybe_unused]] auto ntx_lap = nvtx.lap(); [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + // run the search try { algo->search(query_set + batch_offset * dataset->dim(), n_queries, k, - neighbors.data + batch_offset * k, - distances.data + batch_offset * k, + neighbors->data + out_offset * k, + distances->data + out_offset * k, gpu_timer.stream()); } catch (const std::exception& e) { state.SkipWithError(std::string(e.what())); } + // advance to the next batch - batch_offset = (batch_offset + n_queries) % query_set_size; + batch_offset = (batch_offset + queries_stride) % query_set_size; + out_offset = (out_offset + n_queries) % query_set_size; + queries_processed += n_queries; } } + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast>(end - start).count(); + if (state.thread_index() == 0) { state.counters.insert({{"end_to_end", duration}}); } + state.counters.insert( + {"Latency", {duration / double(state.iterations()), benchmark::Counter::kAvgThreads}}); + state.SetItemsProcessed(queries_processed); - state.counters.insert({{"k", k}, {"n_queries", n_queries}}); if (cudart.found()) { - state.counters.insert({{"GPU Time", gpu_timer.total_time() / state.iterations()}, - {"GPU QPS", queries_processed / gpu_timer.total_time()}}); + double gpu_time_per_iteration = gpu_timer.total_time() / (double)state.iterations(); + state.counters.insert({"GPU", {gpu_time_per_iteration, benchmark::Counter::kAvgThreads}}); } + + // This will be the total number of queries across all threads + state.counters.insert({{"total_queries", queries_processed}}); + if (state.skipped()) { return; } + // assume thread has finished processing successfully at this point + // last thread to finish processing notifies all + if (processed_threads-- == 0) { cond_var.notify_all(); } + + // Each thread calculates recall on their partition of queries. // evaluate recall if (dataset->max_k() >= k) { const std::int32_t* gt = dataset->gt_set(); const std::uint32_t max_k = dataset->max_k(); - buf neighbors_host = neighbors.move(MemoryType::Host); - - std::size_t rows = std::min(queries_processed, query_set_size); - std::size_t match_count = 0; - std::size_t total_count = rows * static_cast(k); - for (std::size_t i = 0; i < rows; i++) { - for (std::uint32_t j = 0; j < k; j++) { - auto act_idx = std::int32_t(neighbors_host.data[i * k + j]); - for (std::uint32_t l = 0; l < k; l++) { - auto exp_idx = gt[i * max_k + l]; - if (act_idx == exp_idx) { - match_count++; - break; + buf neighbors_host = neighbors->move(MemoryType::Host); + std::size_t rows = std::min(queries_processed, query_set_size); + std::size_t match_count = 0; + std::size_t total_count = rows * static_cast(k); + + // We go through the groundtruth with same stride as the benchmark loop. + size_t out_offset = 0; + size_t batch_offset = (state.thread_index() * n_queries) % query_set_size; + while (out_offset < rows) { + for (std::size_t i = 0; i < n_queries; i++) { + size_t i_orig_idx = batch_offset + i; + size_t i_out_idx = out_offset + i; + if (i_out_idx < rows) { + for (std::uint32_t j = 0; j < k; j++) { + auto act_idx = std::int32_t(neighbors_host.data[i_out_idx * k + j]); + for (std::uint32_t l = 0; l < k; l++) { + auto exp_idx = gt[i_orig_idx * max_k + l]; + if (act_idx == exp_idx) { + match_count++; + break; + } + } } } } + out_offset += n_queries; + batch_offset = (batch_offset + queries_stride) % query_set_size; } double actual_recall = static_cast(match_count) / static_cast(total_count); - state.counters.insert({{"Recall", actual_recall}}); + state.counters.insert({"Recall", {actual_recall, benchmark::Counter::kAvgThreads}}); } } inline void printf_usage() { ::benchmark::PrintDefaultHelp(); - fprintf( - stdout, - " [--build|--search] \n" - " [--overwrite]\n" - " [--data_prefix=]\n" - " [--index_prefix=]\n" - " [--override_kv=]\n" - " .json\n" - "\n" - "Note the non-standard benchmark parameters:\n" - " --build: build mode, will build index\n" - " --search: search mode, will search using the built index\n" - " one and only one of --build and --search should be specified\n" - " --overwrite: force overwriting existing index files\n" - " --data_prefix=:" - " prepend to dataset file paths specified in the .json (default = 'data/').\n" - " --index_prefix=:" - " prepend to index file paths specified in the .json (default = 'index/').\n" - " --override_kv=:" - " override a build/search key one or more times multiplying the number of configurations;" - " you can use this parameter multiple times to get the Cartesian product of benchmark" - " configs.\n"); + fprintf(stdout, + " [--build|--search] \n" + " [--force]\n" + " [--data_prefix=]\n" + " [--index_prefix=]\n" + " [--override_kv=]\n" + " [--mode=\n" + " [--threads=min[:max]]\n" + " .json\n" + "\n" + "Note the non-standard benchmark parameters:\n" + " --build: build mode, will build index\n" + " --search: search mode, will search using the built index\n" + " one and only one of --build and --search should be specified\n" + " --force: force overwriting existing index files\n" + " --data_prefix=:" + " prepend to dataset file paths specified in the .json (default = " + "'data/').\n" + " --index_prefix=:" + " prepend to index file paths specified in the .json (default = " + "'index/').\n" + " --override_kv=:" + " override a build/search key one or more times multiplying the number of configurations;" + " you can use this parameter multiple times to get the Cartesian product of benchmark" + " configs.\n" + " --mode=" + " run the benchmarks in latency (accumulate times spent in each batch) or " + " throughput (pipeline batches and measure end-to-end) mode\n" + " --threads=min[:max] specify the number threads to use for throughput benchmark." + " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified," + " then a single test is run with 'min' threads. By default min=1, max=.\n"); } template @@ -319,22 +427,36 @@ void register_build(std::shared_ptr> dataset, auto* b = ::benchmark::RegisterBenchmark( index.name + suf, bench_build, dataset, index, force_overwrite); b->Unit(benchmark::kSecond); + b->MeasureProcessCPUTime(); b->UseRealTime(); } } template void register_search(std::shared_ptr> dataset, - std::vector indices) + std::vector indices, + Objective metric_objective, + const std::vector& threads) { for (auto index : indices) { for (std::size_t i = 0; i < index.search_params.size(); i++) { auto suf = static_cast(index.search_params[i]["override_suffix"]); index.search_params[i].erase("override_suffix"); - auto* b = - ::benchmark::RegisterBenchmark(index.name + suf, bench_search, dataset, index, i); - b->Unit(benchmark::kMillisecond); - b->UseRealTime(); + + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_search, index, i, dataset, metric_objective) + ->Unit(benchmark::kMillisecond) + /** + * The following are important for getting accuracy QPS measurements on both CPU + * and GPU These make sure that + * - `end_to_end` ~ (`Time` * `Iterations`) + * - `items_per_second` ~ (`total_queries` / `end_to_end`) + * - Throughput = `items_per_second` + */ + ->MeasureProcessCPUTime() + ->UseRealTime(); + + if (metric_objective == Objective::THROUGHPUT) { b->ThreadRange(threads[0], threads[1]); } } } } @@ -346,7 +468,9 @@ void dispatch_benchmark(const Configuration& conf, bool search_mode, std::string data_prefix, std::string index_prefix, - kv_series override_kv) + kv_series override_kv, + Objective metric_objective, + const std::vector& threads) { if (cudart.found()) { for (auto [key, value] : cuda_info()) { @@ -403,7 +527,8 @@ void dispatch_benchmark(const Configuration& conf, } else { log_warn( "Ground truth file is not provided; the recall won't be reported. NB: use " - "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the path to " + "the 'groundtruth_neighbors_file' alongside the 'query_file' key to specify the " + "path to " "the ground truth in your conf.json."); } } else { @@ -414,7 +539,7 @@ void dispatch_benchmark(const Configuration& conf, index.search_params = apply_overrides(index.search_params, override_kv); index.file = combine_path(index_prefix, index.file); } - register_search(dataset, indices); + register_search(dataset, indices, metric_objective, threads); } } @@ -445,6 +570,11 @@ inline auto run_main(int argc, char** argv) -> int std::string data_prefix = "data"; std::string index_prefix = "index"; std::string new_override_kv = ""; + std::string mode = "latency"; + std::string threads_arg_txt = ""; + std::vector threads = {1, -1}; // min_thread, max_thread + std::string log_level_str = ""; + int raft_log_level = raft::logger::get(RAFT_NAME).get_level(); kv_series override_kv{}; char arg0_default[] = "benchmark"; // NOLINT @@ -462,12 +592,29 @@ inline auto run_main(int argc, char** argv) -> int std::ifstream conf_stream(conf_path); for (int i = 1; i < argc; i++) { - if (parse_bool_flag(argv[i], "--overwrite", force_overwrite) || + if (parse_bool_flag(argv[i], "--force", force_overwrite) || parse_bool_flag(argv[i], "--build", build_mode) || parse_bool_flag(argv[i], "--search", search_mode) || parse_string_flag(argv[i], "--data_prefix", data_prefix) || parse_string_flag(argv[i], "--index_prefix", index_prefix) || - parse_string_flag(argv[i], "--override_kv", new_override_kv)) { + parse_string_flag(argv[i], "--mode", mode) || + parse_string_flag(argv[i], "--override_kv", new_override_kv) || + parse_string_flag(argv[i], "--threads", threads_arg_txt) || + parse_string_flag(argv[i], "--raft_log_level", log_level_str)) { + if (!log_level_str.empty()) { + raft_log_level = std::stoi(log_level_str); + log_level_str = ""; + } + if (!threads_arg_txt.empty()) { + auto threads_arg = split(threads_arg_txt, ':'); + threads[0] = std::stoi(threads_arg[0]); + if (threads_arg.size() > 1) { + threads[1] = std::stoi(threads_arg[1]); + } else { + threads[1] = threads[0]; + } + threads_arg_txt = ""; + } if (!new_override_kv.empty()) { auto kvv = split(new_override_kv, ':'); auto key = kvv[0]; @@ -486,6 +633,22 @@ inline auto run_main(int argc, char** argv) -> int } } + raft::logger::get(RAFT_NAME).set_level(raft_log_level); + + Objective metric_objective = Objective::LATENCY; + if (mode == "throughput") { metric_objective = Objective::THROUGHPUT; } + + int max_threads = + (metric_objective == Objective::THROUGHPUT) ? std::thread::hardware_concurrency() : 1; + if (threads[1] == -1) threads[1] = max_threads; + + if (metric_objective == Objective::LATENCY) { + if (threads[0] != 1 || threads[1] != 1) { + log_warn("Latency mode enabled. Overriding threads arg, running with single thread."); + threads = {1, 1}; + } + } + if (build_mode == search_mode) { log_error("One and only one of --build and --search should be specified"); printf_usage(); @@ -505,14 +668,35 @@ inline auto run_main(int argc, char** argv) -> int std::string dtype = conf.get_dataset_conf().dtype; if (dtype == "float") { - dispatch_benchmark( - conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv); + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); } else if (dtype == "uint8") { - dispatch_benchmark( - conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv); + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); } else if (dtype == "int8") { - dispatch_benchmark( - conf, force_overwrite, build_mode, search_mode, data_prefix, index_prefix, override_kv); + dispatch_benchmark(conf, + force_overwrite, + build_mode, + search_mode, + data_prefix, + index_prefix, + override_kv, + metric_objective, + threads); } else { log_error("datatype '%s' is not supported", dtype.c_str()); return -1; @@ -522,10 +706,9 @@ inline auto run_main(int argc, char** argv) -> int if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return -1; ::benchmark::RunSpecifiedBenchmarks(); ::benchmark::Shutdown(); - // Release a possibly cached ANN object, so that it cannot be alive longer than the handle to a - // shared library it depends on (dynamic benchmark executable). + // Release a possibly cached ANN object, so that it cannot be alive longer than the handle + // to a shared library it depends on (dynamic benchmark executable). current_algo.reset(); return 0; } - }; // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp new file mode 100644 index 0000000000..9132db7c04 --- /dev/null +++ b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace raft::mr { +/** + * @brief `device_memory_resource` derived class that uses mmap to allocate memory. + * This class enables memory allocation using huge pages. + * It is assumed that the allocated memory is directly accessible on device. This currently only + * works on GH systems. + * + * TODO(tfeher): consider improving or removing this helper once we made progress with + * https://github.com/rapidsai/raft/issues/1819 + */ +class cuda_huge_page_resource final : public rmm::mr::device_memory_resource { + public: + cuda_huge_page_resource() = default; + ~cuda_huge_page_resource() override = default; + cuda_huge_page_resource(cuda_huge_page_resource const&) = default; + cuda_huge_page_resource(cuda_huge_page_resource&&) = default; + cuda_huge_page_resource& operator=(cuda_huge_page_resource const&) = default; + cuda_huge_page_resource& operator=(cuda_huge_page_resource&&) = default; + + /** + * @brief Query whether the resource supports use of non-null CUDA streams for + * allocation/deallocation. `cuda_huge_page_resource` does not support streams. + * + * @returns bool false + */ + [[nodiscard]] bool supports_streams() const noexcept override { return false; } + + /** + * @brief Query whether the resource supports the get_mem_info API. + * + * @return true + */ + [[nodiscard]] bool supports_get_mem_info() const noexcept override { return true; } + + private: + /** + * @brief Allocates memory of size at least `bytes` using cudaMalloc. + * + * The returned pointer has at least 256B alignment. + * + * @note Stream argument is ignored + * + * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled + * + * @param bytes The size, in bytes, of the allocation + * @return void* Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, rmm::cuda_stream_view) override + { + void* _addr{nullptr}; + _addr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (_addr == MAP_FAILED) { RAFT_FAIL("huge_page_resource::MAP FAILED"); } + if (madvise(_addr, bytes, MADV_HUGEPAGE) == -1) { + munmap(_addr, bytes); + RAFT_FAIL("huge_page_resource::madvise MADV_HUGEPAGE"); + } + memset(_addr, 0, bytes); + return _addr; + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * @note Stream argument is ignored. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + */ + void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view) override + { + if (munmap(ptr, size) == -1) { RAFT_FAIL("huge_page_resource::munmap"); } + } + + /** + * @brief Compare this resource to another. + * + * Two cuda_huge_page_resources always compare equal, because they can each + * deallocate memory allocated by the other. + * + * @throws Nothing. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override + { + return dynamic_cast(&other) != nullptr; + } + + /** + * @brief Get free and available memory for memory resource + * + * @throws `rmm::cuda_error` if unable to retrieve memory info. + * + * @return std::pair contaiing free_size and total_size of memory + */ + [[nodiscard]] std::pair do_get_mem_info( + rmm::cuda_stream_view) const override + { + std::size_t free_size{}; + std::size_t total_size{}; + RMM_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); + return std::make_pair(free_size, total_size); + } +}; +} // namespace raft::mr \ No newline at end of file diff --git a/cpp/bench/ann/src/common/cuda_pinned_resource.hpp b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp new file mode 100644 index 0000000000..28ca691f86 --- /dev/null +++ b/cpp/bench/ann/src/common/cuda_pinned_resource.hpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include + +namespace raft::mr { +/** + * @brief `device_memory_resource` derived class that uses cudaMallocHost/Free for + * allocation/deallocation. + * + * This is almost the same as rmm::mr::host::pinned_memory_resource, but it has + * device_memory_resource as base class. Pinned memory can be accessed from device, + * and using this allocator we can create device_mdarray backed by pinned allocator. + * + * TODO(tfeher): it would be preferred to just rely on the existing allocator from rmm + * (pinned_memory_resource), but that is incompatible with the container_policy class + * for device matrix, because the latter expects a device_memory_resource. We shall + * revise this once we progress with Issue https://github.com/rapidsai/raft/issues/1819 + */ +class cuda_pinned_resource final : public rmm::mr::device_memory_resource { + public: + cuda_pinned_resource() = default; + ~cuda_pinned_resource() override = default; + cuda_pinned_resource(cuda_pinned_resource const&) = default; + cuda_pinned_resource(cuda_pinned_resource&&) = default; + cuda_pinned_resource& operator=(cuda_pinned_resource const&) = default; + cuda_pinned_resource& operator=(cuda_pinned_resource&&) = default; + + /** + * @brief Query whether the resource supports use of non-null CUDA streams for + * allocation/deallocation. `cuda_pinned_resource` does not support streams. + * + * @returns bool false + */ + [[nodiscard]] bool supports_streams() const noexcept override { return false; } + + /** + * @brief Query whether the resource supports the get_mem_info API. + * + * @return true + */ + [[nodiscard]] bool supports_get_mem_info() const noexcept override { return true; } + + private: + /** + * @brief Allocates memory of size at least `bytes` using cudaMalloc. + * + * The returned pointer has at least 256B alignment. + * + * @note Stream argument is ignored + * + * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled + * + * @param bytes The size, in bytes, of the allocation + * @return void* Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, rmm::cuda_stream_view) override + { + void* ptr{nullptr}; + RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, bytes)); + return ptr; + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * @note Stream argument is ignored. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + */ + void do_deallocate(void* ptr, std::size_t, rmm::cuda_stream_view) override + { + RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); + } + + /** + * @brief Compare this resource to another. + * + * Two cuda_pinned_resources always compare equal, because they can each + * deallocate memory allocated by the other. + * + * @throws Nothing. + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equal + */ + [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override + { + return dynamic_cast(&other) != nullptr; + } + + /** + * @brief Get free and available memory for memory resource + * + * @throws `rmm::cuda_error` if unable to retrieve memory info. + * + * @return std::pair contaiing free_size and total_size of memory + */ + [[nodiscard]] std::pair do_get_mem_info( + rmm::cuda_stream_view) const override + { + std::size_t free_size{}; + std::size_t total_size{}; + RMM_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); + return std::make_pair(free_size, total_size); + } +}; +} // namespace raft::mr \ No newline at end of file diff --git a/cpp/bench/ann/src/common/cuda_stub.hpp b/cpp/bench/ann/src/common/cuda_stub.hpp index b2e3130304..6e3b63cd38 100644 --- a/cpp/bench/ann/src/common/cuda_stub.hpp +++ b/cpp/bench/ann/src/common/cuda_stub.hpp @@ -154,6 +154,8 @@ namespace stub { { return cudaSuccess; } +[[gnu::weak, gnu::noinline]] cudaError_t cudaDeviceSynchronize() { return cudaSuccess; } + [[gnu::weak, gnu::noinline]] cudaError_t cudaStreamSynchronize(cudaStream_t pStream) { return cudaSuccess; @@ -214,6 +216,7 @@ RAFT_DECLARE_CUDART(cudaFree); RAFT_DECLARE_CUDART(cudaStreamCreate); RAFT_DECLARE_CUDART(cudaStreamCreateWithFlags); RAFT_DECLARE_CUDART(cudaStreamDestroy); +RAFT_DECLARE_CUDART(cudaDeviceSynchronize); RAFT_DECLARE_CUDART(cudaStreamSynchronize); RAFT_DECLARE_CUDART(cudaEventCreate); RAFT_DECLARE_CUDART(cudaEventRecord); diff --git a/cpp/bench/ann/src/common/thread_pool.hpp b/cpp/bench/ann/src/common/thread_pool.hpp new file mode 100644 index 0000000000..c01fa2c32c --- /dev/null +++ b/cpp/bench/ann/src/common/thread_pool.hpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +class FixedThreadPool { + public: + FixedThreadPool(int num_threads) + { + if (num_threads < 1) { + throw std::runtime_error("num_threads must >= 1"); + } else if (num_threads == 1) { + return; + } + + tasks_ = new Task_[num_threads]; + + threads_.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + threads_.emplace_back([&, i] { + auto& task = tasks_[i]; + while (true) { + std::unique_lock lock(task.mtx); + task.cv.wait(lock, + [&] { return task.has_task || finished_.load(std::memory_order_relaxed); }); + if (finished_.load(std::memory_order_relaxed)) { break; } + + task.task(); + task.has_task = false; + } + }); + } + } + + ~FixedThreadPool() + { + if (threads_.empty()) { return; } + + finished_.store(true, std::memory_order_relaxed); + for (unsigned i = 0; i < threads_.size(); ++i) { + auto& task = tasks_[i]; + std::lock_guard(task.mtx); + + task.cv.notify_one(); + threads_[i].join(); + } + + delete[] tasks_; + } + + template + void submit(Func f, IdxT len) + { + // Run functions in main thread if thread pool has no threads + if (threads_.empty()) { + for (IdxT i = 0; i < len; ++i) { + f(i); + } + return; + } + + const int num_threads = threads_.size(); + // one extra part for competition among threads + const IdxT items_per_thread = len / (num_threads + 1); + std::atomic cnt(items_per_thread * num_threads); + + // Wrap function + auto wrapped_f = [&](IdxT start, IdxT end) { + for (IdxT i = start; i < end; ++i) { + f(i); + } + + while (true) { + IdxT i = cnt.fetch_add(1, std::memory_order_relaxed); + if (i >= len) { break; } + f(i); + } + }; + + std::vector> futures; + futures.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) { + IdxT start = i * items_per_thread; + auto& task = tasks_[i]; + { + std::lock_guard lock(task.mtx); + (void)lock; // stop nvcc warning + task.task = std::packaged_task([=] { wrapped_f(start, start + items_per_thread); }); + futures.push_back(task.task.get_future()); + task.has_task = true; + } + task.cv.notify_one(); + } + + for (auto& fut : futures) { + fut.wait(); + } + return; + } + + private: + struct alignas(64) Task_ { + std::mutex mtx; + std::condition_variable cv; + bool has_task = false; + std::packaged_task task; + }; + + Task_* tasks_; + std::vector threads_; + std::atomic finished_{false}; +}; diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp new file mode 100644 index 0000000000..97d1bbf307 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../common/ann_types.hpp" +#include "faiss_cpu_wrapper.h" +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +template +void parse_base_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpu::BuildParam& param) +{ + param.nlist = conf.at("nlist"); + if (conf.contains("ratio")) { param.ratio = conf.at("ratio"); } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpuIVFFlat::BuildParam& param) +{ + parse_base_build_param(conf, param); +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpuIVFPQ::BuildParam& param) +{ + parse_base_build_param(conf, param); + param.M = conf.at("M"); + if (conf.contains("usePrecomputed")) { + param.usePrecomputed = conf.at("usePrecomputed"); + } else { + param.usePrecomputed = false; + } + if (conf.contains("bitsPerCode")) { + param.bitsPerCode = conf.at("bitsPerCode"); + } else { + param.bitsPerCode = 8; + } +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpuIVFSQ::BuildParam& param) +{ + parse_base_build_param(conf, param); + param.quantizer_type = conf.at("quantizer_type"); +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::FaissCpu::SearchParam& param) +{ + param.nprobe = conf.at("nprobe"); + if (conf.contains("refine_ratio")) { param.refine_ratio = conf.at("refine_ratio"); } + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + return std::make_unique>(metric, dim, param); +} + +template class Algo> +std::unique_ptr> make_algo(raft::bench::ann::Metric metric, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + typename Algo::BuildParam param; + parse_build_param(conf, param); + + (void)dev_list; + return std::make_unique>(metric, dim, param); +} + +template +std::unique_ptr> create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + // stop compiler warning; not all algorithms support multi-GPU so it may not be used + (void)dev_list; + + std::unique_ptr> ann; + + if constexpr (std::is_same_v) { + raft::bench::ann::Metric metric = parse_metric(distance); + if (algo == "faiss_cpu_ivf_flat") { + ann = make_algo(metric, dim, conf, dev_list); + } else if (algo == "faiss_cpu_ivf_pq") { + ann = make_algo(metric, dim, conf); + } else if (algo == "faiss_cpu_ivf_sq") { + ann = make_algo(metric, dim, conf); + } else if (algo == "faiss_cpu_flat") { + ann = std::make_unique>(metric, dim); + } + } + + if constexpr (std::is_same_v) {} + + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } + + return ann; +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + if (algo == "faiss_cpu_ivf_flat" || algo == "faiss_cpu_ivf_pq" || algo == "faiss_cpu_ivf_sq") { + auto param = std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } else if (algo == "faiss_cpu_flat") { + auto param = std::make_unique::AnnSearchParam>(); + return param; + } + // else + throw std::runtime_error("invalid algo: '" + algo + "'"); +} + +} // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } +#endif diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h new file mode 100644 index 0000000000..755fe9f197 --- /dev/null +++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../common/ann_types.hpp" +#include "../common/thread_pool.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace { + +faiss::MetricType parse_metric_type(raft::bench::ann::Metric metric) +{ + if (metric == raft::bench::ann::Metric::kInnerProduct) { + return faiss::METRIC_INNER_PRODUCT; + } else if (metric == raft::bench::ann::Metric::kEuclidean) { + return faiss::METRIC_L2; + } else { + throw std::runtime_error("faiss supports only metric type of inner product and L2"); + } +} +} // namespace + +namespace raft::bench::ann { + +template +class FaissCpu : public ANN { + public: + using typename ANN::AnnSearchParam; + struct SearchParam : public AnnSearchParam { + int nprobe; + float refine_ratio = 1.0; + int num_threads = omp_get_num_procs(); + }; + + struct BuildParam { + int nlist = 1; + int ratio = 2; + }; + + FaissCpu(Metric metric, int dim, const BuildParam& param) + : ANN(metric, dim), + metric_type_(parse_metric_type(metric)), + nlist_{param.nlist}, + training_sample_fraction_{1.0 / double(param.ratio)} + { + static_assert(std::is_same_v, "faiss support only float type"); + } + + virtual ~FaissCpu() noexcept {} + + void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final; + + void set_search_param(const AnnSearchParam& param) override; + + void init_quantizer(int dim) + { + if (this->metric_type_ == faiss::MetricType::METRIC_L2) { + this->quantizer_ = std::make_unique(dim); + } else if (this->metric_type_ == faiss::MetricType::METRIC_INNER_PRODUCT) { + this->quantizer_ = std::make_unique(dim); + } + } + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const final; + + AlgoProperty get_preference() const override + { + AlgoProperty property; + // to enable building big dataset which is larger than memory + property.dataset_memory_type = MemoryType::Host; + property.query_memory_type = MemoryType::Host; + return property; + } + + protected: + template + void save_(const std::string& file) const; + + template + void load_(const std::string& file); + + std::unique_ptr index_; + std::unique_ptr quantizer_; + std::unique_ptr index_refine_; + faiss::MetricType metric_type_; + int nlist_; + double training_sample_fraction_; + + int num_threads_; + std::unique_ptr thread_pool_; +}; + +template +void FaissCpu::build(const T* dataset, size_t nrow, cudaStream_t stream) +{ + auto index_ivf = dynamic_cast(index_.get()); + if (index_ivf != nullptr) { + // set the min/max training size for clustering to use the whole provided training set. + double trainset_size = training_sample_fraction_ * static_cast(nrow); + double points_per_centroid = trainset_size / static_cast(nlist_); + int max_ppc = std::ceil(points_per_centroid); + int min_ppc = std::floor(points_per_centroid); + if (min_ppc < index_ivf->cp.min_points_per_centroid) { + RAFT_LOG_WARN( + "The suggested training set size %zu (data size %zu, training sample ratio %f) yields %d " + "points per cluster (n_lists = %d). This is smaller than the FAISS default " + "min_points_per_centroid = %d.", + static_cast(trainset_size), + nrow, + training_sample_fraction_, + min_ppc, + nlist_, + index_ivf->cp.min_points_per_centroid); + } + index_ivf->cp.max_points_per_centroid = max_ppc; + index_ivf->cp.min_points_per_centroid = min_ppc; + } + index_->train(nrow, dataset); // faiss::IndexFlat::train() will do nothing + assert(index_->is_trained); + index_->add(nrow, dataset); + index_refine_ = std::make_unique(this->index_.get(), dataset); +} + +template +void FaissCpu::set_search_param(const AnnSearchParam& param) +{ + auto search_param = dynamic_cast(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + dynamic_cast(index_.get())->nprobe = nprobe; + + if (search_param.refine_ratio > 1.0) { + this->index_refine_.get()->k_factor = search_param.refine_ratio; + } + + if (!thread_pool_ || num_threads_ != search_param.num_threads) { + num_threads_ = search_param.num_threads; + thread_pool_ = std::make_unique(num_threads_); + } +} + +template +void FaissCpu::search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream) const +{ + static_assert(sizeof(size_t) == sizeof(faiss::idx_t), + "sizes of size_t and faiss::idx_t are different"); + + thread_pool_->submit( + [&](int i) { + // Use thread pool for batch size = 1. FAISS multi-threads internally for batch size > 1. + index_->search(batch_size, queries, k, distances, reinterpret_cast(neighbors)); + }, + 1); +} + +template +template +void FaissCpu::save_(const std::string& file) const +{ + faiss::write_index(index_.get(), file.c_str()); +} + +template +template +void FaissCpu::load_(const std::string& file) +{ + index_ = std::unique_ptr(dynamic_cast(faiss::read_index(file.c_str()))); +} + +template +class FaissCpuIVFFlat : public FaissCpu { + public: + using typename FaissCpu::BuildParam; + + FaissCpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissCpu(metric, dim, param) + { + this->init_quantizer(dim); + this->index_ = std::make_unique( + this->quantizer_.get(), dim, param.nlist, this->metric_type_); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } +}; + +template +class FaissCpuIVFPQ : public FaissCpu { + public: + struct BuildParam : public FaissCpu::BuildParam { + int M; + int bitsPerCode; + bool usePrecomputed; + }; + + FaissCpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissCpu(metric, dim, param) + { + this->init_quantizer(dim); + this->index_ = std::make_unique( + this->quantizer_.get(), dim, param.nlist, param.M, param.bitsPerCode, this->metric_type_); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } +}; + +// TODO: Enable this in cmake +// ref: https://github.com/rapidsai/raft/issues/1876 +template +class FaissCpuIVFSQ : public FaissCpu { + public: + struct BuildParam : public FaissCpu::BuildParam { + std::string quantizer_type; + }; + + FaissCpuIVFSQ(Metric metric, int dim, const BuildParam& param) : FaissCpu(metric, dim, param) + { + faiss::ScalarQuantizer::QuantizerType qtype; + if (param.quantizer_type == "fp16") { + qtype = faiss::ScalarQuantizer::QT_fp16; + } else if (param.quantizer_type == "int8") { + qtype = faiss::ScalarQuantizer::QT_8bit; + } else { + throw std::runtime_error("FaissCpuIVFSQ supports only fp16 and int8 but got " + + param.quantizer_type); + } + + this->init_quantizer(dim); + this->index_ = std::make_unique( + this->quantizer_.get(), dim, param.nlist, qtype, this->metric_type_, true); + } + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override + { + this->template load_(file); + } +}; + +template +class FaissCpuFlat : public FaissCpu { + public: + FaissCpuFlat(Metric metric, int dim) + : FaissCpu(metric, dim, typename FaissCpu::BuildParam{}) + { + this->index_ = std::make_unique(dim, this->metric_type_); + } + + // class FaissCpu is more like a IVF class, so need special treating here + void set_search_param(const typename ANN::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + if (!this->thread_pool_ || this->num_threads_ != search_param.num_threads) { + this->num_threads_ = search_param.num_threads; + this->thread_pool_ = std::make_unique(this->num_threads_); + } + }; + + void save(const std::string& file) const override + { + this->template save_(file); + } + void load(const std::string& file) override { this->template load_(file); } +}; + +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/faiss/faiss_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu similarity index 99% rename from cpp/bench/ann/src/faiss/faiss_benchmark.cu rename to cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu index 56885cce5c..8b04ba1980 100644 --- a/cpp/bench/ann/src/faiss/faiss_benchmark.cu +++ b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu @@ -24,7 +24,7 @@ #include "../common/ann_types.hpp" #undef WARP_SIZE -#include "faiss_wrapper.h" +#include "faiss_gpu_wrapper.h" #define JSON_DIAGNOSTICS 1 #include diff --git a/cpp/bench/ann/src/faiss/faiss_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h similarity index 71% rename from cpp/bench/ann/src/faiss/faiss_wrapper.h rename to cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h index 672c685b1f..4f13ff8a49 100644 --- a/cpp/bench/ann/src/faiss/faiss_wrapper.h +++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h @@ -35,6 +35,9 @@ #include #include +#include +#include + #include #include #include @@ -84,6 +87,7 @@ class FaissGpu : public ANN { struct SearchParam : public AnnSearchParam { int nprobe; float refine_ratio = 1.0; + auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; } }; struct BuildParam { @@ -101,13 +105,16 @@ class FaissGpu : public ANN { RAFT_CUDA_TRY(cudaGetDevice(&device_)); RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming)); faiss_default_stream_ = gpu_resource_.getDefaultStream(device_); + raft::resource::set_cuda_stream(handle_, faiss_default_stream_); } virtual ~FaissGpu() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); } void build(const T* dataset, size_t nrow, cudaStream_t stream = 0) final; - void set_search_param(const AnnSearchParam& param) override; + virtual void set_search_param(const FaissGpu::AnnSearchParam& param) {} + + void set_search_dataset(const T* dataset, size_t nrow) override { dataset_ = dataset; } // TODO: if the number of results is less than k, the remaining elements of 'neighbors' // will be filled with (size_t)-1 @@ -123,7 +130,7 @@ class FaissGpu : public ANN { AlgoProperty property; // to enable building big dataset which is larger than GPU memory property.dataset_memory_type = MemoryType::Host; - property.query_memory_type = MemoryType::Device; + property.query_memory_type = MemoryType::Host; return property; } @@ -142,13 +149,17 @@ class FaissGpu : public ANN { mutable faiss::gpu::StandardGpuResources gpu_resource_; std::unique_ptr index_; - std::unique_ptr index_refine_; + std::unique_ptr index_refine_{nullptr}; faiss::MetricType metric_type_; int nlist_; int device_; cudaEvent_t sync_{nullptr}; cudaStream_t faiss_default_stream_{nullptr}; double training_sample_fraction_; + std::unique_ptr search_params_; + const T* dataset_; + raft::device_resources handle_; + float refine_ratio_ = 1.0; }; template @@ -183,20 +194,6 @@ void FaissGpu::build(const T* dataset, size_t nrow, cudaStream_t stream) stream_wait(stream); } -template -void FaissGpu::set_search_param(const AnnSearchParam& param) -{ - auto search_param = dynamic_cast(param); - int nprobe = search_param.nprobe; - assert(nprobe <= nlist_); - dynamic_cast(index_.get())->setNumProbes(nprobe); - - if (search_param.refine_ratio > 1.0) { - this->index_refine_ = std::make_unique(this->index_.get()); - this->index_refine_.get()->k_factor = search_param.refine_ratio; - } -} - template void FaissGpu::search(const T* queries, int batch_size, @@ -205,10 +202,27 @@ void FaissGpu::search(const T* queries, float* distances, cudaStream_t stream) const { - static_assert(sizeof(size_t) == sizeof(faiss::Index::idx_t), - "sizes of size_t and faiss::Index::idx_t are different"); - index_->search( - batch_size, queries, k, distances, reinterpret_cast(neighbors)); + static_assert(sizeof(size_t) == sizeof(faiss::idx_t), + "sizes of size_t and faiss::idx_t are different"); + + if (this->refine_ratio_ > 1.0) { + // TODO: FAISS changed their search APIs to accept the search parameters as a struct object + // but their refine API doesn't allow the struct to be passed in. Once this is fixed, we + // need to re-enable refinement below + // index_refine_->search(batch_size, queries, k, distances, + // reinterpret_cast(neighbors), this->search_params_.get()); Related FAISS issue: + // https://github.com/facebookresearch/faiss/issues/3118 + throw std::runtime_error( + "FAISS doesn't support refinement in their new APIs so this feature is disabled in the " + "benchmarks for the time being."); + } else { + index_->search(batch_size, + queries, + k, + distances, + reinterpret_cast(neighbors), + this->search_params_.get()); + } stream_wait(stream); } @@ -231,7 +245,13 @@ void FaissGpu::load_(const std::string& file) std::unique_ptr cpu_index(dynamic_cast(faiss::read_index(file.c_str()))); assert(cpu_index); - dynamic_cast(index_.get())->copyFrom(cpu_index.get()); + + try { + dynamic_cast(index_.get())->copyFrom(cpu_index.get()); + + } catch (const std::exception& e) { + std::cout << "Error loading index file: " << std::string(e.what()) << std::endl; + } } template @@ -247,6 +267,18 @@ class FaissGpuIVFFlat : public FaissGpu { &(this->gpu_resource_), dim, param.nlist, this->metric_type_, config); } + void set_search_param(const typename FaissGpu::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + + faiss::IVFSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + this->search_params_ = std::make_unique(faiss_search_params); + this->refine_ratio_ = search_param.refine_ratio; + } + void save(const std::string& file) const override { this->template save_(file); @@ -272,6 +304,7 @@ class FaissGpuIVFPQ : public FaissGpu { config.useFloat16LookupTables = param.useFloat16; config.usePrecomputedTables = param.usePrecomputed; config.device = this->device_; + this->index_ = std::make_unique(&(this->gpu_resource_), dim, @@ -282,6 +315,24 @@ class FaissGpuIVFPQ : public FaissGpu { config); } + void set_search_param(const typename FaissGpu::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + this->refine_ratio_ = search_param.refine_ratio; + faiss::IVFPQSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + + this->search_params_ = std::make_unique(faiss_search_params); + + if (search_param.refine_ratio > 1.0) { + this->index_refine_ = + std::make_unique(this->index_.get(), this->dataset_); + this->index_refine_.get()->k_factor = search_param.refine_ratio; + } + } + void save(const std::string& file) const override { this->template save_(file); @@ -292,6 +343,8 @@ class FaissGpuIVFPQ : public FaissGpu { } }; +// TODO: Enable this in cmake +// ref: https://github.com/rapidsai/raft/issues/1876 template class FaissGpuIVFSQ : public FaissGpu { public: @@ -317,6 +370,24 @@ class FaissGpuIVFSQ : public FaissGpu { &(this->gpu_resource_), dim, param.nlist, qtype, this->metric_type_, true, config); } + void set_search_param(const typename FaissGpu::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); + + faiss::IVFSearchParameters faiss_search_params; + faiss_search_params.nprobe = nprobe; + + this->search_params_ = std::make_unique(faiss_search_params); + this->refine_ratio_ = search_param.refine_ratio; + if (search_param.refine_ratio > 1.0) { + this->index_refine_ = + std::make_unique(this->index_.get(), this->dataset_); + this->index_refine_.get()->k_factor = search_param.refine_ratio; + } + } + void save(const std::string& file) const override { this->template save_( @@ -340,9 +411,14 @@ class FaissGpuFlat : public FaissGpu { this->index_ = std::make_unique( &(this->gpu_resource_), dim, this->metric_type_, config); } + void set_search_param(const typename FaissGpu::AnnSearchParam& param) override + { + auto search_param = dynamic_cast::SearchParam&>(param); + int nprobe = search_param.nprobe; + assert(nprobe <= nlist_); - // class FaissGpu is more like a IVF class, so need special treating here - void set_search_param(const typename ANN::AnnSearchParam&) override{}; + this->search_params_ = std::make_unique(); + } void save(const std::string& file) const override { @@ -356,4 +432,4 @@ class FaissGpuFlat : public FaissGpu { } // namespace raft::bench::ann -#endif +#endif \ No newline at end of file diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp index 7d96e54989..1af19a22cb 100644 --- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp +++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp @@ -24,7 +24,6 @@ #include #include -#undef WARP_SIZE #include "hnswlib_wrapper.h" #define JSON_DIAGNOSTICS 1 #include diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h index 4d7b993aa1..921d72decc 100644 --- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h +++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h @@ -31,9 +31,8 @@ #include #include -#include - #include "../common/ann_types.hpp" +#include "../common/thread_pool.hpp" #include namespace raft::bench::ann { @@ -53,112 +52,6 @@ struct hnsw_dist_t { using type = int; }; -class FixedThreadPool { - public: - FixedThreadPool(int num_threads) - { - if (num_threads < 1) { - throw std::runtime_error("num_threads must >= 1"); - } else if (num_threads == 1) { - return; - } - - tasks_ = new Task_[num_threads]; - - threads_.reserve(num_threads); - for (int i = 0; i < num_threads; ++i) { - threads_.emplace_back([&, i] { - auto& task = tasks_[i]; - while (true) { - std::unique_lock lock(task.mtx); - task.cv.wait(lock, - [&] { return task.has_task || finished_.load(std::memory_order_relaxed); }); - if (finished_.load(std::memory_order_relaxed)) { break; } - - task.task(); - task.has_task = false; - } - }); - } - } - - ~FixedThreadPool() - { - if (threads_.empty()) { return; } - - finished_.store(true, std::memory_order_relaxed); - for (unsigned i = 0; i < threads_.size(); ++i) { - auto& task = tasks_[i]; - std::lock_guard(task.mtx); - - task.cv.notify_one(); - threads_[i].join(); - } - - delete[] tasks_; - } - - template - void submit(Func f, IdxT len) - { - if (threads_.empty()) { - for (IdxT i = 0; i < len; ++i) { - f(i); - } - return; - } - - const int num_threads = threads_.size(); - // one extra part for competition among threads - const IdxT items_per_thread = len / (num_threads + 1); - std::atomic cnt(items_per_thread * num_threads); - - auto wrapped_f = [&](IdxT start, IdxT end) { - for (IdxT i = start; i < end; ++i) { - f(i); - } - - while (true) { - IdxT i = cnt.fetch_add(1, std::memory_order_relaxed); - if (i >= len) { break; } - f(i); - } - }; - - std::vector> futures; - futures.reserve(num_threads); - for (int i = 0; i < num_threads; ++i) { - IdxT start = i * items_per_thread; - auto& task = tasks_[i]; - { - std::lock_guard lock(task.mtx); - (void)lock; // stop nvcc warning - task.task = std::packaged_task([=] { wrapped_f(start, start + items_per_thread); }); - futures.push_back(task.task.get_future()); - task.has_task = true; - } - task.cv.notify_one(); - } - - for (auto& fut : futures) { - fut.wait(); - } - return; - } - - private: - struct alignas(64) Task_ { - std::mutex mtx; - std::condition_variable cv; - bool has_task = false; - std::packaged_task task; - }; - - Task_* tasks_; - std::vector threads_; - std::atomic finished_{false}; -}; - template class HnswLib : public ANN { public: @@ -172,7 +65,7 @@ class HnswLib : public ANN { using typename ANN::AnnSearchParam; struct SearchParam : public AnnSearchParam { int ef; - int num_threads = omp_get_num_procs(); + int num_threads = 1; }; HnswLib(Metric metric, int dim, const BuildParam& param); @@ -198,6 +91,8 @@ class HnswLib : public ANN { return property; } + void set_base_layer_only() { appr_alg_->base_layer_only = true; } + private: void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const; @@ -210,6 +105,7 @@ class HnswLib : public ANN { int m_; int num_threads_; std::unique_ptr thread_pool_; + Objective metric_objective_; }; template @@ -253,7 +149,6 @@ void HnswLib::build(const T* dataset, size_t nrow, cudaStream_t) char buf[20]; std::time_t now = std::time(nullptr); std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now)); - printf("%s building %zu / %zu\n", buf, i, items_per_thread); fflush(stdout); } @@ -266,24 +161,31 @@ void HnswLib::build(const T* dataset, size_t nrow, cudaStream_t) template void HnswLib::set_search_param(const AnnSearchParam& param_) { - auto param = dynamic_cast(param_); - appr_alg_->ef_ = param.ef; - - if (!thread_pool_ || num_threads_ != param.num_threads) { - num_threads_ = param.num_threads; - thread_pool_ = std::make_unique(num_threads_); - } + auto param = dynamic_cast(param_); + appr_alg_->ef_ = param.ef; + metric_objective_ = param.metric_objective; + num_threads_ = param.num_threads; + + // Create a pool if multiple query threads have been set and the pool hasn't been created already + bool create_pool = (metric_objective_ == Objective::LATENCY && num_threads_ > 1 && !thread_pool_); + if (create_pool) { thread_pool_ = std::make_unique(num_threads_); } } template void HnswLib::search( const T* query, int batch_size, int k, size_t* indices, float* distances, cudaStream_t) const { - thread_pool_->submit( - [&](int i) { - get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k); - }, - batch_size); + auto f = [&](int i) { + // hnsw can only handle a single vector at a time. + get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k); + }; + if (metric_objective_ == Objective::LATENCY && num_threads_ > 1) { + thread_pool_->submit(f, batch_size); + } else { + for (int i = 0; i < batch_size; i++) { + f(i); + } + } } template diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h new file mode 100644 index 0000000000..1eb0e53cc5 --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define JSON_DIAGNOSTICS 1 +#include + +#undef WARP_SIZE +#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN +#include "raft_wrapper.h" +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT +#include "raft_ivf_flat_wrapper.h" +extern template class raft::bench::ann::RaftIvfFlatGpu; +extern template class raft::bench::ann::RaftIvfFlatGpu; +extern template class raft::bench::ann::RaftIvfFlatGpu; +#endif +#if defined(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || \ + defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) +#include "raft_ivf_pq_wrapper.h" +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ +extern template class raft::bench::ann::RaftIvfPQ; +extern template class raft::bench::ann::RaftIvfPQ; +extern template class raft::bench::ann::RaftIvfPQ; +#endif +#if defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) +#include "raft_cagra_wrapper.h" +#endif +#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA +extern template class raft::bench::ann::RaftCagra; +extern template class raft::bench::ann::RaftCagra; +extern template class raft::bench::ann::RaftCagra; +#endif + +#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftIvfFlatGpu::BuildParam& param) +{ + param.n_lists = conf.at("nlist"); + if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } + if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftIvfFlatGpu::SearchParam& param) +{ + param.ivf_flat_params.n_probes = conf.at("nprobe"); +} +#endif + +#if defined(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || \ + defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftIvfPQ::BuildParam& param) +{ + if (conf.contains("nlist")) { param.n_lists = conf.at("nlist"); } + if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } + if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } + if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); } + if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); } + if (conf.contains("codebook_kind")) { + std::string kind = conf.at("codebook_kind"); + if (kind == "cluster") { + param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER; + } else if (kind == "subspace") { + param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE; + } else { + throw std::runtime_error("codebook_kind: '" + kind + + "', should be either 'cluster' or 'subspace'"); + } + } +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftIvfPQ::SearchParam& param) +{ + if (conf.contains("nprobe")) { param.pq_param.n_probes = conf.at("nprobe"); } + if (conf.contains("internalDistanceDtype")) { + std::string type = conf.at("internalDistanceDtype"); + if (type == "float") { + param.pq_param.internal_distance_dtype = CUDA_R_32F; + } else if (type == "half") { + param.pq_param.internal_distance_dtype = CUDA_R_16F; + } else { + throw std::runtime_error("internalDistanceDtype: '" + type + + "', should be either 'float' or 'half'"); + } + } else { + // set half as default type + param.pq_param.internal_distance_dtype = CUDA_R_16F; + } + + if (conf.contains("smemLutDtype")) { + std::string type = conf.at("smemLutDtype"); + if (type == "float") { + param.pq_param.lut_dtype = CUDA_R_32F; + } else if (type == "half") { + param.pq_param.lut_dtype = CUDA_R_16F; + } else if (type == "fp8") { + param.pq_param.lut_dtype = CUDA_R_8U; + } else { + throw std::runtime_error("smemLutDtype: '" + type + + "', should be either 'float', 'half' or 'fp8'"); + } + } else { + // set half as default + param.pq_param.lut_dtype = CUDA_R_16F; + } + if (conf.contains("refine_ratio")) { + param.refine_ratio = conf.at("refine_ratio"); + if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); } + } +} +#endif + +#if defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA) || defined(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB) +template +void parse_build_param(const nlohmann::json& conf, + raft::neighbors::experimental::nn_descent::index_params& param) +{ + if (conf.contains("graph_degree")) { param.graph_degree = conf.at("graph_degree"); } + if (conf.contains("intermediate_graph_degree")) { + param.intermediate_graph_degree = conf.at("intermediate_graph_degree"); + } + // we allow niter shorthand for max_iterations + if (conf.contains("niter")) { param.max_iterations = conf.at("niter"); } + if (conf.contains("max_iterations")) { param.max_iterations = conf.at("max_iterations"); } + if (conf.contains("termination_threshold")) { + param.termination_threshold = conf.at("termination_threshold"); + } +} + +nlohmann::json collect_conf_with_prefix(const nlohmann::json& conf, + const std::string& prefix, + bool remove_prefix = true) +{ + nlohmann::json out; + for (auto& i : conf.items()) { + if (i.key().compare(0, prefix.size(), prefix) == 0) { + auto new_key = remove_prefix ? i.key().substr(prefix.size()) : i.key(); + out[new_key] = i.value(); + } + } + return out; +} + +template +void parse_build_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftCagra::BuildParam& param) +{ + if (conf.contains("graph_degree")) { + param.cagra_params.graph_degree = conf.at("graph_degree"); + param.cagra_params.intermediate_graph_degree = param.cagra_params.graph_degree * 2; + } + if (conf.contains("intermediate_graph_degree")) { + param.cagra_params.intermediate_graph_degree = conf.at("intermediate_graph_degree"); + } + if (conf.contains("graph_build_algo")) { + if (conf.at("graph_build_algo") == "IVF_PQ") { + param.cagra_params.build_algo = raft::neighbors::cagra::graph_build_algo::IVF_PQ; + } else if (conf.at("graph_build_algo") == "NN_DESCENT") { + param.cagra_params.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT; + } + } + nlohmann::json ivf_pq_build_conf = collect_conf_with_prefix(conf, "ivf_pq_build_"); + if (!ivf_pq_build_conf.empty()) { + raft::neighbors::ivf_pq::index_params bparam; + parse_build_param(ivf_pq_build_conf, bparam); + param.ivf_pq_build_params = bparam; + } + nlohmann::json ivf_pq_search_conf = collect_conf_with_prefix(conf, "ivf_pq_search_"); + if (!ivf_pq_search_conf.empty()) { + typename raft::bench::ann::RaftIvfPQ::SearchParam sparam; + parse_search_param(ivf_pq_search_conf, sparam); + param.ivf_pq_search_params = sparam.pq_param; + param.ivf_pq_refine_rate = sparam.refine_ratio; + } + nlohmann::json nn_descent_conf = collect_conf_with_prefix(conf, "nn_descent_"); + if (!nn_descent_conf.empty()) { + raft::neighbors::experimental::nn_descent::index_params nn_param; + nn_param.intermediate_graph_degree = 1.5 * param.cagra_params.intermediate_graph_degree; + parse_build_param(nn_descent_conf, nn_param); + if (nn_param.graph_degree != param.cagra_params.intermediate_graph_degree) { + nn_param.graph_degree = param.cagra_params.intermediate_graph_degree; + } + param.nn_descent_params = nn_param; + } +} + +raft::bench::ann::AllocatorType parse_allocator(std::string mem_type) +{ + if (mem_type == "device") { + return raft::bench::ann::AllocatorType::Device; + } else if (mem_type == "host_pinned") { + return raft::bench::ann::AllocatorType::HostPinned; + } else if (mem_type == "host_huge_page") { + return raft::bench::ann::AllocatorType::HostHugePage; + } + THROW( + "Invalid value for memory type %s, must be one of [\"device\", \"host_pinned\", " + "\"host_huge_page\"", + mem_type.c_str()); +} + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftCagra::SearchParam& param) +{ + if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); } + if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); } + if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); } + if (conf.contains("algo")) { + if (conf.at("algo") == "single_cta") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::SINGLE_CTA; + } else if (conf.at("algo") == "multi_cta") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_CTA; + } else if (conf.at("algo") == "multi_kernel") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_KERNEL; + } else if (conf.at("algo") == "auto") { + param.p.algo = raft::neighbors::experimental::cagra::search_algo::AUTO; + } else { + std::string tmp = conf.at("algo"); + THROW("Invalid value for algo: %s", tmp.c_str()); + } + } + if (conf.contains("graph_memory_type")) { + param.graph_mem = parse_allocator(conf.at("graph_memory_type")); + } + if (conf.contains("internal_dataset_memory_type")) { + param.dataset_mem = parse_allocator(conf.at("internal_dataset_memory_type")); + } +} +#endif diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu index a9ff6c2922..f8c65a2d6e 100644 --- a/cpp/bench/ann/src/raft/raft_benchmark.cu +++ b/cpp/bench/ann/src/raft/raft_benchmark.cu @@ -16,170 +16,23 @@ #include "../common/ann_types.hpp" +#include "raft_ann_bench_param_parser.h" + #include #include #include +#include +#include #include #include #include #include -#undef WARP_SIZE -#ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN -#include "raft_wrapper.h" -#endif -#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT -#include "raft_ivf_flat_wrapper.h" -extern template class raft::bench::ann::RaftIvfFlatGpu; -extern template class raft::bench::ann::RaftIvfFlatGpu; -extern template class raft::bench::ann::RaftIvfFlatGpu; -#endif -#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ -#include "raft_ivf_pq_wrapper.h" -extern template class raft::bench::ann::RaftIvfPQ; -extern template class raft::bench::ann::RaftIvfPQ; -extern template class raft::bench::ann::RaftIvfPQ; -#endif -#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA -#include "raft_cagra_wrapper.h" -extern template class raft::bench::ann::RaftCagra; -extern template class raft::bench::ann::RaftCagra; -extern template class raft::bench::ann::RaftCagra; -#endif #define JSON_DIAGNOSTICS 1 #include namespace raft::bench::ann { -#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT -template -void parse_build_param(const nlohmann::json& conf, - typename raft::bench::ann::RaftIvfFlatGpu::BuildParam& param) -{ - param.n_lists = conf.at("nlist"); - if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } - if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } -} - -template -void parse_search_param(const nlohmann::json& conf, - typename raft::bench::ann::RaftIvfFlatGpu::SearchParam& param) -{ - param.ivf_flat_params.n_probes = conf.at("nprobe"); -} -#endif - -#ifdef RAFT_ANN_BENCH_USE_RAFT_IVF_PQ -template -void parse_build_param(const nlohmann::json& conf, - typename raft::bench::ann::RaftIvfPQ::BuildParam& param) -{ - param.n_lists = conf.at("nlist"); - if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); } - if (conf.contains("ratio")) { param.kmeans_trainset_fraction = 1.0 / (double)conf.at("ratio"); } - if (conf.contains("pq_bits")) { param.pq_bits = conf.at("pq_bits"); } - if (conf.contains("pq_dim")) { param.pq_dim = conf.at("pq_dim"); } - if (conf.contains("codebook_kind")) { - std::string kind = conf.at("codebook_kind"); - if (kind == "cluster") { - param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER; - } else if (kind == "subspace") { - param.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE; - } else { - throw std::runtime_error("codebook_kind: '" + kind + - "', should be either 'cluster' or 'subspace'"); - } - } -} - -template -void parse_search_param(const nlohmann::json& conf, - typename raft::bench::ann::RaftIvfPQ::SearchParam& param) -{ - param.pq_param.n_probes = conf.at("nprobe"); - if (conf.contains("internalDistanceDtype")) { - std::string type = conf.at("internalDistanceDtype"); - if (type == "float") { - param.pq_param.internal_distance_dtype = CUDA_R_32F; - } else if (type == "half") { - param.pq_param.internal_distance_dtype = CUDA_R_16F; - } else { - throw std::runtime_error("internalDistanceDtype: '" + type + - "', should be either 'float' or 'half'"); - } - } else { - // set half as default type - param.pq_param.internal_distance_dtype = CUDA_R_16F; - } - - if (conf.contains("smemLutDtype")) { - std::string type = conf.at("smemLutDtype"); - if (type == "float") { - param.pq_param.lut_dtype = CUDA_R_32F; - } else if (type == "half") { - param.pq_param.lut_dtype = CUDA_R_16F; - } else if (type == "fp8") { - param.pq_param.lut_dtype = CUDA_R_8U; - } else { - throw std::runtime_error("smemLutDtype: '" + type + - "', should be either 'float', 'half' or 'fp8'"); - } - } else { - // set half as default - param.pq_param.lut_dtype = CUDA_R_16F; - } - if (conf.contains("refine_ratio")) { - param.refine_ratio = conf.at("refine_ratio"); - if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); } - } -} -#endif - -#ifdef RAFT_ANN_BENCH_USE_RAFT_CAGRA -template -void parse_build_param(const nlohmann::json& conf, - typename raft::bench::ann::RaftCagra::BuildParam& param) -{ - if (conf.contains("graph_degree")) { - param.graph_degree = conf.at("graph_degree"); - param.intermediate_graph_degree = param.graph_degree * 2; - } - if (conf.contains("intermediate_graph_degree")) { - param.intermediate_graph_degree = conf.at("intermediate_graph_degree"); - } - if (conf.contains("graph_build_algo")) { - if (conf.at("graph_build_algo") == "IVF_PQ") { - param.build_algo = raft::neighbors::cagra::graph_build_algo::IVF_PQ; - } else if (conf.at("graph_build_algo") == "NN_DESCENT") { - param.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT; - } - } -} - -template -void parse_search_param(const nlohmann::json& conf, - typename raft::bench::ann::RaftCagra::SearchParam& param) -{ - if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); } - if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); } - if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); } - if (conf.contains("algo")) { - if (conf.at("algo") == "single_cta") { - param.p.algo = raft::neighbors::experimental::cagra::search_algo::SINGLE_CTA; - } else if (conf.at("algo") == "multi_cta") { - param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_CTA; - } else if (conf.at("algo") == "multi_kernel") { - param.p.algo = raft::neighbors::experimental::cagra::search_algo::MULTI_KERNEL; - } else if (conf.at("algo") == "auto") { - param.p.algo = raft::neighbors::experimental::cagra::search_algo::AUTO; - } else { - std::string tmp = conf.at("algo"); - THROW("Invalid value for algo: %s", tmp.c_str()); - } - } -} -#endif - template std::unique_ptr> create_algo(const std::string& algo, const std::string& distance, @@ -222,6 +75,7 @@ std::unique_ptr> create_algo(const std::string& algo, ann = std::make_unique>(metric, dim, param); } #endif + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } return ann; @@ -232,7 +86,7 @@ std::unique_ptr::AnnSearchParam> create_search const std::string& algo, const nlohmann::json& conf) { #ifdef RAFT_ANN_BENCH_USE_RAFT_BFKNN - if (algo == "raft_bfknn") { + if (algo == "raft_brute_force") { auto param = std::make_unique::AnnSearchParam>(); return param; } @@ -259,6 +113,7 @@ std::unique_ptr::AnnSearchParam> create_search return param; } #endif + // else throw std::runtime_error("invalid algo: '" + algo + "'"); } @@ -271,5 +126,15 @@ REGISTER_ALGO_INSTANCE(std::uint8_t); #ifdef ANN_BENCH_BUILD_MAIN #include "../common/benchmark.hpp" -int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } +int main(int argc, char** argv) +{ + rmm::mr::cuda_memory_resource cuda_mr; + // Construct a resource that uses a coalescing best-fit pool allocator + rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; + rmm::mr::set_current_device_resource( + &pool_mr); // Updates the current device resource pointer to `pool_mr` + rmm::mr::device_memory_resource* mr = + rmm::mr::get_current_device_resource(); // Points to `pool_mr` + return raft::bench::ann::run_main(argc, argv); +} #endif diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu new file mode 100644 index 0000000000..ce6fa255b2 --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../common/ann_types.hpp" +#include "raft_ann_bench_param_parser.h" +#include "raft_cagra_hnswlib_wrapper.h" + +#include + +#define JSON_DIAGNOSTICS 1 +#include + +namespace raft::bench::ann { + +template +void parse_search_param(const nlohmann::json& conf, + typename raft::bench::ann::RaftCagraHnswlib::SearchParam& param) +{ + param.ef = conf.at("ef"); + if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } +} + +template +std::unique_ptr> create_algo(const std::string& algo, + const std::string& distance, + int dim, + const nlohmann::json& conf, + const std::vector& dev_list) +{ + // stop compiler warning; not all algorithms support multi-GPU so it may not be used + (void)dev_list; + + raft::bench::ann::Metric metric = parse_metric(distance); + std::unique_ptr> ann; + + if constexpr (std::is_same_v or std::is_same_v) { + if (algo == "raft_cagra_hnswlib") { + typename raft::bench::ann::RaftCagraHnswlib::BuildParam param; + parse_build_param(conf, param); + ann = std::make_unique>(metric, dim, param); + } + } + + if (!ann) { throw std::runtime_error("invalid algo: '" + algo + "'"); } + + return ann; +} + +template +std::unique_ptr::AnnSearchParam> create_search_param( + const std::string& algo, const nlohmann::json& conf) +{ + if (algo == "raft_cagra_hnswlib") { + auto param = + std::make_unique::SearchParam>(); + parse_search_param(conf, *param); + return param; + } + + throw std::runtime_error("invalid algo: '" + algo + "'"); +} + +} // namespace raft::bench::ann + +REGISTER_ALGO_INSTANCE(float); +REGISTER_ALGO_INSTANCE(std::int8_t); +REGISTER_ALGO_INSTANCE(std::uint8_t); + +#ifdef ANN_BENCH_BUILD_MAIN +#include "../common/benchmark.hpp" +int main(int argc, char** argv) +{ + rmm::mr::cuda_memory_resource cuda_mr; + // Construct a resource that uses a coalescing best-fit pool allocator + rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; + rmm::mr::set_current_device_resource( + &pool_mr); // Updates the current device resource pointer to `pool_mr` + rmm::mr::device_memory_resource* mr = + rmm::mr::get_current_device_resource(); // Points to `pool_mr` + return raft::bench::ann::run_main(argc, argv); +} +#endif diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h new file mode 100644 index 0000000000..432caecfcc --- /dev/null +++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../hnswlib/hnswlib_wrapper.h" +#include "raft_cagra_wrapper.h" +#include + +namespace raft::bench::ann { + +template +class RaftCagraHnswlib : public ANN { + public: + using typename ANN::AnnSearchParam; + using BuildParam = typename RaftCagra::BuildParam; + using SearchParam = typename HnswLib::SearchParam; + + RaftCagraHnswlib(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1) + : ANN(metric, dim), + metric_(metric), + index_params_(param), + dimension_(dim), + handle_(cudaStreamPerThread) + { + } + + ~RaftCagraHnswlib() noexcept {} + + void build(const T* dataset, size_t nrow, cudaStream_t stream) final; + + void set_search_param(const AnnSearchParam& param) override; + + // TODO: if the number of results is less than k, the remaining elements of 'neighbors' + // will be filled with (size_t)-1 + void search(const T* queries, + int batch_size, + int k, + size_t* neighbors, + float* distances, + cudaStream_t stream = 0) const override; + + // to enable dataset access from GPU memory + AlgoProperty get_preference() const override + { + AlgoProperty property; + property.dataset_memory_type = MemoryType::HostMmap; + property.query_memory_type = MemoryType::Host; + return property; + } + void save(const std::string& file) const override; + void load(const std::string&) override; + + private: + raft::device_resources handle_; + Metric metric_; + BuildParam index_params_; + int dimension_; + + std::unique_ptr> cagra_build_; + std::unique_ptr> hnswlib_search_; + + Objective metric_objective_; +}; + +template +void RaftCagraHnswlib::build(const T* dataset, size_t nrow, cudaStream_t stream) +{ + if (not cagra_build_) { + cagra_build_ = std::make_unique>(metric_, dimension_, index_params_); + } + cagra_build_->build(dataset, nrow, stream); +} + +template +void RaftCagraHnswlib::set_search_param(const AnnSearchParam& param_) +{ + hnswlib_search_->set_search_param(param_); +} + +template +void RaftCagraHnswlib::save(const std::string& file) const +{ + cagra_build_->save_to_hnswlib(file); +} + +template +void RaftCagraHnswlib::load(const std::string& file) +{ + typename HnswLib::BuildParam param; + // these values don't matter since we don't build with HnswLib + param.M = 50; + param.ef_construction = 100; + if (not hnswlib_search_) { + hnswlib_search_ = std::make_unique>(metric_, dimension_, param); + } + hnswlib_search_->load(file); + hnswlib_search_->set_base_layer_only(); +} + +template +void RaftCagraHnswlib::search( + const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const +{ + hnswlib_search_->search(queries, batch_size, k, neighbors, distances); +} + +} // namespace raft::bench::ann diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h index 02aa2ea28b..a3e481ec5a 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h @@ -19,17 +19,19 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include #include #include -#include +#include +#include +#include #include #include #include @@ -40,8 +42,15 @@ #include "raft_ann_bench_utils.h" #include +#include "../common/cuda_huge_page_resource.hpp" +#include "../common/cuda_pinned_resource.hpp" + +#include +#include + namespace raft::bench::ann { +enum class AllocatorType { HostPinned, HostHugePage, Device }; template class RaftCagra : public ANN { public: @@ -49,23 +58,38 @@ class RaftCagra : public ANN { struct SearchParam : public AnnSearchParam { raft::neighbors::experimental::cagra::search_params p; + AllocatorType graph_mem = AllocatorType::Device; + AllocatorType dataset_mem = AllocatorType::Device; auto needs_dataset() const -> bool override { return true; } }; - using BuildParam = raft::neighbors::cagra::index_params; + struct BuildParam { + raft::neighbors::cagra::index_params cagra_params; + std::optional nn_descent_params = + std::nullopt; + std::optional ivf_pq_refine_rate = std::nullopt; + std::optional ivf_pq_build_params = std::nullopt; + std::optional ivf_pq_search_params = std::nullopt; + }; - RaftCagra(Metric metric, int dim, const BuildParam& param) + RaftCagra(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1) : ANN(metric, dim), index_params_(param), dimension_(dim), - mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) + handle_(cudaStreamPerThread), + need_dataset_update_(true), + dataset_(make_device_matrix(handle_, 0, 0)), + graph_(make_device_matrix(handle_, 0, 0)), + input_dataset_v_(nullptr, 0, 0), + graph_mem_(AllocatorType::Device), + dataset_mem_(AllocatorType::Device) { - rmm::mr::set_current_device_resource(&mr_); - index_params_.metric = parse_metric_type(metric); + index_params_.cagra_params.metric = parse_metric_type(metric); + index_params_.ivf_pq_build_params->metric = parse_metric_type(metric); RAFT_CUDA_TRY(cudaGetDevice(&device_)); } - ~RaftCagra() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + ~RaftCagra() noexcept {} void build(const T* dataset, size_t nrow, cudaStream_t stream) final; @@ -92,35 +116,61 @@ class RaftCagra : public ANN { } void save(const std::string& file) const override; void load(const std::string&) override; + void save_to_hnswlib(const std::string& file) const; private: - // `mr_` must go first to make sure it dies last - rmm::mr::pool_memory_resource mr_; + inline rmm::mr::device_memory_resource* get_mr(AllocatorType mem_type) + { + switch (mem_type) { + case (AllocatorType::HostPinned): return &mr_pinned_; + case (AllocatorType::HostHugePage): return &mr_huge_page_; + default: return rmm::mr::get_current_device_resource(); + } + } + raft ::mr::cuda_pinned_resource mr_pinned_; + raft ::mr::cuda_huge_page_resource mr_huge_page_; raft::device_resources handle_; + AllocatorType graph_mem_; + AllocatorType dataset_mem_; BuildParam index_params_; + bool need_dataset_update_; raft::neighbors::cagra::search_params search_params_; std::optional> index_; int device_; int dimension_; + raft::device_matrix graph_; + raft::device_matrix dataset_; + raft::device_matrix_view input_dataset_v_; }; template void RaftCagra::build(const T* dataset, size_t nrow, cudaStream_t) { - switch (raft::spatial::knn::detail::utils::check_pointer_residency(dataset)) { - case raft::spatial::knn::detail::utils::pointer_residency::host_only: { - auto dataset_view = - raft::make_host_matrix_view(dataset, IdxT(nrow), dimension_); - index_.emplace(raft::neighbors::cagra::build(handle_, index_params_, dataset_view)); - return; - } - default: { - auto dataset_view = - raft::make_device_matrix_view(dataset, IdxT(nrow), dimension_); - index_.emplace(raft::neighbors::cagra::build(handle_, index_params_, dataset_view)); - return; - } + auto dataset_view = + raft::make_host_matrix_view(dataset, IdxT(nrow), dimension_); + + auto& params = index_params_.cagra_params; + + index_.emplace(raft::neighbors::cagra::detail::build(handle_, + params, + dataset_view, + index_params_.nn_descent_params, + index_params_.ivf_pq_refine_rate, + index_params_.ivf_pq_build_params, + index_params_.ivf_pq_search_params)); + return; +} + +inline std::string allocator_to_string(AllocatorType mem_type) +{ + if (mem_type == AllocatorType::Device) { + return "device"; + } else if (mem_type == AllocatorType::HostPinned) { + return "host_pinned"; + } else if (mem_type == AllocatorType::HostHugePage) { + return "host_huge_page"; } + return ""; } template @@ -128,19 +178,72 @@ void RaftCagra::set_search_param(const AnnSearchParam& param) { auto search_param = dynamic_cast(param); search_params_ = search_param.p; + if (search_param.graph_mem != graph_mem_) { + // Move graph to correct memory space + graph_mem_ = search_param.graph_mem; + RAFT_LOG_INFO("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str()); + // We create a new graph and copy to it from existing graph + auto mr = get_mr(graph_mem_); + auto new_graph = make_device_mdarray( + handle_, mr, make_extents(index_->graph().extent(0), index_->graph_degree())); + + raft::copy(new_graph.data_handle(), + index_->graph().data_handle(), + index_->graph().size(), + resource::get_cuda_stream(handle_)); + + index_->update_graph(handle_, make_const_mdspan(new_graph.view())); + // update_graph() only stores a view in the index. We need to keep the graph object alive. + graph_ = std::move(new_graph); + } + + if (search_param.dataset_mem != dataset_mem_ || need_dataset_update_) { + dataset_mem_ = search_param.dataset_mem; + + // First free up existing memory + dataset_ = make_device_matrix(handle_, 0, 0); + index_->update_dataset(handle_, make_const_mdspan(dataset_.view())); + + // Allocate space using the correct memory resource. + RAFT_LOG_INFO("moving dataset to new memory space: %s", + allocator_to_string(dataset_mem_).c_str()); + + auto mr = get_mr(dataset_mem_); + raft::neighbors::cagra::detail::copy_with_padding(handle_, dataset_, input_dataset_v_, mr); + + index_->update_dataset(handle_, make_const_mdspan(dataset_.view())); + + // Ideally, instead of dataset_.view(), we should pass a strided matrix view to update. + // See Issue https://github.com/rapidsai/raft/issues/1972 for details. + // auto dataset_view = make_device_strided_matrix_view( + // dataset_.data_handle(), dataset_.extent(0), this->dim_, dataset_.extent(1)); + // index_->update_dataset(handle_, dataset_view); + need_dataset_update_ = false; + } } template void RaftCagra::set_search_dataset(const T* dataset, size_t nrow) { - index_->update_dataset(handle_, - raft::make_host_matrix_view(dataset, nrow, this->dim_)); + // It can happen that we are re-using a previous algo object which already has + // the dataset set. Check if we need update. + if (static_cast(input_dataset_v_.extent(0)) != nrow || + input_dataset_v_.data_handle() != dataset) { + input_dataset_v_ = make_device_matrix_view(dataset, nrow, this->dim_); + need_dataset_update_ = true; + } } template void RaftCagra::save(const std::string& file) const { - raft::neighbors::cagra::serialize(handle_, file, *index_, false); + raft::neighbors::cagra::serialize(handle_, file, *index_); +} + +template +void RaftCagra::save_to_hnswlib(const std::string& file) const +{ + raft::neighbors::cagra::serialize_to_hnswlib(handle_, file, *index_); } template @@ -175,7 +278,7 @@ void RaftCagra::search( neighbors_IdxT, batch_size * k, raft::cast_op(), - resource::get_cuda_stream(handle_)); + raft::resource::get_cuda_stream(handle_)); } handle_.sync_stream(); diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h index da457e32f1..24b3c69bb6 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -53,18 +52,14 @@ class RaftIvfFlatGpu : public ANN { using BuildParam = raft::neighbors::ivf_flat::index_params; RaftIvfFlatGpu(Metric metric, int dim, const BuildParam& param) - : ANN(metric, dim), - index_params_(param), - dimension_(dim), - mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) + : ANN(metric, dim), index_params_(param), dimension_(dim) { index_params_.metric = parse_metric_type(metric); index_params_.conservative_memory_allocation = true; - rmm::mr::set_current_device_resource(&mr_); RAFT_CUDA_TRY(cudaGetDevice(&device_)); } - ~RaftIvfFlatGpu() noexcept { rmm::mr::set_current_device_resource(mr_.get_upstream()); } + ~RaftIvfFlatGpu() noexcept {} void build(const T* dataset, size_t nrow, cudaStream_t stream) final; @@ -91,8 +86,6 @@ class RaftIvfFlatGpu : public ANN { void load(const std::string&) override; private: - // `mr_` must go first to make sure it dies last - rmm::mr::pool_memory_resource mr_; raft::device_resources handle_; BuildParam index_params_; raft::neighbors::ivf_flat::search_params search_params_; @@ -135,10 +128,9 @@ template void RaftIvfFlatGpu::search( const T* queries, int batch_size, int k, size_t* neighbors, float* distances, cudaStream_t) const { - rmm::mr::device_memory_resource* mr_ptr = &const_cast(this)->mr_; static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t"); raft::neighbors::ivf_flat::search( - handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances, mr_ptr); + handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances); resource::sync_stream(handle_); return; } diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h index 8f1e43a706..e4004b0007 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -55,22 +54,14 @@ class RaftIvfPQ : public ANN { using BuildParam = raft::neighbors::ivf_pq::index_params; RaftIvfPQ(Metric metric, int dim, const BuildParam& param) - : ANN(metric, dim), - index_params_(param), - dimension_(dim), - mr_(rmm::mr::get_current_device_resource(), 1024 * 1024 * 1024ull) + : ANN(metric, dim), index_params_(param), dimension_(dim) { - rmm::mr::set_current_device_resource(&mr_); index_params_.metric = parse_metric_type(metric); RAFT_CUDA_TRY(cudaGetDevice(&device_)); RAFT_CUDA_TRY(cudaEventCreate(&sync_, cudaEventDisableTiming)); } - ~RaftIvfPQ() noexcept - { - RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); - rmm::mr::set_current_device_resource(mr_.get_upstream()); - } + ~RaftIvfPQ() noexcept { RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(sync_)); } void build(const T* dataset, size_t nrow, cudaStream_t stream) final; @@ -98,8 +89,6 @@ class RaftIvfPQ : public ANN { void load(const std::string&) override; private: - // `mr_` must go first to make sure it dies last - rmm::mr::pool_memory_resource mr_; raft::device_resources handle_; cudaEvent_t sync_{nullptr}; BuildParam index_params_; @@ -174,8 +163,7 @@ void RaftIvfPQ::search(const T* queries, raft::runtime::neighbors::ivf_pq::search( handle_, search_params_, *index_, queries_v, candidates.view(), distances_tmp.view()); - if (raft::spatial::knn::detail::utils::check_pointer_residency(dataset_.data_handle()) == - raft::spatial::knn::detail::utils::pointer_residency::device_only) { + if (raft::get_device_for_address(dataset_.data_handle()) >= 0) { auto queries_v = raft::make_device_matrix_view(queries, batch_size, index_->dim()); auto neighbors_v = raft::make_device_matrix_view((IdxT*)neighbors, batch_size, k); diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h index c8d98460b7..499bdf29a1 100644 --- a/cpp/bench/ann/src/raft/raft_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_wrapper.h @@ -134,6 +134,8 @@ void RaftGpu::search(const T* queries, float* distances, cudaStream_t stream) const { + // TODO: Integrate new `raft::brute_force::index` (from + // https://github.com/rapidsai/raft/pull/1817) raft::spatial::knn::detail::fusedL2Knn(this->dim_, reinterpret_cast(neighbors), distances, diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index ca4b0f099d..fe58453d0d 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -32,6 +32,7 @@ function(ConfigureBench) PRIVATE raft::raft raft_internal $<$:raft::compiled> + ${RAFT_CTK_MATH_DEPENDENCIES} benchmark::benchmark Threads::Threads $ @@ -73,11 +74,14 @@ function(ConfigureBench) endfunction() if(BUILD_PRIMS_BENCH) + ConfigureBench( + NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp + ) + ConfigureBench( NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) - ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp) ConfigureBench( NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu @@ -143,10 +147,12 @@ if(BUILD_PRIMS_BENCH) bench/prims/neighbors/knn/brute_force_float_int64_t.cu bench/prims/neighbors/knn/brute_force_float_uint32_t.cu bench/prims/neighbors/knn/cagra_float_uint32_t.cu + bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu + bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu bench/prims/neighbors/refine_float_int64_t.cu diff --git a/cpp/bench/prims/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu index effe2a55a4..129578c303 100644 --- a/cpp/bench/prims/cluster/kmeans_balanced.cu +++ b/cpp/bench/prims/cluster/kmeans_balanced.cu @@ -50,10 +50,10 @@ struct KMeansBalanced : public fixture { constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); if constexpr (std::is_integral_v) { raft::random::uniformInt( - rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax, stream); + handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax); } else { raft::random::uniform( - rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax, stream); + handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax); } resource::sync_stream(handle, stream); } diff --git a/cpp/bench/prims/core/bitset.cu b/cpp/bench/prims/core/bitset.cu index 5f44aa9af5..ce3136bcd5 100644 --- a/cpp/bench/prims/core/bitset.cu +++ b/cpp/bench/prims/core/bitset.cu @@ -44,7 +44,7 @@ struct bitset_bench : public fixture { loop_on_state(state, [this]() { auto my_bitset = raft::core::bitset( this->res, raft::make_const_mdspan(mask.view()), params.bitset_len); - my_bitset.test(res, raft::make_const_mdspan(queries.view()), outputs.view()); + my_bitset.test(this->res, raft::make_const_mdspan(queries.view()), outputs.view()); }); } diff --git a/cpp/bench/prims/core/copy.cu b/cpp/bench/prims/core/copy.cu new file mode 100644 index 0000000000..31ee83b924 --- /dev/null +++ b/cpp/bench/prims/core/copy.cu @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::core { + +template +auto constexpr const default_dims = []() { + auto dims = std::array{}; + std::fill(dims.begin(), dims.end(), 2); + return dims; +}(); + +template +auto constexpr const default_dims = std::array{3000000}; + +template +auto constexpr const default_dims = std::array{1000, 3000}; + +template +auto constexpr const default_dims = std::array{20, 300, 500}; + +template > +struct bench_array_type; + +template +struct bench_array_type> { + template + auto static constexpr const extent_type = raft::dynamic_extent; + + using type = + std::conditional_t...>, LayoutPolicy>, + device_mdarray...>, LayoutPolicy>>; +}; + +template +struct params { + std::array dims = default_dims; + using src_array_type = + typename bench_array_type::type; + using dst_array_type = + typename bench_array_type::type; +}; + +template +struct CopyBench : public fixture { + using params_type = + params; + using src_array_type = typename params_type::src_array_type; + using dst_array_type = typename params_type::dst_array_type; + explicit CopyBench(const params_type& ps) + : fixture{true}, + res_{}, + params_{ps}, + src_{ + res_, + typename src_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename src_array_type::container_policy_type{}, + }, + dst_{ + res_, + typename dst_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename dst_array_type::container_policy_type{}, + } + { + res_.get_cublas_handle(); // initialize cublas handle + auto src_data = std::vector(src_.size()); + std::iota(src_data.begin(), src_data.end(), SrcT{}); + raft::copy(src_.data_handle(), src_data.data(), src_.size(), res_.get_stream()); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { raft::copy(res_, dst_.view(), src_.view()); }); + } + + private: + raft::device_resources res_; + params_type params_; + src_array_type src_; + dst_array_type dst_; +}; + +template +auto static const inputs = std::vector{ParamsT{}}; + +#define COPY_REGISTER(BenchT) \ + RAFT_BENCH_REGISTER(BenchT, "BenchT", inputs) + +using copy_bench_device_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_3d_diff_dtype_same_layout = CopyBench; + +using copy_bench_host_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout_float_float = CopyBench; +using copy_bench_host_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_host_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_device_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_device_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_host_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_host_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_3d_diff_dtype_same_layout = CopyBench; + +// COPY_REGISTER(copy_bench_same_dtype_1d_host_host); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout_float_float); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_diff_layout); + +} // namespace raft::bench::core diff --git a/cpp/bench/prims/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu index 7d916e6ce0..3f74759665 100644 --- a/cpp/bench/prims/distance/kernels.cu +++ b/cpp/bench/prims/distance/kernels.cu @@ -46,9 +46,9 @@ struct GramMatrix : public fixture { A.resize(params.m * params.k, stream); B.resize(params.k * params.n, stream); C.resize(params.m * params.n, stream); - raft::random::Rng r(123456ULL); - r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream); - r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream); + raft::random::RngState rng(123456ULL); + raft::random::uniform(handle, rng, A.data(), params.m * params.k, T(-1.0), T(1.0)); + raft::random::uniform(handle, rng, B.data(), params.k * params.n, T(-1.0), T(1.0)); } ~GramMatrix() diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu index c804ecb3a1..19d78f4cd9 100644 --- a/cpp/bench/prims/distance/masked_nn.cu +++ b/cpp/bench/prims/distance/masked_nn.cu @@ -46,10 +46,10 @@ struct Params { AdjacencyPattern pattern; }; // struct Params -__global__ void init_adj(AdjacencyPattern pattern, - int n, - raft::device_matrix_view adj, - raft::device_vector_view group_idxs) +RAFT_KERNEL init_adj(AdjacencyPattern pattern, + int n, + raft::device_matrix_view adj, + raft::device_vector_view group_idxs) { int m = adj.extent(0); int num_groups = adj.extent(1); diff --git a/cpp/bench/prims/linalg/norm.cu b/cpp/bench/prims/linalg/norm.cu index f83953f8e4..1db23e4ca4 100644 --- a/cpp/bench/prims/linalg/norm.cu +++ b/cpp/bench/prims/linalg/norm.cu @@ -42,7 +42,7 @@ struct rowNorm : public fixture { rowNorm(const norm_input& p) : params(p), in(p.rows * p.cols, stream), dots(p.rows, stream) { raft::random::RngState rng{1234}; - raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream); + raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0); } void run_benchmark(::benchmark::State& state) override diff --git a/cpp/bench/prims/linalg/normalize.cu b/cpp/bench/prims/linalg/normalize.cu index ad9052a008..91319e774c 100644 --- a/cpp/bench/prims/linalg/normalize.cu +++ b/cpp/bench/prims/linalg/normalize.cu @@ -41,7 +41,7 @@ struct rowNormalize : public fixture { : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream) { raft::random::RngState rng{1234}; - raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream); + raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0); } void run_benchmark(::benchmark::State& state) override diff --git a/cpp/bench/prims/linalg/reduce_cols_by_key.cu b/cpp/bench/prims/linalg/reduce_cols_by_key.cu index ac0c612ee4..1b584e80c8 100644 --- a/cpp/bench/prims/linalg/reduce_cols_by_key.cu +++ b/cpp/bench/prims/linalg/reduce_cols_by_key.cu @@ -42,7 +42,7 @@ struct reduce_cols_by_key : public fixture { : params(p), in(p.rows * p.cols, stream), out(p.rows * p.keys, stream), keys(p.cols, stream) { raft::random::RngState rng{42}; - raft::random::uniformInt(rng, keys.data(), p.cols, (KeyT)0, (KeyT)p.keys, stream); + raft::random::uniformInt(handle, rng, keys.data(), p.cols, (KeyT)0, (KeyT)p.keys); } void run_benchmark(::benchmark::State& state) override diff --git a/cpp/bench/prims/linalg/reduce_rows_by_key.cu b/cpp/bench/prims/linalg/reduce_rows_by_key.cu index aa9c9a1f62..b68cefc274 100644 --- a/cpp/bench/prims/linalg/reduce_rows_by_key.cu +++ b/cpp/bench/prims/linalg/reduce_rows_by_key.cu @@ -37,7 +37,7 @@ struct reduce_rows_by_key : public fixture { workspace(p.rows, stream) { raft::random::RngState rng{42}; - raft::random::uniformInt(rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys, stream); + raft::random::uniformInt(handle, rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys); } void run_benchmark(::benchmark::State& state) override diff --git a/cpp/bench/prims/matrix/argmin.cu b/cpp/bench/prims/matrix/argmin.cu index a8f667257a..afee81aa00 100644 --- a/cpp/bench/prims/matrix/argmin.cu +++ b/cpp/bench/prims/matrix/argmin.cu @@ -40,7 +40,7 @@ struct Argmin : public fixture { raft::random::RngState rng{1234}; raft::random::uniform( - rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream); + handle, rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1)); resource::sync_stream(handle, stream); } diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu index ca6a2830bd..00a145ffa9 100644 --- a/cpp/bench/prims/matrix/gather.cu +++ b/cpp/bench/prims/matrix/gather.cu @@ -52,11 +52,11 @@ struct Gather : public fixture { raft::random::RngState rng{1234}; raft::random::uniform( - rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream); + handle, rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1)); raft::random::uniformInt( handle, rng, map.data_handle(), params.map_length, (MapT)0, (MapT)params.rows); if constexpr (Conditional) { - raft::random::uniform(rng, stencil.data_handle(), params.map_length, T(-1), T(1), stream); + raft::random::uniform(handle, rng, stencil.data_handle(), params.map_length, T(-1), T(1)); } resource::sync_stream(handle, stream); } diff --git a/cpp/bench/prims/neighbors/cagra_bench.cuh b/cpp/bench/prims/neighbors/cagra_bench.cuh index 63f6c14686..07e93a3473 100644 --- a/cpp/bench/prims/neighbors/cagra_bench.cuh +++ b/cpp/bench/prims/neighbors/cagra_bench.cuh @@ -62,20 +62,20 @@ struct CagraBench : public fixture { constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); if constexpr (std::is_integral_v) { raft::random::uniformInt( - state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax, stream); + handle, state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax); raft::random::uniformInt( - state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax, stream); + handle, state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax); } else { raft::random::uniform( - state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax, stream); + handle, state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax); raft::random::uniform( - state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax, stream); + handle, state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax); } // Generate random knn graph raft::random::uniformInt( - state, knn_graph_.data_handle(), knn_graph_.size(), 0, ps.n_samples - 1, stream); + handle, state, knn_graph_.data_handle(), knn_graph_.size(), 0, ps.n_samples - 1); auto metric = raft::distance::DistanceType::L2Expanded; diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh index e580b20fdc..55865b577a 100644 --- a/cpp/bench/prims/neighbors/knn.cuh +++ b/cpp/bench/prims/neighbors/knn.cuh @@ -21,9 +21,12 @@ #include +#include #include #include +#include #include +#include #include #include @@ -31,6 +34,8 @@ #include #include +#include + #include namespace raft::bench::spatial { @@ -44,11 +49,14 @@ struct params { size_t n_queries; /** Number of nearest neighbours to find for every probe. */ size_t k; + /** Ratio of removed indices. */ + double removed_ratio; }; inline auto operator<<(std::ostream& os, const params& p) -> std::ostream& { - os << p.n_samples << "#" << p.n_dims << "#" << p.n_queries << "#" << p.k; + os << p.n_samples << "#" << p.n_dims << "#" << p.n_queries << "#" << p.k << "#" + << p.removed_ratio; return os; } @@ -221,6 +229,104 @@ struct brute_force_knn { } }; +template +struct ivf_flat_filter_knn { + using dist_t = float; + + std::optional> index; + raft::neighbors::ivf_flat::index_params index_params; + raft::neighbors::ivf_flat::search_params search_params; + raft::core::bitset removed_indices_bitset_; + params ps; + + ivf_flat_filter_knn(const raft::device_resources& handle, const params& ps, const ValT* data) + : ps(ps), removed_indices_bitset_(handle, ps.n_samples) + { + index_params.n_lists = 4096; + index_params.metric = raft::distance::DistanceType::L2Expanded; + index.emplace(raft::neighbors::ivf_flat::build( + handle, index_params, data, IdxT(ps.n_samples), uint32_t(ps.n_dims))); + auto removed_indices = + raft::make_device_vector(handle, ps.removed_ratio * ps.n_samples); + thrust::sequence( + resource::get_thrust_policy(handle), + thrust::device_pointer_cast(removed_indices.data_handle()), + thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0))); + removed_indices_bitset_.set(handle, removed_indices.view()); + } + + void search(const raft::device_resources& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + search_params.n_probes = 20; + auto queries_view = + raft::make_device_matrix_view(search_items, ps.n_queries, ps.n_dims); + auto neighbors_view = raft::make_device_matrix_view(out_idxs, ps.n_queries, ps.k); + auto distance_view = raft::make_device_matrix_view(out_dists, ps.n_queries, ps.k); + auto filter = raft::neighbors::filtering::bitset_filter(removed_indices_bitset_.view()); + + if (ps.removed_ratio > 0) { + raft::neighbors::ivf_flat::search_with_filtering( + handle, search_params, *index, queries_view, neighbors_view, distance_view, filter); + } else { + raft::neighbors::ivf_flat::search( + handle, search_params, *index, queries_view, neighbors_view, distance_view); + } + } +}; + +template +struct ivf_pq_filter_knn { + using dist_t = float; + + std::optional> index; + raft::neighbors::ivf_pq::index_params index_params; + raft::neighbors::ivf_pq::search_params search_params; + raft::core::bitset removed_indices_bitset_; + params ps; + + ivf_pq_filter_knn(const raft::device_resources& handle, const params& ps, const ValT* data) + : ps(ps), removed_indices_bitset_(handle, ps.n_samples) + { + index_params.n_lists = 4096; + index_params.metric = raft::distance::DistanceType::L2Expanded; + auto data_view = raft::make_device_matrix_view(data, ps.n_samples, ps.n_dims); + index.emplace(raft::neighbors::ivf_pq::build(handle, index_params, data_view)); + auto removed_indices = + raft::make_device_vector(handle, ps.removed_ratio * ps.n_samples); + thrust::sequence( + resource::get_thrust_policy(handle), + thrust::device_pointer_cast(removed_indices.data_handle()), + thrust::device_pointer_cast(removed_indices.data_handle() + removed_indices.extent(0))); + removed_indices_bitset_.set(handle, removed_indices.view()); + } + + void search(const raft::device_resources& handle, + const ValT* search_items, + dist_t* out_dists, + IdxT* out_idxs) + { + search_params.n_probes = 20; + auto queries_view = + raft::make_device_matrix_view(search_items, ps.n_queries, ps.n_dims); + auto neighbors_view = + raft::make_device_matrix_view(out_idxs, ps.n_queries, ps.k); + auto distance_view = + raft::make_device_matrix_view(out_dists, ps.n_queries, ps.k); + auto filter = raft::neighbors::filtering::bitset_filter(removed_indices_bitset_.view()); + + if (ps.removed_ratio > 0) { + raft::neighbors::ivf_pq::search_with_filtering( + handle, search_params, *index, queries_view, neighbors_view, distance_view, filter); + } else { + raft::neighbors::ivf_pq::search( + handle, search_params, *index, queries_view, neighbors_view, distance_view); + } + } +}; + template struct knn : public fixture { explicit knn(const params& p, const TransferStrategy& strategy, const Scope& scope) @@ -260,9 +366,9 @@ struct knn : public fixture { constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); if constexpr (std::is_integral_v) { - raft::random::uniformInt(state, vec.data(), n, kRangeMin, kRangeMax, stream); + raft::random::uniformInt(handle, state, vec.data(), n, kRangeMin, kRangeMax); } else { - raft::random::uniform(state, vec.data(), n, kRangeMin, kRangeMax, stream); + raft::random::uniform(handle, state, vec.data(), n, kRangeMin, kRangeMax); } } @@ -378,8 +484,15 @@ struct knn : public fixture { }; inline const std::vector kInputs{ - {2000000, 128, 1000, 32}, {10000000, 128, 1000, 32}, {10000, 8192, 1000, 32}}; - + {2000000, 128, 1000, 32, 0}, {10000000, 128, 1000, 32, 0}, {10000, 8192, 1000, 32, 0}}; + +const std::vector kInputsFilter = + raft::util::itertools::product({size_t(10000000)}, // n_samples + {size_t(128)}, // n_dim + {size_t(1000)}, // n_queries + {size_t(255)}, // k + {0.0, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64} // removed_ratio + ); inline const std::vector kAllStrategies{ TransferStrategy::NO_COPY, TransferStrategy::MAP_PINNED, TransferStrategy::MANAGED}; inline const std::vector kNoCopyOnly{TransferStrategy::NO_COPY}; diff --git a/cpp/bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu new file mode 100644 index 0000000000..bf5118ceae --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#undef RAFT_EXPLICIT_INSTANTIATE_ONLY // Enable instantiation of search with filter +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, ivf_flat_filter_knn, kInputsFilter, kNoCopyOnly, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu b/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu new file mode 100644 index 0000000000..9534515cbb --- /dev/null +++ b/cpp/bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#undef RAFT_EXPLICIT_INSTANTIATE_ONLY // Enable instantiation of search with filter +#include "../knn.cuh" + +namespace raft::bench::spatial { + +KNN_REGISTER(float, int64_t, ivf_pq_filter_knn, kInputsFilter, kNoCopyOnly, kScopeFull); + +} // namespace raft::bench::spatial diff --git a/cpp/bench/prims/sparse/convert_csr.cu b/cpp/bench/prims/sparse/convert_csr.cu index c9dcae6985..634c749a54 100644 --- a/cpp/bench/prims/sparse/convert_csr.cu +++ b/cpp/bench/prims/sparse/convert_csr.cu @@ -30,7 +30,7 @@ struct bench_param { }; template -__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor) +RAFT_KERNEL init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor) { index_t r = blockDim.y * blockIdx.y + threadIdx.y; index_t c = blockDim.x * blockIdx.x + threadIdx.x; diff --git a/cpp/cmake/patches/ggnn.patch b/cpp/cmake/patches/ggnn.patch index 95e1aaff4b..21df3bd738 100644 --- a/cpp/cmake/patches/ggnn.patch +++ b/cpp/cmake/patches/ggnn.patch @@ -1,3 +1,26 @@ +diff --git a/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh b/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh +index 890420e..d792903 100644 +--- a/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh ++++ b/include/ggnn/cache/cuda_simple_knn_sym_cache.cuh +@@ -62,7 +62,7 @@ struct SimpleKNNSymCache { + const ValueT dist_half) + : dist_query(dist_query), dist_half(dist_half) {} + +- __device__ __forceinline__ DistQueryAndHalf() {} ++ DistQueryAndHalf() = default; + }; + + struct DistanceAndNorm { +@@ -98,8 +98,7 @@ struct SimpleKNNSymCache { + KeyT cache; + DistQueryAndHalf dist; + bool flag; +- +- __device__ __forceinline__ SyncTempStorage() {} ++ SyncTempStorage() = default; + }; + + public: diff --git a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh b/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh index 8cbaf0d..6eb72ac 100644 --- a/include/ggnn/cuda_knn_ggnn_gpu_instance.cuh diff --git a/cpp/cmake/patches/hnswlib.patch b/cpp/cmake/patches/hnswlib.patch new file mode 100644 index 0000000000..32c1537c58 --- /dev/null +++ b/cpp/cmake/patches/hnswlib.patch @@ -0,0 +1,130 @@ +diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h +index e95e0b5..f0fe50a 100644 +--- a/hnswlib/hnswalg.h ++++ b/hnswlib/hnswalg.h +@@ -3,6 +3,7 @@ + #include "visited_list_pool.h" + #include "hnswlib.h" + #include ++#include + #include + #include + #include +@@ -16,6 +17,8 @@ namespace hnswlib { + template + class HierarchicalNSW : public AlgorithmInterface { + public: ++ bool base_layer_only{false}; ++ int num_seeds=32; + static const tableint max_update_element_locks = 65536; + HierarchicalNSW(SpaceInterface *s) { + } +@@ -56,7 +59,7 @@ namespace hnswlib { + visited_list_pool_ = new VisitedListPool(1, max_elements); + + //initializations for special treatment of the first node +- enterpoint_node_ = -1; ++ enterpoint_node_ = std::numeric_limits::max(); + maxlevel_ = -1; + + linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); +@@ -527,7 +530,7 @@ namespace hnswlib { + tableint *datal = (tableint *) (data + 1); + for (int i = 0; i < size; i++) { + tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) ++ if (cand > max_elements_) + throw std::runtime_error("cand error"); + dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); + +@@ -1067,7 +1070,7 @@ namespace hnswlib { + tableint *datal = (tableint *) (data + 1); + for (int i = 0; i < size; i++) { + tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) ++ if (cand > max_elements_) + throw std::runtime_error("cand error"); + dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_); + if (d < curdist) { +@@ -1119,28 +1122,41 @@ namespace hnswlib { + tableint currObj = enterpoint_node_; + dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); + +- for (int level = maxlevel_; level > 0; level--) { +- bool changed = true; +- while (changed) { +- changed = false; +- unsigned int *data; ++ if (base_layer_only) { ++ // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale ++ for (int i = 0; i < num_seeds; i++) { ++ tableint obj = i * (max_elements_ / num_seeds); ++ dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_); ++ if (dist < curdist) { ++ curdist = dist; ++ currObj = obj; ++ } ++ } ++ } ++ else{ ++ for (int level = maxlevel_; level > 0; level--) { ++ bool changed = true; ++ while (changed) { ++ changed = false; ++ unsigned int *data; + +- data = (unsigned int *) get_linklist(currObj, level); +- int size = getListCount(data); +- metric_hops++; +- metric_distance_computations+=size; ++ data = (unsigned int *) get_linklist(currObj, level); ++ int size = getListCount(data); ++ metric_hops++; ++ metric_distance_computations+=size; + +- tableint *datal = (tableint *) (data + 1); +- for (int i = 0; i < size; i++) { +- tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) +- throw std::runtime_error("cand error"); +- dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); ++ tableint *datal = (tableint *) (data + 1); ++ for (int i = 0; i < size; i++) { ++ tableint cand = datal[i]; ++ if (cand > max_elements_) ++ throw std::runtime_error("cand error"); ++ dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); + +- if (d < curdist) { +- curdist = d; +- currObj = cand; +- changed = true; ++ if (d < curdist) { ++ curdist = d; ++ currObj = cand; ++ changed = true; ++ } + } + } + } +diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h +index 5e1a4a5..4195ebd 100644 +--- a/hnswlib/visited_list_pool.h ++++ b/hnswlib/visited_list_pool.h +@@ -3,6 +3,7 @@ + #include + #include + #include ++#include + + namespace hnswlib { + typedef unsigned short int vl_type; +@@ -14,7 +15,7 @@ namespace hnswlib { + unsigned int numelements; + + VisitedList(int numelements1) { +- curV = -1; ++ curV = std::numeric_limits::max(); + numelements = numelements1; + mass = new vl_type[numelements]; + } diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake index 853fd7c52f..0123c4b07a 100644 --- a/cpp/cmake/thirdparty/get_cutlass.cmake +++ b/cpp/cmake/thirdparty/get_cutlass.cmake @@ -70,10 +70,12 @@ function(find_and_configure_cutlass) # Tell cmake where it can find the generated NvidiaCutlass-config.cmake we wrote. include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( - INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] raft-exports + INSTALL NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}/../]=] + EXPORT_SET raft-exports ) rapids_export_find_package_root( - BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-exports + BUILD NvidiaCutlass [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET raft-exports ) endfunction() diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake index b7c132f2f1..85829554ae 100644 --- a/cpp/cmake/thirdparty/get_faiss.cmake +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -15,7 +15,7 @@ #============================================================================= function(find_and_configure_faiss) - set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL) + set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU) cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} ) @@ -30,58 +30,81 @@ function(find_and_configure_faiss) set(CPM_DOWNLOAD_faiss ON) endif() + include(cmake/modules/FindAVX.cmake) + + # Link against AVX CPU lib if it exists + set(RAFT_FAISS_GLOBAL_TARGETS faiss::faiss) + set(RAFT_FAISS_EXPORT_GLOBAL_TARGETS faiss) + set(RAFT_FAISS_OPT_LEVEL "generic") + if(CXX_AVX_FOUND) + set(RAFT_FAISS_OPT_LEVEL "avx2") + list(APPEND RAFT_FAISS_GLOBAL_TARGETS faiss::faiss_avx2) + list(APPEND RAFT_FAISS_EXPORT_GLOBAL_TARGETS faiss_avx2) + endif() + rapids_cpm_find(faiss ${PKG_VERSION} - GLOBAL_TARGETS faiss::faiss + GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} CPM_ARGS GIT_REPOSITORY ${PKG_REPOSITORY} GIT_TAG ${PKG_PINNED_TAG} EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL} OPTIONS + "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}" "FAISS_ENABLE_PYTHON OFF" - "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}" - "FAISS_ENABLE_GPU ON" + "FAISS_OPT_LEVEL ${RAFT_FAISS_OPT_LEVEL}" + "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}" "BUILD_TESTING OFF" "CMAKE_MESSAGE_LOG_LEVEL VERBOSE" - "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}" ) if(TARGET faiss AND NOT TARGET faiss::faiss) add_library(faiss::faiss ALIAS faiss) endif() - if(faiss_ADDED) + if(CXX_AVX_FOUND) + + if(TARGET faiss_avx2 AND NOT TARGET faiss::faiss_avx2) + add_library(faiss::faiss_avx2 ALIAS faiss_avx2) + endif() + endif() + + + if(faiss_ADDED) rapids_export(BUILD faiss EXPORT_SET faiss-targets - GLOBAL_TARGETS faiss + GLOBAL_TARGETS ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS} NAMESPACE faiss::) endif() # We generate the faiss-config files when we built faiss locally, so always do `find_dependency` rapids_export_package(BUILD OpenMP raft-ann-bench-exports) # faiss uses openMP but doesn't export a need for it - rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS faiss::faiss faiss) - rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS faiss::faiss faiss) + rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}) + rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}) # Tell cmake where it can find the generated faiss-config.cmake we wrote. include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-ann-bench-exports) + rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET raft-ann-bench-exports) endfunction() if(NOT RAFT_FAISS_GIT_TAG) # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC # (https://github.com/facebookresearch/faiss/pull/2446) - set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk-v1.7.0) + set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk) # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30) endif() if(NOT RAFT_FAISS_GIT_REPOSITORY) # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC # (https://github.com/facebookresearch/faiss/pull/2446) - set(RAFT_FAISS_GIT_REPOSITORY https://github.com/trxcllnt/faiss.git) + set(RAFT_FAISS_GIT_REPOSITORY https://github.com/cjnolet/faiss.git) # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git) endif() -find_and_configure_faiss(VERSION 1.7.0 +find_and_configure_faiss(VERSION 1.7.4 REPOSITORY ${RAFT_FAISS_GIT_REPOSITORY} PINNED_TAG ${RAFT_FAISS_GIT_TAG} BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC} - EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL}) \ No newline at end of file + EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL} + ENABLE_GPU ${RAFT_FAISS_ENABLE_GPU}) + diff --git a/cpp/cmake/thirdparty/get_fmt.cmake b/cpp/cmake/thirdparty/get_fmt.cmake new file mode 100644 index 0000000000..c06f8a78bb --- /dev/null +++ b/cpp/cmake/thirdparty/get_fmt.cmake @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone fmt +function(find_and_configure_fmt) + + include(${rapids-cmake-dir}/cpm/fmt.cmake) + rapids_cpm_fmt(INSTALL_EXPORT_SET rmm-exports BUILD_EXPORT_SET rmm-exports) +endfunction() + +find_and_configure_fmt() \ No newline at end of file diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake index 94033e8333..a4ceacae38 100644 --- a/cpp/cmake/thirdparty/get_hnswlib.cmake +++ b/cpp/cmake/thirdparty/get_hnswlib.cmake @@ -26,6 +26,11 @@ function(find_and_configure_hnswlib) COMMAND git clone --branch=v0.6.2 https://github.com/nmslib/hnswlib.git hnswlib-src WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps ) + message("SOURCE ${CMAKE_CURRENT_SOURCE_DIR}") + execute_process ( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/hnswlib.patch + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/_deps/hnswlib-src + ) endif () include(cmake/modules/FindAVX.cmake) diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake new file mode 100644 index 0000000000..7be7804c7e --- /dev/null +++ b/cpp/cmake/thirdparty/get_spdlog.cmake @@ -0,0 +1,33 @@ +# ============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone speedlog +function(find_and_configure_spdlog) + + include(${rapids-cmake-dir}/cpm/spdlog.cmake) + rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports) + rapids_export_package(BUILD spdlog rmm-exports) + + if(spdlog_ADDED) + rapids_export( + BUILD spdlog + EXPORT_SET spdlog + GLOBAL_TARGETS spdlog spdlog_header_only + NAMESPACE spdlog::) + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET rmm-exports) + endif() +endfunction() + +find_and_configure_spdlog() \ No newline at end of file diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index cb990d668e..f1dcfa9db7 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "RAFT C++ API" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "23.10" +PROJECT_NUMBER = "23.12" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/cpp/include/raft/cluster/detail/agglomerative.cuh b/cpp/include/raft/cluster/detail/agglomerative.cuh index 624e67b7fa..f2c83abdd3 100644 --- a/cpp/include/raft/cluster/detail/agglomerative.cuh +++ b/cpp/include/raft/cluster/detail/agglomerative.cuh @@ -155,9 +155,7 @@ void build_dendrogram_host(raft::resources const& handle, } template -__global__ void write_levels_kernel(const value_idx* children, - value_idx* parents, - value_idx n_vertices) +RAFT_KERNEL write_levels_kernel(const value_idx* children, value_idx* parents, value_idx n_vertices) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { @@ -179,12 +177,12 @@ __global__ void write_levels_kernel(const value_idx* children, * @param labels */ template -__global__ void inherit_labels(const value_idx* children, - const value_idx* levels, - std::size_t n_leaves, - value_idx* labels, - int cut_level, - value_idx n_vertices) +RAFT_KERNEL inherit_labels(const value_idx* children, + const value_idx* levels, + std::size_t n_leaves, + value_idx* labels, + int cut_level, + value_idx n_vertices) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh index ef046ab4ff..49ac6ae704 100644 --- a/cpp/include/raft/cluster/detail/connectivities.cuh +++ b/cpp/include/raft/cluster/detail/connectivities.cuh @@ -107,7 +107,7 @@ struct distance_graph_impl -__global__ void fill_indices2(value_idx* indices, size_t m, size_t nnz) +RAFT_KERNEL fill_indices2(value_idx* indices, size_t m, size_t nnz) { value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nnz) return; diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh index ade3a6e348..593d7d8fa9 100644 --- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh @@ -434,7 +434,7 @@ template -__global__ void __launch_bounds__((WarpSize * BlockDimY)) +__launch_bounds__((WarpSize * BlockDimY)) RAFT_KERNEL adjust_centers_kernel(MathT* centers, // [n_clusters, dim] IdxT n_clusters, IdxT dim, diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh index 5a1479a81f..0b5dec4e19 100644 --- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh @@ -92,12 +92,12 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * initialized to zero. */ template -static __global__ void computeDistances(index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) +RAFT_KERNEL computeDistances(index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) { // Loop index index_type_t i; @@ -173,11 +173,11 @@ static __global__ void computeDistances(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(index_type_t n, - index_type_t k, - value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) +RAFT_KERNEL minDistances(index_type_t n, + index_type_t k, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { // Loop index index_type_t i, j; @@ -233,11 +233,11 @@ static __global__ void minDistances(index_type_t n, * @param code_new Index associated with new centroid. */ template -static __global__ void minDistances2(index_type_t n, - value_type_t* __restrict__ dists_old, - const value_type_t* __restrict__ dists_new, - index_type_t* __restrict__ codes_old, - index_type_t code_new) +RAFT_KERNEL minDistances2(index_type_t n, + value_type_t* __restrict__ dists_old, + const value_type_t* __restrict__ dists_new, + index_type_t* __restrict__ codes_old, + index_type_t code_new) { // Loop index index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; @@ -275,9 +275,9 @@ static __global__ void minDistances2(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes(index_type_t n, - const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) +RAFT_KERNEL computeClusterSizes(index_type_t n, + const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { @@ -308,10 +308,10 @@ static __global__ void computeClusterSizes(index_type_t n, * column is the mean position of a cluster). */ template -static __global__ void divideCentroids(index_type_t d, - index_type_t k, - const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) +RAFT_KERNEL divideCentroids(index_type_t d, + index_type_t k, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) { // Global indices index_type_t gidx, gidy; diff --git a/cpp/include/raft/common/detail/scatter.cuh b/cpp/include/raft/common/detail/scatter.cuh index 87a8826aa6..6e7522853e 100644 --- a/cpp/include/raft/common/detail/scatter.cuh +++ b/cpp/include/raft/common/detail/scatter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ namespace raft::detail { template -__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) +RAFT_KERNEL scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) { typedef TxN_t DataVec; typedef TxN_t IdxVec; diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh index 6747c5fab0..552c2e9ac5 100644 --- a/cpp/include/raft/core/bitset.cuh +++ b/cpp/include/raft/core/bitset.cuh @@ -16,10 +16,13 @@ #pragma once +#include // native_popc #include +#include #include #include #include +#include #include #include @@ -39,7 +42,7 @@ namespace raft::core { */ template struct bitset_view { - index_t static constexpr const bitset_element_size = sizeof(bitset_t) * 8; + static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8; _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len) : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len} @@ -69,12 +72,40 @@ struct bitset_view { const bool is_bit_set = (bit_element & (bitset_t{1} << bit_index)) != 0; return is_bit_set; } + /** + * @brief Device function to test if a given index is set in the bitset. + * + * @param sample_index Single index to test + * @return bool True if index has not been unset in the bitset + */ + inline _RAFT_DEVICE auto operator[](const index_t sample_index) const -> bool + { + return test(sample_index); + } + /** + * @brief Device function to set a given index to set_value in the bitset. + * + * @param sample_index index to set + * @param set_value Value to set the bit to (true or false) + */ + inline _RAFT_DEVICE void set(const index_t sample_index, bool set_value) const + { + const index_t bit_element = sample_index / bitset_element_size; + const index_t bit_index = sample_index % bitset_element_size; + const bitset_t bitmask = bitset_t{1} << bit_index; + if (set_value) { + atomicOr(bitset_ptr_ + bit_element, bitmask); + } else { + const bitset_t bitmask2 = ~bitmask; + atomicAnd(bitset_ptr_ + bit_element, bitmask2); + } + } /** * @brief Get the device pointer to the bitset. */ - inline _RAFT_HOST_DEVICE auto data_handle() -> bitset_t* { return bitset_ptr_; } - inline _RAFT_HOST_DEVICE auto data_handle() const -> const bitset_t* { return bitset_ptr_; } + inline _RAFT_HOST_DEVICE auto data() -> bitset_t* { return bitset_ptr_; } + inline _RAFT_HOST_DEVICE auto data() const -> const bitset_t* { return bitset_ptr_; } /** * @brief Get the number of bits of the bitset representation. */ @@ -114,7 +145,7 @@ struct bitset_view { */ template struct bitset { - index_t static constexpr const bitset_element_size = sizeof(bitset_t) * 8; + static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8; /** * @brief Construct a new bitset object with a list of indices to unset. @@ -130,13 +161,9 @@ struct bitset { bool default_value = true) : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)), raft::resource::get_cuda_stream(res)}, - bitset_len_{bitset_len}, - default_value_{default_value} + bitset_len_{bitset_len} { - cudaMemsetAsync(bitset_.data(), - default_value ? 0xff : 0x00, - n_elements() * sizeof(bitset_t), - resource::get_cuda_stream(res)); + reset(res, default_value); set(res, mask_index, !default_value); } @@ -150,13 +177,9 @@ struct bitset { bitset(const raft::resources& res, index_t bitset_len, bool default_value = true) : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)), resource::get_cuda_stream(res)}, - bitset_len_{bitset_len}, - default_value_{default_value} + bitset_len_{bitset_len} { - cudaMemsetAsync(bitset_.data(), - default_value ? 0xff : 0x00, - n_elements() * sizeof(bitset_t), - resource::get_cuda_stream(res)); + reset(res, default_value); } // Disable copy constructor bitset(const bitset&) = delete; @@ -181,8 +204,8 @@ struct bitset { /** * @brief Get the device pointer to the bitset. */ - inline auto data_handle() -> bitset_t* { return bitset_.data(); } - inline auto data_handle() const -> const bitset_t* { return bitset_.data(); } + inline auto data() -> bitset_t* { return bitset_.data(); } + inline auto data() const -> const bitset_t* { return bitset_.data(); } /** * @brief Get the number of bits of the bitset representation. */ @@ -207,8 +230,12 @@ struct bitset { } /** @brief Resize the bitset. If the requested size is larger, new memory is allocated and set to - * the default value. */ - void resize(const raft::resources& res, index_t new_bitset_len) + * the default value. + * @param res RAFT resources + * @param new_bitset_len new size of the bitset + * @param default_value default value to initialize the new bits to + */ + void resize(const raft::resources& res, index_t new_bitset_len, bool default_value = true) { auto old_size = raft::ceildiv(bitset_len_, bitset_element_size); auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size); @@ -216,10 +243,11 @@ struct bitset { bitset_len_ = new_bitset_len; if (old_size < new_size) { // If the new size is larger, set the new bits to the default value - cudaMemsetAsync(bitset_.data() + old_size, - default_value_ ? 0xff : 0x00, - (new_size - old_size) * sizeof(bitset_t), - resource::get_cuda_stream(res)); + + thrust::fill_n(resource::get_thrust_policy(res), + bitset_.data() + old_size, + new_size - old_size, + default_value ? ~bitset_t{0} : bitset_t{0}); } } @@ -255,25 +283,16 @@ struct bitset { raft::device_vector_view mask_index, bool set_value = false) { - auto* bitset_ptr = this->data_handle(); + auto this_bitset_view = view(); thrust::for_each_n(resource::get_thrust_policy(res), mask_index.data_handle(), mask_index.extent(0), - [bitset_ptr, set_value] __device__(const index_t sample_index) { - const index_t bit_element = sample_index / bitset_element_size; - const index_t bit_index = sample_index % bitset_element_size; - const bitset_t bitmask = bitset_t{1} << bit_index; - if (set_value) { - atomicOr(bitset_ptr + bit_element, bitmask); - } else { - const bitset_t bitmask2 = ~bitmask; - atomicAnd(bitset_ptr + bit_element, bitmask2); - } + [this_bitset_view, set_value] __device__(const index_t sample_index) { + this_bitset_view.set(sample_index, set_value); }); } /** * @brief Flip all the bits in a bitset. - * * @param res RAFT resources */ void flip(const raft::resources& res) @@ -289,19 +308,90 @@ struct bitset { * @brief Reset the bits in a bitset. * * @param res RAFT resources + * @param default_value Value to set the bits to (true or false) */ - void reset(const raft::resources& res) + void reset(const raft::resources& res, bool default_value = true) { - cudaMemsetAsync(bitset_.data(), - default_value_ ? 0xff : 0x00, - n_elements() * sizeof(bitset_t), - resource::get_cuda_stream(res)); + thrust::fill_n(resource::get_thrust_policy(res), + bitset_.data(), + n_elements(), + default_value ? ~bitset_t{0} : bitset_t{0}); } + /** + * @brief Returns the number of bits set to true in count_gpu_scalar. + * + * @param[in] res RAFT resources + * @param[out] count_gpu_scalar Device scalar to store the count + */ + void count(const raft::resources& res, raft::device_scalar_view count_gpu_scalar) + { + auto n_elements_ = n_elements(); + auto count_gpu = + raft::make_device_vector_view(count_gpu_scalar.data_handle(), 1); + auto bitset_matrix_view = raft::make_device_matrix_view( + bitset_.data(), n_elements_, 1); + + bitset_t n_last_element = (bitset_len_ % bitset_element_size); + bitset_t last_element_mask = + n_last_element ? (bitset_t)((bitset_t{1} << n_last_element) - bitset_t{1}) : ~bitset_t{0}; + raft::linalg::coalesced_reduction( + res, + bitset_matrix_view, + count_gpu, + index_t{0}, + false, + [last_element_mask, n_elements_] __device__(bitset_t element, index_t index) { + index_t result = 0; + if constexpr (bitset_element_size == 64) { + if (index == n_elements_ - 1) + result = index_t(raft::detail::popc(element & last_element_mask)); + else + result = index_t(raft::detail::popc(element)); + } else { // Needed because popc is not overloaded for 16 and 8 bit elements + if (index == n_elements_ - 1) + result = index_t(raft::detail::popc(uint32_t{element} & last_element_mask)); + else + result = index_t(raft::detail::popc(uint32_t{element})); + } + + return result; + }); + } + /** + * @brief Returns the number of bits set to true. + * + * @param res RAFT resources + * @return index_t Number of bits set to true + */ + auto count(const raft::resources& res) -> index_t + { + auto count_gpu_scalar = raft::make_device_scalar(res, 0.0); + count(res, count_gpu_scalar.view()); + index_t count_cpu = 0; + raft::update_host( + &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res)); + resource::sync_stream(res); + return count_cpu; + } + /** + * @brief Checks if any of the bits are set to true in the bitset. + * @param res RAFT resources + */ + bool any(const raft::resources& res) { return count(res) > 0; } + /** + * @brief Checks if all of the bits are set to true in the bitset. + * @param res RAFT resources + */ + bool all(const raft::resources& res) { return count(res) == bitset_len_; } + /** + * @brief Checks if none of the bits are set to true in the bitset. + * @param res RAFT resources + */ + bool none(const raft::resources& res) { return count(res) == 0; } private: raft::device_uvector bitset_; index_t bitset_len_; - bool default_value_; }; /** @} */ diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh new file mode 100644 index 0000000000..f256f9ea0f --- /dev/null +++ b/cpp/include/raft/core/copy.cuh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. Any necessary device operations will be stream-ordered via the CUDA stream + * provided by the `raft::resources` argument. + * + * This header includes a custom kernel used for copying data between + * completely arbitrary mdspans on device. To compile this function in a + * non-CUDA translation unit, `raft/core/copy.hpp` may be used instead. The + * pure C++ header will correctly compile even without a CUDA compiler. + * Depending on the specialization, this CUDA header may invoke the kernel and + * therefore require a CUDA compiler. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ +template +detail::mdspan_copyable_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +template +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} +#endif +} // namespace raft diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp new file mode 100644 index 0000000000..0a16b742a2 --- /dev/null +++ b/cpp/include/raft/core/copy.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. + * + * This header does _not_ include the custom kernel used for copying data + * between completely arbitrary mdspans on device. For arbitrary copies of this + * kind, `#include ` instead. Specializations of this + * function that require the custom kernel will be SFINAE-omitted when this + * header is used instead of `copy.cuh`. This header _does_ support + * device-to-device copies that can be performed with cuBLAS or a + * straightforward cudaMemcpy. Any necessary device operations will be stream-ordered via the CUDA + * stream provided by the `raft::resources` argument. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ +template +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} +#endif + +} // namespace raft diff --git a/cpp/include/raft/core/cuda_support.hpp b/cpp/include/raft/core/cuda_support.hpp new file mode 100644 index 0000000000..07fb95a921 --- /dev/null +++ b/cpp/include/raft/core/cuda_support.hpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +namespace raft { +#ifndef RAFT_DISABLE_CUDA +auto constexpr static const CUDA_ENABLED = true; +#else +auto constexpr static const CUDA_ENABLED = false; +#endif +} // namespace raft diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp new file mode 100644 index 0000000000..1a9a4d004d --- /dev/null +++ b/cpp/include/raft/core/detail/copy.hpp @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#include +#include +#ifdef __CUDACC__ +#include +#endif +#endif + +namespace raft { +namespace detail { + +template +struct mdspan_copyable : std::false_type { + auto static constexpr const custom_kernel_allowed = false; + auto static constexpr const custom_kernel_not_allowed = false; +}; + +/* + * A helper struct used to determine whether one mdspan type can be copied to + * another and if so how + */ +template +struct mdspan_copyable>>, + std::bool_constant>>>>> { + using dst_type = std::remove_reference_t; + using src_type = std::remove_reference_t; + + // Extents properties + using dst_extents_type = typename dst_type::extents_type; + using src_extents_type = typename src_type::extents_type; + using index_type = + std::conditional_t<(std::numeric_limits::max() > + std::numeric_limits::max()), + typename dst_extents_type::index_type, + typename src_extents_type::index_type>; + + // Dtype properties + using dst_value_type = typename dst_type::value_type; + using src_value_type = typename src_type::value_type; + using dst_element_type = typename dst_type::element_type; + using src_element_type = typename src_type::element_type; + auto static constexpr const same_dtype = std::is_same_v; + auto static constexpr const compatible_dtype = + std::is_assignable_v; + + auto static constexpr const dst_float = std::is_same_v; + auto static constexpr const src_float = std::is_same_v; + auto static constexpr const dst_double = std::is_same_v; + auto static constexpr const src_double = std::is_same_v; + + auto static constexpr const both_float = dst_float && src_float; + auto static constexpr const both_double = dst_double && src_double; + auto static constexpr const both_float_or_both_double = both_float || both_double; + + // Ranks + auto static constexpr const dst_rank = dst_extents_type::rank(); + auto static constexpr const src_rank = src_extents_type::rank(); + auto static constexpr const compatible_rank = (dst_rank == src_rank); + auto static constexpr const has_vector_rank = (dst_rank == 1); + auto static constexpr const has_matrix_rank = (dst_rank == 2); + + // Layout properties + using dst_layout_type = typename dst_type::layout_type; + using src_layout_type = typename src_type::layout_type; + + auto static constexpr const same_layout = std::is_same_v; + + auto static check_for_unique_dst(dst_type dst) + { + if constexpr (!dst_type::is_always_unique()) { + RAFT_EXPECTS(dst.is_unique(), "Destination mdspan must be unique for parallelized copies"); + } + } + + auto static constexpr const src_contiguous = + std::disjunction_v, + std::is_same>; + + auto static constexpr const dst_contiguous = + std::disjunction_v, + std::is_same>; + + auto static constexpr const both_contiguous = src_contiguous && dst_contiguous; + + auto static constexpr const same_underlying_layout = + std::disjunction_v, + std::bool_constant>; + // Layout for intermediate tile if copying through custom kernel + using tile_layout_type = + std::conditional_t>; + + // Accessibility + auto static constexpr const dst_device_accessible = is_device_mdspan_v; + auto static constexpr const src_device_accessible = is_device_mdspan_v; + auto static constexpr const both_device_accessible = + dst_device_accessible && src_device_accessible; + + auto static constexpr const dst_host_accessible = is_host_mdspan_v; + auto static constexpr const src_host_accessible = is_host_mdspan_v; + auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; + + // Allowed copy codepaths + auto static constexpr const can_use_host = both_host_accessible; + +#if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) + // TODO(wphicks): Following should be only necessary restrictions. Test if + // perf actually improves once fully implemented. + // auto static constexpr const can_use_simd = can_use_host && both_contiguous && + // both_float_or_both_double; + auto static constexpr const can_use_simd = + can_use_host && both_contiguous && both_float && has_matrix_rank; +#else + auto static constexpr const can_use_simd = false; +#endif + + auto static constexpr const can_use_std_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + auto static constexpr const can_use_raft_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + // Do we need intermediate storage on device in order to perform + // non-trivial layout or dtype conversions after copying source from host or + // before copying converted results back to host? + auto static constexpr const requires_intermediate = + !both_host_accessible && !both_device_accessible && !can_use_raft_copy; + + auto static constexpr const use_intermediate_dst = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const use_intermediate_src = + std::conjunction_v, + std::bool_constant>; + auto static constexpr const can_use_device = + std::conjunction_v, + std::disjunction, + std::bool_constant, + std::bool_constant>>; + + auto static constexpr const can_use_cublas = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + auto static constexpr const custom_kernel_allowed = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const custom_kernel_not_allowed = !custom_kernel_allowed; + auto static constexpr const custom_kernel_required = + std::conjunction_v, + std::bool_constant>; + + // Viable overload? + auto static constexpr const value = + std::conjunction_v>, + std::bool_constant>, + std::bool_constant>; + using type = std::enable_if_t; +}; + +template +using mdspan_copyable_t = typename mdspan_copyable::type; +template +auto static constexpr const mdspan_copyable_v = + mdspan_copyable::value; + +template +auto static constexpr const mdspan_copyable_with_kernel_v = + mdspan_copyable::custom_kernel_allowed; +template +auto static constexpr const mdspan_copyable_not_with_kernel_v = + mdspan_copyable::custom_kernel_not_allowed; + +template +using mdspan_copyable_with_kernel_t = + std::enable_if_t, T>; + +template +using mdspan_copyable_not_with_kernel_t = + std::enable_if_t, T>; + +#ifdef __CUDACC__ +auto static constexpr const mdspan_copy_tile_dim = 32; +auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim; + +// Helper struct to work around lack of CUDA-native std::apply +template +struct index_sequence {}; + +template +struct make_index_sequence + : std::conditional_t, + make_index_sequence> {}; + +/* template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence) +{ + return lambda(args[Idx]...); +} + +template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args) +{ + return apply(std::forward(lambda), std::forward(args), +make_index_sequence{}); +} */ + +/* + * Given an mdspan and an array of indices, return a reference to the + * indicated element. + */ +template +__device__ decltype(auto) get_mdspan_elem(MdspanType md, + IdxType const* indices, + index_sequence) +{ + return md(indices[Idx]...); +} + +template +__device__ decltype(auto) get_mdspan_elem(MdspanType md, IdxType const* indices) +{ + return get_mdspan_elem( + md, indices, make_index_sequence{}); +} + +/* Advance old_indices forward by the number of mdspan elements specified + * by increment. Store the result in indices. Return true if the new + * indices are valid for the input mdspan. + */ +template +__device__ auto increment_indices(IdxType* indices, + MdspanType const& md, + IdxType const* old_indices, + IdxType const* index_strides, + IncrType increment) +{ +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + increment += index_strides[i] * old_indices[i]; + } + +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + // Iterate through dimensions in order from slowest to fastest varying for + // layout_right and layout_left. Otherwise, just iterate through dimensions + // in order. + // + // TODO(wphicks): It is possible to always iterate through dimensions in + // the slowest to fastest order. Consider this or at minimum expanding to + // padded layouts. + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return MdspanType::rank() - ind - 1; + } else { + return ind; + } + }(i); + + auto cur_index = IdxType{}; + + while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) { + increment -= index_strides[real_index]; + ++cur_index; + } + indices[real_index] = cur_index; + } + + return increment == IdxType{}; +} + +/* + * WARNING: This kernel _must_ be launched with mdspan_copy_tile_dim x + * mdspan_copy_tile_dim threads per block. This restriction allows for + * additional optimizations at the expense of generalized launch + * parameters. + */ +template + +RAFT_KERNEL mdspan_copy_kernel(DstType dst, SrcType src) +{ + using config = mdspan_copyable; + + // An intermediate storage location for the data to be copied. + __shared__ typename config::dst_value_type tile[mdspan_copy_tile_dim][mdspan_copy_tile_dim + 1]; + + // Compute the cumulative product of extents in order from fastest to + // slowest varying extent + typename config::index_type index_strides[config::dst_rank]; + auto cur_stride = typename config::index_type{1}; +#pragma unroll + for (auto i = typename SrcType::extents_type::rank_type{}; i < config::src_rank; ++i) { + // Iterate through dimensions in order from fastest to slowest varying + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return config::src_rank - ind - 1; + } else { + return ind; + } + }(i); + + index_strides[real_index] = cur_stride; + cur_stride *= src.extent(real_index); + } + + // The index of the first element in the mdspan which will be copied via + // the current tile for this block. + typename config::index_type tile_offset[config::dst_rank] = {0}; + typename config::index_type cur_indices[config::dst_rank]; + auto valid_tile = increment_indices( + tile_offset, src, tile_offset, index_strides, blockIdx.x * mdspan_copy_tile_elems); + + while (valid_tile) { + auto tile_read_x = std::is_same_v + ? threadIdx.x + : threadIdx.y; + auto tile_read_y = std::is_same_v + ? threadIdx.y + : threadIdx.x; + + auto valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_x * mdspan_copy_tile_dim + tile_read_y); + + if constexpr (config::same_underlying_layout || !config::dst_contiguous) { + if (valid_index) { + tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); + get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y]; + } + } else { + if (valid_index) { tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); } + __syncthreads(); + + valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_y * mdspan_copy_tile_dim + tile_read_x); + if (valid_index) { get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; } + __syncthreads(); + } + valid_tile = increment_indices( + tile_offset, src, tile_offset, index_strides, blockDim.x * mdspan_copy_tile_elems); + } +} +#endif + +template +mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType&& src) +{ + using config = mdspan_copyable; + for (auto i = std::size_t{}; i < config::src_rank; ++i) { + RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape"); + } + + if constexpr (config::use_intermediate_src) { +#ifndef RAFT_DISABLE_CUDA + // Copy to intermediate source on device, then perform necessary + // changes in layout on device, directly into final destination + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{src.extents()}, + typename mdarray_t::container_policy_type{}); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to device in non-CUDA build")); +#endif + + } else if constexpr (config::use_intermediate_dst) { +#ifndef RAFT_DISABLE_CUDA + // Perform necessary changes in layout on device, then copy to final + // destination on host + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{dst.extents()}, + typename mdarray_t::container_policy_type{}); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); +#else + throw(raft::non_cuda_build_error("Copying from device in non-CUDA build")); +#endif + } else if constexpr (config::can_use_raft_copy) { +#ifndef RAFT_DISABLE_CUDA + raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res)); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif + } else if constexpr (config::can_use_cublas) { +#ifndef RAFT_DISABLE_CUDA + auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; + auto constexpr const beta = typename std::remove_reference_t::value_type{0}; + if constexpr (std::is_same_v) { + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(1), + dst.extent(0), + &alpha, + src.data_handle(), + src.extent(0), + &beta, + dst.data_handle(), + dst.extent(1), + dst.data_handle(), + dst.extent(1), + resource::get_cuda_stream(res))); + } else { + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(0), + dst.extent(1), + &alpha, + src.data_handle(), + src.extent(1), + &beta, + dst.data_handle(), + dst.extent(0), + dst.data_handle(), + dst.extent(0), + resource::get_cuda_stream(res))); + } +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif + } else if constexpr (config::custom_kernel_allowed) { +#ifdef __CUDACC__ + config::check_for_unique_dst(dst); + auto const blocks = std::min( + // This maximum is somewhat arbitrary. Could query the device to see + // how many blocks we could reasonably allow, but this is probably + // sufficient considering that this kernel will likely overlap with + // real computations for most use cases. + typename config::index_type{32}, + raft::ceildiv(typename config::index_type(dst.size()), + typename config::index_type(mdspan_copy_tile_elems))); + auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1}; + mdspan_copy_kernel<<>>(dst, src); +#else + // Should never actually reach this because of enable_ifs. Included for + // safety. + RAFT_FAIL( + "raft::copy called in a way that requires custom kernel. Please use " + "raft/core/copy.cuh and include the header in a .cu file"); +#endif + } else if constexpr (config::can_use_std_copy) { + std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); + } else { + // TODO(wphicks): Make the following cache-oblivious and add SIMD support + auto indices = std::array{}; + for (auto i = std::size_t{}; i < dst.size(); ++i) { + if (i != 0) { + if constexpr (std::is_same_v) { + // For layout_right/layout_c_contiguous, we iterate over the + // rightmost extent fastest + auto dim = config::src_rank - 1; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + --dim; + } + } else { + // For layout_left/layout_f_contiguous (and currently all other + // layouts), we iterate over the leftmost extent fastest. The + // cache-oblivious implementation should work through dimensions in + // order of increasing stride. + auto dim = std::size_t{}; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + ++dim; + } + } + } + std::apply(dst, indices) = std::apply(src, indices); + } + } +} +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp index bb4207938b..364914043e 100644 --- a/cpp/include/raft/core/detail/macros.hpp +++ b/cpp/include/raft/core/detail/macros.hpp @@ -86,6 +86,38 @@ // as a weak symbol rather than a global." #define RAFT_WEAK_FUNCTION __attribute__((weak)) +// The RAFT_HIDDEN_FUNCTION specificies that the function will be hidden +// and therefore not callable by consumers of raft when compiled as +// a shared library. +// +// Hidden visibility also ensures that the linker doesn't de-duplicate the +// symbol across multiple `.so`. This allows multiple libraries to embed raft +// without issue +#define RAFT_HIDDEN_FUNCTION __attribute__((visibility("hidden"))) + +// The RAFT_KERNEL specificies that a kernel has hidden visibility +// +// Raft needs to ensure that the visibility of its __global__ function +// templates have hidden visibility ( default is weak visibility). +// +// When kernls have weak visibility it means that if two dynamic libraries +// both contain identical instantiations of a RAFT template, then the linker +// will discard one of the two instantiations and use only one of them. +// +// Do to unique requirements of how the CUDA works this de-deduplication +// can lead to the wrong kernels being called ( SM version being wrong ), +// silently no kernel being called at all, or cuda runtime errors being +// thrown. +// +// https://github.com/rapidsai/raft/issues/1722 +#if defined(__CUDACC_RDC__) +#define RAFT_KERNEL RAFT_HIDDEN_FUNCTION __global__ void +#elif defined(_RAFT_HAS_CUDA) +#define RAFT_KERNEL static __global__ void +#else +#define RAFT_KERNEL static void +#endif + /** * Some macro magic to remove optional parentheses of a macro argument. * See https://stackoverflow.com/a/62984543 diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 84b244f4dc..9045c5c871 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,6 +98,16 @@ struct logic_error : public raft::exception { explicit logic_error(std::string const& message) : raft::exception(message) {} }; +/** + * @brief Exception thrown when attempting to use CUDA features from a non-CUDA + * build + * + */ +struct non_cuda_build_error : public raft::exception { + explicit non_cuda_build_error(char const* const message) : raft::exception(message) {} + explicit non_cuda_build_error(std::string const& message) : raft::exception(message) {} +}; + /** * @} */ diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp index 192d160d45..610945a76c 100644 --- a/cpp/include/raft/core/kvp.hpp +++ b/cpp/include/raft/core/kvp.hpp @@ -36,7 +36,7 @@ struct KeyValuePair { Value value; ///< Item value /// Constructor - RAFT_INLINE_FUNCTION KeyValuePair() {} + KeyValuePair() = default; #ifdef _RAFT_HAS_CUDA /// Conversion Constructor to allow integration w/ cub diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp index f1a1adb916..15414b9af3 100644 --- a/cpp/include/raft/core/mdspan.hpp +++ b/cpp/include/raft/core/mdspan.hpp @@ -312,6 +312,41 @@ RAFT_INLINE_FUNCTION auto unravel_index(Idx idx, /** @} */ +/** + * @defgroup mdspan_contiguous Whether the strides imply a contiguous layout. + * @{ + */ + +/** + * @brief Whether the strides imply a c-contiguous layout. + */ +template +[[nodiscard]] auto is_c_contiguous(Extents const& extents, Strides const& strides) -> bool +{ + typename Extents::index_type stride = 1; + for (auto r = extents.rank(); r > 0; r--) { + if (stride != strides[r - 1]) { return false; } + stride *= extents.extent(r - 1); + } + return true; +} + +/** + * @brief Whether the strides imply a f-contiguous layout. + */ +template +[[nodiscard]] auto is_f_contiguous(Extents const& extents, Strides const& strides) -> bool +{ + typename Extents::index_type stride = 1; + for (typename Extents::rank_type r = 0; r < extents.rank(); r++) { + if (stride != strides[r]) { return false; } + stride *= extents.extent(r); + } + return true; +} + +/** @} */ + /** * @brief Const accessor specialization for default_accessor * diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index 8e331293bf..c30f2e81e8 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -39,6 +39,8 @@ enum resource_type { SUB_COMMUNICATOR, // raft sub communicator DEVICE_PROPERTIES, // cuda device properties DEVICE_ID, // cuda device id + STREAM_VIEW, // view of a cuda stream or a placeholder in + // CUDA-free builds THRUST_POLICY, // thrust execution policy WORKSPACE_RESOURCE, // rmm device memory resource diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp new file mode 100644 index 0000000000..ccf516076f --- /dev/null +++ b/cpp/include/raft/core/resource/stream_view.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif + +namespace raft::resource { +struct stream_view_resource : public resource { + stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) {} + void* get_resource() override { return &stream; } + + ~stream_view_resource() override {} + + private: + raft::stream_view stream; +}; + +/** + * Factory that knows how to construct a specific raft::resource to populate + * the resources instance. + */ +struct stream_view_resource_factory : public resource_factory { + public: + stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) : stream(view) + { + } + resource_type get_resource_type() override { return resource_type::STREAM_VIEW; } + resource* make_resource() override { return new stream_view_resource(stream); } + + private: + raft::stream_view stream; +}; + +/** + * @defgroup resource_stream_view stream resource functions compatible with + * non-CUDA builds + * @{ + */ +/** + * Load a raft::stream_view from a resources instance (and populate it on the res + * if needed). + * @param res raft res object for managing resources + * @return + */ +inline raft::stream_view get_stream_view(resources const& res) +{ + if (!res.has_resource_factory(resource_type::STREAM_VIEW)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::STREAM_VIEW); +}; + +/** + * Load a raft::stream__view from a resources instance (and populate it on the res + * if needed). + * @param[in] res raft resources object for managing resources + * @param[in] view raft stream view + */ +inline void set_stream_view(resources const& res, raft::stream_view view) +{ + res.add_resource_factory(std::make_shared(view)); +}; + +/** + * @brief synchronize a specific stream + * + * @param[in] res the raft resources object + * @param[in] stream stream to synchronize + */ +inline void sync_stream_view(const resources& res, raft::stream_view stream) +{ + stream.interruptible_synchronize(); +} + +/** + * @brief synchronize main stream on the resources instance + */ +inline void sync_stream_view(const resources& res) { sync_stream_view(res, get_stream_view(res)); } + +/** + * @} + */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp new file mode 100644 index 0000000000..f7e7934dbf --- /dev/null +++ b/cpp/include/raft/core/stream_view.hpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#endif + +namespace raft { + +namespace detail { +struct fail_stream_view { + constexpr fail_stream_view() = default; + constexpr fail_stream_view(fail_stream_view const&) = default; + constexpr fail_stream_view(fail_stream_view&&) = default; + auto constexpr operator=(fail_stream_view const&) -> fail_stream_view& = default; + auto constexpr operator=(fail_stream_view&&) -> fail_stream_view& = default; + auto value() { throw non_cuda_build_error{"Attempted to access CUDA stream in non-CUDA build"}; } + [[nodiscard]] auto is_per_thread_default() const { return false; } + [[nodiscard]] auto is_default() const { return false; } + void synchronize() const + { + throw non_cuda_build_error{"Attempted to sync CUDA stream in non-CUDA build"}; + } + void synchronize_no_throw() const + { + RAFT_LOG_ERROR("Attempted to sync CUDA stream in non-CUDA build"); + } +}; +} // namespace detail + +/** A lightweight wrapper around rmm::cuda_stream_view that can be used in + * CUDA-free builds + * + * While CUDA-free builds should never actually make use of a CUDA stream at + * runtime, it is sometimes useful to have a symbol that can stand in place of + * a CUDA stream to avoid excessive ifdef directives interspersed with other + * logic. This struct's methods invoke the underlying rmm::cuda_stream_view in + * CUDA-enabled builds but throw runtime exceptions if any non-trivial method + * is called from a CUDA-free build */ +struct stream_view { +#ifndef RAFT_DISABLE_CUDA + using underlying_view_type = rmm::cuda_stream_view; +#else + using underlying_view_type = detail::fail_stream_view; +#endif + + constexpr stream_view( + underlying_view_type base_view = stream_view::get_underlying_per_thread_default()) + : base_view_{base_view} + { + } + constexpr stream_view(stream_view const&) = default; + constexpr stream_view(stream_view&&) = default; + auto operator=(stream_view const&) -> stream_view& = default; + auto operator=(stream_view&&) -> stream_view& = default; + auto value() { return base_view_.value(); } + operator underlying_view_type() const noexcept { return base_view_; } + [[nodiscard]] auto is_per_thread_default() const { return base_view_.is_per_thread_default(); } + [[nodiscard]] auto is_default() const { return base_view_.is_default(); } + void synchronize() const { base_view_.synchronize(); } + void synchronize_no_throw() const { base_view_.synchronize_no_throw(); } + void interruptible_synchronize() const + { +#ifndef RAFT_DISABLE_CUDA + interruptible::synchronize(base_view_); +#else + synchronize(); +#endif + } + + auto underlying() { return base_view_; } + void synchronize_if_cuda_enabled() + { + if constexpr (raft::CUDA_ENABLED) { base_view_.synchronize(); } + } + + private: + underlying_view_type base_view_; + auto static get_underlying_per_thread_default() -> underlying_view_type + { +#ifndef RAFT_DISABLE_CUDA + return rmm::cuda_stream_per_thread; +#else + auto static constexpr const default_fail_stream = underlying_view_type{}; + return default_fail_stream; +#endif + } +}; + +auto static const stream_view_per_thread = stream_view{}; + +} // namespace raft diff --git a/cpp/include/raft/distance/detail/compress_to_bits.cuh b/cpp/include/raft/distance/detail/compress_to_bits.cuh index fa0df25461..5ffb717c42 100644 --- a/cpp/include/raft/distance/detail/compress_to_bits.cuh +++ b/cpp/include/raft/distance/detail/compress_to_bits.cuh @@ -35,7 +35,7 @@ namespace raft::distance::detail { * Note: the division (`/`) is a ceilDiv. */ template ::value>> -__global__ void compress_to_bits_kernel( +RAFT_KERNEL compress_to_bits_kernel( raft::device_matrix_view in, raft::device_matrix_view out) { diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh index 5e93d9e33b..a218c85a0a 100644 --- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh +++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh @@ -21,6 +21,22 @@ namespace raft::distance::detail::ops { +/** + * Reserve 1 digit of precision from each floating-point type + * for round-off error tolerance. + * @tparam DataT + */ +template +__device__ constexpr DataT get_clamp_precision() +{ + switch (sizeof(DataT)) { + case 2: return 1e-3; + case 4: return 1e-6; + case 8: return 1e-15; + default: return 0; + } +} + // Epilogue operator for CUTLASS based kernel template struct l2_exp_cutlass_op { @@ -28,14 +44,16 @@ struct l2_exp_cutlass_op { __device__ l2_exp_cutlass_op() noexcept : sqrt(false) {} __device__ l2_exp_cutlass_op(bool isSqrt) noexcept : sqrt(isSqrt) {} - __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept + inline __device__ AccT operator()(DataT aNorm, DataT bNorm, DataT accVal) const noexcept { AccT outVal = aNorm + bNorm - DataT(2.0) * accVal; - // outVal could be negative due to numerical instability, especially when - // calculating self distance. - // clamp to 0 to avoid potential NaN in sqrt - outVal = outVal * (raft::abs(outVal) >= DataT(0.0001)); - return sqrt ? raft::sqrt(outVal) : outVal; + + /** + * Self-neighboring points should have (aNorm == bNorm) == accVal and the dot product (accVal) + * can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal instead. + */ + outVal = outVal * !((outVal * outVal < get_clamp_precision()) * (aNorm == bNorm)); + return sqrt ? raft::sqrt(outVal * (outVal > 0)) : outVal; } __device__ AccT operator()(DataT aData) const noexcept { return aData; } @@ -86,10 +104,16 @@ struct l2_exp_distance_op { for (int i = 0; i < Policy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < Policy::AccColsPerTh; ++j) { - DataT val = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j]; - // val could be negative due to numerical instability, especially when - // calculating self distance. Clamp to 0 to avoid potential NaN in sqrt - acc[i][j] = val * (raft::abs(val) >= DataT(0.0001)); + DataT accVal = acc[i][j]; + DataT val = regxn[i] + regyn[j] - (DataT)2.0 * accVal; + + /** + * Self-neighboring points should have (aNorm == bNorm) == accVal and the dot product + * (accVal) can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal + * instead. + */ + acc[i][j] = + val * (val > 0) * !((val * val < get_clamp_precision()) * (regxn[i] == regyn[j])); } } if (sqrt) { diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh index f0f12acdb1..2468dcd740 100644 --- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh @@ -87,7 +87,7 @@ struct MinReduceOpImpl { }; template -__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) +RAFT_KERNEL initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; if (tid < m) { redOp.init(min + tid, maxVal); } @@ -139,20 +139,20 @@ template -__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - DataT maxVal, - int* mutex, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - OpT distance_op, - FinalLambda fin_op) +__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedL2NNkernel(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + DataT maxVal, + int* mutex, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + OpT distance_op, + FinalLambda fin_op) { // compile only if below non-ampere arch. #if __CUDA_ARCH__ < 800 diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh index f02e29c797..8d5b2c766e 100644 --- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh +++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh @@ -36,7 +36,7 @@ namespace raft::distance::kernels::detail { * @param offset */ template -__global__ void polynomial_kernel_nopad( +RAFT_KERNEL polynomial_kernel_nopad( math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; @@ -56,7 +56,7 @@ __global__ void polynomial_kernel_nopad( * @param offset */ template -__global__ void polynomial_kernel( +RAFT_KERNEL polynomial_kernel( math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) { for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; @@ -75,7 +75,7 @@ __global__ void polynomial_kernel( * @param offset */ template -__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) +RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; tid += blockDim.x * gridDim.x) { @@ -93,7 +93,7 @@ __global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t * @param offset */ template -__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) +RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) { for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; tidy += blockDim.y * gridDim.y) @@ -121,7 +121,7 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga * @param gain */ template -__global__ void rbf_kernel_expanded( +RAFT_KERNEL rbf_kernel_expanded( math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) { for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh index 0e13783c19..4de9f4764a 100644 --- a/cpp/include/raft/distance/detail/masked_nn.cuh +++ b/cpp/include/raft/distance/detail/masked_nn.cuh @@ -40,24 +40,24 @@ template -__global__ __launch_bounds__(P::Nthreads, 2) void masked_l2_nn_kernel(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - const uint64_t* adj, - const IdxT* group_idxs, - IdxT num_groups, - IdxT m, - IdxT n, - IdxT k, - bool sqrt, - DataT maxVal, - int* mutex, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - CoreLambda core_op, - FinalLambda fin_op) +__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL masked_l2_nn_kernel(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + const uint64_t* adj, + const IdxT* group_idxs, + IdxT num_groups, + IdxT m, + IdxT n, + IdxT k, + bool sqrt, + DataT maxVal, + int* mutex, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + CoreLambda core_op, + FinalLambda fin_op) { extern __shared__ char smem[]; diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh index 2d0a98862e..5393bf7389 100644 --- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh +++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh @@ -31,8 +31,8 @@ template -__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel( - OpT distance_op, pairwise_matrix_params params) +__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL + pairwise_matrix_kernel(OpT distance_op, pairwise_matrix_params params) { // Early exit to minimize the size of the kernel when it is not supposed to be compiled. constexpr SM_compat_t sm_compat_range{}; diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh index 64d8b4bfae..6e432e050c 100644 --- a/cpp/include/raft/label/detail/classlabels.cuh +++ b/cpp/include/raft/label/detail/classlabels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -119,13 +119,13 @@ void getOvrlabels( // +/-1, return array with the new class labels and corresponding indices. template -__global__ void map_label_kernel(Type* map_ids, - size_t N_labels, - Type* in, - Type* out, - size_t N, - Lambda filter_op, - bool zero_based = false) +RAFT_KERNEL map_label_kernel(Type* map_ids, + size_t N_labels, + Type* in, + Type* out, + size_t N, + Lambda filter_op, + bool zero_based = false) { int tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh index f93a97d52b..166bb2122a 100644 --- a/cpp/include/raft/label/detail/merge_labels.cuh +++ b/cpp/include/raft/label/detail/merge_labels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,13 +32,12 @@ namespace detail { * For an additional cost we can build the graph with edges * E={(A[i], B[i]) | M[i]=1} and make this step faster */ template -__global__ void __launch_bounds__(TPB_X) - propagate_label_kernel(const value_idx* __restrict__ labels_a, - const value_idx* __restrict__ labels_b, - value_idx* __restrict__ R, - const bool* __restrict__ mask, - bool* __restrict__ m, - value_idx N) +RAFT_KERNEL __launch_bounds__(TPB_X) propagate_label_kernel(const value_idx* __restrict__ labels_a, + const value_idx* __restrict__ labels_b, + value_idx* __restrict__ R, + const bool* __restrict__ mask, + bool* __restrict__ m, + value_idx N) { value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { @@ -65,12 +64,11 @@ __global__ void __launch_bounds__(TPB_X) } template -__global__ void __launch_bounds__(TPB_X) - reassign_label_kernel(value_idx* __restrict__ labels_a, - const value_idx* __restrict__ labels_b, - const value_idx* __restrict__ R, - value_idx N, - value_idx MAX_LABEL) +RAFT_KERNEL __launch_bounds__(TPB_X) reassign_label_kernel(value_idx* __restrict__ labels_a, + const value_idx* __restrict__ labels_b, + const value_idx* __restrict__ R, + value_idx N, + value_idx MAX_LABEL) { value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh index bf9b2bd1d8..121ac10e24 100644 --- a/cpp/include/raft/linalg/detail/add.cuh +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,10 +38,10 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st } template -__global__ void add_dev_scalar_kernel(OutT* outDev, - const InT* inDev, - const InT* singleScalarDev, - IdxType len) +RAFT_KERNEL add_dev_scalar_kernel(OutT* outDev, + const InT* inDev, + const InT* singleScalarDev, + IdxType len) { IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh index 5b01196cf4..f3c150cbee 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh @@ -40,7 +40,7 @@ template -__global__ void __launch_bounds__(Policy::ThreadsPerBlock) +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedReductionThinKernel(OutType* dots, const InType* data, IdxType D, @@ -137,15 +137,15 @@ template -__global__ void __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots, - const InType* data, - IdxType D, - IdxType N, - OutType init, - MainLambda main_op, - ReduceLambda reduce_op, - FinalLambda final_op, - bool inplace = false) +RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + MainLambda main_op, + ReduceLambda reduce_op, + FinalLambda final_op, + bool inplace = false) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -225,7 +225,7 @@ template -__global__ void __launch_bounds__(Policy::ThreadsPerBlock) +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedReductionThickKernel(OutType* buffer, const InType* data, IdxType D, diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 0c79dec248..4ff3aa9754 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -65,7 +65,7 @@ __device__ __forceinline__ void map_kernel_mainloop( } template -__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) +RAFT_KERNEL map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) { const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x; if constexpr (R <= 1) { diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh index 6fae16117f..d1e211f8d2 100644 --- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh @@ -52,13 +52,13 @@ template -__global__ void mapThenReduceKernel(OutType* out, - IdxType len, - OutType neutral, - MapOp map, - ReduceLambda op, - const InType* in, - Args... args) +RAFT_KERNEL mapThenReduceKernel(OutType* out, + IdxType len, + OutType neutral, + MapOp map, + ReduceLambda op, + const InType* in, + Args... args) { OutType acc = neutral; auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); diff --git a/cpp/include/raft/linalg/detail/normalize.cuh b/cpp/include/raft/linalg/detail/normalize.cuh index 78c773ab35..d1ca4816e5 100644 --- a/cpp/include/raft/linalg/detail/normalize.cuh +++ b/cpp/include/raft/linalg/detail/normalize.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,7 @@ template -__global__ void __launch_bounds__(Policy::ThreadsPerBlock) +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalesced_normalize_thin_kernel(Type* out, const Type* in, IdxType D, @@ -92,15 +92,15 @@ template -__global__ void __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out, - const Type* in, - IdxType D, - IdxType N, - Type init, - MainLambda main_op, - ReduceLambda reduce_op, - FinalLambda fin_op, - Type eps) +RAFT_KERNEL __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out, + const Type* in, + IdxType D, + IdxType N, + Type init, + MainLambda main_op, + ReduceLambda reduce_op, + FinalLambda fin_op, + Type eps) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh index a85e04acca..b726e3ea5a 100644 --- a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,7 +29,7 @@ namespace detail { ///@todo: specialize this to support shared-mem based atomics template -__global__ void reduce_cols_by_key_direct_kernel( +RAFT_KERNEL reduce_cols_by_key_direct_kernel( const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys) { typedef typename std::iterator_traits::value_type KeyType; @@ -44,7 +44,7 @@ __global__ void reduce_cols_by_key_direct_kernel( } template -__global__ void reduce_cols_by_key_cached_kernel( +RAFT_KERNEL reduce_cols_by_key_cached_kernel( const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys) { typedef typename std::iterator_traits::value_type KeyType; diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh index 572d6b738c..ce11825e12 100644 --- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ namespace detail { // template -void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) +RAFT_KERNEL convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) { for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) { dst[idx] = src[idx]; @@ -95,14 +95,14 @@ struct quadSum { template __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) - __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, - IdxT lda, - const char* d_keys, - const WeightT* d_weights, - IdxT nrows, - IdxT ncols, - IdxT nkeys, - SumsT* d_sums) + RAFT_KERNEL sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, + IdxT lda, + const char* d_keys, + const WeightT* d_weights, + IdxT nrows, + IdxT ncols, + IdxT nkeys, + SumsT* d_sums) { typedef typename std::iterator_traits::value_type DataType; typedef cub::BlockReduce, SUM_ROWS_SMALL_K_DIMX> BlockReduce; @@ -193,15 +193,15 @@ template -__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, - IdxT lda, - KeysIteratorT d_keys, - const WeightT* d_weights, - IdxT nrows, - IdxT ncols, - int key_offset, - IdxT nkeys, - SumsT* d_sums) +RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, + IdxT lda, + KeysIteratorT d_keys, + const WeightT* d_weights, + IdxT nrows, + IdxT ncols, + int key_offset, + IdxT nkeys, + SumsT* d_sums) { typedef typename std::iterator_traits::value_type KeyType; typedef typename std::iterator_traits::value_type DataType; @@ -269,13 +269,13 @@ template -__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, - IdxT lda, - const WeightT* d_weights, - KeysIteratorT d_keys, - IdxT nrows, - IdxT ncols, - SumsT* d_sums) +RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, + IdxT lda, + const WeightT* d_weights, + KeysIteratorT d_keys, + IdxT nrows, + IdxT ncols, + SumsT* d_sums) { IdxT gid = threadIdx.x + (blockDim.x * static_cast(blockIdx.x)); IdxT j = gid % ncols; diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 42e79a9285..aef346bd4b 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -30,7 +30,7 @@ namespace detail { // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout template -__global__ void stridedSummationKernel( +RAFT_KERNEL stridedSummationKernel( Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) { // Thread reduction @@ -68,13 +68,13 @@ template -__global__ void stridedReductionKernel(OutType* dots, - const InType* data, - int D, - int N, - OutType init, - MainLambda main_op, - ReduceLambda reduce_op) +RAFT_KERNEL stridedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + MainLambda main_op, + ReduceLambda reduce_op) { // Thread reduction OutType thread_data = init; diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh index 6df09df8ed..6519d58fa1 100644 --- a/cpp/include/raft/linalg/detail/subtract.cuh +++ b/cpp/include/raft/linalg/detail/subtract.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,10 +38,10 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream } template -__global__ void subtract_dev_scalar_kernel(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len) +RAFT_KERNEL subtract_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) { // TODO: kernel do not use shared memory in current implementation int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh index 5df7ba3cdc..652c4fda0f 100644 --- a/cpp/include/raft/matrix/detail/columnWiseSort.cuh +++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ struct SmemPerBlock { }; template -__global__ void devLayoutIdx(InType* in, int n_cols, int totalElements) +RAFT_KERNEL devLayoutIdx(InType* in, int n_cols, int totalElements) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int n = n_cols; @@ -63,7 +63,7 @@ __global__ void devLayoutIdx(InType* in, int n_cols, int totalElements) } template -__global__ void devOffsetKernel(T* in, T value, int n_times) +RAFT_KERNEL devOffsetKernel(T* in, T value, int n_times) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < n_times) in[idx] = idx * value; @@ -76,12 +76,12 @@ template < int BLOCK_SIZE, int ITEMS_PER_THREAD, typename std::enable_if::IsValid, InType>::type* = nullptr> -__global__ void __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys, - InType* outputKeys, - OutType* inputVals, - int n_rows, - int n_cols, - InType MAX_VALUE) +RAFT_KERNEL __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys, + InType* outputKeys, + OutType* inputVals, + int n_rows, + int n_cols, + InType MAX_VALUE) { typedef cub::BlockLoad BlockLoadTypeKey; @@ -124,12 +124,12 @@ template < int BLOCK_SIZE, int ITEMS_PER_THREAD, typename std::enable_if::IsValid), InType>::type* = nullptr> -__global__ void devKeyValSortColumnPerRow(const InType* inputKeys, - InType* outputKeys, - OutType* inputVals, - int n_rows, - int n_cols, - InType MAX_VALUE) +RAFT_KERNEL devKeyValSortColumnPerRow(const InType* inputKeys, + InType* outputKeys, + OutType* inputVals, + int n_rows, + int n_cols, + InType MAX_VALUE) { // place holder function // so that compiler unrolls for all template types successfully diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh index 59fcf606c8..73072ec841 100644 --- a/cpp/include/raft/matrix/detail/gather.cuh +++ b/cpp/include/raft/matrix/detail/gather.cuh @@ -47,14 +47,14 @@ template -__global__ void gather_kernel(const InputIteratorT in, - IndexT D, - IndexT len, - const MapIteratorT map, - StencilIteratorT stencil, - OutputIteratorT out, - PredicateOp pred_op, - MapTransformOp transform_op) +RAFT_KERNEL gather_kernel(const InputIteratorT in, + IndexT D, + IndexT len, + const MapIteratorT map, + StencilIteratorT stencil, + OutputIteratorT out, + PredicateOp pred_op, + MapTransformOp transform_op) { typedef typename std::iterator_traits::value_type MapValueT; typedef typename std::iterator_traits::value_type StencilValueT; diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 514d0dc51b..6061fe6aee 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -260,7 +260,7 @@ template -__global__ void __launch_bounds__(BlockSize) +RAFT_KERNEL __launch_bounds__(BlockSize) matrixLinewiseVecColsMainKernel(Type* out, const Type* in, const IdxType arrOffset, @@ -304,15 +304,14 @@ __global__ void __launch_bounds__(BlockSize) * @param [in] vecs pointers to the argument vectors */ template -__global__ void __launch_bounds__(MaxOffset, 2) - matrixLinewiseVecColsTailKernel(Type* out, - const Type* in, - const IdxType arrOffset, - const IdxType arrTail, - const IdxType rowLen, - const IdxType len, - Lambda op, - const Vecs*... vecs) +RAFT_KERNEL __launch_bounds__(MaxOffset, 2) matrixLinewiseVecColsTailKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType arrTail, + const IdxType rowLen, + const IdxType len, + Lambda op, + const Vecs*... vecs) { // Note, L::VecElems == 1 typedef Linewise L; @@ -370,14 +369,13 @@ template -__global__ void __launch_bounds__(BlockSize) - matrixLinewiseVecRowsMainKernel(Type* out, - const Type* in, - const IdxType arrOffset, - const IdxType rowLen, - const IdxType len, - Lambda op, - const Vecs*... vecs) +RAFT_KERNEL __launch_bounds__(BlockSize) matrixLinewiseVecRowsMainKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType rowLen, + const IdxType len, + Lambda op, + const Vecs*... vecs) { typedef Linewise L; constexpr uint workSize = L::VecElems * BlockSize; @@ -413,14 +411,13 @@ template -__global__ void __launch_bounds__(BlockSize) - matrixLinewiseVecRowsSpanKernel(Type* out, - const Type* in, - const IdxType rowLen, - const IdxType rowLenPadded, - const IdxType lenPadded, - Lambda op, - const Vecs*... vecs) +RAFT_KERNEL __launch_bounds__(BlockSize) matrixLinewiseVecRowsSpanKernel(Type* out, + const Type* in, + const IdxType rowLen, + const IdxType rowLenPadded, + const IdxType lenPadded, + Lambda op, + const Vecs*... vecs) { typedef Linewise L; constexpr uint workSize = L::VecElems * BlockSize; @@ -457,15 +454,14 @@ __global__ void __launch_bounds__(BlockSize) * @param [in] vecs pointers to the argument vectors */ template -__global__ void __launch_bounds__(MaxOffset, 2) - matrixLinewiseVecRowsTailKernel(Type* out, - const Type* in, - const IdxType arrOffset, - const IdxType arrTail, - const IdxType rowLen, - const IdxType len, - Lambda op, - const Vecs*... vecs) +RAFT_KERNEL __launch_bounds__(MaxOffset, 2) matrixLinewiseVecRowsTailKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType arrTail, + const IdxType rowLen, + const IdxType len, + Lambda op, + const Vecs*... vecs) { // Note, L::VecElems == 1 constexpr uint workSize = MaxOffset; diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index d2707e1254..9e9d7f8b3b 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -331,7 +331,7 @@ void matrixVectorBinarySub(Type* data, // Computes an argmin/argmax column-wise in a DxN matrix template -__global__ void argReduceKernel(const T* d_in, IdxT D, IdxT N, OutT* out) +RAFT_KERNEL argReduceKernel(const T* d_in, IdxT D, IdxT N, OutT* out) { typedef cub:: BlockReduce, TPB, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY> @@ -396,7 +396,7 @@ void argmax(const math_t* in, idx_t D, idx_t N, out_t* out, cudaStream_t stream) // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by // flipping the sign if the |max| value for each column is negative. template -__global__ void signFlipKernel(T* d_in, int D, int N) +RAFT_KERNEL signFlipKernel(T* d_in, int D, int N) { typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index 48821df5b2..2fa741fd96 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -169,8 +169,7 @@ void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) * (1-based) */ template -__global__ void slice( - const m_t* src_d, idx_t lda, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) +RAFT_KERNEL slice(const m_t* src_d, idx_t lda, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t dm = x2 - x1, dn = y2 - y1; @@ -211,7 +210,7 @@ void sliceMatrix(const m_t* in, * @param k: min(n_rows, n_cols) */ template -__global__ void getUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) +RAFT_KERNEL getUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t m = n_rows, n = n_cols; @@ -239,7 +238,7 @@ void copyUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, c * @param k: dimensionality */ template -__global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t lda, idx_t k) +RAFT_KERNEL copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t lda, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; @@ -254,7 +253,7 @@ __global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t ld * @param k: dimensionality */ template -__global__ void copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t lda, idx_t k) +RAFT_KERNEL copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t lda, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; @@ -290,7 +289,7 @@ void getDiagonalMatrix( * @param len: size of one side of the matrix */ template -__global__ void matrixDiagonalInverse(m_t* in, idx_t len) +RAFT_KERNEL matrixDiagonalInverse(m_t* in, idx_t len) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; } diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh index af5a5770fb..20fe1963fc 100644 --- a/cpp/include/raft/matrix/detail/select_k-inl.cuh +++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh @@ -34,7 +34,7 @@ namespace raft::matrix::detail { // this is a subset of algorithms, chosen by running the algorithm_selection // notebook in cpp/scripts/heuristics/select_k -enum class Algo { kRadix11bits, kWarpDistributedShm, kFaissBlockSelect }; +enum class Algo { kRadix11bits, kWarpDistributedShm, kWarpImmediate, kRadix11bitsExtraPass }; /** * Predict the fastest select_k algorithm based on the number of rows/cols/k @@ -50,73 +50,29 @@ enum class Algo { kRadix11bits, kWarpDistributedShm, kFaissBlockSelect }; */ inline Algo choose_select_k_algorithm(size_t rows, size_t cols, int k) { - if (k > 134) { - if (k > 256) { - if (k > 809) { - return Algo::kRadix11bits; - } else { - if (rows > 124) { - if (cols > 63488) { - return Algo::kFaissBlockSelect; - } else { - return Algo::kRadix11bits; - } - } else { - return Algo::kRadix11bits; - } - } - } else { - if (cols > 678736) { - return Algo::kWarpDistributedShm; + if (k > 256) { + if (cols > 16862) { + if (rows > 1020) { + return Algo::kRadix11bitsExtraPass; } else { return Algo::kRadix11bits; } + } else { + return Algo::kRadix11bitsExtraPass; } } else { - if (cols > 13776) { - if (rows > 335) { - if (k > 1) { - if (rows > 546) { - return Algo::kWarpDistributedShm; - } else { - if (k > 17) { - return Algo::kWarpDistributedShm; - } else { - return Algo::kFaissBlockSelect; - } - } - } else { - return Algo::kFaissBlockSelect; - } + if (k > 2) { + if (cols > 22061) { + return Algo::kWarpDistributedShm; } else { - if (k > 44) { - if (cols > 1031051) { - return Algo::kWarpDistributedShm; - } else { - if (rows > 22) { - return Algo::kWarpDistributedShm; - } else { - return Algo::kRadix11bits; - } - } - } else { - return Algo::kWarpDistributedShm; - } - } - } else { - if (k > 1) { - if (rows > 188) { + if (rows > 198) { return Algo::kWarpDistributedShm; } else { - if (k > 72) { - return Algo::kRadix11bits; - } else { - return Algo::kWarpDistributedShm; - } + return Algo::kWarpImmediate; } - } else { - return Algo::kFaissBlockSelect; } + } else { + return Algo::kWarpImmediate; } } } @@ -294,6 +250,8 @@ void select_k(raft::resources const& handle, switch (algo) { case Algo::kRadix11bits: + case Algo::kRadix11bitsExtraPass: { + bool fused_last_filter = algo == Algo::kRadix11bits; detail::select::radix::select_k(in_val, in_idx, batch_size, @@ -302,7 +260,7 @@ void select_k(raft::resources const& handle, out_val, out_idx, select_min, - true, // fused_last_filter + fused_last_filter, stream, mr); @@ -324,13 +282,15 @@ void select_k(raft::resources const& handle, handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min); } return; + } case Algo::kWarpDistributedShm: return detail::select::warpsort:: select_k_impl( in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); - case Algo::kFaissBlockSelect: - return neighbors::detail::select_k( - in_val, in_idx, batch_size, len, out_val, out_idx, select_min, k, stream); + case Algo::kWarpImmediate: + return detail::select::warpsort:: + select_k_impl( + in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr); default: RAFT_FAIL("K-selection Algorithm not supported."); } } diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh index edde924892..fa12005df2 100644 --- a/cpp/include/raft/matrix/detail/select_radix.cuh +++ b/cpp/include/raft/matrix/detail/select_radix.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -103,15 +104,27 @@ _RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool select_min) return (twiddle_in(x, select_min) >> start_bit) & mask; } -template +// Strangely, RATIO_T has a strong impact on register usage and occupancy for sm80, e.g. +// using RATIO_T=unsigned for radix_kernel decreases occupancy (with CUDA 12). +// In the meanwhile, RATIO_T has no impact for sm90. +template _RAFT_HOST_DEVICE IdxT calc_buf_len(IdxT len) { // When writing is skipped, only read `in`(type T). // When writing is not skipped, read `in_buf`(T) and `in_idx_buf`(IdxT), and write `out_buf`(T) // and `out_idx_buf`(IdxT). // The ratio between these cases determines whether to skip writing and hence the buffer size. - constexpr float ratio = 2 + sizeof(IdxT) * 2.0 / sizeof(T); - return len / ratio; + constexpr RATIO_T ratio = 2 + sizeof(IdxT) * 2 / sizeof(T); + // Even such estimation is too conservative, so further decrease buf_len by 1/8 + IdxT buf_len = len / (ratio * 8); + + // one-block kernel splits one large buffer into smaller ones, so round buf size to 256 bytes to + // avoid alignment issues + static_assert(is_a_power_of_two(sizeof(T))); + static_assert(is_a_power_of_two(sizeof(IdxT))); + constexpr IdxT aligned = 256 / std::min(sizeof(T), sizeof(IdxT)); + buf_len = Pow2::roundDown(buf_len); + return buf_len; } /** @@ -208,6 +221,11 @@ struct alignas(128) Counter { /** * Fused filtering of the current pass and building histogram for the next pass * (see steps 4 & 1 in `radix_kernel` description). + * + * This function is more complicated than the one-block counterpart because this function handles + * the case of early stopping. When early stopping is triggered, it's desirable to do the final + * filtering in this function rather than in last_filter(), because this function is run by multiple + * blocks while last_filter is run by a single block. */ template _RAFT_DEVICE void filter_and_histogram(const T* in_buf, @@ -397,7 +415,7 @@ _RAFT_DEVICE void last_filter(const T* in_buf, const int start_bit = calc_start_bit(pass); // changed in choose_bucket(); need to reload - const IdxT needed_num_of_kth = counter->k; + const IdxT num_of_kth_needed = counter->k; IdxT* p_out_cnt = &counter->out_cnt; IdxT* p_out_back_cnt = &counter->out_back_cnt; for (IdxT i = threadIdx.x; i < current_len; i += blockDim.x) { @@ -412,7 +430,7 @@ _RAFT_DEVICE void last_filter(const T* in_buf, out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i; } else if (bits == kth_value_bits) { IdxT back_pos = atomicAdd(p_out_back_cnt, static_cast(1)); - if (back_pos < needed_num_of_kth) { + if (back_pos < num_of_kth_needed) { IdxT pos = k - 1 - back_pos; out[pos] = value; out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i; @@ -422,16 +440,16 @@ _RAFT_DEVICE void last_filter(const T* in_buf, } template -__global__ void last_filter_kernel(const T* in, - const IdxT* in_idx, - const T* in_buf, - const IdxT* in_idx_buf, - T* out, - IdxT* out_idx, - IdxT len, - IdxT k, - Counter* counters, - const bool select_min) +RAFT_KERNEL last_filter_kernel(const T* in, + const IdxT* in_idx, + const T* in_buf, + const IdxT* in_idx_buf, + T* out, + IdxT* out_idx, + const IdxT len, + const IdxT k, + Counter* counters, + const bool select_min) { const size_t batch_id = blockIdx.y; // size_t to avoid multiplication overflow @@ -454,14 +472,14 @@ __global__ void last_filter_kernel(const T* in, constexpr int start_bit = calc_start_bit(pass); const auto kth_value_bits = counter->kth_value_bits; - const IdxT needed_num_of_kth = counter->k; + const IdxT num_of_kth_needed = counter->k; IdxT* p_out_cnt = &counter->out_cnt; IdxT* p_out_back_cnt = &counter->out_back_cnt; auto f = [k, select_min, kth_value_bits, - needed_num_of_kth, + num_of_kth_needed, p_out_cnt, p_out_back_cnt, in_idx_buf, @@ -474,7 +492,7 @@ __global__ void last_filter_kernel(const T* in, out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i; } else if (bits == kth_value_bits) { IdxT back_pos = atomicAdd(p_out_back_cnt, static_cast(1)); - if (back_pos < needed_num_of_kth) { + if (back_pos < num_of_kth_needed) { IdxT pos = k - 1 - back_pos; out[pos] = value; out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i; @@ -525,20 +543,20 @@ __global__ void last_filter_kernel(const T* in, * their indices. */ template -__global__ void radix_kernel(const T* in, - const IdxT* in_idx, - const T* in_buf, - const IdxT* in_idx_buf, - T* out_buf, - IdxT* out_idx_buf, - T* out, - IdxT* out_idx, - Counter* counters, - IdxT* histograms, - const IdxT len, - const IdxT k, - const bool select_min, - const int pass) +RAFT_KERNEL radix_kernel(const T* in, + const IdxT* in_idx, + const T* in_buf, + const IdxT* in_idx_buf, + T* out_buf, + IdxT* out_idx_buf, + T* out, + IdxT* out_idx, + Counter* counters, + IdxT* histograms, + const IdxT len, + const IdxT k, + const bool select_min, + const int pass) { const size_t batch_id = blockIdx.y; auto counter = counters + batch_id; @@ -657,16 +675,35 @@ __global__ void radix_kernel(const T* in, } template -int calc_chunk_size(int batch_size, IdxT len, int sm_cnt, Kernel kernel) +int calc_chunk_size(int batch_size, IdxT len, int sm_cnt, Kernel kernel, bool one_block) { int active_blocks; RAFT_CUDA_TRY( cudaOccupancyMaxActiveBlocksPerMultiprocessor(&active_blocks, kernel, BlockSize, 0)); - constexpr int items_per_thread = 32; - constexpr int num_waves = 10; - int chunk_size = - std::max(1, num_waves * sm_cnt * active_blocks * BlockSize * items_per_thread / len); + // The chunk size is chosen so that there is enough workload to fully utilize GPU. + // One full wave contains (sm_cnt * active_blocks) blocks, and 10 waves is an empirically safe + // estimation of enough workload. It also counteracts imbalance if some blocks run slower + // than others. + constexpr int num_waves = 10; + int chunk_size; + if (one_block) { + // For one-block version, one block processes one instance in the chunk. Just ensure that there + // are enough blocks. + chunk_size = num_waves * sm_cnt * active_blocks; + } else { + // One instance in the chunk contains len items and is processed by multiple blocks. + // The total number of items in a chunk (chunk_size * len) should be large enough that every + // thread has enough items to processes. So set it to num_waves * "max num of active threads" + // (sm_cnt * active_blocks * BlockSize) * items_per_thread. + // + // Also, the upper bound of the total number of items in a chunk is: + // 10 (num_waves) * ~100 (sm_cnt) * 2048 (active_blocks*BlockSize) * 32 (items_per_thread) =64M. + // So temporary buffer size required for one chunk won't be too large. + constexpr int items_per_thread = 32; + chunk_size = + std::max(1, num_waves * sm_cnt * active_blocks * BlockSize * items_per_thread / len); + } return std::min(chunk_size, batch_size); } @@ -709,17 +746,17 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt) } template -_RAFT_HOST_DEVICE void set_buf_pointers(const T* in, - const IdxT* in_idx, - T* buf1, - IdxT* idx_buf1, - T* buf2, - IdxT* idx_buf2, - int pass, - const T*& in_buf, - const IdxT*& in_idx_buf, - T*& out_buf, - IdxT*& out_idx_buf) +_RAFT_HOST void set_buf_pointers(const T* in, + const IdxT* in_idx, + T* buf1, + IdxT* idx_buf1, + T* buf2, + IdxT* idx_buf2, + int pass, + const T*& in_buf, + const IdxT*& in_idx_buf, + T*& out_buf, + IdxT*& out_idx_buf) { if (pass == 0) { in_buf = in; @@ -744,6 +781,41 @@ _RAFT_HOST_DEVICE void set_buf_pointers(const T* in, } } +template +_RAFT_DEVICE void set_buf_pointers(const T* in, + const IdxT* in_idx, + char* bufs, + IdxT buf_len, + int pass, + const T*& in_buf, + const IdxT*& in_idx_buf, + T*& out_buf, + IdxT*& out_idx_buf) +{ + // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2 + if (pass == 0) { + in_buf = in; + in_idx_buf = nullptr; + out_buf = nullptr; + out_idx_buf = nullptr; + } else if (pass == 1) { + in_buf = in; + in_idx_buf = in_idx; + out_buf = reinterpret_cast(bufs); + out_idx_buf = reinterpret_cast(bufs + sizeof(T) * 2 * buf_len); + } else if (pass % 2 == 0) { + in_buf = reinterpret_cast(bufs); + in_idx_buf = reinterpret_cast(bufs + sizeof(T) * 2 * buf_len); + out_buf = const_cast(in_buf + buf_len); + out_idx_buf = const_cast(in_idx_buf + buf_len); + } else { + out_buf = reinterpret_cast(bufs); + out_idx_buf = reinterpret_cast(bufs + sizeof(T) * 2 * buf_len); + in_buf = out_buf + buf_len; + in_idx_buf = out_idx_buf + buf_len; + } +} + template void radix_topk(const T* in, const IdxT* in_idx, @@ -765,7 +837,7 @@ void radix_topk(const T* in, auto kernel = radix_kernel; const size_t max_chunk_size = - calc_chunk_size(batch_size, len, sm_cnt, kernel); + calc_chunk_size(batch_size, len, sm_cnt, kernel, false); if (max_chunk_size != static_cast(batch_size)) { grid_dim = calc_grid_dim(max_chunk_size, len, sm_cnt); } @@ -793,6 +865,7 @@ void radix_topk(const T* in, RAFT_CUDA_TRY( cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter), stream)); RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream)); + auto kernel = radix_kernel; const T* chunk_in = in + offset * len; const IdxT* chunk_in_idx = in_idx ? (in_idx + offset * len) : nullptr; @@ -866,6 +939,7 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf, IdxT* out_idx_buf, T* out, IdxT* out_idx, + const IdxT previous_len, Counter* counter, IdxT* histogram, bool select_min, @@ -879,9 +953,8 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf, if (threadIdx.x == 0) { *p_filter_cnt = 0; } __syncthreads(); - const int start_bit = calc_start_bit(pass); - const unsigned mask = calc_mask(pass); - const IdxT previous_len = counter->previous_len; + const int start_bit = calc_start_bit(pass); + const unsigned mask = calc_mask(pass); if (pass == 0) { auto f = [histogram, select_min, start_bit, mask](T value, IdxT) { @@ -889,6 +962,20 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf, atomicAdd(histogram + bucket, static_cast(1)); }; vectorized_process(threadIdx.x, blockDim.x, in_buf, previous_len, f); + } else if (!out_buf) { + // not use vectorized_process here because it increases #registers a lot + const auto kth_value_bits = counter->kth_value_bits; + const int previous_start_bit = calc_start_bit(pass - 1); + + for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) { + const T value = in_buf[i]; + const auto previous_bits = (twiddle_in(value, select_min) >> previous_start_bit) + << previous_start_bit; + if (previous_bits == kth_value_bits) { + int bucket = calc_bucket(value, start_bit, mask, select_min); + atomicAdd(histogram + bucket, static_cast(1)); + } + } } else { // not use vectorized_process here because it increases #registers a lot IdxT* p_out_cnt = &counter->out_cnt; @@ -920,17 +1007,14 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf, } template -__global__ void radix_topk_one_block_kernel(const T* in, - const IdxT* in_idx, - const IdxT len, - const IdxT k, - T* out, - IdxT* out_idx, - const bool select_min, - T* buf1, - IdxT* idx_buf1, - T* buf2, - IdxT* idx_buf2) +RAFT_KERNEL radix_topk_one_block_kernel(const T* in, + const IdxT* in_idx, + const IdxT len, + const IdxT k, + T* out, + IdxT* out_idx, + const bool select_min, + char* bufs) { constexpr int num_buckets = calc_num_buckets(); __shared__ Counter counter; @@ -951,22 +1035,30 @@ __global__ void radix_topk_one_block_kernel(const T* in, if (in_idx) { in_idx += batch_id * len; } out += batch_id * k; out_idx += batch_id * k; - buf1 += batch_id * len; - idx_buf1 += batch_id * len; - buf2 += batch_id * len; - idx_buf2 += batch_id * len; - const T* in_buf = nullptr; - const IdxT* in_idx_buf = nullptr; - T* out_buf = nullptr; - IdxT* out_idx_buf = nullptr; + const IdxT buf_len = calc_buf_len(len); + bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT)); constexpr int num_passes = calc_num_passes(); for (int pass = 0; pass < num_passes; ++pass) { - set_buf_pointers( - in, in_idx, buf1, idx_buf1, buf2, idx_buf2, pass, in_buf, in_idx_buf, out_buf, out_idx_buf); - - IdxT current_len = counter.len; - IdxT current_k = counter.k; + const T* in_buf; + const IdxT* in_idx_buf; + T* out_buf; + IdxT* out_idx_buf; + set_buf_pointers(in, in_idx, bufs, buf_len, pass, in_buf, in_idx_buf, out_buf, out_idx_buf); + + const IdxT current_len = counter.len; + const IdxT current_k = counter.k; + IdxT previous_len = counter.previous_len; + if (previous_len > buf_len) { + in_buf = in; + in_idx_buf = in_idx; + previous_len = len; + } + if (current_len > buf_len) { + // so "out_buf==nullptr" denotes skipping writing buffer in current pass + out_buf = nullptr; + out_idx_buf = nullptr; + } filter_and_histogram_for_one_block(in_buf, in_idx_buf, @@ -974,6 +1066,7 @@ __global__ void radix_topk_one_block_kernel(const T* in, out_idx_buf, out, out_idx, + previous_len, &counter, histogram, select_min, @@ -988,11 +1081,11 @@ __global__ void radix_topk_one_block_kernel(const T* in, __syncthreads(); if (counter.len == counter.k || pass == num_passes - 1) { - last_filter(pass == 0 ? in : out_buf, - pass == 0 ? in_idx : out_idx_buf, + last_filter(out_buf ? out_buf : in, + out_buf ? out_idx_buf : in_idx, out, out_idx, - current_len, + out_buf ? current_len : len, k, &counter, select_min, @@ -1022,21 +1115,17 @@ void radix_topk_one_block(const T* in, { static_assert(calc_num_passes() > 1); - auto kernel = radix_topk_one_block_kernel; + auto kernel = radix_topk_one_block_kernel; + const IdxT buf_len = calc_buf_len(len); const size_t max_chunk_size = - calc_chunk_size(batch_size, len, sm_cnt, kernel); + calc_chunk_size(batch_size, len, sm_cnt, kernel, true); auto pool_guard = - raft::get_pool_memory_resource(mr, - max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT)) + - 256 * 4 // might need extra memory for alignment - ); + raft::get_pool_memory_resource(mr, max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT))); if (pool_guard) { RAFT_LOG_DEBUG("radix::select_k: using pool memory resource"); } - rmm::device_uvector buf1(len * max_chunk_size, stream, mr); - rmm::device_uvector idx_buf1(len * max_chunk_size, stream, mr); - rmm::device_uvector buf2(len * max_chunk_size, stream, mr); - rmm::device_uvector idx_buf2(len * max_chunk_size, stream, mr); + rmm::device_uvector bufs( + max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr); for (size_t offset = 0; offset < static_cast(batch_size); offset += max_chunk_size) { int chunk_size = std::min(max_chunk_size, batch_size - offset); @@ -1047,10 +1136,7 @@ void radix_topk_one_block(const T* in, out + offset * k, out_idx + offset * k, select_min, - buf1.data(), - idx_buf1.data(), - buf2.data(), - idx_buf2.data()); + bufs.data()); } } diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh index 2927604e7d..0ee87de4f7 100644 --- a/cpp/include/raft/matrix/detail/select_warpsort.cuh +++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh @@ -56,7 +56,7 @@ the top-k result. Example: - __global__ void kernel() { + RAFT_KERNEL kernel() { block_sort queue(...); for (IdxT i = threadIdx.x; i < len, i += blockDim.x) { @@ -80,7 +80,7 @@ (see the usage of LaunchThreshold::len_factor_for_choosing). Example: - __global__ void kernel() { + RAFT_KERNEL kernel() { warp_sort_immediate<...> queue(...); int warp_id = threadIdx.x / WarpSize; int lane_id = threadIdx.x % WarpSize; @@ -750,8 +750,8 @@ template