diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 3cb91fc0f8232..63d508c606c8e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,33 +1,4 @@ # See https://help.github.com/articles/about-codeowners/ # for more info about CODEOWNERS file -# This lists cover the "core" components of vLLM that require careful review -/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -CMakeLists.txt @tlrmchlsmth - -# vLLM V1 -/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic - -# Test ownership -/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo -/tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo -/tests/models @DarkLight1337 @ywang96 -/tests/multimodal @DarkLight1337 @ywang96 -/tests/prefix_caching @comaniac @KuntaiDu -/tests/spec_decode @njhill @LiuXiaoxuanPKU -/tests/kernels @tlrmchlsmth @WoosukKwon -/tests/quantization @mgoin @robertgshaw2-neuralmagic -/.buildkite/lm-eval-harness @mgoin @simon-mo -/tests/distributed/test_multi_node_assignment.py @youkaichao -/tests/distributed/test_pipeline_parallel.py @youkaichao -/tests/distributed/test_same_node.py @youkaichao -/tests/multi_step @alexm-neuralmagic @comaniac -/tests/weight_loading @mgoin @youkaichao -/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac +* @kzawora-intel @madamczykhabana @michalkuligowski @mgawarkiewicz @vivekgoe @afierka-intel diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml new file mode 100644 index 0000000000000..eafb7a5447f4a --- /dev/null +++ b/.github/actionlint.yaml @@ -0,0 +1,10 @@ +self-hosted-runner: + # Labels of self-hosted runner in array of strings. + labels: + - generic-runner +paths: + .github/workflows/trigger_jenkins.yml: + ignore: + - shellcheck reported issue in this script: SC2116:.+ + - shellcheck reported issue in this script: SC2086:.+ + - shellcheck reported issue in this script: SC2001:.+ \ No newline at end of file diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 0226cf0ca00e9..d139f625d98ab 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -2,14 +2,14 @@ name: Lint GitHub Actions workflows on: push: branches: - - "main" + - "habana_main" paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' - '.github/workflows/matchers/actionlint.json' pull_request: branches: - - "main" + - "habana_main" paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 68149d2dc019f..2a4655b9cee05 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -2,10 +2,10 @@ name: clang-format on: # Trigger the workflow on push or pull request, - # but only for the main branch + # but only for the habana_main branch push: branches: - - main + - habana_main paths: - '**/*.h' - '**/*.cpp' @@ -14,7 +14,7 @@ on: - '.github/workflows/clang-format.yml' pull_request: branches: - - main + - habana_main paths: - '**/*.h' - '**/*.cpp' diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index 68887adaae54b..72e732d878e61 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -5,7 +5,7 @@ on: # but only for the main branch push: branches: - - main + - habana_main paths: - "**/*.py" - "**/*.md" @@ -15,7 +15,7 @@ on: - .github/workflows/codespell.yml pull_request: branches: - - main + - habana_main paths: - "**/*.py" - "**/*.md" diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml new file mode 100644 index 0000000000000..b900239463323 --- /dev/null +++ b/.github/workflows/cpu-test.yml @@ -0,0 +1,35 @@ +name: cpu-test + +on: + # Trigger the workflow on push or pull request, + # but only for the habana_main branch + push: + branches: + - habana_main + pull_request: + branches: + - habana_main + + +jobs: + cputest: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install torch --extra-index-url https://download.pytorch.org/whl/cpu + pip install -r requirements-build.txt + pip install -r requirements-hpu.txt + VLLM_TARGET_DEVICE=hpu python setup.py develop + - name: cpu-test + run: | + VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 VLLM_USE_FAKE_HPU=1 python examples/offline_inference_fakehpu.py diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml index 2f5ee8bbfd8c5..2a156f627196e 100644 --- a/.github/workflows/doc-lint.yml +++ b/.github/workflows/doc-lint.yml @@ -3,12 +3,12 @@ name: Lint documentation on: push: branches: - - main + - habana_main paths: - "docs/**" pull_request: branches: - - main + - habana_main paths: - "docs/**" diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml deleted file mode 100644 index 556b60d2fca12..0000000000000 --- a/.github/workflows/lint-and-deploy.yaml +++ /dev/null @@ -1,82 +0,0 @@ -name: Lint and Deploy Charts - -on: pull_request - -jobs: - lint-and-deploy: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: Set up Helm - uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0 - with: - version: v3.14.4 - - #Python is required because ct lint runs Yamale and yamllint which require Python. - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: '3.13' - - - name: Set up chart-testing - uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1 - with: - version: v3.10.1 - - - name: Run chart-testing (lint) - run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm - - - name: Setup minio - run: | - docker network create vllm-net - docker run -d -p 9000:9000 --name minio --net vllm-net \ - -e "MINIO_ACCESS_KEY=minioadmin" \ - -e "MINIO_SECRET_KEY=minioadmin" \ - -v /tmp/data:/data \ - -v /tmp/config:/root/.minio \ - minio/minio server /data - export AWS_ACCESS_KEY_ID=minioadmin - export AWS_SECRET_ACCESS_KEY=minioadmin - export AWS_EC2_METADATA_DISABLED=true - mkdir opt-125m - cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd .. - aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket - aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive - - - name: Create kind cluster - uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0 - - - name: Build the Docker image vllm cpu - run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env . - - - name: Configuration of docker images, network and namespace for the kind cluster - run: | - docker pull amazon/aws-cli:2.6.4 - kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing - kind load docker-image vllm-cpu-env:latest --name chart-testing - docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")" - kubectl create ns ns-vllm - - - name: Run chart-testing (install) - run: | - export AWS_ACCESS_KEY_ID=minioadmin - export AWS_SECRET_ACCESS_KEY=minioadmin - sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & - helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - - - name: curl test - run: | - kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 & - sleep 10 - CODE="$(curl -v -f --location http://localhost:8001/v1/completions \ - --header "Content-Type: application/json" \ - --data '{ - "model": "opt-125m", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }'):$CODE" - echo "$CODE" \ No newline at end of file diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 73eeacf1fa562..f436e0d8336d4 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -2,10 +2,10 @@ name: mypy on: # Trigger the workflow on push or pull request, - # but only for the main branch + # but only for the habana_main branch push: branches: - - main + - habana_main paths: - '**/*.py' - '.github/workflows/mypy.yaml' @@ -13,7 +13,7 @@ on: - 'pyproject.toml' pull_request: branches: - - main + - habana_main # This workflow is only relevant when one of the following files changes. # However, we have github configured to expect and require this workflow # to run and pass before github with auto-merge a pull request. Until github diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml index 4932af943a07b..140cb5e050a6a 100644 --- a/.github/workflows/png-lint.yml +++ b/.github/workflows/png-lint.yml @@ -2,13 +2,13 @@ name: Lint PNG exports from excalidraw on: push: branches: - - "main" + - "habana_main" paths: - '*.excalidraw.png' - '.github/workflows/png-lint.yml' pull_request: branches: - - "main" + - "habana_main" paths: - '*.excalidraw.png' - '.github/workflows/png-lint.yml' diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml deleted file mode 100644 index df62539c0b3d9..0000000000000 --- a/.github/workflows/reminder_comment.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: PR Reminder Comment Bot -on: - pull_request_target: - types: [opened] - -jobs: - pr_reminder: - runs-on: ubuntu-latest - steps: - - name: Remind to run full CI on PR - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - with: - script: | - github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: 'šŸ‘‹ Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\nšŸš€' - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 7266cc378cfb0..42385ef947502 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -2,10 +2,10 @@ name: ruff on: # Trigger the workflow on push or pull request, - # but only for the main branch + # but only for the habana_main branch push: branches: - - main + - habana_main paths: - "**/*.py" - pyproject.toml @@ -14,7 +14,7 @@ on: - .github/workflows/ruff.yml pull_request: branches: - - main + - habana_main # This workflow is only relevant when one of the following files changes. # However, we have github configured to expect and require this workflow # to run and pass before github with auto-merge a pull request. Until github diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 0000000000000..c610f06360d1f --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,73 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + - cron: '20 13 * * 0' + push: + branches: [ "habana_main" ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: false + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@97a0fba1372883ab732affbe8f94b823f91727db # v3.pre.node20 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard (optional). + # Commenting out will disable upload of results to your repo's Code Scanning dashboard + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: results.sarif diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml index 4b1587e373e17..f6931150c795d 100644 --- a/.github/workflows/shellcheck.yml +++ b/.github/workflows/shellcheck.yml @@ -2,13 +2,13 @@ name: Lint shell scripts on: push: branches: - - "main" + - "habana_main" paths: - '**/*.sh' - '.github/workflows/shellcheck.yml' pull_request: branches: - - "main" + - "habana_main" paths: - '**/*.sh' - '.github/workflows/shellcheck.yml' diff --git a/.github/workflows/trigger_jenkins.yml b/.github/workflows/trigger_jenkins.yml new file mode 100644 index 0000000000000..6a8e2f6bed1ca --- /dev/null +++ b/.github/workflows/trigger_jenkins.yml @@ -0,0 +1,113 @@ +name: Trigger Jenkins Tests +on: + pull_request: + types: [opened, reopened, edited, synchronize] + +permissions: + pull-requests: write +jobs: + DependencyReview: + name: Dependency Review + runs-on: ubuntu-latest + steps: + - name: 'Checkout Repository' + uses: actions/checkout@v4 + - name: 'Dependency Review' + uses: actions/dependency-review-action@v4 + with: + fail-on-severity: high + CodeQLScan: + name: CodeQL Scan + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: python + build-mode: none + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:python" + upload: "never" + CalculateJobs: + runs-on: generic-runner + name: Calculate Tests To Trigger + needs: [DependencyReview,CodeQLScan] + outputs: + tests_list: ${{ steps.tests.outputs.tests_list }} + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Install YQ + run: | + wget https://github.com/mikefarah/yq/releases/download/v4.14.1/yq_linux_amd64.tar.gz -O - |\ + tar xz && sudo mv yq_linux_amd64 /usr/bin/yq + - name: Calculate Tests + id: tests + run: | + test_list=$(yq -oj e .jenkins/test_config.yaml | jq -c "[.stages[].steps[]]") + echo "tests_list=${test_list}" >> "$GITHUB_OUTPUT" + TestRun: + name: Test / ${{matrix.tests.name}} + needs: [CalculateJobs] + runs-on: generic-runner + strategy: + fail-fast: false + matrix: + tests: ${{ fromJson(needs.CalculateJobs.outputs.tests_list) }} + env: + USERNAME: ${{ secrets.SWUSERNAME }} + PASSWORD: ${{ secrets.SWPASSWORD }} + POD_TEMPLATE: ${{ secrets.POD_TEMPLATE }} + TEST_COMMAND: ${{ matrix.tests.command }} + steps: + - name: Download Hlctl + run: | + curl --show-error --silent ${{ secrets.HLCTL_ADDRESS }} | bash &> /dev/null + - name: Config Hlctl + run: | + ${{ secrets.HLCTL_COMMAND }} &> /dev/null + - name: Create Pod Template + env: + TARGET_BRANCH: ${{ github.base_ref }} + RELEASED_SYNAPSE_VERSION: ${{ vars.RELEASED_SYNAPSE_VERSION }} + BASE_BRANCH: ${{github.head_ref}} + run: | + if [[ $TARGET_BRANCH == "habana_main" ]]; then + synapse_version=${RELEASED_SYNAPSE_VERSION#v} + elif [[ $TARGET_BRANCH =~ v*.*.* ]]; then + synapse_version=${TARGET_BRANCH#v} + else + echo "Cant Calculate Synapse Version, Failing The Test" + exit 1 + fi + synapse_build=$(curl "https://dms.habana-labs.com/api/v1.1/branch/info/v$synapse_version" | jq -r ".release_id") + pt_version=${{ vars.PT_VERSION }} + BUILD_TAG="Github-vLLM-Fork-${{ github.event.number }}-${{github.run_number}}" + safe_cmd=${TEST_COMMAND//&/\\&} + echo "Writing Pod Template To File" + echo "${POD_TEMPLATE}" > pod.yml + sed -i "s/##VERSION##/${synapse_version}/g" pod.yml + sed -i "s/##BUILD##/${synapse_build}/g" pod.yml + sed -i "s/##BUILD_TAG##/${BUILD_TAG}/g" pod.yml + sed -i "s/##PYTORCH_VERSION##/${pt_version}/g" pod.yml + sed -i "s|##GIT_BRANCH##|$BASE_BRANCH|g" pod.yml + sed -i "s|##CMD##|$safe_cmd|g" pod.yml + echo "Pod Template Created" + - name: Run Test + run: | + converted_test_name=$(echo ${{ matrix.tests.name }} | tr "_" "-") + if [[ ${#converted_test_name} -ge 33 ]];then + converted_test_name=${converted_test_name:12} + fi + hlctl create containers \ + --file=pod.yml \ + --flavor=${{ matrix.tests.flavor}} \ + --name="vllm-fork-${{github.event.number}}-${converted_test_name}" \ + --namespace="framework" \ + --priority="high" \ + --retry \ + --shm=10240 \ No newline at end of file diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index ff441f94435ad..554150da97c02 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -2,16 +2,16 @@ name: yapf on: # Trigger the workflow on push or pull request, - # but only for the main branch + # but only for the habana_main branch push: branches: - - main + - habana_main paths: - "**/*.py" - .github/workflows/yapf.yml pull_request: branches: - - main + - habana_main paths: - "**/*.py" - .github/workflows/yapf.yml diff --git a/.jenkins/lm-eval-harness/configs/Llama-2-7B-hf.yaml b/.jenkins/lm-eval-harness/configs/Llama-2-7B-hf.yaml new file mode 100644 index 0000000000000..da048ba19305f --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Llama-2-7B-hf.yaml @@ -0,0 +1,14 @@ +# These scores were chosen to place within 6% range of values achieved using vLLM on HPU: +# 0.148 - 0.164 +# where on https://www.llama.com/llama2/: 0.146 is given +model_name: "/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.155 + - name: "exact_match,flexible-extract" + value: 0.155 +limit: 250 +num_fewshot: 5 +dtype: "bfloat16" \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml new file mode 100644 index 0000000000000..38965c6197c55 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml @@ -0,0 +1,12 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.892 + - name: "exact_match,flexible-extract" + value: 0.892 +limit: 250 +num_fewshot: 5 +dtype: "bfloat16" diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml new file mode 100644 index 0000000000000..9fe7d634b887b --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml @@ -0,0 +1,12 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 +dtype: "bfloat16" diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml new file mode 100644 index 0000000000000..5c1cd657e8e36 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml @@ -0,0 +1,16 @@ +# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF +# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" +tasks: +- name: "gsm8k_cot_llama" + metrics: + - name: "exact_match,strict-match" + value: 0.664 + - name: "exact_match,flexible-extract" + value: 0.676 +limit: 250 +num_fewshot: 8 +dtype: "bfloat16" +fewshot_as_multiturn: true +apply_chat_template: true +fp8: true \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-mss.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-mss.yaml new file mode 100644 index 0000000000000..ff787f1085cba --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-mss.yaml @@ -0,0 +1,16 @@ +# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF +# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" +tasks: +- name: "gsm8k_cot_llama" + metrics: + - name: "exact_match,strict-match" + value: 0.8317 + - name: "exact_match,flexible-extract" + value: 0.8355 +limit: null +num_fewshot: 8 +dtype: "bfloat16" +fewshot_as_multiturn: true +apply_chat_template: true +num_scheduler_steps: 10 \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml new file mode 100644 index 0000000000000..e2458a8ea4f1c --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml @@ -0,0 +1,15 @@ +# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF +# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" +tasks: +- name: "gsm8k_cot_llama" + metrics: + - name: "exact_match,strict-match" + value: 0.8317 + - name: "exact_match,flexible-extract" + value: 0.8355 +limit: null +num_fewshot: 8 +dtype: "bfloat16" +fewshot_as_multiturn: true +apply_chat_template: true \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-fp8.txt b/.jenkins/lm-eval-harness/configs/models-fp8.txt new file mode 100644 index 0000000000000..8a318a9ec936d --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-fp8.txt @@ -0,0 +1 @@ +Meta-Llama-3.1-8B-Instruct-fp8.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-large.txt b/.jenkins/lm-eval-harness/configs/models-large.txt new file mode 100644 index 0000000000000..ca2548d1234a8 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-large.txt @@ -0,0 +1 @@ +Meta-Llama-3-70B-Instruct.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-llama2.txt b/.jenkins/lm-eval-harness/configs/models-llama2.txt new file mode 100644 index 0000000000000..7ae5af4cce4d3 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-llama2.txt @@ -0,0 +1 @@ +Llama-2-7B-hf.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-mss.txt b/.jenkins/lm-eval-harness/configs/models-mss.txt new file mode 100644 index 0000000000000..cfcc3d42d108f --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-mss.txt @@ -0,0 +1 @@ +Meta-Llama-3.1-8B-Instruct-mss.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-small.txt b/.jenkins/lm-eval-harness/configs/models-small.txt new file mode 100644 index 0000000000000..d8ae241e58ad3 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-small.txt @@ -0,0 +1,2 @@ +Meta-Llama-3-8B-Instruct.yaml +Meta-Llama-3.1-8B-Instruct.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/inc_unit_scales_config.json b/.jenkins/lm-eval-harness/inc_unit_scales_config.json new file mode 100644 index 0000000000000..cd6589c811417 --- /dev/null +++ b/.jenkins/lm-eval-harness/inc_unit_scales_config.json @@ -0,0 +1,16 @@ +{ + "mode": "QUANTIZE", + "observer": "maxabs", + "scale_method": "unit_scale", + "allowlist": { + "types": [], + "names": [] + }, + "blocklist": { + "types": [], + "names": [ + "lm_head" + ] + }, + "dump_stats_path": "" +} \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh new file mode 100644 index 0000000000000..2816a8334a8c0 --- /dev/null +++ b/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.3 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096,dtype=bfloat16" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh new file mode 100644 index 0000000000000..179b75e1137f7 --- /dev/null +++ b/.jenkins/lm-eval-harness/run-tests.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using vllm and compares to " + echo "precomputed baseline (measured by HF transformers.)" + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. configs/small-models.txt)" + echo " -t - tensor parallel size" + echo +} + +SUCCESS=0 + +while getopts "c:t:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" + + export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} + export LM_EVAL_TP_SIZE=$TP_SIZE + export PT_HPU_ENABLE_LAZY_COLLECTIVES=true + export VLLM_SKIP_WARMUP=true + export TQDM_BAR_FORMAT="{desc}: {percentage:3.0f}% {bar:10} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]" + RANDOM_SUFFIX=$(tr -dc A-Za-z0-9

-| Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack | +| IntelĀ® GaudiĀ® README | Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |

--- +> [!NOTE] +> For Intel Gaudi specific setup instructions and examples, please refer [IntelĀ® GaudiĀ® README](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md). For jupyter notebook based quickstart tutorials refer [Getting Started with vLLM](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/Getting_Started_with_vLLM/Getting_Started_with_vLLM.ipynb) and [Understanding vLLM on Gaudi](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/Understanding_vLLM_on_Gaudi/Understanding_vLLM_on_Gaudi.ipynb). The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui) @@ -28,6 +30,7 @@ The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Goog - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing). +- [2024/05] vLLM-fork specific: Added IntelĀ® GaudiĀ® 2 support with SynapseAI 1.16.0. For more information, please refer to IntelĀ® GaudiĀ® README. - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing). - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing). diff --git a/README_GAUDI.md b/README_GAUDI.md new file mode 100644 index 0000000000000..74d742e815df5 --- /dev/null +++ b/README_GAUDI.md @@ -0,0 +1,401 @@ +# vLLM with IntelĀ® GaudiĀ® AI Accelerators + +This README provides instructions on how to run vLLM with Intel Gaudi devices. + +# Requirements and Installation + +Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the execution environment. +To achieve the best performance, please follow the methods outlined in the +[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). + +## Requirements + +- Ubuntu 22.04 LTS OS +- Python 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.19.0 and above + +## Quick Start Using Dockerfile +Set up the container with latest release of Gaudi Software Suite using the Dockerfile: + +``` +$ docker build -f Dockerfile.hpu -t vllm-hpu-env . +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +``` + +> [!TIP] +> If you are facing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Optional Packages" section + of [Install Driver and Software](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#install-driver-and-software) and "Configure Container + Runtime" section of [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Installation_Methods/Docker_Installation.html#configure-container-runtime). + Make sure you have ``habanalabs-container-runtime`` package installed and that ``habana`` container runtime is registered. + + +## Build from Source + +### Environment Verification +To verify that the Intel Gaudi software was correctly installed, run the following: + +```{.console} +$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +$ pip list | grep neural # verify that neural-compressor is installed +``` + +Refer to [System Verification and Final Tests](https://docs.habana.ai/en/latest/Installation_Guide/System_Verification_and_Final_Tests.html) for more details. + +### Run Docker Image + +It is highly recommended to use the latest Docker image from Intel Gaudi vault. +Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details. + +Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html): + +```{.console} +$ docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest +``` + +### Build and Install vLLM + +Currently, multiple ways are provided which can be used to install vLLM with IntelĀ® GaudiĀ®, pick **one** option: + +#### 1. Build and Install the stable version + +vLLM releases are being performed periodically to align with IntelĀ® GaudiĀ® software releases. The stable version is released with a tag, and supports fully validated features and performance optimizations in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork). To install the stable release from [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: + +```{.console} +$ git clone https://github.com/HabanaAI/vllm-fork.git +$ cd vllm-fork +$ git checkout v0.6.4.post2+Gaudi-1.19.0 +$ pip install -r requirements-hpu.txt +$ python setup.py develop +``` + +#### 2. Build and Install the latest from vLLM-fork + +Currently, the latest features and performance optimizations are being developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and periodically upstreamed to vLLM main repository. +To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: + +```{.console} +$ git clone https://github.com/HabanaAI/vllm-fork.git +$ cd vllm-fork +$ git checkout habana_main +$ pip install -r requirements-hpu.txt +$ python setup.py develop +``` + +#### 3. Build and Install from vLLM main source + +If you prefer to build and install directly from the main vLLM source, where periodically we are upstreaming new features, run the following: + +```{.console} +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ pip install -r requirements-hpu.txt +$ python setup.py develop +``` + +# Supported Features +| **Feature** | **Description** | **References** | +|--- |--- |--- | +| Offline batched inference | Offline inference using LLM class from vLLM Python API | [Quickstart](https://docs.vllm.ai/en/stable/getting_started/quickstart.html#offline-batched-inference)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference.html) | +| Online inference via OpenAI-Compatible Server | Online inference using HTTP server that implements OpenAI Chat and Completions API | [Documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/openai_chat_completion_client.html) | +| HPU autodetection | HPU users do not need to specify the target platform, it will be detected automatically upon vLLM startup | N/A | +| Paged KV cache with algorithms enabled for Intel Gaudi accelerators | vLLM HPU backend contains a custom Paged Attention and cache operators implementations optimized for Gaudi devices. | N/A | +| Custom Intel Gaudi operator implementations | vLLM HPU backend provides optimized implementations of operators such as prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding. | N/A | +| Tensor parallel inference (single-node multi-HPU) | vLLM HPU backend support multi-HPU inference across a single node with tensor parallelism with Ray and HCCL. | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html)
[Example](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html)
[HCCL reference](https://docs.habana.ai/en/latest/API_Reference_Guides/HCCL_APIs/index.html) | +| Inference with HPU Graphs | vLLM HPU backend uses HPU Graphs by default for optimal performance. When HPU Graphs are enabled, execution graphs will be recorded ahead of time, to be later replayed during inference, significantly reducing host overheads. | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
[vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes)
[Optimization guide](https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#hpu-graph-capture) | +| Inference with torch.compile (experimental) | vLLM HPU backend experimentally supports inference with torch.compile. | [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes) | +| Attention with Linear Biases (ALiBi) | vLLM HPU backend supports models utilizing Attention with Linear Biases (ALiBi) such as mpt-7b. | [vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) | +| INC quantization | vLLM HPU backend supports FP8 model and KV cache quantization and calibration with Intel Neural Compressor (INC). | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html) | +| LoRA/MultiLoRA support | vLLM HPU backend includes support for LoRA and MultiLoRA on supported models. | [Documentation](https://docs.vllm.ai/en/stable/models/lora.html)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/multilora_inference.html)
[vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) | +| Multi-step scheduling support | vLLM HPU backend includes multi-step scheduling support for host overhead reduction, configurable by standard `--num-scheduler-seqs` parameter. | [Feature RFC](https://github.com/vllm-project/vllm/issues/6854) | +| Automatic prefix caching (experimental) | vLLM HPU backend includes automatic prefix caching (APC) support for more efficient prefills, configurable by standard `--enable-prefix-caching` parameter. | [Documentation](https://docs.vllm.ai/en/stable/automatic_prefix_caching/apc.html)
[Details](https://docs.vllm.ai/en/stable/automatic_prefix_caching/details.html) | +| Speculative decoding (experimental) | vLLM HPU backend includes experimental speculative decoding support for improving inter-token latency in some scenarios, configurabie via standard `--speculative_model` and `--num_speculative_tokens` parameters. | [Documentation](https://docs.vllm.ai/en/stable/models/spec_decode.html)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference_mlpspeculator.html) | + +# Unsupported Features + +- Beam search +- AWQ quantization +- Prefill chunking (mixed-batch inferencing) + +# Supported Configurations + +The following configurations have been validated to be function with Gaudi2 devices. Configurations that are not listed may or may not work. + +- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling +- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling + +# Performance Tuning + +## Execution Modes + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. + +| `PT_HPU_LAZY_MODE` | `enforce_eager` | Execution Mode | +| ------------------ | --------------- | ------------------ | +| 0 | 0 | torch.compile | +| 0 | 1 | PyTorch eager mode | +| 1 | 0 | HPU Graphs | +| 1 | 1 | PyTorch lazy mode | + +> [!WARNING] +> All modes using PT_HPU_LAZY_MODE=0 are experimental and should only be used for validating functional correctness. To achieve the best performance, use HPU Graphs or PyTorch Lazy Mode. Performance improvements are planned for future releases. + +## Bucketing Mechanism + +Intel Gaudi accelerators perform best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) +generates optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be highly dependent on input and output tensor shapes, requiring graph recompilation +when encountering tensors with different shapes within the same topology. While these binaries efficiently utilize Gaudi, the compilation process itself can introduce noticeable overhead in end-to-end execution. +In dynamic inference serving scenarios, it is important to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently, this is achieved by +"bucketing" the model's forward pass across two dimensions: `batch_size` and `sequence_length`. + +> [!NOTE] +> Bucketing helps significantly reduce the number of required graphs, but it does not handle graph compilation or device code generation. These tasks are performed during the warmup and HPUGraph capture phase. + +Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters +can be observed in logs during vLLM startup: + +```{.} +INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +``` + +`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` +has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, +while allowing larger padding on larger batch sizes. + +**Example with ramp-up** + +```{.} +min = 2, step = 32, max = 64 +=> ramp_up = (2, 4, 8, 16) +=> stable = (32, 64) +=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) +``` + +**Example without ramp-up** + +```{.} +min = 128, step = 128, max = 512 +=> ramp_up = () +=> stable = (128, 256, 384, 512) +=> buckets = ramp_up + stable => (128, 256, 384, 512) +``` + +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor +shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +> [!WARNING] +> If a request exceeds the maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. + The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. + +For example, if a request with 3 sequences, each having a maximum sequence length of 412, is sent to an idle vLLM server, it will be padded and executed as a `(4, 512)` prefill bucket. This is because the `batch_size` +(number of sequences) will be padded to 4 (the nearest batch size dimension higher than 3), and the maximum sequence length will be padded to 512 (the nearest sequence length dimension higher than 412). After the +prefill stage, it will be executed as a `(4, 512)` decode bucket and will remain in this bucket until either the batch dimension changes (e.g., due to a request being completed), in which case it will become +a `(2, 512)` bucket, or the context length increases beyond 512 tokens, at which point it will become a `(4, 640)` bucket. + +> [!NOTE] +> Bucketing is transparent to the user ā€“ padding in the sequence length dimension is never returned, and padding in the batch dimension does not create new requests. + +## Warmup + +Warmup is an optional but highly recommended step that occurs before the vLLM server starts listening. It executes a forward pass for each bucket using dummy data. The goal is to pre-compile all graphs +and avoid any graph compilation overhead within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup. + +This example uses the same buckets as those in the Bucketing Mechanism section. Each output line corresponds to the execution of a single bucket. When a bucket is executed for the first time, its graph +is compiled and can be reused later, avoiding further graph compilations. + +```{.} +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB +INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB +... +INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB +INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB +... +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +``` + + +> [!TIP] +> Compiling all the buckets may take some time and can be disabled by setting the VLLM_SKIP_WARMUP=true environment variable. Keep in mind that if you do this, you may encounter graph compilations + when executing a given bucket for the first time. Disabling warmup is fine for development, but it is highly recommended to enable it in deployment. + +## HPU Graph Capture + +[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, +execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which +needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + +When HPU Graphs are used, they share the common memory pool ("usable memory") with the KV cache, as determined by the `gpu_memory_utilization` flag (default value is `0.9`). Before the KV cache is allocated, +the model weights are loaded onto the device, and a forward pass of the model is executed on dummy data to estimate memory usage. Only after that, the `gpu_memory_utilization` flag is applied. At its default value, +it marks 90% of the free device memory at that point as usable. Next, the KV cache is allocated, the model is warmed up, and HPU Graphs are captured. The `VLLM_GRAPH_RESERVED_MEM` environment variable defines +the ratio of memory reserved for HPU Graph capture. With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of the usable memory will be reserved for graph capture (referred to as "usable graph memory"), +and the remaining 90% will be used for the KV cache. The environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default +(`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages share equal memory constraints. A lower value corresponds to less usable graph memory reserved for the prefill stage. For example, setting `VLLM_GRAPH_PROMPT_RATIO=0.2` +reserves 20% of usable graph memory for prefill graphs, while 80% is allocated for decode graphs. + +> [!NOTE] +> `gpu_memory_utilization` does not represent the absolute memory usage across the HPU. Instead, it specifies the memory margin after loading the model and running a profile. For example, if a device has 100 GiB of + total memory and 50 GiB of free memory after loading the model weights and executing the profiling run, the default value of `gpu_memory_utilization` will mark 90% of the 50 GiB as usable, leaving 5 GiB as a margin, + regardless of the total device memory. + +You can also configure the strategy for capturing HPU graphs separately for the prompt and decode stages. The strategy affects the order in which graphs are captured. Two strategies are implemented: + + - `max_bs` - The graph capture queue is sorted in descending order by batch size. Buckets with equal batch sizes are sorted by sequence length in an ascending order + (e.g., `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), which is the default strategy for decode. + - `min_tokens` - The graph capture queue is sorted in an ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), which is the default strategy for prompt. + +When a large number of requests are pending, the vLLM scheduler attempts to fill the maximum batch size for decoding as quickly as possible. Once a request is finished, the decode batch size decreases. +When this happens, vLLM attempts to schedule a prefill iteration for requests in the waiting queue to restore the decode batch size to its previous state. In a fully loaded scenario, the decode +batch size is often at its maximum, making large-batch HPU graphs critical to capture, as indicated by the `max_bs` strategy. Conversely, prefill iterations will typically be executed with very low +batch sizes (1-4), as reflected in the `min_tokens` strategy. + +> [!NOTE] +> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on the memory allocated for graphs in each stage (prefill and decode). vLLM first attempts to use the entire usable prefill graph memory + (usable graph memory * VLLM_GRAPH_PROMPT_RATIO) for capturing prefill HPU Graphs. It will then attempt to do the same for decode graphs and the usable decode graph memory pool. If one stage is fully + captured and there is unused memory remaining in the usable graph memory pool, vLLM will attempt to capture more graphs for the other stage, until no more HPU Graphs can be captured without exceeding + the reserved memory pool. The behavior of this mechanism is illustrated in the example below. + +Each step outlined is logged by the vLLM server, with negative values indicating memory release: + +```{.} +INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache +INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 +INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB +... +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +... +INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB +INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB +... +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory +INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) +``` + +## Recommended vLLM Parameters + +- It is recommended to run inference on Gaudi 2 with `block_size` of 128 for BF16 data type. Using the default values (16, 32) may result in suboptimal performance due to underutilization of the Matrix + Multiplication Engine (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). +- To achieve maximum throughput on Llama 7B, it is recommended to use a batch size of 128 or 256 and a maximum context length of 2048 with HPU Graphs enabled. If you experience out-of-memory issues, + please refer to the Troubleshooting section below. + +## Environment Variables + +**Diagnostic and Profiling Knobs:** + +- `VLLM_PROFILER_ENABLED`: if `true` - enables high level profiler. Resulting JSON traces can be viewed at [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true` - logs graph compilations for each vLLM engine step, but only if any compilation occurs. It is highly recommended to use this in conjunction with `PT_HPU_METRICS_GC_DETAILS=1`. + Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true` - logs graph compilations for every vLLM engine step, even if no compilation occurs. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true` - logs CPU fallbacks for each vLLM engine step, but only if any fallback occurs. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true` - logs CPU fallbacks for each vLLM engine step, even if no fallback occur. Disabled by default. + +**Performance Tuning Knobs:** + +- `VLLM_SKIP_WARMUP`: if `true` - warmup is skipped. `false` by default. +- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default. +- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default. +- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default. +- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default. +- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism. + - `{phase}` is either `PROMPT` or `DECODE` + - `{dim}` is either `BS`, `SEQ` or `BLOCK` + - `{param}` is either `MIN`, `STEP` or `MAX` + - Default values: + - Prompt: + + - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + + - Decode: + + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` +- `VLLM_HANDLE_TOPK_DUPLICATES`, if ``true`` - handles duplicates that are outside of top-k. `false` by default. +- `VLLM_CONFIG_HIDDEN_LAYERS` - configures how many hidden layers to run in a HPUGraph for model splitting among hidden layers when TP is 1. The default is 1. + It helps improve throughput by reducing inter-token latency limitations in some models. + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default. +- `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs. + +# Quantization, FP8 Inference and Model Calibration Process + +> [!NOTE] +> Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described + in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package. + +Once you have completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command: +```bash +export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json +vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --weights-load-device cpu --tensor_paralel_size 8 +``` + +`QUANT_CONFIG` is an environment variable that points to the measurement or quantization configuration file. The measurement configuration file is used during the calibration procedure to collect +measurements for a given model. The quantization configuration is used during inference. + +> [!TIP] +> If you are prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which is time-consuming. + However, disabling this feature in production environments is not recommended, as it can lead to a significant performance decrease. + +> [!TIP] +> When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this, set the following environment variables: +> - `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes. +> - `VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes. + +# Troubleshooting + +If you encounter device out-of-memory issues or want to attempt inference with higher batch sizes, try tweaking HPU Graphs as follows: + +- Tweak `gpu_memory_utilization` knob. This will decrease the allocation of KV cache, leaving some headroom for capturing graphs with larger batch size. By default, `gpu_memory_utilization` is set to 0.9. + It attempts to allocate ~90% of HBM left for KV cache after short profiling run. Note that this reduces the number of KV cache blocks you have available, and therefore reduces the effective maximum + number of tokens handled at a given time. +- If this method is not efficient, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. + You can do that by adding `--enforce-eager` flag to the server (for online inference), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c1b10b3cf8f58..7c8a93262ee37 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -63,6 +63,10 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str: model = model.lower() if "pixtral" in model: return f"[INST]{question}\n[IMG][/INST]" + elif "llava" in model: + return f"USER: \n{question}\nASSISTANT:" + elif "llama-3.2" in model: + return f"<|image|><|begin_of_text|>{question}" raise ValueError(f"Unsupported model {model}") @@ -221,6 +225,10 @@ async def run_vllm_async( n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, + weights_load_device: str = None, + use_padding_aware_scheduling: bool = False, + max_num_seqs: int = 256, + max_num_prefill_seqs: int = None, ) -> float: from vllm import SamplingParams diff --git a/collect_env.py b/collect_env.py index 254c19b19a5ac..16261ce28ca4e 100644 --- a/collect_env.py +++ b/collect_env.py @@ -39,6 +39,8 @@ 'cuda_module_loading', 'nvidia_driver_version', 'nvidia_gpu_models', + 'habana_hpu_models', + 'habana_driver_version', 'cudnn_version', 'pip_version', # 'pip' or 'pip3' 'pip_packages', @@ -254,6 +256,37 @@ def get_nvidia_smi(): return smi +def get_hpu_info(): + try: + command = ["hl-smi", "-q", "-d", "PRODUCT"] + lines = subprocess.Popen(command, stdout=subprocess.PIPE, universal_newlines=True).stdout.readlines() + lines = [l.strip('\t') for l in lines] + hpu_count = None + hpu_model = None + hpu_driver = None + model_re = re.compile(r'Product Name.+?: (.+)') + count_re = re.compile(r'Attached AIPs.+?: (\d+)') + driver_re = re.compile(r'Driver Version.+?: (.+)') + for line in lines: + if hpu_c := count_re.match(line): + hpu_count = hpu_c.group(1) + + if hpu_m := model_re.match(line): + hpu_model = hpu_m.group(1) + + if hpu_d := driver_re.match(line): + hpu_driver = hpu_d.group(1) + + if hpu_model and hpu_count and hpu_driver: + break + + if hpu_model is None: + return ('N/A', hpu_driver) + return (f'{hpu_count}x {hpu_model}', hpu_driver) + except: + return ('N/A', 'N/A') + + def get_rocm_version(run_lambda): """Returns the ROCm version if available, otherwise 'N/A'.""" return run_and_parse_first_match(run_lambda, 'hipcc --version', @@ -568,6 +601,7 @@ def get_version_or_na(cfg, prefix): vllm_version = get_vllm_version() vllm_build_flags = summarize_vllm_build_flags() gpu_topo = get_gpu_topo(run_lambda) + hpu_info = get_hpu_info() return SystemEnv( torch_version=version_str, @@ -583,6 +617,8 @@ def get_version_or_na(cfg, prefix): nvidia_gpu_models=get_gpu_info(run_lambda), nvidia_driver_version=get_nvidia_driver_version(run_lambda), cudnn_version=get_cudnn_version(run_lambda), + habana_hpu_models=hpu_info[0], + habana_driver_version=hpu_info[1], hip_compiled_version=hip_compiled_version, hip_runtime_version=hip_runtime_version, miopen_runtime_version=miopen_runtime_version, @@ -626,6 +662,8 @@ def get_version_or_na(cfg, prefix): GPU models and configuration: {nvidia_gpu_models} Nvidia driver version: {nvidia_driver_version} cuDNN version: {cudnn_version} +HPU devices: {habana_hpu_models} +HPU driver version: {habana_driver_version} HIP runtime version: {hip_runtime_version} MIOpen runtime version: {miopen_runtime_version} Is XNNPACK available: {is_xnnpack_available} diff --git a/docs/source/features/quantization/inc.md b/docs/source/features/quantization/inc.md new file mode 100644 index 0000000000000..41c032be53af6 --- /dev/null +++ b/docs/source/features/quantization/inc.md @@ -0,0 +1,64 @@ +(inc)= + +# FP8 INC + +vLLM supports FP8 (8-bit floating point) weight and activation quantization using IntelĀ® Neural Compressor (INC) on IntelĀ® GaudiĀ® 2 and IntelĀ® GaudiĀ® 3 AI accelerators. +Currently, quantization is validated only in Llama models. + +Intel Gaudi supports quantization of various modules and functions, including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`. For more information, please refer to: +[Supported Modules\\Supported Functions\\Custom Patched Modules](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules). + +```{note} +Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package. +``` + +```{note} +`QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options). +The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference. +``` + +## Run Online Inference Using FP8 + +Once you've completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command: + +```bash +export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json +vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --weights-load-device cpu --tensor_paralel_size 8 +``` + +```{tip} +If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop. +``` + +```{tip} +When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables: +`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes. +`VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes. +``` + +## Run Offline Inference Using FP8 + +To run offline inference (after completing the model calibration process): +\* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode. +\* Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object. +\* Call shutdown method of the model_executor at the end of the run. + +```python +from vllm import LLM +llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc") +... +# Call llm.generate on the required prompts and sampling params. +... +llm.llm_engine.model_executor.shutdown() +``` + +## Specifying Device for the Model's Weights Uploading + +It is possible to load the unquantized weights on a different device before quantizing them, then moving them to the device on which the model will run. +This reduces the device memory footprint of model weights, as only quantized weights are stored in device memory. +To set the device to upload weights, use the `weights_load_device` parameter for the `LLM` object, or `--weights-load-device` command line parameter when running online inference: + +```python +from vllm import LLM +llm = LLM("llama3.1/Meta-Llama-3.1-8B-Instruct", quantization="inc", kv_cache_dtype="fp8_inc", weights_load_device="cpu") +``` diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index 861cb165c11c2..58f9c4d42947f 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -12,6 +12,7 @@ supported_hardware auto_awq bnb gguf +inc int8 fp8 fp8_e5m2_kvcache diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md index f5c0a95ea426e..c375d044dd64b 100644 --- a/docs/source/features/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -6,7 +6,7 @@ The table below shows the compatibility of various quantization implementations ```{list-table} :header-rows: 1 -:widths: 20 8 8 8 8 8 8 8 8 8 8 +:widths: 20 8 8 8 8 8 8 8 8 8 8 8 * - Implementation - Volta @@ -16,6 +16,7 @@ The table below shows the compatibility of various quantization implementations - Hopper - AMD GPU - Intel GPU + - Intel Gaudi - x86 CPU - AWS Inferentia - Google TPU @@ -27,6 +28,7 @@ The table below shows the compatibility of various quantization implementations - āœ…ļøŽ - āœ— - āœ…ļøŽ + - āœ— - āœ…ļøŽ - āœ— - āœ— @@ -38,6 +40,7 @@ The table below shows the compatibility of various quantization implementations - āœ…ļøŽ - āœ— - āœ…ļøŽ + - āœ— - āœ…ļøŽ - āœ— - āœ— @@ -52,6 +55,7 @@ The table below shows the compatibility of various quantization implementations - āœ— - āœ— - āœ— + - āœ— * - INT8 (W8A8) - āœ— - āœ…ļøŽ @@ -60,6 +64,7 @@ The table below shows the compatibility of various quantization implementations - āœ…ļøŽ - āœ— - āœ— + - āœ— - āœ…ļøŽ - āœ— - āœ— @@ -74,6 +79,7 @@ The table below shows the compatibility of various quantization implementations - āœ— - āœ— - āœ— + - āœ— * - AQLM - āœ…ļøŽ - āœ…ļøŽ @@ -85,6 +91,7 @@ The table below shows the compatibility of various quantization implementations - āœ— - āœ— - āœ— + - āœ— * - bitsandbytes - āœ…ļøŽ - āœ…ļøŽ @@ -96,6 +103,7 @@ The table below shows the compatibility of various quantization implementations - āœ— - āœ— - āœ— + - āœ— * - DeepSpeedFP - āœ…ļøŽ - āœ…ļøŽ @@ -107,17 +115,31 @@ The table below shows the compatibility of various quantization implementations - āœ— - āœ— - āœ— + - āœ— * - GGUF - āœ…ļøŽ - āœ…ļøŽ - āœ…ļøŽ - āœ…ļøŽ - āœ…ļøŽ - - āœ…ļøŽ - āœ— - āœ— - āœ— - āœ— + - āœ— + - āœ— +* - INC (W8A8) + - āœ— + - āœ— + - āœ— + - āœ— + - āœ— + - āœ— + - āœ— + - āœ…ļøŽ + - āœ— + - āœ— + - āœ— ``` - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index b4695d504b601..8ac6e7045f780 100644 --- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -1,49 +1,40 @@ # Installation -This tab provides instructions on running vLLM with Intel Gaudi devices. +This tab provides instructions on how to run vLLM with Intel Gaudi devices. ## Requirements -- OS: Ubuntu 22.04 LTS -- Python: 3.10 +- Ubuntu 22.04 LTS OS +- Python 3.10 - Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 +- Intel Gaudi software version 1.19.0 and above -Please follow the instructions provided in the [Gaudi Installation -Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) -to set up the execution environment. To achieve the best performance, -please follow the methods outlined in the [Optimizing Training Platform -Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). +Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the execution environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). ## Configure a new environment -### Environment verification +### Environment Verification -To verify that the Intel Gaudi software was correctly installed, run: +To verify that the Intel Gaudi software was correctly installed, run the following: ```console hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -pip list | grep neural # verify that neural_compressor is installed +pip list | grep neural # verify that neural-compressor is installed ``` -Refer to [Intel Gaudi Software Stack -Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) -for more details. +Refer to [System Verification and Final Tests](https://docs.habana.ai/en/latest/Installation_Guide/System_Verification_and_Final_Tests.html) for more details. ### Run Docker Image -It is highly recommended to use the latest Docker image from Intel Gaudi -vault. Refer to the [Intel Gaudi -documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) -for more details. +It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details. -Use the following commands to run a Docker image: +Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html): ```console -docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest ``` ## Set up using Python @@ -54,20 +45,40 @@ Currently, there are no pre-built Intel Gaudi wheels. ### Build wheel from source -To build and install vLLM from source, run: +Currently, multiple ways are provided which can be used to install vLLM with IntelĀ® GaudiĀ®, pick **one** option: + +#### 1. Build and Install the stable version + +vLLM releases are being performed periodically to align with IntelĀ® GaudiĀ® software releases. The stable version is released with a tag, and supports fully validated features and performance optimizations in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork). To install the stable release from [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: ```console -git clone https://github.com/vllm-project/vllm.git -cd vllm +git clone https://github.com/HabanaAI/vllm-fork.git +cd vllm-fork +git checkout v0.6.4.post2+Gaudi-1.19.0 +pip install -r requirements-hpu.txt python setup.py develop ``` -Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: +#### 2. Build and Install the latest from vLLM-fork + +Currently, the latest features and performance optimizations are being developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and periodically upstreamed to vLLM main repository. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: ```console git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork git checkout habana_main +pip install -r requirements-hpu.txt +python setup.py develop +``` + +#### 3. Build and Install from vLLM main source + +If you prefer to build and install directly from the main vLLM source, where periodically we are upstreaming new features, run the following: + +```console +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-hpu.txt python setup.py develop ``` @@ -79,114 +90,86 @@ Currently, there are no pre-built Intel Gaudi images. ### Build image from source +Set up the container with latest release of Gaudi Software Suite using the Dockerfile: + ```console docker build -f Dockerfile.hpu -t vllm-hpu-env . docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env ``` ```{tip} -If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. +If you are facing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Optional Packages" section of [Install Driver and Software](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#install-driver-and-software) and "Configure Container Runtime" section of [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Installation_Methods/Docker_Installation.html#configure-container-runtime).. Make sure you have `habanalabs-container-runtime` package installed and that `habana` container runtime is registered. ``` ## Extra information -## Supported features - -- [Offline inference](#offline-inference) -- Online serving via [OpenAI-Compatible Server](#openai-compatible-server) -- HPU autodetection - no need to manually select device within vLLM -- Paged KV cache with algorithms enabled for Intel Gaudi accelerators -- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, - prefill attention, Root Mean Square Layer Normalization, Rotary - Positional Encoding -- Tensor parallelism support for multi-card inference -- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) - for accelerating low-batch latency and throughput -- Attention with Linear Biases (ALiBi) - -## Unsupported features +## Supported Features + +| **Feature** | **Description** | **References** | +| ------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Offline batched inference | Offline inference using LLM class from vLLM Python API | [Quickstart](https://docs.vllm.ai/en/stable/getting_started/quickstart.html#offline-batched-inference) [Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference.html) | +| Online inference via OpenAI-Compatible Server | Online inference using HTTP server that implements OpenAI Chat and Completions API | [Documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html) [Example](https://docs.vllm.ai/en/stable/getting_started/examples/openai_chat_completion_client.html) | +| HPU autodetection | HPU users do not need to specify the target platform, it will be detected automatically upon vLLM startup | N/A | +| Paged KV cache with algorithms enabled for Intel Gaudi accelerators | vLLM HPU backend contains a custom Paged Attention and cache operators implementations optimized for Gaudi devices. | N/A | +| Custom Intel Gaudi operator implementations | vLLM HPU backend provides optimized implementations of operators such as prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding. | N/A | +| Tensor parallel inference (single-node multi-HPU) | vLLM HPU backend support multi-HPU inference across a single node with tensor parallelism with Ray and HCCL. | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) [Example](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html) [HCCL reference](https://docs.habana.ai/en/latest/API_Reference_Guides/HCCL_APIs/index.html) | +| Inference with HPU Graphs | vLLM HPU backend uses HPU Graphs by default for optimal performance. When HPU Graphs are enabled, execution graphs will be recorded ahead of time, to be later replayed during inference, significantly reducing host overheads. | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes) [Optimization guide](https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#hpu-graph-capture) | +| Inference with torch.compile (experimental) | vLLM HPU backend experimentally supports inference with torch.compile. | [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes) | +| Attention with Linear Biases (ALiBi) | vLLM HPU backend supports models utilizing Attention with Linear Biases (ALiBi) such as mpt-7b. | [vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) | +| INC quantization | vLLM HPU backend supports FP8 model and KV cache quantization and calibration with Intel Neural Compressor (INC). | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html) | +| LoRA/MultiLoRA support | vLLM HPU backend includes support for LoRA and MultiLoRA on supported models. | [Documentation](https://docs.vllm.ai/en/stable/models/lora.html) [Example](https://docs.vllm.ai/en/stable/getting_started/examples/multilora_inference.html) [vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) | +| Multi-step scheduling support | vLLM HPU backend includes multi-step scheduling support for host overhead reduction, configurable by standard `--num-scheduler-seqs` parameter. | [Feature RFC](https://github.com/vllm-project/vllm/issues/6854) | +| Automatic prefix caching (experimental) | vLLM HPU backend includes automatic prefix caching (APC) support for more efficient prefills, configurable by standard `--enable-prefix-caching` parameter. | [Documentation](https://docs.vllm.ai/en/stable/automatic_prefix_caching/apc.html) [Details](https://docs.vllm.ai/en/stable/automatic_prefix_caching/details.html) | +| Speculative decoding (experimental) | vLLM HPU backend includes experimental speculative decoding support for improving inter-token latency in some scenarios, configurabie via standard `--speculative_model` and `--num_speculative_tokens` parameters. | [Documentation](https://docs.vllm.ai/en/latest/models/spec_decode.html) [Example](https://docs.vllm.ai/en/latest/getting_started/examples/offline_inference_mlpspeculator.html) | + +## Unsupported Features - Beam search -- LoRA adapters -- Quantization +- AWQ quantization - Prefill chunking (mixed-batch inferencing) -## Supported configurations - -The following configurations have been validated to be function with -Gaudi2 devices. Configurations that are not listed may or may not work. - -- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - -## Performance tuning - -### Execution modes +## Supported Configurations + +The following configurations have been validated to be function with Gaudi2 devices. Configurations that are not listed may or may not work. + +- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling +- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling + +## Performance Tuning + +### Execution Modes Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. -```{list-table} vLLM execution modes -:widths: 25 25 50 -:header-rows: 1 - -* - `PT_HPU_LAZY_MODE` - - `enforce_eager` - - execution mode -* - 0 - - 0 - - torch.compile -* - 0 - - 1 - - PyTorch eager mode -* - 1 - - 0 - - HPU Graphs -* - 1 - - 1 - - PyTorch lazy mode -``` +| `PT_HPU_LAZY_MODE` | `enforce_eager` | Execution Mode | +| ------------------ | --------------- | ------------------ | +| 0 | 0 | torch.compile | +| 0 | 1 | PyTorch eager mode | +| 1 | 0 | HPU Graphs | +| 1 | 1 | PyTorch lazy mode | ```{warning} -In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. +All modes using PT_HPU_LAZY_MODE=0 are experimental and should only be used for validating functional correctness. To achieve the best performance, use HPU Graphs or PyTorch Lazy Mode. Performance improvements are planned for future releases. ``` -(gaudi-bucketing-mechanism)= - -### Bucketing mechanism +### Bucketing Mechanism -Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. -In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. +Intel Gaudi accelerators perform best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) generates optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be highly dependent on input and output tensor shapes, requiring graph recompilation when encountering tensors with different shapes within the same topology. While these binaries efficiently utilize Gaudi, the compilation process itself can introduce noticeable overhead in end-to-end execution. In dynamic inference serving scenarios, it is important to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently, this is achieved by "bucketing" the model's forward pass across two dimensions: `batch_size` and `sequence_length`. ```{note} -Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. +Bucketing helps significantly reduce the number of required graphs, but it does not handle graph compilation or device code generation. These tasks are performed during the warmup and HPUGraph capture phase. ``` Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: @@ -198,9 +181,9 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] ``` -`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. +`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. -Example (with ramp-up) +#### Example with ramp-up ```text min = 2, step = 32, max = 64 @@ -209,7 +192,7 @@ min = 2, step = 32, max = 64 => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) ``` -Example (without ramp-up) +#### Example without ramp-up ```text min = 128, step = 128, max = 512 @@ -221,18 +204,20 @@ min = 128, step = 128, max = 512 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. ```{warning} -If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. +If a request exceeds the maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. ``` -As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. +For example, if a request with 3 sequences, each having a maximum sequence length of 412, is sent to an idle vLLM server, it will be padded and executed as a `(4, 512)` prefill bucket. This is because the `batch_size` (number of sequences) will be padded to 4 (the nearest batch size dimension higher than 3), and the maximum sequence length will be padded to 512 (the nearest sequence length dimension higher than 412). After the prefill stage, it will be executed as a `(4, 512)` decode bucket and will remain in this bucket until either the batch dimension changes (e.g., due to a request being completed), in which case it will become a `(2, 512)` bucket, or the context length increases beyond 512 tokens, at which point it will become a `(4, 640)` bucket. ```{note} -Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. +Bucketing is transparent to the user ā€“ padding in the sequence length dimension is never returned, and padding in the batch dimension does not create new requests. ``` ### Warmup -Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: +Warmup is an optional but highly recommended step that occurs before the vLLM server starts listening. It executes a forward pass for each bucket using dummy data. The goal is to pre-compile all graphs and avoid any graph compilation overhead within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup. + +This example uses the same buckets as those in the Bucketing Mechanism section. Each output line corresponds to the execution of a single bucket. When a bucket is executed for the first time, its graph is compiled and can be reused later, avoiding further graph compilations. ```text INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB @@ -248,40 +233,33 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` -This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. - ```{tip} -Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. +Compiling all the buckets may take some time and can be disabled by setting the VLLM_SKIP_WARMUP=true environment variable. Keep in mind that if you do this, you may encounter graph compilations when executing a given bucket for the first time. Disabling warmup is fine for development, but it is highly recommended to enable it in deployment. ``` -### HPU Graph capture +### HPU Graph Capture [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. -When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default). -Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. -Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. -Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. -Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. -With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. -Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. -Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. +When HPU Graphs are used, they share the common memory pool ("usable memory") with the KV cache, as determined by the `gpu_memory_utilization` flag (default value is `0.9`). Before the KV cache is allocated, the model weights are loaded onto the device, and a forward pass of the model is executed on dummy data to estimate memory usage. Only after that, the `gpu_memory_utilization` flag is applied. At its default value, it marks 90% of the free device memory at that point as usable. Next, the KV cache is allocated, the model is warmed up, and HPU Graphs are captured. The `VLLM_GRAPH_RESERVED_MEM` environment variable defines the ratio of memory reserved for HPU Graph capture. With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of the usable memory will be reserved for graph capture (referred to as "usable graph memory"), and the remaining 90% will be used for the KV cache. The environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and +decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages share equal memory constraints. A lower value corresponds to less usable graph memory reserved for the prefill stage. For example, setting `VLLM_GRAPH_PROMPT_RATIO=0.2` reserves 20% of usable graph memory for prefill graphs, while 80% is allocated for decode graphs. ```{note} -`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. +`gpu_memory_utilization` does not represent the absolute memory usage across the HPU. Instead, it specifies the memory margin after loading the model and running a profile. For example, if a device has 100 GiB of total memory and 50 GiB of free memory after loading the model weights and executing the profiling run, the default value of `gpu_memory_utilization` will mark 90% of the 50 GiB as usable, leaving 5 GiB as a margin, regardless of the total device memory. ``` -User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: -\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode -\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt +You can also configure the strategy for capturing HPU graphs separately for the prompt and decode stages. The strategy affects the order in which graphs are captured. Two strategies are implemented: + +- `max_bs` - The graph capture queue is sorted in descending order by batch size. Buckets with equal batch sizes are sorted by sequence length in an ascending order (e.g., `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), which is the default strategy for decode. +- `min_tokens` - The graph capture queue is sorted in an ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), which is the default strategy for prompt. -When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. +When a large number of requests are pending, the vLLM scheduler attempts to fill the maximum batch size for decoding as quickly as possible. Once a request is finished, the decode batch size decreases. When this happens, vLLM attempts to schedule a prefill iteration for requests in the waiting queue to restore the decode batch size to its previous state. In a fully loaded scenario, the decode batch size is often at its maximum, making large-batch HPU graphs critical to capture, as indicated by the `max_bs` strategy. Conversely, prefill iterations will typically be executed with very low batch sizes (1-4), as reflected in the `min_tokens` strategy. ```{note} -`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. +`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on the memory allocated for graphs in each stage (prefill and decode). vLLM first attempts to use the entire usable prefill graph memory (usable graph memory * VLLM_GRAPH_PROMPT_RATIO) for capturing prefill HPU Graphs. It will then attempt to do the same for decode graphs and the usable decode graph memory pool. If one stage is fully captured and there is unused memory remaining in the usable graph memory pool, vLLM will attempt to capture more graphs for the other stage, until no more HPU Graphs can be captured without exceeding the reserved memory pool. The behavior of this mechanism is illustrated in the example below. ``` -Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): +Each step outlined is logged by the vLLM server, with negative values indicating memory release: ```text INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] @@ -298,7 +276,7 @@ INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB ... INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB -INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) +INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB ... INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB @@ -317,38 +295,32 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi ### Recommended vLLM Parameters -- We recommend running inference on Gaudi 2 with `block_size` of 128 - for BF16 data type. Using default values (16, 32) might lead to - sub-optimal performance due to Matrix Multiplication Engine - under-utilization (see [Gaudi - Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). -- For max throughput on Llama 7B, we recommend running with batch size - of 128 or 256 and max context length of 2048 with HPU Graphs enabled. - If you encounter out-of-memory issues, see troubleshooting section. +- It is recommended to run inference on Gaudi 2 with `block_size` of 128 for BF16 data type. Using the default values (16, 32) may result in suboptimal performance due to underutilization of the Matrix Multiplication Engine (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). +- To achieve maximum throughput on Llama 7B, it is recommended to use a batch size of 128 or 256 and a maximum context length of 2048 with HPU Graphs enabled. If you experience out-of-memory issues, please refer to the Troubleshooting section below. -### Environment variables +### Environment Variables -**Diagnostic and profiling knobs:** +**Diagnostic and Profiling Knobs:** -- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default. -- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default. -- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. -- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. -- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. +- `VLLM_PROFILER_ENABLED`: if `true` - enables high level profiler. Resulting JSON traces can be viewed at [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true` - logs graph compilations for each vLLM engine step, but only if any compilation occurs. It is highly recommended to use this in conjunction with `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true` - logs graph compilations for every vLLM engine step, even if no compilation occurs. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true` - logs CPU fallbacks for each vLLM engine step, but only if any fallback occurs. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true` - logs CPU fallbacks for each vLLM engine step, even if no fallback occur. Disabled by default. -**Performance tuning knobs:** +**Performance Tuning Knobs:** -- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default +- `VLLM_SKIP_WARMUP`: if `true` - warmup is skipped. `false` by default. -- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default +- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default. -- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default +- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default. -- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default +- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default. -- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default +- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default. -- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism +- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism. - `{phase}` is either `PROMPT` or `DECODE` @@ -359,41 +331,175 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - Default values: - Prompt: + - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - Decode: + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + +- `VLLM_HANDLE_TOPK_DUPLICATES`, if `true` - handles duplicates that are outside of top-k. `false` by default. + +- `VLLM_CONFIG_HIDDEN_LAYERS` - configures how many hidden layers to run in a HPUGraph for model splitting among hidden layers when TP is 1. The default is 1. It helps improve throughput by reducing inter-token latency limitations in some models. Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: -- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default -- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs - -## Troubleshooting: tweaking HPU graphs - -If you experience device out-of-memory issues or want to attempt -inference at higher batch sizes, try tweaking HPU Graphs by following -the below: - -- Tweak `gpu_memory_utilization` knob. It will decrease the - allocation of KV cache, leaving some headroom for capturing graphs - with larger batch size. By default `gpu_memory_utilization` is set - to 0.9. It attempts to allocate ~90% of HBM left for KV cache after - short profiling run. Note that decreasing reduces the number of KV - cache blocks you have available, and therefore reduces the effective - maximum number of tokens you can handle at a given time. -- If this method is not efficient, you can disable `HPUGraph` - completely. With HPU Graphs disabled, you are trading latency and - throughput at lower batches for potentially higher throughput on - higher batches. You can do that by adding `--enforce-eager` flag to - server (for online serving), or by passing `enforce_eager=True` - argument to LLM constructor (for offline inference). +- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default. +- `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs. + +## Quantization, FP8 Inference and Model Calibration Process + +```{note} +Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package. +``` + +Once you have completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command: + +```bash +export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json +vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --weights-load-device cpu --tensor_paralel_size 8 +``` + +`QUANT_CONFIG` is an environment variable that points to the measurement or quantization configuration file. The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference. + +```{tip} +If you are prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which is time-consuming. However, disabling this feature in production environments is not recommended, as it can lead to a significant performance decrease. +``` + +```{tip} +When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this, set the following environment variables: + +- `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes. +- `VLLM_RPC_TIMEOUT` - to adjust the RPC protocol timeout used by the OpenAI-compatible API. This value is in microseconds, e.g., 600000 equals 10 minutes. +``` + +## Troubleshooting + +If you encounter device out-of-memory issues or want to attempt inference with higher batch sizes, try tweaking HPU Graphs as follows: + +- Tweak `gpu_memory_utilization` knob. This will decrease the allocation of KV cache, leaving some headroom for capturing graphs with larger batch size. By default, `gpu_memory_utilization` is set to 0.9. It attempts to allocate ~90% of HBM left for KV cache after short profiling run. Note that this reduces the number of KV cache blocks you have available, and therefore reduces the effective maximum number of tokens handled at a given time. +- If this method is not efficient, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to the server (for online inference), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). + +## Changelog + +### 1.19.0 + +#### New features + +- Added fake HPU mode to Habana components with dummy habana_frameworks module. ([#250](https://github.com/HabanaAI/vllm-fork/pull/250)) +- Enabled HPU Graph capture even when warmup is skipped ([#320](https://github.com/HabanaAI/vllm-fork/pull/320)) +- Introduced vllm-hpu-extension, removed vllm.hpu directory and changed relevant imports ([#291](https://github.com/HabanaAI/vllm-fork/pull/291), [#323](https://github.com/HabanaAI/vllm-fork/pull/323)) +- Enabled async output processing for HPU ([#342](https://github.com/HabanaAI/vllm-fork/pull/342)) +- Enabled automatic BF16 usage on HPU instead of FP16 ([#361](https://github.com/HabanaAI/vllm-fork/pull/361)) +- Added padding-aware scheduling and option to limit prefill batch size ([#394](https://github.com/HabanaAI/vllm-fork/pull/394)) +- Overhauled HPU support of RotaryEmbedding ([#404](https://github.com/HabanaAI/vllm-fork/pull/404)) +- Added HPU specific arguments to benchmark_throughput ([#406](https://github.com/HabanaAI/vllm-fork/pull/406)) +- Added support for long context lengths with LoRA ([#418](https://github.com/HabanaAI/vllm-fork/pull/418)) +- Added support for various softmax normalization options ([#378](https://github.com/HabanaAI/vllm-fork/pull/378), [#420](https://github.com/HabanaAI/vllm-fork/pull/420)) +- Added initial support for automatic prefix caching ([#162](https://github.com/HabanaAI/vllm-fork/pull/162)) +- Added multi step scheduling HPU support with tensor parallelism support ([#441](https://github.com/HabanaAI/vllm-fork/pull/441), [#457](https://github.com/HabanaAI/vllm-fork/pull/457)) +- Added HPU support for speculative_decoding ([#375](https://github.com/HabanaAI/vllm-fork/pull/375), [#461](https://github.com/HabanaAI/vllm-fork/pull/461)) +- Enabled asynchronous input preparation in HPU model runner ([#497](https://github.com/HabanaAI/vllm-fork/pull/497)) +- Aligned HPU fork with upstream code up to 01aae1c (v0.6.4.post2) ([#259](https://github.com/HabanaAI/vllm-fork/pull/259), [#311](https://github.com/HabanaAI/vllm-fork/pull/311), [#340](https://github.com/HabanaAI/vllm-fork/pull/340), [#353](https://github.com/HabanaAI/vllm-fork/pull/353), [#465](https://github.com/HabanaAI/vllm-fork/pull/465), [#468](https://github.com/HabanaAI/vllm-fork/pull/468), [#485](https://github.com/HabanaAI/vllm-fork/pull/485)) + +#### Performance optimizations + +- Reduced default value of VLLM_GRAPH_RESERVED_MEM to 0.1 ([#292](https://github.com/HabanaAI/vllm-fork/pull/292)) +- Added attention performance optimizations: prefill cache write chunking, div_i32 removal from insert_or_update_cache ([#289](https://github.com/HabanaAI/vllm-fork/pull/289)) +- Optimized Qwen2 model on Gaudi ([#233](https://github.com/HabanaAI/vllm-fork/pull/233)) +- Optimized performance of top_p and top_k calculations ([#449](https://github.com/HabanaAI/vllm-fork/pull/449)) +- Removed CPU sync before sampler ([#414](https://github.com/HabanaAI/vllm-fork/pull/414)) +- Enabled Contiguous Paged Attention optimization ([#424](https://github.com/HabanaAI/vllm-fork/pull/424), [#433](https://github.com/HabanaAI/vllm-fork/pull/433), [#519](https://github.com/HabanaAI/vllm-fork/pull/519)) +- Reduced block fragmentation ([#426](https://github.com/HabanaAI/vllm-fork/pull/426)) +- Enabled FusedSDPA prefill by default ([#447](https://github.com/HabanaAI/vllm-fork/pull/447), [#448](https://github.com/HabanaAI/vllm-fork/pull/448)) +- Offload logits processing to CPU when guided decoding is used ([#358](https://github.com/HabanaAI/vllm-fork/pull/358)) +- Enabled Dynamic MoE layer for Mixtral ([#425](https://github.com/HabanaAI/vllm-fork/pull/425)) +- Enabled INC patching matmuls in paged attention's block2batch and batch2block ([#500](https://github.com/HabanaAI/vllm-fork/pull/500)) +- Optimized multi-step scheduling deepcopy overhead ([#452](https://github.com/HabanaAI/vllm-fork/pull/452)) +- Enabled FP8 patching of more matmul operations in Paged Attention ([#500](https://github.com/HabanaAI/vllm-fork/pull/500)) +- Enabled warmup for multi-step scheduling ([#501](https://github.com/HabanaAI/vllm-fork/pull/501)) +- Added regional compilation support for torch.compile mode ([#595](https://github.com/HabanaAI/vllm-fork/pull/595)) +- Enabled warmup of random sampler ([#506](https://github.com/HabanaAI/vllm-fork/pull/506)) + +#### Bugfixes + +- Fixed LLaVA-1.5 multi-modal model inference ([#283](https://github.com/HabanaAI/vllm-fork/pull/283)) +- Fixed blocks number calculation for Flat Paged Attention ([#269](https://github.com/HabanaAI/vllm-fork/pull/269)) +- Fixed initialize_ray_cluster device_str bug ([#297](https://github.com/HabanaAI/vllm-fork/pull/297)) +- Fixed calculating slots for warmup ([#310](https://github.com/HabanaAI/vllm-fork/pull/310)) +- Removed padding block from a list of available blocks in allocators ([#313](https://github.com/HabanaAI/vllm-fork/pull/313)) +- Fixed seq_len for padding sequences ([#318](https://github.com/HabanaAI/vllm-fork/pull/318)) +- Fixed LoRA specific conditions in profile_run ([#317](https://github.com/HabanaAI/vllm-fork/pull/317)) +- Removed throwing "Failed to imported from vllm.\_C" warning on HPU ([#326](https://github.com/HabanaAI/vllm-fork/pull/326)) +- Fixed documentation build warnings ([#330](https://github.com/HabanaAI/vllm-fork/pull/330)) +- Fixed INC FP8 inference after rebase ([#333](https://github.com/HabanaAI/vllm-fork/pull/333)) +- Refined INC shutdown code ([#335](https://github.com/HabanaAI/vllm-fork/pull/335)) +- Fixed torch.compile issue of dispatch key set mismatch ([#299](https://github.com/HabanaAI/vllm-fork/pull/299)) +- Fixed runtime errors reported when using long input sequence lengths with LoRA ([#339](https://github.com/HabanaAI/vllm-fork/pull/339)) +- Fixed hpu_set_env call in load_model in vllm ([#364](https://github.com/HabanaAI/vllm-fork/pull/364)) +- Fixed LoRA tests ([#376](https://github.com/HabanaAI/vllm-fork/pull/376)) +- Removed constraints for bucket creation during warmup in LoRA ([#382](https://github.com/HabanaAI/vllm-fork/pull/382)) +- Fixed lora_manager tests with hpu_model_runner ([#386](https://github.com/HabanaAI/vllm-fork/pull/386)) +- Removed workaround added to resolve multi-card stall issue ([#387](https://github.com/HabanaAI/vllm-fork/pull/387)) +- Added workaround for RuntimeError: "fill_cpu" not implemented for 'Float8_e4m3fn' ([#402](https://github.com/HabanaAI/vllm-fork/pull/402)) +- Fixed SchedulerConfig params ([#459](https://github.com/HabanaAI/vllm-fork/pull/459)) +- Fixed multistep deepcopy overhead ([#452](https://github.com/HabanaAI/vllm-fork/pull/452)) +- Added option to disable duplicates in topk ([#464](https://github.com/HabanaAI/vllm-fork/pull/464)) +- Enabled lazy import of HPU-dependent components ([#363](https://github.com/HabanaAI/vllm-fork/pull/363)) +- Fixed bug: seed_everything function doesn't handle HPU ([#384](https://github.com/HabanaAI/vllm-fork/pull/384)) +- Removed redundant set_active_loras call during warmup ([#413](https://github.com/HabanaAI/vllm-fork/pull/413)) +- Fixed number of blocks when profiling contiguous paged attention ([#496](https://github.com/HabanaAI/vllm-fork/pull/496)) +- Fixed one_hot bug in torch compile mode ([#427](https://github.com/HabanaAI/vllm-fork/pull/427)) +- Fixed execution of empty steps in multi-step scheduling ([#526](https://github.com/HabanaAI/vllm-fork/pull/526)) + +#### Other + +- Updated SynapseAI version in README & Dockerfile ([#390](https://github.com/HabanaAI/vllm-fork/pull/390)) +- Updated documentation on support of FP8 ([#288](https://github.com/HabanaAI/vllm-fork/pull/288)) +- Added FP8 inference procedure ([#504](https://github.com/HabanaAI/vllm-fork/pull/504)) +- Fixed broken urls in gaudi-installation ([#473](https://github.com/HabanaAI/vllm-fork/pull/473)) +- Renamed vLLM components from Habana to HPU ([#359](https://github.com/HabanaAI/vllm-fork/pull/359)) +- Introduced bucketing mechanism overhaul and moved bucketing logic to extension ([#394](https://github.com/HabanaAI/vllm-fork/pull/394), [#530](https://github.com/HabanaAI/vllm-fork/pull/530), [#534](https://github.com/HabanaAI/vllm-fork/pull/534)) + +(target-2)= + +### 1.18.0 + +(new-features-1)= + +#### New features + +- Added support FP8 INC inference ([#144](https://github.com/HabanaAI/vllm-fork/pull/144)) +- Added support for FusedSDPA prefills ([#168](https://github.com/HabanaAI/vllm-fork/pull/168)) +- Enabled LoRA support for HPU ([#170](https://github.com/HabanaAI/vllm-fork/pull/170), [#247](https://github.com/HabanaAI/vllm-fork/pull/247)) +- Enabled buckets not warmed-up warnings ([#222](https://github.com/HabanaAI/vllm-fork/pull/222)) +- Enabled Flat Paged Attention optimization ([#169](https://github.com/HabanaAI/vllm-fork/pull/169)) +- Added disable_tensor_cache=True to HPUGraph capture ([#252](https://github.com/HabanaAI/vllm-fork/pull/252)) +- Added support for Mixtral quantization using INC ([#267](https://github.com/HabanaAI/vllm-fork/pull/267)) +- Added option to skip forward pass execution during warmup ([#227](https://github.com/HabanaAI/vllm-fork/pull/227)) +- Added PyTorch profiler integration ([#256](https://github.com/HabanaAI/vllm-fork/pull/256)) +- Added Dockerfile.hpu ([#200](https://github.com/HabanaAI/vllm-fork/pull/200)) +- Added topp/topk calculation sampler optimization ([#195](https://github.com/HabanaAI/vllm-fork/pull/195)) + +(bugfixes-1)= + +### Bugfixes + +- HPU Buckets now don't exceed token budget ([#206](https://github.com/HabanaAI/vllm-fork/pull/206)) +- Fixed bug causing incorrect lower bucket boundary calculation ([#239](https://github.com/HabanaAI/vllm-fork/pull/239)) +- Fixed ALiBi support ([#254](https://github.com/HabanaAI/vllm-fork/pull/254)) +- Fixed HPU guided decoding crashes ([#236](https://github.com/HabanaAI/vllm-fork/pull/236)) +- Fixed incorrect handlign of large bucket minimums ([#235](https://github.com/HabanaAI/vllm-fork/pull/235)) +- Issued Llama-405b workaround for memory allocation error ([#184](https://github.com/HabanaAI/vllm-fork/pull/184)) +- Enabled dispersed dummy cache slots for avoiding caching issues ([#243](https://github.com/HabanaAI/vllm-fork/pull/243)) +- Eliminated Llama and GPTBigCode graph breaks in torch.compile mode ([#202](https://github.com/HabanaAI/vllm-fork/pull/202)) diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 8ac80e5e5c553..a20e315f791e4 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -7,6 +7,8 @@ This guide will help you quickly get started with vLLM to perform: - [Offline batched inference](#quickstart-offline) - [Online serving using OpenAI-compatible server](#quickstart-online) +Be sure to complete the [Gaudi installation instructions](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/gaudi-installation.rst#run-docker-image) before continuing with this guide. + ## Prerequisites - OS: Linux diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000000000..5dfb4bb1c32f9 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,4 @@ +# Please refer IntelĀ® GaudiĀ® README for Gaudi examples + +> [!NOTE] +> Not all examples in this folder are Intel Gaudi specific and come from the original vllm-project repository from where this fork was created. For examples such as offline inference with openAI on Intel Gaudi please refer to [IntelĀ® GaudiĀ® README supported features table](https://github.com/HabanaAI/vllm-fork/blob/v0.6.4.post2%2BGaudi-1.19.0/README_GAUDI.md#supported-features) and the [quantization section](https://github.com/HabanaAI/vllm-fork/blob/v0.6.4.post2%2BGaudi-1.19.0/README_GAUDI.md#quantization-fp8-inference-and-model-calibration-process) for FP8 examples. \ No newline at end of file diff --git a/examples/lora_inference_hpu.py b/examples/lora_inference_hpu.py new file mode 100644 index 0000000000000..b8154a29a82bb --- /dev/null +++ b/examples/lora_inference_hpu.py @@ -0,0 +1,47 @@ +from huggingface_hub import snapshot_download + +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") + +llm = LLM(model="meta-llama/Llama-2-7b-hf", + enable_lora=True, + max_num_seqs=2, + dtype='bfloat16') + +sampling_params = SamplingParams(temperature=0, + max_tokens=1024, + stop=["[/assistant]"]) + +prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /Ė©okiru/ [Ć²kĆ¬É½ÉÆĢ]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 +] + +expected_output = [ + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /Ė©okiru/' [Ć²kĆ¬É½ÉÆĢ] AND accented_mora = 'low tone mora with a gloss of /Ė©okiru/' [Ć²kĆ¬É½ÉÆĢ] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 +] + +outputs = llm.generate(prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, + sql_lora_path)) + +for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + match = expected_output[i] == generated_text + if not match: + print( + f"Comparison failed for request_id::{i}\n\t[PROMPT]{prompt!r}\n\t[GENERATED]{generated_text!r}\n\t[EXPECTED]{expected_output[i]!r}" # noqa: E501 + ) diff --git a/examples/offline_inference_eaglespeculator.py b/examples/offline_inference_eaglespeculator.py new file mode 100644 index 0000000000000..e13965d77e6ea --- /dev/null +++ b/examples/offline_inference_eaglespeculator.py @@ -0,0 +1,68 @@ +import gc +import time +from typing import List + +from vllm import LLM, SamplingParams + + +def time_generation(llm: LLM, prompts: List[str], + sampling_params: SamplingParams): + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + # Warmup first + llm.generate(prompts, sampling_params) + llm.generate(prompts, sampling_params) + start = time.time() + outputs = llm.generate(prompts, sampling_params) + end = time.time() + latency_per_token = (end - start) / sum( + [len(o.outputs[0].token_ids) for o in outputs]) + # Print the outputs. + ret = [] + for output in outputs: + generated_text = output.outputs[0].text + ret.append(generated_text) + return ret, latency_per_token + + +if __name__ == "__main__": + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=20) + + # Create an LLM without spec decoding + print("==============Without speculation==================") + llm = LLM(model="JackFram/llama-68m") + + ret_non_spec, latency_per_token_non_spec = time_generation( + llm, prompts, sampling_params) + + del llm + gc.collect() + + # Create an LLM with spec decoding + print("==============With speculation=====================") + llm = LLM( + model="JackFram/llama-68m", + speculative_model="abhigoyal/vllm-eagle-llama-68m-random", + num_speculative_tokens=5, + # These are currently required for MLPSpeculator decoding + use_v2_block_manager=True, + ) + + ret_spec, latency_per_token_spec = time_generation(llm, prompts, + sampling_params) + + del llm + gc.collect() + print("================= Summary =====================") + print("input is ", prompts, "\n") + print("Non Spec Decode - latency_per_token is ", + latency_per_token_non_spec) + print("Generated Text is :", ret_non_spec, "\n") + print("Spec Decode - latency_per_token is ", latency_per_token_spec) + print("Generated Text is :", ret_spec) diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py new file mode 100644 index 0000000000000..248b5740fa35e --- /dev/null +++ b/examples/offline_inference_fakehpu.py @@ -0,0 +1,38 @@ +import os + +from vllm import LLM, SamplingParams + +if os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0': + from vllm.utils import migrate_to_cpu + migrate_to_cpu() + +# Sample prompts. +prompts = [ + "Berlin is the capital city of ", + "Louvre is located in the city of ", + "Barack Obama was the 44th president of ", + "Warsaw is the capital city of ", + "Gniezno is a city in ", + "San Francisco is located in the state of ", + "Llanfairpwllgwyngyll is located in country of ", +] +ref_answers = [ + "Germany", "Paris", "United States", "Poland", "Poland", "California", + "Wales" +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0, n=1) + +# Create an LLM. +llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4) +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output, answer in zip(outputs, ref_answers): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert answer in generated_text, ( + f"The generated text does not contain the correct answer: {answer}") +print('PASSED') diff --git a/examples/offline_inference_medusaspeculator.py b/examples/offline_inference_medusaspeculator.py new file mode 100644 index 0000000000000..100d452d1bc75 --- /dev/null +++ b/examples/offline_inference_medusaspeculator.py @@ -0,0 +1,67 @@ +import gc +import time +from typing import List + +from vllm import LLM, SamplingParams + + +def time_generation(llm: LLM, prompts: List[str], + sampling_params: SamplingParams): + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + # Warmup first + llm.generate(prompts, sampling_params) + llm.generate(prompts, sampling_params) + start = time.time() + outputs = llm.generate(prompts, sampling_params) + end = time.time() + latency_per_token = (end - start) / sum( + [len(o.outputs[0].token_ids) for o in outputs]) + # Print the outputs. + ret = [] + for output in outputs: + generated_text = output.outputs[0].text + ret.append(generated_text) + return ret, latency_per_token + + +if __name__ == "__main__": + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=20) + + # Create an LLM without spec decoding + print("==============Without speculation==================") + llm = LLM(model="JackFram/llama-68m") + + ret_non_spec, latency_per_token_non_spec = time_generation( + llm, prompts, sampling_params) + + del llm + gc.collect() + + # Create an LLM with spec decoding + print("==============With speculation=====================") + llm = LLM( + model="JackFram/llama-68m", + speculative_model="abhigoyal/vllm-medusa-llama-68m-random", + num_speculative_tokens=5, + use_v2_block_manager=True, + ) + + ret_spec, latency_per_token_spec = time_generation(llm, prompts, + sampling_params) + + del llm + gc.collect() + print("================= Summary =====================") + print("input is ", prompts, "\n") + print("Non Spec Decode - latency_per_token is ", + latency_per_token_non_spec) + print("Generated Text is :", ret_non_spec, "\n") + print("Spec Decode - latency_per_token is ", latency_per_token_spec) + print("Generated Text is :", ret_spec) diff --git a/examples/offline_inference_spec_decode.py b/examples/offline_inference_spec_decode.py new file mode 100644 index 0000000000000..22daecfcca070 --- /dev/null +++ b/examples/offline_inference_spec_decode.py @@ -0,0 +1,67 @@ +import gc +import time +from typing import List + +from vllm import LLM, SamplingParams + + +def time_generation(llm: LLM, prompts: List[str], + sampling_params: SamplingParams): + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + # Warmup first + llm.generate(prompts, sampling_params) + llm.generate(prompts, sampling_params) + start = time.time() + outputs = llm.generate(prompts, sampling_params) + end = time.time() + latency_per_token = (end - start) / sum( + [len(o.outputs[0].token_ids) for o in outputs]) + # Print the outputs. + ret = [] + for output in outputs: + generated_text = output.outputs[0].text + ret.append(generated_text) + return ret, latency_per_token + + +if __name__ == "__main__": + + # Sample prompts. + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM without spec decoding + print("==============Without speculation==================") + llm = LLM(model="facebook/opt-6.7b") + + ret_non_spec, latency_per_token_non_spec = time_generation( + llm, prompts, sampling_params) + + del llm + gc.collect() + + # Create an LLM with spec decoding + print("==============With speculation=====================") + llm = LLM( + model="facebook/opt-6.7b", + speculative_model="facebook/opt-125m", + num_speculative_tokens=5, + # These are currently required for MLPSpeculator decoding + use_v2_block_manager=True, + ) + + ret_spec, latency_per_token_spec = time_generation(llm, prompts, + sampling_params) + + del llm + gc.collect() + print("================= Summary =====================") + print("input is ", prompts, "\n") + print("Non Spec Decode - latency_per_token is ", + latency_per_token_non_spec) + print("Generated Text is :", ret_non_spec, "\n") + print("Spec Decode - latency_per_token is ", latency_per_token_spec) + print("Generated Text is :", ret_spec) diff --git a/examples/other/fp8/README.md b/examples/other/fp8/README.md index 4e8031d954113..ee09f09dfdcd2 100644 --- a/examples/other/fp8/README.md +++ b/examples/other/fp8/README.md @@ -1,3 +1,6 @@ +> [!NOTE] +>The examples in this folder are **NOT** Intel Gaudi specific and come from the original vllm-project repository from where this fork was created. For FP8 examples on Intel Gaudi please refer to IntelĀ® GaudiĀ® README. + # FP8 KV Cache This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms. diff --git a/requirements-hpu.txt b/requirements-hpu.txt index f4fb89ef42834..873d2db93f90d 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@87ab1b8 diff --git a/tests/conftest.py b/tests/conftest.py index 95af4ac1eb17b..55cae78e4d721 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,6 +32,7 @@ zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput +from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, identity, is_list_of) @@ -656,6 +657,80 @@ def hf_runner(): return HfRunner +class HfHPURunner(HfRunner): + + def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + if device is None: + device = "cpu" if current_platform.is_cpu() else "hpu" + + if isinstance(x, dict): + return {k: self.wrap_device(v, device) for k, v in x.items()} + + if hasattr(x, "device") and x.device.type == device: + return x + + return x.to(device) + + def __init__( + self, + model_name: str, + dtype: str = "half", + *, + model_kwargs: Optional[Dict[str, Any]] = None, + is_embedding_model: bool = False, + auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, + postprocess_inputs: Callable[[BatchEncoding], + BatchEncoding] = identity, + ) -> None: + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] + + self.model_name = model_name + + model_kwargs = model_kwargs if model_kwargs is not None else {} + self.model = self.wrap_device( + auto_cls.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + **model_kwargs, + ).eval()) + + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + wrap_done = False + if hasattr(self.model, "language_model"): + self.model.language_model = wrap_in_hpu_graph( + self.model.language_model) + wrap_done = True + if hasattr(self.model, "vision_model"): + self.model.vision_model = wrap_in_hpu_graph( + self.model.vision_model) + wrap_done = True + if not wrap_done: + self.model = wrap_in_hpu_graph(self.model) + + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) + + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor # noqa: F401 + self.processor = AutoProcessor.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) + self.dtype = dtype + self.postprocess_inputs = postprocess_inputs + + +@pytest.fixture(scope="session") +def hf_hpu_runner(): + return HfHPURunner + + class VllmRunner: def __init__( @@ -670,7 +745,7 @@ def __init__( dtype: str = "half", disable_log_stats: bool = True, tensor_parallel_size: int = 1, - block_size: int = 16, + block_size: int = 16 if not current_platform.is_hpu() else 128, enable_chunked_prefill: bool = False, swap_space: int = 4, enforce_eager: Optional[bool] = False, @@ -973,11 +1048,19 @@ def caplog_vllm(temporary_enable_log_propagate, caplog): yield caplog +def is_hpu(): + from importlib import util + return util.find_spec('habana_frameworks') is not None + + @pytest.fixture(scope="session") def num_gpus_available(): """Get number of GPUs without initializing the CUDA context in current process.""" + if is_hpu(): + return torch.hpu.device_count() + return cuda_device_count_stateless() diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index eee77c22ab81a..d6f79bc1bb3f8 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -20,6 +20,9 @@ CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] +if current_platform.is_hpu(): + import habana_frameworks.torch as htorch + CUDA_DEVICES = ['hpu'] @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -65,6 +68,8 @@ def test_rotary_embedding( # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key) + if current_platform.is_hpu(): + htorch.core.mark_step() out_query, out_key = rope.forward(positions, query, key) # Compare the results. torch.testing.assert_close(out_query, @@ -120,6 +125,8 @@ def test_batched_rotary_embedding( # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key) + if current_platform.is_hpu(): + htorch.core.mark_step() out_query, out_key = rope.forward(positions, query, key, @@ -193,6 +200,8 @@ def test_batched_rotary_embedding_multi_lora( # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key, query_offsets) + if current_platform.is_hpu(): + htorch.core.mark_step() out_query, out_key = rope.forward(positions, query, key, query_offsets.flatten()) # Compare the results. diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index e7378d00765f0..151cfa767d891 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -65,11 +65,14 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): @pytest.fixture def dist_init(): + import habana_frameworks.torch.hpu # noqa: F401 temp_file = tempfile.mkstemp()[1] backend = "nccl" if current_platform.is_cpu(): backend = "gloo" + elif current_platform.is_hpu(): + backend = "hccl" init_distributed_environment(world_size=1, rank=0, @@ -293,8 +296,14 @@ def get_model_patched(**kwargs): max_lora_rank=8) return get_model_old(**kwargs) - with patch("vllm.worker.model_runner.get_model", get_model_patched): - engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) + if current_platform.is_hpu(): + with patch("vllm.worker.hpu_model_runner.get_model", + get_model_patched): + engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) + else: + with patch("vllm.worker.model_runner.get_model", get_model_patched): + engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) + yield engine.llm_engine del engine cleanup_dist_env_and_memory(shutdown_ray=True) diff --git a/tests/lora/test_layers_hpu.py b/tests/lora/test_layers_hpu.py new file mode 100644 index 0000000000000..1df4ab9616672 --- /dev/null +++ b/tests/lora/test_layers_hpu.py @@ -0,0 +1,1438 @@ +import random +from copy import deepcopy +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple +from unittest.mock import patch + +import habana_frameworks.torch.core as htcore +import pytest +import torch +import torch.nn.functional as F +from vllm_hpu_extension.ops import LoraMask + +from tests.utils import fork_new_process_for_each_test +from vllm.config import LoRAConfig +from vllm.lora.fully_sharded_layers import ( + ColumnParallelLinearWithShardedLoRA, + MergedColumnParallelLinearWithShardedLoRA, + MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora, + RowParallelLinearWithShardedLoRA) +# yapf conflicts with isort for this block +# yapf: disable +from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, + LinearScalingRotaryEmbeddingWithLora, + LogitsProcessorWithLoRA, LoRAMapping, + MergedColumnParallelLinearWithLoRA, + MergedQKVParallelLinearWithLora, + QKVParallelLinearWithLora, + ReplicatedLinearWithLoRA, + RowParallelLinearWithLoRA, + VocabParallelEmbeddingWithLoRA) +# yapf: enable +from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights, + PackedLoRALayerWeights) +from vllm.lora.punica_wrapper import get_punica_wrapper +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) +from vllm.model_executor.utils import set_random_seed +from vllm.platforms import current_platform + +from .utils import DummyLoRAManager + +TOLERANCES = { + torch.float16: (5e-3, 5e-3), + torch.float32: (5e-3, 5e-3), + torch.bfloat16: (3e-2, 2e-2), +} +# TODO: Modify this based on platform +if current_platform.is_hpu(): + DEVICES = ["hpu"] +else: + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] +#For GPU, we will launch different triton kernels between the prefill and decode +# stages, so we need to verify this. prefill stage(True) or decode stage(False) +STAGES = [True, False] + + +def get_random_id_to_index(num_loras: int, + num_slots: int, + log: bool = True) -> List[Optional[int]]: + """Creates a random lora_id_to_index mapping. + + Args: + num_loras: The number of active loras in the mapping. + num_slots: The number of slots in the mapping. Must be larger + than num_loras. + log: Whether to log the output. + """ + + if num_loras > num_slots: + raise ValueError( + f"num_loras is higher than num_slots: {num_loras} > {num_slots}. " + "num_loras must be less than or equal to num_slots.") + + slots: List[Optional[int]] = [None] * num_slots + random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist() + for lora_id, slot_idx in enumerate(random_slot_selections, start=1): + slots[slot_idx] = lora_id + + if log: + print(f"Created lora_id_to_index mapping: {slots}.") + + return slots + + +def populate_loras( + id_to_index: List[Optional[int]], + layer: BaseLayerWithLoRA, + layer_weights: torch.Tensor, + generate_embeddings_tensor: int = 0, + repeats: int = 1, +) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]: + """This method populates the lora layers with lora weights. + + Args: + id_to_index: a list of lora ids. The index of the lora id + represents which memory slot the lora matrices are + stored in. A None value indicates a free slot. + layer: the LoRAlayer to populate. + layer_weights: the PyTorch tensor containing the layer's + weights. + generate_embeddings_tensor: whether to generate an + embeddings tensor for each LoRA. + repeats: must only be set for column parallel packed + layers. Indicates the number of loras to compose + together to create a single lora layer. + """ + + # Dictionary that maps the lora ID to the + # corresponding lora weights. + lora_dict: Dict[int, LoRALayerWeights] = dict() + + # Dictionary that maps the lora ID to the + # corresponding subloras. + sublora_dict: Dict[int, List[LoRALayerWeights]] = dict() + + for slot_idx, lora_id in enumerate(id_to_index): + if lora_id is not None: + subloras: List[LoRALayerWeights] = [] + sublora_len = layer_weights.shape[0] // repeats + for i in range(repeats): + sublora = DummyLoRAManager( + layer_weights.device).init_random_lora( + module_name=f"fake_{i}", + weight=layer_weights, + generate_embeddings_tensor=generate_embeddings_tensor, + ) + sublora.lora_b = sublora.lora_b[:, (sublora_len * + i):(sublora_len * (i + 1))] + sublora.optimize() + subloras.append(sublora) + + lora = PackedLoRALayerWeights.pack( + subloras) if repeats > 1 else subloras[0] + + layer.set_lora( + slot_idx, + lora_a=lora.lora_a, + lora_b=lora.lora_b, + embeddings_tensor=lora.embeddings_tensor, + ) + + lora_dict[lora_id] = lora + sublora_dict[lora_id] = subloras + + return lora_dict, sublora_dict + + +def create_random_inputs( + active_lora_ids: List[int], + num_inputs: int, + input_size: Tuple[int, ...], + input_range: Tuple[float, float], + input_type: torch.dtype = torch.int, + device: torch.device = "cuda" +) -> Tuple[List[torch.Tensor], List[int], List[int]]: + """Creates random inputs. + + Args: + active_lora_ids: lora IDs of active lora weights. + num_inputs: the number of inputs to create. + input_size: the size of each individual input. + input_range: the range of values to include in the input. + input_range[0] <= possible input values < input_range[1] + input_type: the type of values in the input. + """ + + low, high = input_range + + inputs: List[torch.Tensor] = [] + index_mapping: List[int] = [] + prompt_mapping: List[int] = [] + + for _ in range(num_inputs): + if input_type == torch.int: + inputs.append( + torch.randint(low=int(low), + high=int(high), + size=input_size, + device=device)) + else: + inputs.append( + torch.rand(size=input_size, dtype=input_type, device=device) * + high + low) + + lora_id = random.choice(active_lora_ids) + index_mapping += [lora_id] * input_size[0] + prompt_mapping += [lora_id] + + return inputs, index_mapping, prompt_mapping + + +def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank, + lora_dtype): + indices = indices.view(-1, 1) + mask = torch.arange(max_loras * max_lora_rank, device=indices.device) + mask = mask.view(1, -1) + mask = ((mask >= ((indices) * max_lora_rank)) * + (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype) + mask = mask.view(batch_size, 1, + -1).expand(batch_size, seq_len, + -1).reshape(batch_size * seq_len, -1) + return mask + + +def check_punica_wrapper(punica_wrapper) -> bool: + if current_platform.is_cuda_alike(): + from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU + + return type(punica_wrapper) is PunicaWrapperGPU + elif current_platform.is_hpu(): + # Lazy import to avoid ImportError + from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU + return type(punica_wrapper) is PunicaWrapperHPU + else: + return False + + +@fork_new_process_for_each_test +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) +@pytest.mark.parametrize("stage", STAGES) +def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: + # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA + # device, see: https://github.com/triton-lang/triton/issues/2925 + # Same below. + if current_platform.is_cuda(): + torch.cuda.set_device(device) + + torch.set_default_device(device) + max_loras = 8 + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.bfloat16) + + def create_random_embedding_layer(): + embedding = VocabParallelEmbedding(vocab_size, 256) + embedding.weight.data = torch.rand_like(embedding.weight.data) + embedding.weight.data[vocab_size:, :] = 0 + lora_embedding = VocabParallelEmbeddingWithLoRA(embedding) + lora_embedding.create_lora_weights(max_loras, lora_config) + + return embedding, lora_embedding + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + embedding, lora_embedding = create_random_embedding_layer() + lora_embedding.set_mapping(punica_wrapper) + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_embedding, + layer_weights=embedding.weight.T, + ) + + if current_platform.is_hpu(): + htcore.mark_step() + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=num_loras * 3, + input_size=(200, ), + input_range=(1, vocab_size), + device=device) + + if current_platform.is_hpu(): + indices_list = [ + id_to_index.index(value) for value in index_mapping + ] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, indices.shape[0], 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) + + lora_result = lora_embedding(torch.cat(inputs)) + + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + lora = lora_dict[lora_id] + result = embedding(input_) + after_a = F.embedding( + input_, + lora.lora_a, + ) + result += (after_a @ lora.lora_b) + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_embedding.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=num_loras * 3, + input_size=(200, ), + input_range=(1, vocab_size), + device=device) + + if current_platform.is_hpu(): + indices = torch.full((len(inputs) * len(inputs[0]), ), + 0, + device=device) + mask = createLoraMask(indices, indices.shape[0], 1, 8, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) + + lora_result = lora_embedding(torch.cat(inputs)) + expected_result = embedding(torch.cat(inputs)) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@fork_new_process_for_each_test +@torch.inference_mode() +# @pytest.mark.skip( +# reason="Fails when loras are in any slot other than the first.") +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) +@pytest.mark.parametrize("stage", STAGES) +def test_embeddings_with_new_embeddings(dist_init, num_loras, device, + vocab_size, stage) -> None: + + torch.set_default_device(device) + max_loras = 8 + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.bfloat16) + + def create_random_embedding_layer(): + embedding = VocabParallelEmbedding(vocab_size, 256) + embedding_data = torch.rand_like(embedding.weight.data) + embedding.weight.data = embedding_data + embedding.weight.data[vocab_size:, :] = 0 + expanded_embedding = VocabParallelEmbedding( + vocab_size + lora_config.lora_extra_vocab_size * max_loras, + 256, + org_num_embeddings=vocab_size) + expanded_embedding.weight.data[:vocab_size, :] = embedding_data + # We need to deepcopy the embedding as it will be modified + # in place + lora_embedding = VocabParallelEmbeddingWithLoRA( + deepcopy(expanded_embedding)) + lora_embedding.create_lora_weights(max_loras, lora_config) + + return expanded_embedding, lora_embedding + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + expanded_embedding, lora_embedding = create_random_embedding_layer() + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_embedding, + layer_weights=torch.zeros( + (256, vocab_size + lora_config.lora_extra_vocab_size)), + generate_embeddings_tensor=256, + ) + + lora_embedding.set_mapping(punica_wrapper) + # All embeddings tensors have the same shape. + embeddings_tensors = [ + lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys()) + ] + embeddings_tensor_len = embeddings_tensors[0].shape[0] + + # Add empty embeddings_tensors for unoccupied lora slots. + for _ in range(max_loras - len(embeddings_tensors)): + embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape)) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=num_loras * 3, + input_size=(200, ), + input_range=(1, vocab_size), + device=device) + if current_platform.is_hpu(): + indices_list = [ + id_to_index.index(value) for value in index_mapping + ] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, indices.shape[0], 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) + original_inputs = deepcopy(inputs) + + # Force some of the inputs to be in the extended embeddings range + # to guarantee that their behavior is tested. + for input_, original_input_, lora_id in zip(inputs, original_inputs, + prompt_mapping): + embedding_id = lora_id - 1 + input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len) + original_input_[-1] = vocab_size + input_[-2] = vocab_size + ( + (embedding_id + 1) * embeddings_tensor_len - 1) + original_input_[-2] = vocab_size + embeddings_tensor_len - 1 + + expanded_embedding.weight[vocab_size:vocab_size + + (embeddings_tensor_len * + max_loras)] = torch.cat(embeddings_tensors) + + lora_result = lora_embedding(torch.cat(original_inputs)) + + expected_results: List[torch.Tensor] = [] + for input_, original_input_, lora_id in zip(inputs, original_inputs, + prompt_mapping): + lora = lora_dict[lora_id] + result = expanded_embedding(input_) + after_a = F.embedding( + original_input_, + lora.lora_a, + ) + result += (after_a @ lora.lora_b) + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_embedding.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=num_loras * 3, + input_size=(200, ), + input_range=(1, vocab_size), + device=device) + + if current_platform.is_hpu(): + indices = torch.full((len(inputs) * len(inputs[0]), ), + 0, + device=device) + mask = createLoraMask(indices, indices.shape[0], 1, 8, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + original_inputs = deepcopy(inputs) + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) + lora_result = lora_embedding(torch.cat(original_inputs)) + expected_result = expanded_embedding(torch.cat(inputs)) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@fork_new_process_for_each_test +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512]) +@pytest.mark.parametrize("stage", STAGES) +def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, + stage) -> None: + + if current_platform.is_cuda(): + torch.cuda.set_device(device) + torch.set_default_device(device) + max_loras = 8 + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.bfloat16) + + def _pretest(): + linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size, + 1024, + vocab_size, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + linear.weight.data[:, vocab_size:] = 0 + logits_processor = LogitsProcessor( + vocab_size + lora_config.lora_extra_vocab_size, vocab_size) + lora_logits_processor = LogitsProcessorWithLoRA( + logits_processor, 1024, linear.weight.dtype, linear.weight.device, + None) + lora_logits_processor.create_lora_weights(max_loras, lora_config) + + return linear, logits_processor, lora_logits_processor + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + linear, logits_processor, lora_logits_processor = _pretest() + lora_logits_processor.set_mapping(punica_wrapper) + # NOTE: all the generated loras share the same embeddings tensor. + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_logits_processor, + layer_weights=linear.weight, + generate_embeddings_tensor=1024, + ) + if current_platform.is_hpu(): + htcore.mark_step() + embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor + embeddings_tensor_len = embeddings_tensor.shape[0] + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=8 * num_loras, # * 3, + input_size=(1, 1024), + input_range=(0, 1), + input_type=torch.bfloat16, + device=device) + + if current_platform.is_hpu(): + indices_list = [ + id_to_index.index(value) for value in index_mapping + ] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, indices.shape[0], 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + vocab_size, + lora_config.lora_extra_vocab_size, + ) + input_ = torch.rand(20, 1024) + + lora_result = lora_logits_processor._get_logits( + hidden_states=torch.cat(inputs), + lm_head=linear, + embedding_bias=None) + + original_lm_head = deepcopy(linear) + + linear.weight[logits_processor. + org_vocab_size:logits_processor.org_vocab_size + + embeddings_tensor_len] = embeddings_tensor + + logits_processor.org_vocab_size = (vocab_size + + lora_config.lora_extra_vocab_size) + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + lora = lora_dict[lora_id] + result = logits_processor._get_logits(hidden_states=input_, + lm_head=linear, + embedding_bias=None) + result[:, vocab_size + embeddings_tensor_len:] = float("-inf") + result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + expected_results.append(result) + expected_result = torch.cat(expected_results) + logits_processor.org_vocab_size = vocab_size + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_logits_processor.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=8 * num_loras * 3, + input_size=(1, 1024), + input_range=(0, 1), + input_type=torch.bfloat16, + device=device) + + if current_platform.is_hpu(): + indices = torch.full((len(inputs) * len(inputs[0]), ), + 0, + device=device) + mask = createLoraMask(indices, indices.shape[0], 1, 8, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + vocab_size, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_logits_processor._get_logits( + hidden_states=torch.cat(inputs), + lm_head=original_lm_head, + embedding_bias=None)[:, :vocab_size] + expected_result = logits_processor._get_logits( + hidden_states=torch.cat(inputs), + lm_head=original_lm_head, + embedding_bias=None) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@fork_new_process_for_each_test +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) +def test_linear_replicated(dist_init, num_loras, device, stage, + bias_enabled) -> None: + + if current_platform.is_hpu and bias_enabled: + pytest.skip("Bias support in LoRA is not enabled in HPU yet.") + if current_platform.is_cuda(): + torch.cuda.set_device(device) + + torch.set_default_device(device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) + max_loras = 8 + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.bfloat16, + bias_enabled=bias_enabled) + + def create_random_linear_replicated_layer(): + + linear = ReplicatedLinear(4096, + 4096, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = ReplicatedLinearWithLoRA(linear) + + lora_linear.create_lora_weights(max_loras, lora_config) + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == 1) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None + return linear, lora_linear + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + linear, lora_linear = create_random_linear_replicated_layer() + lora_linear.set_mapping(punica_wrapper) + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_linear, + layer_weights=linear.weight, + ) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + device=device) + + if current_platform.is_hpu(): + indices_list = [ + id_to_index.index(value) for value in index_mapping + ] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_linear(torch.cat(inputs))[0] + + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + if current_platform.is_hpu(): + htcore.mark_step() + lora = lora_dict[lora_id] + result = linear(input_)[0] + result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_linear.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + device=device) + + if current_platform.is_hpu(): + indices = torch.full((len(inputs), ), 0, device=device) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + 512, lora_config.lora_extra_vocab_size) + + lora_result = lora_linear(torch.cat(inputs))[0] + expected_result = linear(torch.cat(inputs))[0] + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@fork_new_process_for_each_test +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("orientation", ["row", "column"]) +@pytest.mark.parametrize("fully_shard", [True, False]) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) +def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, + device, stage, bias_enabled) -> None: + + if current_platform.is_cuda(): + torch.cuda.set_device(device) + + if current_platform.is_hpu: + if fully_shard: + pytest.skip("Fully sharded LoRAs is not enabled in HPU yet") + if bias_enabled: + pytest.skip("Bias support in LoRA is not enabled in HPU yet.") + + torch.set_default_device(device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) + max_loras = 8 + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + fully_sharded_loras=fully_shard, + lora_dtype=torch.bfloat16, + bias_enabled=bias_enabled) + + def create_random_linear_parallel_layer(): + if orientation == "row": + linear = RowParallelLinear(4096, + 4096, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard + else RowParallelLinearWithShardedLoRA(linear)) + else: + linear = ColumnParallelLinear(4096, + 4096, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = (ColumnParallelLinearWithLoRA(linear) + if not fully_shard else + ColumnParallelLinearWithShardedLoRA(linear)) + lora_linear.create_lora_weights(max_loras, lora_config) + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == 1) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None + return linear, lora_linear + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + linear, lora_linear = create_random_linear_parallel_layer() + lora_linear.set_mapping(punica_wrapper) + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_linear, + layer_weights=linear.weight, + ) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + device=device) + if current_platform.is_hpu(): + indices_list = [ + id_to_index.index(value) for value in index_mapping + ] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_linear(torch.cat(inputs))[0] + + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + if current_platform.is_hpu(): + htcore.mark_step() + lora = lora_dict[lora_id] + result = linear(input_)[0] + result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_linear.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + device=device) + + if current_platform.is_hpu(): + indices = torch.full((len(inputs), ), 0, device=device) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + 512, lora_config.lora_extra_vocab_size) + + lora_result = lora_linear(torch.cat(inputs))[0] + expected_result = linear(torch.cat(inputs))[0] + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@fork_new_process_for_each_test +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("repeats", [1, 2, 3]) +@pytest.mark.parametrize("fully_shard", [True, False]) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) +def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, + device, stage, bias_enabled) -> None: + + if current_platform.is_cuda(): + torch.cuda.set_device(device) + + if current_platform.is_hpu: + if fully_shard: + pytest.skip("Fully sharded LoRAs is not enabled in HPU yet") + if bias_enabled: + pytest.skip("Bias support in LoRA is not enabled in HPU yet.") + + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) + max_loras = 8 + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + fully_sharded_loras=fully_shard, + lora_dtype=torch.bfloat16) + + def create_column_parallel_packed_layer(): + if repeats == 2: + linear = MergedColumnParallelLinear(4096, [4096] * repeats, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = (MergedColumnParallelLinearWithLoRA(linear) + if not fully_shard else + MergedColumnParallelLinearWithShardedLoRA(linear)) + elif repeats == 3: + linear = QKVParallelLinear(4096, + 64, + 32, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = (MergedQKVParallelLinearWithLora(linear) + if not fully_shard else + MergedQKVParallelLinearWithShardedLora(linear)) + else: + linear = QKVParallelLinear(4096, + 64, + 32, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = QKVParallelLinearWithLora( + linear + ) if not fully_shard else QKVParallelLinearWithShardedLora(linear) + + @dataclass + class FakeConfig: + hidden_size = 4096 + num_key_value_heads = 32 + num_attention_heads = 32 + + n_slices = repeats + lora_linear.create_lora_weights(max_loras, + lora_config, + model_config=FakeConfig()) + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == n_slices) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None + return linear, lora_linear + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + + linear, lora_linear = create_column_parallel_packed_layer() + lora_linear.set_mapping(punica_wrapper) + lora_dict, sublora_dict = populate_loras( + id_to_index, + layer=lora_linear, + layer_weights=linear.weight, + repeats=repeats, + ) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + device=device) + if current_platform.is_hpu(): + indices_list = [ + id_to_index.index(value) for value in index_mapping + ] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_linear(torch.cat(inputs))[0] + + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + if current_platform.is_hpu(): + htcore.mark_step() + result = linear(input_)[0] + subloras = sublora_dict[lora_id] + for i, sublora in enumerate(subloras): + result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * + (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b * + sublora.scaling) + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + for slot_idx in range(max_loras): + lora_linear.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + device=device) + + if current_platform.is_hpu(): + indices = torch.full((len(inputs), ), 0, device=device) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_linear(torch.cat(inputs))[0] + expected_result = linear(torch.cat(inputs))[0] + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@fork_new_process_for_each_test +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 8]) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0), + (6.0, 1.0)]) +@pytest.mark.parametrize("max_position", [11, 4096, 32768]) +@pytest.mark.parametrize("is_neox_style", [True, False]) +@pytest.mark.parametrize("rotary_dim", [None, 32]) +@pytest.mark.parametrize("head_size", [32, 108]) +@pytest.mark.parametrize("seq_len", [11, 1024]) +def _test_rotary_embedding_long_context(dist_init, num_loras, device, + scaling_factors, max_position, + is_neox_style, rotary_dim, head_size, + seq_len) -> None: + dtype = torch.bfloat16 + seed = 0 + current_platform.seed_everything(seed) + torch.set_default_device(device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) + max_loras = 8 + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + long_lora_scaling_factors=scaling_factors, + lora_dtype=dtype) + + if rotary_dim is None: + rotary_dim = head_size + base = 10000 + batch_size = 5 * num_loras + num_heads = 7 + + # Verify lora is equivalent to linear scaling rotary embedding. + rope = get_rope(head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype=torch.bfloat16) + lora_rope = LinearScalingRotaryEmbeddingWithLora(rope) + lora_rope.set_mapping(punica_wrapper) + lora_rope.create_lora_weights(max_loras, lora_config) + linear_rope = get_rope(head_size, + rotary_dim, + max_position, + base, + is_neox_style, { + "rope_type": "linear", + "factor": scaling_factors + }, + dtype=torch.bfloat16) + #linear_rope = linear_rope.to(dtype=dtype) + id_to_index = get_random_id_to_index(num_loras, max_loras) + _, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=batch_size, + input_size=(1, max_position), + input_range=(0, lora_config.lora_extra_vocab_size), + input_type=torch.bfloat16, + device=device) + + lora_mapping = LoRAMapping(index_mapping, prompt_mapping) + long_lora_context = LongContextLoRAContext(list(scaling_factors), + rotary_dim) + + next_expected_offset = 0 + # Make sure the offset is correct. + scaling_factor_to_offset = lora_rope.scaling_factor_to_offset + for scaling_factor, offset in scaling_factor_to_offset.items(): + assert offset == next_expected_offset + next_expected_offset += scaling_factor * max_position + + for i in range(len(scaling_factors)): + long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get( + scaling_factors[i], 0) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + long_lora_context=long_lora_context, + ) + # lora_rope.set_mapping(*mapping_info) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + ref_q, ref_k = linear_rope(positions, query, key) + if current_platform.is_hpu(): + htcore.mark_step() + actual_q, actual_k = lora_rope(positions, query, key) + + torch.allclose(ref_q, actual_q) + torch.allclose(ref_k, actual_k) + + +@pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) +@pytest.mark.parametrize("seed", list(range(256))) +def test_vocab_parallel_embedding_indices(tp_size, seed): + random.seed(seed) + vocab_size = random.randint(4000, 64000) + added_vocab_size = random.randint(0, 1024) + org_vocab_size = vocab_size - added_vocab_size + last_org_vocab_end_index = 0 + last_added_vocab_end_index = org_vocab_size + computed_vocab_size = 0 + computed_org_vocab_size = 0 + computed_added_vocab_size = 0 + vocab_size_padded = -1 + + all_org_tokens: List[int] = [] + all_added_tokens: List[int] = [] + token_ids: List[int] = [] + + for tp_rank in range(tp_size): + with patch( + "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank", + return_value=tp_rank + ), patch( + "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size", + return_value=tp_size): + vocab_embedding = VocabParallelEmbedding( + vocab_size, 1, org_num_embeddings=org_vocab_size) + vocab_size_padded = vocab_embedding.num_embeddings_padded + shard_indices = vocab_embedding.shard_indices + # Assert that the ranges are contiguous + assert shard_indices.org_vocab_start_index == last_org_vocab_end_index + assert (shard_indices.added_vocab_start_index == + last_added_vocab_end_index) + + # Ensure that we are not exceeding the vocab size + computed_vocab_size += shard_indices.num_elements_padded + computed_org_vocab_size += shard_indices.num_org_elements + computed_added_vocab_size += shard_indices.num_added_elements + + # Ensure that the ranges are not overlapping + all_org_tokens.extend( + range(shard_indices.org_vocab_start_index, + shard_indices.org_vocab_end_index)) + all_added_tokens.extend( + range(shard_indices.added_vocab_start_index, + shard_indices.added_vocab_end_index)) + + token_ids.extend( + range(shard_indices.org_vocab_start_index, + shard_indices.org_vocab_end_index)) + token_ids.extend([-1] * (shard_indices.num_org_elements_padded - + shard_indices.num_org_elements)) + token_ids.extend( + range(shard_indices.added_vocab_start_index, + shard_indices.added_vocab_end_index)) + token_ids.extend([-1] * (shard_indices.num_added_elements_padded - + shard_indices.num_added_elements)) + + last_org_vocab_end_index = shard_indices.org_vocab_end_index + last_added_vocab_end_index = shard_indices.added_vocab_end_index + + assert computed_vocab_size == vocab_size_padded + assert computed_org_vocab_size == org_vocab_size + assert computed_added_vocab_size == added_vocab_size + + # Ensure that the ranges are not overlapping + assert len(all_org_tokens) == len(set(all_org_tokens)) + assert len(all_added_tokens) == len(set(all_added_tokens)) + assert not set(all_org_tokens).intersection(set(all_added_tokens)) + + token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) + reindex_mapping = vocab_embedding.get_sharded_to_full_mapping() + assert reindex_mapping is not None or tp_size == 1 + if reindex_mapping is not None: + reindexed_token_ids = token_ids_tensor[reindex_mapping] + expected = torch.tensor(list(range(0, vocab_size))) + assert reindexed_token_ids[:vocab_size].equal(expected) + assert torch.all(reindexed_token_ids[vocab_size:] == -1) + + +def test_get_masked_input_and_mask(): + x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) + + # base tp 1 case, no padding + modified_x, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=8, + added_vocab_start_index=8, + added_vocab_end_index=12, + num_org_vocab_padding=0) + assert torch.equal(x, modified_x) + + # tp 2 case, no padding + modified_x_rank_0, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=4, + added_vocab_start_index=8, + added_vocab_end_index=10, + num_org_vocab_padding=0) + modified_x_rank_1, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=4, + org_vocab_end_index=8, + added_vocab_start_index=10, + added_vocab_end_index=12, + num_org_vocab_padding=0) + assert torch.equal(modified_x_rank_0, + torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0])) + assert torch.equal(modified_x_rank_1, + torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5])) + + # tp 4 case, no padding + modified_x_rank_0, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=2, + added_vocab_start_index=8, + added_vocab_end_index=9, + num_org_vocab_padding=0) + modified_x_rank_1, _ = get_masked_input_and_mask(x, + org_vocab_start_index=2, + org_vocab_end_index=4, + added_vocab_start_index=9, + added_vocab_end_index=10, + num_org_vocab_padding=0) + modified_x_rank_2, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=4, + org_vocab_end_index=6, + added_vocab_start_index=10, + added_vocab_end_index=11, + num_org_vocab_padding=0) + modified_x_rank_3, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=6, + org_vocab_end_index=8, + added_vocab_start_index=11, + added_vocab_end_index=12, + num_org_vocab_padding=0) + assert torch.equal(modified_x_rank_0, + torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0])) + assert torch.equal(modified_x_rank_1, + torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0])) + assert torch.equal(modified_x_rank_2, + torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0])) + assert torch.equal(modified_x_rank_3, + torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2])) + + # base tp 1 case, with padding + modified_x, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=8, + added_vocab_start_index=8, + added_vocab_end_index=12, + num_org_vocab_padding=2) + assert torch.equal(modified_x, + torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13])) + + # tp 2 case, with padding + modified_x_rank_0, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=4, + added_vocab_start_index=8, + added_vocab_end_index=10, + num_org_vocab_padding=2) + modified_x_rank_1, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=4, + org_vocab_end_index=8, + added_vocab_start_index=10, + added_vocab_end_index=12, + num_org_vocab_padding=2) + assert torch.equal(modified_x_rank_0, + torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0])) + assert torch.equal(modified_x_rank_1, + torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7])) + + # tp 4 case, with padding + modified_x_rank_0, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=2, + added_vocab_start_index=8, + added_vocab_end_index=9, + num_org_vocab_padding=2) + modified_x_rank_1, _ = get_masked_input_and_mask(x, + org_vocab_start_index=2, + org_vocab_end_index=4, + added_vocab_start_index=9, + added_vocab_end_index=10, + num_org_vocab_padding=2) + modified_x_rank_2, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=4, + org_vocab_end_index=6, + added_vocab_start_index=10, + added_vocab_end_index=11, + num_org_vocab_padding=2) + modified_x_rank_3, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=6, + org_vocab_end_index=8, + added_vocab_start_index=11, + added_vocab_end_index=12, + num_org_vocab_padding=2) + assert torch.equal(modified_x_rank_0, + torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0])) + assert torch.equal(modified_x_rank_1, + torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0])) + assert torch.equal(modified_x_rank_2, + torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0])) + assert torch.equal(modified_x_rank_3, + torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4])) diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py new file mode 100644 index 0000000000000..1262d5166be7b --- /dev/null +++ b/tests/lora/test_llama_hpu.py @@ -0,0 +1,99 @@ +from multiprocessing import Process +from typing import List + +import vllm +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.lora.request import LoRARequest + +MODEL_PATH = "/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf" + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /Ė©okiru/ [Ć²kĆ¬É½ÉÆĢ]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts: List[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +def _test_llama_lora(sql_lora_files, tp_size): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + dtype='float32', + tensor_parallel_size=tp_size) + + expected_no_lora_output = [ + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /Ė§kot/ [kĆ²t]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /Ė§kot/ [kĆ²t]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 + ] + expected_lora_output = [ + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /Ė©okiru/' [Ć²kĆ¬É½ÉÆĢ] AND accented_mora = 'low tone mora with a gloss of /Ė©okiru/' [Ć²kĆ¬É½ÉÆĢ] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 + ] + + print("lora adapter created") + assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output + + print("lora 1") + assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output + + print("no lora") + assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output + + print("lora 2") + assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output + + print("removing lora") + cleanup_dist_env_and_memory() + + +def test_llama_lora_1x(sql_lora_files): + p = Process(target=_test_llama_lora, args=(sql_lora_files, 1)) + p.start() + p.join() + assert p.exitcode == 0 + + +def test_llama_lora_2x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_lora, args=(sql_lora_files, 2)) + p.start() + p.join() + assert p.exitcode == 0 + + +def test_llama_lora_4x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_lora, args=(sql_lora_files, 4)) + p.start() + p.join() + assert p.exitcode == 0 \ No newline at end of file diff --git a/tests/lora/test_long_context_hpu.py b/tests/lora/test_long_context_hpu.py new file mode 100644 index 0000000000000..d329ccbd3c446 --- /dev/null +++ b/tests/lora/test_long_context_hpu.py @@ -0,0 +1,304 @@ +import ast +from typing import List, Optional, Tuple + +import numpy as np +import pytest + +import vllm +from vllm import SamplingParams +from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora +from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.rotary_embedding import ( + LinearScalingRotaryEmbedding) + +from .data.long_context_test_data import prompts_and_responses + +context_len_to_scaling_factor = { + "16k": 4, + "32k": 8, +} + +# We use the same sampling params for all requests +sampling_params = SamplingParams( + temperature=0, + max_tokens=100, +) + + +def _create_lora_request(lora_id, long_context_infos): + context_len = long_context_infos[lora_id]["context_length"] + scaling_factor = context_len_to_scaling_factor[context_len] + return LoRARequest( + # There are 2 LoRAs for 16K, we need to add lora_id to indicate + # they are different LoRAs. + context_len + str(lora_id), + lora_id, + long_context_infos[lora_id]["lora"], + None, + 4096 * scaling_factor, + ) + + +def evaluate_json_response(model_response, golden_response): + """Evaluates the model response against the golden response. + + Returns a score between 0 and 1, where 1 is a perfect match and 0 is no + match. The score quantifies how well the model is able to extract the + golden JSON from the long context. + """ + try: + model_response = ast.literal_eval(model_response) + except Exception as e: + raise ValueError( + f"Model response is not a valid JSON. Expected {golden_response}, " + f"got {model_response}") from e + + # Normally, we would flatten the dictionary and compare the values, but in + # this case, we know that the dictionary is only 2 levels deep + positive_values = 0 + total_values = 0 + # We look at all the attributes of the person that we are extracting a + # biography of and copmare them to the golden response + for person_attribute, person_attribute_value in golden_response.items(): + if person_attribute in model_response: + if isinstance(person_attribute_value, dict): + for (sub_attribute, + sub_attribute_value) in person_attribute_value.items(): + total_values += 1 + if sub_attribute in model_response[ + person_attribute] and model_response[ + person_attribute][ + sub_attribute] == sub_attribute_value: + positive_values += 1 + else: + total_values += 1 + if model_response[person_attribute] == person_attribute_value: + positive_values += 1 + else: + # We count a missing sub-dict as a single missed value. + total_values += 1 + + # Return a score between 0 and 1 + return positive_values / total_values + + +def generate( + llm: vllm.LLM, + inputs: Tuple[str, SamplingParams, Optional[LoRARequest]], +): + prompts, sampling_param, lora_request = inputs + outputs = llm.generate(prompts, sampling_param, lora_request=lora_request) + return outputs[0].outputs[0].text.strip() + + +def batched_generate( + llm: vllm.LLM, + inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]], +): + for input in inputs: + prompt, sampling_param, lora_req = input + # Add requests to the engine and run the engine + llm._validate_and_add_requests(prompt, + sampling_param, + lora_request=lora_req, + prompt_adapter_request=None) + + outputs = llm._run_engine(use_tqdm=True) + return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))] + + +@pytest.fixture(scope="module") +def lora_llm(long_context_infos): + scaling_factors = [ + context_len_to_scaling_factor[info["context_length"]] + for info in long_context_infos.values() + ] + + llm = vllm.LLM( + "/mnt/weka/data/pytorch/llama2/Llama-2-13b-chat-hf", + enable_lora=True, + max_num_seqs=16, + max_loras=2, + long_lora_scaling_factors=tuple(scaling_factors), + max_num_batched_tokens=4096 * 8, + tensor_parallel_size=1, + dtype="bfloat16", + # FIXME enable async output processor + disable_async_output_proc=True, + distributed_executor_backend="mp") + yield llm + del llm + + +def test_rotary_emb_replaced(dist_init): + """Verify rotary emb in all the layers are replaced""" + from vllm.engine.arg_utils import EngineArgs + from vllm.platforms import current_platform + if current_platform.is_hpu(): + from vllm.worker.hpu_model_runner import HPUModelRunner as ModelRunner + else: + from vllm.worker.model_runner import ModelRunner + engine_args = EngineArgs("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", + long_lora_scaling_factors=(4.0, ), + enable_lora=True) + engine_config = engine_args.create_engine_config() + model_runner = ModelRunner( + vllm_config=engine_config, + is_driver_worker=True, + ) + model_runner.load_model() + rotary_emb_count = 0 + model = model_runner.model.model if current_platform.is_hpu( + ) else model_runner.model + for module_name, module in model.named_modules(remove_duplicate=False): + if "rotary_emb" in module_name: + if "base_layer" not in module_name: + rotary_emb_count += 1 + assert isinstance(module, LinearScalingRotaryEmbeddingWithLora) + else: + assert isinstance(module, LinearScalingRotaryEmbedding) + # Llama 2 has 32 layers. + assert rotary_emb_count == 32 + + +@pytest.mark.skip_global_cleanup +def test_batched_rope_kernel(lora_llm, long_context_infos): + """We test the batched kernel by comparing the results of batched an + non-batched generation. + """ + # Create non batched results first to compare against batched results + non_batched_results: List[str] = [] + + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + lora_prompt = (prompts_and_responses[context_len][0]["prompt"], + sampling_params, + _create_lora_request(lora_id, long_context_infos)) + lora_output = generate(lora_llm, lora_prompt) + non_batched_results.append(lora_output) + + # Create batched results + # Each element of the batch must be + # (prompt, prompt_sampling_params, prompt_lora_request) + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + batched_prompts.extend([ + (prompts_and_responses[context_len][0]["prompt"], sampling_params, + _create_lora_request(lora_id, long_context_infos)) + ]) + batched_results = batched_generate(lora_llm, batched_prompts) + + # Results should be the same + for non_batched, batched in zip(non_batched_results, batched_results): + assert non_batched == batched, ( + "Non batched and batched results should be the " + f"same:\n{batched}\n{non_batched}") + + +@pytest.mark.skip_global_cleanup +def test_self_consistency(lora_llm, long_context_infos): + """We test consistency of the batched kernel by permuting batched + inputs and comparing the results to the non-permuted batched results. + """ + num_loras = len(long_context_infos) + + # Create results in order of long_context_infos + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + batched_prompts.extend([ + (prompts_and_responses[context_len][0]["prompt"], sampling_params, + _create_lora_request(lora_id, long_context_infos)) + ]) + + batched_results = batched_generate(lora_llm, batched_prompts) + + permutation = np.random.default_rng(seed=42).permutation(num_loras) + + # Create results in random order of permutation + batched_prompts = [] + for i in permutation: + lora_id, info = list(long_context_infos.items())[i] + context_len = info["context_length"] + batched_prompts.extend([ + (prompts_and_responses[context_len][0]["prompt"], sampling_params, + _create_lora_request(lora_id, long_context_infos)) + ]) + + permutated_batched_results = batched_generate(lora_llm, batched_prompts) + + # Results should be the same + for i in range(num_loras): + assert batched_results[i] == permutated_batched_results[ + permutation[i]], ( + f"Results should be the same:\n{batched_results[i]}" + f"\n{permutated_batched_results[permutation[i]]}") + + +@pytest.mark.skip_global_cleanup +def test_quality(lora_llm, long_context_infos): + """We test the quality of the answers given by the LoRA model by + comparing the generated text to the merged model's outputs. + + This is effectively a mini-benchmark over four prompts. + If this test fails, this indicates that the quality of the LoRA model + is suboptimal compared to the merged model. For example, if the model + does not output valid dictionaries, this test will fail. + + If needed for testing, the merged versions of the models are available + as part of the `conftest`. + + The test is expected to run for about 1 minute on a p4de.24xlarge + instance. + """ + scores: List[float] = [] + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + for prompt_and_response in prompts_and_responses[context_len]: + lora_prompt = (prompt_and_response["prompt"], sampling_params, + _create_lora_request(lora_id, long_context_infos)) + response = generate(lora_llm, lora_prompt) + golden_answer = prompt_and_response["golden_answer"] + score = evaluate_json_response(response, golden_answer) + scores.append(score) + assert score > 0.3, ("Quality of the answer is not good enough. " + f"Expected {golden_answer}, got {response}") + assert np.mean(scores) > 0.5 + + +@pytest.mark.skip_global_cleanup +def test_max_len(lora_llm, long_context_infos): + """Test that we raise an ValueError when the input of a given LoRA + model exceeds the maximum length.""" + # Since each LoRA model has a different maximum length, we need to + # test each one separately + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + lora_request = _create_lora_request(lora_id, long_context_infos) + # Good prompt should be fine + good_prompt = prompts_and_responses[context_len][0]["prompt"] + generate(lora_llm, (good_prompt, sampling_params, lora_request)) + # Bad prompt should raise an error + bad_prompt = good_prompt * 2 + with pytest.raises(ValueError): + generate(lora_llm, (bad_prompt, sampling_params, lora_request)) + + # Also test batched + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] + for lora_id_with_bad_inputs in long_context_infos: + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + batched_prompts.extend([ + (prompts_and_responses[context_len][0]["prompt"] * + (2 if lora_id == lora_id_with_bad_inputs else 1), + sampling_params, + _create_lora_request(lora_id, long_context_infos)) + ]) + # Turn good prompt into bad prompt inside of batched prompts + + with pytest.raises(ValueError): + batched_generate(lora_llm, batched_prompts) diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py new file mode 100644 index 0000000000000..f3a3d42dffb7c --- /dev/null +++ b/tests/lora/test_lora_hpu.py @@ -0,0 +1,270 @@ +import pytest +import torch +from vllm_hpu_extension.ops import LoraMask + +from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU + +from .utils import DummyLoRAManager + +TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] +QKV_TENSOR_SIZES = [ + (8192, 1024, 1024), + (8192 // 8, 1024 // 8, 1024 // 8), + (4096, 4096, 4096), + (4096 // 2, 4096 // 2, 4096 // 2), +] +BATCH_SIZES = [8, 32, 256] +RANKS = [8] +DTYPES = [torch.bfloat16] +TOLERANCES = { + torch.float16: (5e-3, 5e-3), + torch.bfloat16: (3e-2, 2e-2), +} + + +def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank, + lora_dtype): + indices = indices.view(-1, 1) + mask = torch.arange(max_loras * max_lora_rank, device=indices.device) + mask = mask.view(1, -1) + mask = ((mask >= ((indices) * max_lora_rank)) * + (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype) + mask = mask.view(batch_size, 1, + -1).expand(batch_size, seq_len, + -1).reshape(batch_size * seq_len, -1) + return mask + + +@pytest.mark.parametrize("m", TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora(m, n, k, rank, dtype) -> None: + manager = DummyLoRAManager(device="hpu") + + module_name = "module" + weight = torch.rand([m, n], device="hpu", dtype=dtype) + + manager.init_random_lora(module_name, weight, rank=rank) + lora = manager.get_module_lora(module_name) + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = input @ lora.lora_a @ lora.lora_b * lora.scaling + + lora_a_stack = [ + torch.zeros(8, + 1, + lora.lora_a.shape[1], + lora.lora_a.shape[0], + device="hpu", + dtype=dtype) + ] + lora_b_stack = [ + torch.zeros(8, + 1, + lora.lora_b.shape[1], + lora.lora_b.shape[0], + device="hpu", + dtype=dtype) + ] + for i in range(lora_a_stack[0].shape[0]): + lora_a_stack[0][i][0] = lora.lora_a.T + lora_b_stack[0][i][0] = (lora.lora_b * lora.scaling).T + + output = torch.zeros(k, m, device="hpu", dtype=dtype) + indices = torch.randint(0, + lora_a_stack[0].shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + punica_wrapper = PunicaWrapperHPU(4096, max_batches=256, device="hpu") + + lora_bias_stacked = None + output_slices = (lora_b_stack[0].shape[2], ) + punica_wrapper.add_lora_linear(output, input, lora_a_stack, lora_b_stack, + lora_bias_stacked, 1.0, output_slices) + + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + punica_wrapper.add_lora_linear(output, input, lora_a_stack, lora_b_stack, + lora_bias_stacked, 1.0, output_slices) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() + + +@pytest.mark.parametrize("m", TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: + if m % 2 != 0: + pytest.skip("m must be divisible by 2") + if m // 2 not in TENSOR_SIZES: + pytest.skip("m//2 must be in TENSOR_SIZES") + + manager = DummyLoRAManager(device="hpu") + + module_name = "module" + weight = torch.rand([m // 2, n], device="hpu", dtype=dtype) + + manager.init_random_lora(module_name + "1", weight, rank=rank) + lora_1 = manager.get_module_lora(module_name + "1") + manager.init_random_lora(module_name + "2", weight, rank=rank) + lora_2 = manager.get_module_lora(module_name + "2") + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = torch.cat([ + input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling, + input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling + ], + dim=1) + + lora_a_stacks = [ + torch.zeros(8, + 1, + lora_1.lora_a.shape[1], + lora_1.lora_a.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + lora_b_stacks = [ + torch.zeros(8, + 1, + lora_1.lora_b.shape[1], + lora_1.lora_b.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + for i in range(lora_a_stacks[0].shape[0]): + lora_a_stacks[0][i][0] = lora_1.lora_a.T + lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T + lora_a_stacks[1][i][0] = lora_2.lora_a.T + lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T + + output = torch.zeros(k, m, device="hpu", dtype=dtype) + indices = torch.randint(0, + lora_a_stacks[0].shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + lora_bias_stacked = None + punica_wrapper = PunicaWrapperHPU(4096, max_batches=256, device="hpu") + punica_wrapper.add_lora_linear(output, input, lora_a_stacks, lora_b_stacks, + lora_bias_stacked, 1.0, (m // 2, m // 2)) + + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + punica_wrapper.add_lora_linear(output, input, lora_a_stacks, lora_b_stacks, + lora_bias_stacked, 1.0, (m // 2, m // 2)) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() + + +@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: + manager = DummyLoRAManager(device="hpu") + + module_name = "module" + weight_q = torch.empty(qkv[0], n, device="hpu", dtype=dtype) + weight_kv = torch.empty(qkv[1], n, device="hpu", dtype=dtype) + + manager.init_random_lora(module_name + "q", weight_q, rank=rank) + lora_q = manager.get_module_lora(module_name + "q") + manager.init_random_lora(module_name + "k", weight_kv, rank=rank) + lora_k = manager.get_module_lora(module_name + "k") + manager.init_random_lora(module_name + "v", weight_kv, rank=rank) + lora_v = manager.get_module_lora(module_name + "v") + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = torch.cat([ + input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling, + input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling, + input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling + ], + dim=1) + + lora_a_stacks = [ + torch.zeros(8, + 1, + lora_q.lora_a.shape[1], + lora_q.lora_a.shape[0], + device="hpu", + dtype=dtype) + ] + [ + torch.zeros(8, + 1, + lora_k.lora_a.shape[1], + lora_k.lora_a.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + lora_b_stacks = [ + torch.zeros(8, + 1, + lora_q.lora_b.shape[1], + lora_q.lora_b.shape[0], + device="hpu", + dtype=dtype) + ] + [ + torch.zeros(8, + 1, + lora_k.lora_b.shape[1], + lora_k.lora_b.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + for i in range(lora_a_stacks[0].shape[0]): + lora_a_stacks[0][i][0] = lora_q.lora_a.T + lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T + lora_a_stacks[1][i][0] = lora_k.lora_a.T + lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T + lora_a_stacks[2][i][0] = lora_v.lora_a.T + lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T + + output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype) + indices = torch.randint(0, + lora_a_stacks[0].shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + lora_bias_stacked = None + punica_wrapper = PunicaWrapperHPU(4096, max_batches=256, device="hpu") + qkvs = (qkv[0], qkv[1], qkv[2]) + punica_wrapper.add_lora_linear(output, input, lora_a_stacks, lora_b_stacks, + lora_bias_stacked, 1.0, qkvs) + + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + qkvs = (qkv[0], qkv[1], qkv[2]) + punica_wrapper.add_lora_linear(output, input, lora_a_stacks, lora_b_stacks, + lora_bias_stacked, 1.0, qkvs) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() diff --git a/tests/lora/test_lora_manager_hpu.py b/tests/lora/test_lora_manager_hpu.py new file mode 100644 index 0000000000000..ede3b11e431f5 --- /dev/null +++ b/tests/lora/test_lora_manager_hpu.py @@ -0,0 +1,559 @@ +import os +from typing import Dict, List + +import habana_frameworks.torch # noqa: F401 +import pytest +import torch +from safetensors.torch import load_file +from torch import nn + +from vllm.config import LoRAConfig +from vllm.lora.layers import (ColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithLoRA, + RowParallelLinearWithLoRA) +from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager, + LRUCacheLoRAModelManager) +from vllm.lora.request import LoRARequest +from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, + WorkerLoRAManager) +from vllm.model_executor.layers.linear import RowParallelLinear + +EMBEDDING_MODULES = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", +} + +EMBEDDING_PADDING_MODULES = ["lm_head"] + + +def test_from_lora_tensors(sql_lora_files): + tensors = load_file( + os.path.join(sql_lora_files, "adapter_model.safetensors")) + new_embeddings = load_file( + os.path.join(sql_lora_files, "new_embeddings.safetensors")) + lora_model = LoRAModel.from_lora_tensors( + 1, + 8, + 16, + tensors, + torch.device("hpu"), + embeddings=new_embeddings, + embedding_modules=EMBEDDING_MODULES, + embedding_padding_modules=EMBEDDING_PADDING_MODULES) + for module_name, lora in lora_model.loras.items(): + assert lora.module_name == module_name + assert lora.rank == 8 + assert lora.lora_alpha == 16 + assert lora.lora_a is not None + assert lora.lora_b is not None + assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] + ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" + assert lora.lora_a.shape[1] == 8 + embeddings_module = next( + (k for k in EMBEDDING_MODULES if k in module_name), None) + if embeddings_module: + assert torch.equal( + lora.embeddings_tensor, + new_embeddings[EMBEDDING_MODULES[embeddings_module]].to( + device=lora.embeddings_tensor.device)) + else: + assert lora.embeddings_tensor is None + + +def create_lora(lora_id: int, model: nn.Module, + sub_modules: List[str]) -> LoRAModel: + loras: Dict[str, LoRALayerWeights] = {} + for name in sub_modules: + w = model.get_submodule(name).weight + loras[name] = LoRALayerWeights( + name, + 8, + 16, + torch.rand([w.shape[1], 8], device="hpu"), + torch.rand([8, w.shape[0]], device="hpu"), + ) + return LoRAModel(lora_id, 8, loras) + + +def create_packed_lora( + lora_id: int, + model: nn.Module, + module_name, + replaced_module_names, + empty_replaced_module_name=None, +) -> LoRAModel: + w = model.get_submodule(module_name).weight + loras: Dict[str, LoRALayerWeights] = {} + for replaced_module_name in replaced_module_names: + if replaced_module_name == empty_replaced_module_name: + continue + loras[replaced_module_name] = LoRALayerWeights( + replaced_module_name, + 8, + 16, + torch.rand([w.shape[1], 8], device="hpu"), + torch.rand([8, w.shape[0] // len(replaced_module_names)], + device="hpu"), + ) + return LoRAModel(lora_id, 8, loras) + + +def test_replace_submodules(dist_init, dummy_model): + model = dummy_model + model.supported_lora_modules = ["dense1", "layer1.dense2"] + model.packed_modules_mapping = {} + manager = LoRAModelManager( + model, 1, 1, 1, + LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), + torch.device("hpu")) + model = manager.model + + assert isinstance(model.get_submodule("dense1"), + ColumnParallelLinearWithLoRA) + assert isinstance(model.get_submodule("layer1.dense1"), + ColumnParallelLinearWithLoRA) + assert isinstance(model.get_submodule("dense2"), RowParallelLinear) + assert isinstance(model.get_submodule("layer1.dense2"), + RowParallelLinearWithLoRA) + + +def test_lora_model_manager(dist_init, dummy_model): + model = dummy_model + model.supported_lora_modules = ["dense1", "dense2", "lm_head"] + model.packed_modules_mapping = {} + model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) + model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) + model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) + manager = LoRAModelManager( + model, 2, 2, 2, + LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2), + torch.device("hpu")) + assert all(x is None for x in manager.lora_index_to_id) + assert manager.add_adapter(model_lora1) + assert manager.activate_adapter(1) + assert manager.lora_index_to_id[0] == 1 + assert not manager.add_adapter(model_lora1) + assert not manager.activate_adapter(1) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(2) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + assert not manager.add_adapter(model_lora2) + assert not manager.activate_adapter(2) + assert manager.add_adapter(model_lora3) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + with pytest.raises(ValueError): + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + assert manager.remove_adapter(model_lora2.id) + assert manager.lora_index_to_id[1] is None + assert not manager.remove_adapter(model_lora2.id) + assert manager.remove_adapter(model_lora1.id) + assert not manager.remove_adapter(model_lora1.id) + assert manager.add_adapter(model_lora1) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] is None + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] is None + assert manager.activate_adapter(2) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 2 + + +def test_lora_lru_cache_model_manager(dist_init, dummy_model): + model = dummy_model + model.supported_lora_modules = ["dense1", "dense2", "lm_head"] + model.packed_modules_mapping = {} + model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) + model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) + model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) + manager = LRUCacheLoRAModelManager( + model, 2, 2, 2, + LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2), + torch.device("hpu")) + assert all(x is None for x in manager.lora_index_to_id) + assert manager.add_adapter(model_lora1) + assert manager.activate_adapter(1) + assert manager.lora_index_to_id[0] == 1 + assert not manager.add_adapter(model_lora1) + assert not manager.activate_adapter(1) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(2) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + assert not manager.add_adapter(model_lora2) + assert not manager.activate_adapter(2) + assert manager.add_adapter(model_lora3) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 2 + assert manager.remove_adapter(model_lora2.id) + assert manager.lora_index_to_id[1] is None + assert not manager.remove_adapter(model_lora2.id) + assert manager.remove_adapter(model_lora1.id) + assert not manager.remove_adapter(model_lora1.id) + assert manager.add_adapter(model_lora1) + assert manager.activate_adapter(1) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + assert manager.add_adapter(model_lora2) + assert manager.deactivate_adapter(3) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_adapter(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 3 + assert manager.pin_adapter(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 3 + assert manager.activate_adapter(1) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.deactivate_adapter(2) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + assert manager.pin_adapter(3) + assert manager.pin_adapter(1) + with pytest.raises(RuntimeError): + assert manager.pin_adapter(2) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + with pytest.raises(RuntimeError): + assert manager.activate_adapter(2) + + assert manager.deactivate_adapter(3) + assert manager.pin_adapter(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.remove_adapter(3) + with pytest.raises(ValueError): + assert manager.pin_adapter(3) + + +def test_lru_lora_model_manager(dist_init, dummy_model): + # This tests just the LRU cache functionality, everything else is + # tested in test_lora_model_manager + model = dummy_model + model.supported_lora_modules = ["dense1", "dense2", "lm_head"] + model.packed_modules_mapping = {} + model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) + model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) + model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) + model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"]) + manager = LRUCacheLoRAModelManager( + model, 2, 2, 2, + LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2), + torch.device("hpu")) + + assert all(x is None for x in manager.lora_index_to_id) + + # Add up to capacity + assert manager.add_adapter(model_lora1) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(1) + assert manager.activate_adapter(2) + + assert set(manager.list_adapters()) == {1, 2} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + + # Add over capacity + assert manager.add_adapter(model_lora3) + assert manager.add_adapter(model_lora4) + assert manager.activate_adapter(3) + assert manager.activate_adapter(4) + + assert set(manager.list_adapters()) == {3, 4} + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 4 + + # Add 3 again to move it to the top and then add 2 + # should return false since it's in already + assert not manager.add_adapter(model_lora3) + assert not manager.activate_adapter(3) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(2) + + assert set(manager.list_adapters()) == {3, 2} + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 2 + + # Remove manually + assert manager.remove_adapter(3) + assert not manager.remove_adapter(3) + + assert set(manager.list_adapters()) == {2} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 2 + + assert manager.add_adapter(model_lora3) + assert manager.activate_adapter(3) + assert manager.add_adapter(model_lora4) + assert manager.activate_adapter(4) + + assert set(manager.list_adapters()) == {3, 4} + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 4 + + assert manager.remove_oldest_adapter() + assert set(manager.list_adapters()) == {4} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 4 + + assert manager.remove_oldest_adapter() + assert set(manager.list_adapters()) == set() + assert all(x is None for x in manager.lora_index_to_id) + + assert not manager.remove_oldest_adapter() + assert set(manager.list_adapters()) == set() + assert all(x is None for x in manager.lora_index_to_id) + + # pinning + assert manager.add_adapter(model_lora3) + assert manager.activate_adapter(3) + assert manager.add_adapter(model_lora4) + assert manager.activate_adapter(4) + assert set(manager.list_adapters()) == {3, 4} + with pytest.raises(ValueError): + assert manager.pin_adapter(1) + assert manager.pin_adapter(3) + # Remove manually + assert manager.remove_adapter(3) + assert not manager.remove_adapter(3) + + assert set(manager.list_adapters()) == {4} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 4 + + assert manager.add_adapter(model_lora1) + assert manager.pin_adapter(1) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(2) + + assert set(manager.list_adapters()) == {1, 2} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + + assert manager.remove_oldest_adapter() + assert set(manager.list_adapters()) == {1} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] is None + + with pytest.raises(RuntimeError): + assert manager.remove_oldest_adapter() + + assert set(manager.list_adapters()) == {1} + + +def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, + sql_lora_files): + lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) + worker_adapter_manager = LRUCacheWorkerLoRAManager( + 4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size - + lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"), + EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + worker_adapter_manager.create_lora_manager( + llama_2_7b_model_extra_embeddings.model) + + mapping = LoRAMapping([], []) + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("2", 2, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("3", 3, sql_lora_files), + LoRARequest("4", 4, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("2", 2, sql_lora_files), + LoRARequest("5", 5, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("1", 1, sql_lora_files), + LoRARequest("1", 1, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("6", 6, sql_lora_files), + LoRARequest("7", 7, sql_lora_files), + LoRARequest("8", 8, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 8 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 6 + + # Over capacity + with pytest.raises(RuntimeError): + worker_adapter_manager.set_active_adapters([ + LoRARequest("10", 10, sql_lora_files), + LoRARequest("11", 11, sql_lora_files), + LoRARequest("12", 12, sql_lora_files), + LoRARequest("13", 13, sql_lora_files), + LoRARequest("14", 14, sql_lora_files) + ], mapping) + + +def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, + sql_lora_files): + # Should remove every LoRA not specified in the request. + lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) + worker_adapter_manager = WorkerLoRAManager( + 4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size - + lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"), + EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + worker_adapter_manager.create_lora_manager( + llama_2_7b_model_extra_embeddings.model) + + mapping = LoRAMapping([], []) + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("2", 2, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("3", 3, sql_lora_files), + LoRARequest("4", 4, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 3, 4} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("2", 2, sql_lora_files), + LoRARequest("5", 5, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2, 5} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("1", 1, sql_lora_files), + LoRARequest("1", 1, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None + + worker_adapter_manager.set_active_adapters([ + LoRARequest("6", 6, sql_lora_files), + LoRARequest("7", 7, sql_lora_files), + LoRARequest("8", 8, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {6, 7, 8} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 7 + + # Over capacity + with pytest.raises(RuntimeError): + worker_adapter_manager.set_active_adapters([ + LoRARequest("10", 10, sql_lora_files), + LoRARequest("11", 11, sql_lora_files), + LoRARequest("12", 12, sql_lora_files), + LoRARequest("13", 13, sql_lora_files), + LoRARequest("14", 14, sql_lora_files) + ], mapping) + + +def test_packed_loras(dist_init, dummy_model_gate_up): + model = dummy_model_gate_up + model.supported_lora_modules = ["gate_up_proj"] + model.packed_modules_mapping = { + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + model_lora = create_packed_lora( + 1, + model, + module_name="gate_up_proj", + replaced_module_names=["gate_proj", "up_proj"]) + model_lora1 = create_packed_lora( + 2, + model, + module_name="gate_up_proj", + replaced_module_names=["gate_proj", "up_proj"], + empty_replaced_module_name="gate_proj", + ) + + manager = LoRAModelManager( + model, 2, 2, 2, + LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2), + torch.device("hpu")) + model = manager.model + + assert isinstance(model.get_submodule("gate_up_proj"), + MergedColumnParallelLinearWithLoRA) + assert manager.add_adapter(model_lora) + assert manager.add_adapter(model_lora1) + + packed_lora = model_lora.get_lora("gate_up_proj") + assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights) + + assert torch.allclose(packed_lora.lora_a[0], + model_lora.get_lora("gate_proj").lora_a) + assert torch.allclose(packed_lora.lora_b[0], + model_lora.get_lora("gate_proj").lora_b) + assert torch.allclose(packed_lora.lora_a[1], + model_lora.get_lora("up_proj").lora_a) + assert torch.allclose(packed_lora.lora_b[1], + model_lora.get_lora("up_proj").lora_b) + + packed_lora1 = model_lora1.get_lora("gate_up_proj") + assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights) + + assert packed_lora1.lora_a[0] is None + assert packed_lora1.lora_b[0] is None + assert torch.allclose(packed_lora1.lora_a[1], + model_lora1.get_lora("up_proj").lora_a) + assert torch.allclose(packed_lora1.lora_b[1], + model_lora1.get_lora("up_proj").lora_b) diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py new file mode 100644 index 0000000000000..e2ae1051ade17 --- /dev/null +++ b/tests/lora/test_multilora_hpu.py @@ -0,0 +1,131 @@ +from multiprocessing import Process +from typing import List, Optional, Tuple + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.lora.request import LoRARequest + + +def create_test_prompts( + lora_path: str +) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: + """Create a list of test prompts with their sampling parameters. + + 2 requests for base model, 4 requests for the LoRA. We define 2 + different LoRA adapters (using the same model for demo purposes). + """ + return [ + ("A robot may not injure a human being", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), None), + ("To be or not to be,", + SamplingParams(temperature=0.8, + top_k=5, + presence_penalty=0.2, + max_tokens=128), None), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora2", 2, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ] + + +def process_requests(engine: LLMEngine, + test_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]]): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + result = {} + + while test_prompts or engine.has_unfinished_requests(): + if test_prompts: + prompt, sampling_params, lora_request = test_prompts.pop(0) + engine.add_request(str(request_id), + prompt, + sampling_params, + lora_request=lora_request) + request_id += 1 + + request_outputs: List[RequestOutput] = engine.step() + + for request_output in request_outputs: + if request_output.finished: + result[ + request_output.request_id] = request_output.outputs[0].text + return result + + +expected_output = [ + " or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194", # noqa: E501 + " that is the question.\nIt is the most famous line in all of Shakespeare's plays and one of the most famous in English literature. The question is not whether or not to be, but rather the question of who to be.\nIn Hamlet's case, the question is whether or not to be a good person. He is torn between the goodness of his father and the evil of his mother.\nThe question is a difficult one, and one that has been asked many times before. In Hamlet's case, the question is whether or not to be a good person, and he is torn between the", # noqa: E501 + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' " # noqa: E501 +] + + +def _test_llama_multilora(sql_lora_files, tp_size): + """Main function that sets up and runs the prompt processing.""" + engine_args = EngineArgs( + model="/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", + enable_lora=True, + max_loras=2, + max_lora_rank=8, + max_num_seqs=256, + dtype='float32', + tensor_parallel_size=tp_size) + engine = LLMEngine.from_engine_args(engine_args) + test_prompts = create_test_prompts(sql_lora_files) + results = process_requests(engine, test_prompts) + generated_texts = [results[key] for key in sorted(results)] + assert generated_texts == expected_output + + +def test_llama_multilora_1x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1)) + p.start() + p.join() + assert p.exitcode == 0 + + +def test_llama_multilora_2x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2)) + p.start() + p.join() + assert p.exitcode == 0 + + +def test_llama_multilora_4x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4)) + p.start() + p.join() + assert p.exitcode == 0 diff --git a/tests/lora/utils.py b/tests/lora/utils.py index ce47546f2154b..a35aa47f1b4b8 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -60,8 +60,8 @@ def init_lora( module_name, rank=rank, lora_alpha=1, - lora_a=torch.rand([input_dim, rank], device="cuda"), - lora_b=torch.rand([rank, output_dim], device="cuda"), + lora_a=torch.rand([input_dim, rank], device=self._device), + lora_b=torch.rand([rank, output_dim], device=self._device), embeddings_tensor=embeddings_tensor, ) self.set_module_lora(module_name, lora) diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 636a3eedff31b..84ed415c136cc 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -278,6 +278,28 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, ) +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("sizes", [ + [(512, 512), (512, 512), (512, 512)], +]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_hpu_models(hf_hpu_runner, vllm_runner, image_assets, model, sizes, + dtype, max_tokens, num_logprobs) -> None: + run_test( + hf_hpu_runner, + vllm_runner, + image_assets, + model, + sizes=sizes, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) + + @large_gpu_test(min_gb=48) @pytest.mark.core_model @pytest.mark.parametrize("model", models) diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 397fa2cc85821..34cfb9c5bbd71 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -84,7 +84,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int, dtype=torch.int64) rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) + rejection_sampler.init_tensors(device=device) output_token_ids = rejection_sampler._create_output( # pylint: disable=protected-access accepted, recovered_token_ids, @@ -133,7 +133,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, device: str, use_flashinfer: bool): torch.set_default_device(device) rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) + rejection_sampler.init_tensors(device=device) draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) target_probs = torch.rand(batch_size, @@ -166,7 +166,7 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int, use_flashinfer: bool): torch.set_default_device(device) rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) + rejection_sampler.init_tensors(device=device) draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) target_probs = torch.rand(batch_size, @@ -302,7 +302,7 @@ def get_seeded_seqs(): for use_flashinfer in [True, False]: rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) + rejection_sampler.init_tensors(device=device) # We use seeded sequences to ensure the same tokens are accepted # for both flashinfer and nonflashinfer backends. seeded_seqs = get_seeded_seqs() @@ -333,7 +333,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str, rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer, strict_mode=True) - rejection_sampler.init_gpu_tensors(device=device) + rejection_sampler.init_tensors(device=device) draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) target_probs = torch.rand(batch_size, @@ -464,7 +464,7 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler): self.vocab_size = vocab_size self.vocab_range = (0, vocab_size) - self.rejection_sampler.init_gpu_tensors(device=0) + self.rejection_sampler.init_tensors(device=0) # Keep test simple, use k=1 self.k = 1 diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 28c34064f670c..f6b7313863199 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -9,7 +9,7 @@ from transformers import GenerationConfig, GenerationMixin import vllm.envs as envs -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import ApplyToppTopkScalar, Sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata @@ -754,3 +754,63 @@ def test_sampler_include_gpu_probs_tensor(device: str): assert sampler_output.sampled_token_probs is not None assert sampler_output.logprobs is not None assert sampler_output.sampled_token_ids is not None + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_topk_topk_scalar(): + obj1 = ApplyToppTopkScalar(2) + assert ApplyToppTopkScalar._padded_k == 0 + x = torch.tensor([[9, 9, 8, 8, 8, 8, 7, 7, 7.0], + [10, 10, 9, 9, 9, 8, 5, 5, 5]]) + + retval1 = obj1(x, p=0.9, k=5) + ninf = -float("inf") + expected1 = torch.tensor([[9., 9., 8., 8., 8., 8., ninf, ninf, ninf], + [10., 10., 9., 9., 9., ninf, ninf, ninf, ninf]]) + assert torch.all(retval1 == expected1).item() + assert ApplyToppTopkScalar._padded_k == 9 + + obj2 = ApplyToppTopkScalar(2) + assert obj2._padded_k == 9 + + x = torch.tensor([[2, 2, 9, 9, 2, 2, 1, 1, 1.0], + [10, 9, 9, 5, 9, 9, 5, 9, 10]]) + retval2 = obj2(x, p=0.9, k=5) + expected2 = torch.tensor( + [[ninf, ninf, 9., 9., ninf, ninf, ninf, ninf, ninf], + [10., ninf, 9., ninf, 9., 9., ninf, 9., 10.]]) + assert torch.all(retval2 == expected2).item() + assert obj2._padded_k == 9 + + retval3 = obj2(x, p=1.0, k=5) + expected3 = torch.tensor([[2., 2., 9., 9., 2., 2., ninf, ninf, ninf], + [10., 9., 9., ninf, 9., 9., ninf, 9., 10.]]) + + assert torch.all(retval3 == expected3).item() + + # this should not be done in general, doing it here for testing purposes + ApplyToppTopkScalar._padded_k = 0 + x = torch.tensor([[1, 1, 1, 9, 8, 1, 1, 1, 1.0], + [2, 1, 2, 2, 1, 1, 1, 1, 1]]) + obj3 = ApplyToppTopkScalar(2) + retval4 = obj3(x, p=0.9, k=2) + expected4 = torch.tensor( + [[ninf, ninf, ninf, 9., 8., ninf, ninf, ninf, ninf], + [2., ninf, 2., 2., ninf, ninf, ninf, ninf, ninf]]) + assert torch.all(retval4 == expected4).item() + assert obj3._padded_k == 4 + y = torch.tensor([[8, 8, 8, 9, 8, 1, 1, 1, 1.0], + [2, 1, 2, 2, 1, 1, 1, 1, 1]]) + retval5 = obj3(y, p=0.9, k=2) + assert obj3._padded_k == 8 + expected5 = torch.tensor([[8., 8., 8., 9., 8., ninf, ninf, ninf, ninf], + [2., ninf, 2., 2., ninf, ninf, ninf, ninf, + ninf]]) + assert torch.all(retval5 == expected5).item() + y = torch.tensor([[8, 8, 8, 9, 8, 8, 1, 1, 1.0], + [2, 1, 2, 2, 3, 1, 1, 1, 1]]) + retval6 = obj3(y, p=0.9, k=2) + expected6 = torch.tensor([[8., 8., 8., 9., 8., 8., ninf, ninf, ninf], + [2., ninf, 2., 2., 3., ninf, ninf, ninf, ninf]]) + assert torch.all(retval6 == expected6).item() + assert obj3._padded_k == 8 diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py index 4ddad66dce1fb..9d2b05f353dd2 100644 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -77,7 +77,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, """ torch.set_default_device(device) typical_acceptance_sampler = get_acceptance_sampler() - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) target_with_bonus_probs = torch.rand(batch_size, k + 1, vocab_size, @@ -113,7 +113,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str, vocab_size = 30_000 torch.set_default_device(device) typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) target_with_bonus_probs = torch.rand(batch_size, k + 1, vocab_size, @@ -172,7 +172,7 @@ def test_uniform_target_distribution_accepts_all_tokens( vocab_size = 30_000 torch.set_default_device(device) typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) target_with_bonus_probs = torch.rand(batch_size, k + 1, vocab_size, @@ -222,7 +222,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str): torch.set_default_device(device) typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) # Simulate temperature 0 probability distribution for target probabilities # and create target probabilities such that only 1 token id has # probability 1.0 @@ -278,7 +278,7 @@ def test_mixed_target_distribution(seed: int, device: str): vocab_size = 30_000 torch.set_default_device(device) typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) # For sequences 0 and 2 set the distribution to a temperature # zero distribution. For sequences 1 and 3 set it to a uniform # distribution. @@ -341,7 +341,7 @@ def test_accept_tokens_partially(seed: int, device: str): vocab_size = 30_000 torch.set_default_device(device) typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) # Create a temperature zero target probability distribution and ensure # all draft token ids correspond to the tokens with 1.0 probability. # Verify that all of them are accepted. @@ -399,7 +399,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int, device: str): vocab_size = 30_000 torch.set_default_device(device) typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) # Simulate temperature 0 probability distribution for target # probabilities and create target probabilities such that only 1 token # id has probability 1.0 and others have a very low probability of @@ -430,7 +430,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int, device: str): # target distribution. Simulate and verify the same. typical_acceptance_sampler = TypicalAcceptanceSampler( strict_mode=True, posterior_threshold=0.0, posterior_alpha=0.0) - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) output_token_ids = typical_acceptance_sampler( target_probs, bonus_token_ids, @@ -462,7 +462,7 @@ def test_get_recovered_token_ids(seed: int, device: str): vocab_size = 30_000 torch.set_default_device(device) typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) + typical_acceptance_sampler.init_tensors(device=device) target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) expected_replacement_tokens = torch.argmax(target_probs, dim=-1) actual_replacement_tokens = ( diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 7477486a3388d..59f9986f9c7d7 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -20,7 +20,7 @@ def test_initial_call_returns_none(): spec_decode_sampler.num_draft_tokens = 0 collector = AsyncMetricsCollector(spec_decode_sampler) - collector.init_gpu_tensors(rank=0) + collector.init_tensors(rank=0) maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5) assert maybe_metrics is None @@ -46,7 +46,7 @@ def test_second_call_returns_metrics(): collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, timer=timer, collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) + collector.init_tensors(rank=0) _ = collector.maybe_collect_rejsample_metrics(k=5) metrics = collector.maybe_collect_rejsample_metrics(k=5) assert metrics is not None @@ -66,7 +66,7 @@ def test_nonzero_rank_noop(rank): spec_decode_sampler.num_draft_tokens = 0 collector = AsyncMetricsCollector(spec_decode_sampler) - collector.init_gpu_tensors(rank=rank) + collector.init_tensors(rank=rank) _ = collector.maybe_collect_rejsample_metrics(k=5) metrics = collector.maybe_collect_rejsample_metrics(k=5) assert metrics is None @@ -94,7 +94,7 @@ def test_noop_until_time(): collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, timer=timer, collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) + collector.init_tensors(rank=0) _ = collector.maybe_collect_rejsample_metrics(k=5) metrics = collector.maybe_collect_rejsample_metrics(k=5) @@ -133,7 +133,7 @@ def test_timer_is_reset(): collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, timer=timer, collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) + collector.init_tensors(rank=0) _ = collector.maybe_collect_rejsample_metrics(k=5) metrics = collector.maybe_collect_rejsample_metrics(k=5) @@ -183,7 +183,7 @@ def test_initial_metrics_has_correct_values(has_data: bool): collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, timer=timer, collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) + collector.init_tensors(rank=0) _ = collector.maybe_collect_rejsample_metrics(k) metrics = collector.maybe_collect_rejsample_metrics(k) diff --git a/vllm/__init__.py b/vllm/__init__.py index a533dba561c00..0d38a96ed8337 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,4 +1,8 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" +from vllm.utils import is_fake_hpu, migrate_to_cpu + +if is_fake_hpu(): + migrate_to_cpu() from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 94a461e0c8c29..1893f98d8af77 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -2,13 +2,15 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### -import os from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type import torch +import vllm_hpu_extension.kernels as kernels import vllm_hpu_extension.ops as ops -from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache +from vllm_hpu_extension.flags import enabled_flags +from vllm_hpu_extension.utils import (Matmul, ModuleFusedSDPA, Softmax, + VLLMKVCache) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) @@ -52,16 +54,16 @@ def get_kv_cache_shape( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], + src_to_dsts: torch.Tensor, ) -> None: - HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dsts) @staticmethod def copy_blocks( kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], + src_to_dsts: torch.Tensor, ) -> None: - HPUPagedAttention.copy_blocks(kv_caches, src_to_dists) + HPUPagedAttention.copy_blocks(kv_caches, src_to_dsts) @dataclass @@ -72,6 +74,19 @@ class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata): is_prompt: bool attn_bias: Optional[torch.Tensor] seq_lens_tensor: Optional[torch.Tensor] + context_lens_tensor: Optional[torch.Tensor] + seq_lens: Optional[List[int]] = None + encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens_tensor: Optional[torch.Tensor] = None + cross_block_indices: Optional[torch.Tensor] = None + cross_block_offsets: Optional[torch.Tensor] = None + cross_block_list: Optional[torch.Tensor] = None + cross_slot_mapping: Optional[torch.Tensor] = None + cross_block_mapping: Optional[torch.Tensor] = None + cross_block_groups: Optional[torch.Tensor] = None + cross_block_scales: Optional[torch.Tensor] = None + cross_block_usage: Optional[torch.Tensor] = None + cross_attn_bias: Optional[torch.Tensor] = None class HPUAttentionImpl(AttentionImpl, torch.nn.Module): @@ -114,14 +129,11 @@ def __init__( self.matmul_av = Matmul() self.batch2block_matmul = Matmul() self.block2batch_matmul = Matmul() - # NOTE(kzawora): Contiguous PA is off until model runner supports it self.k_cache = VLLMKVCache() - self.k_cache.use_contiguous_pa = False self.v_cache = VLLMKVCache() - self.v_cache.use_contiguous_pa = False - # NOTE(kzawora): Pipelined PA is off until model runner supports it - ops.pa_impl = ops.pa - + HPUFusedSDPA = kernels.fsdpa() + self.fused_scaled_dot_product_attention = None if HPUFusedSDPA is None \ + else ModuleFusedSDPA(HPUFusedSDPA) self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.sliding_window = sliding_window self.alibi_slopes = alibi_slopes @@ -132,9 +144,8 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', - '0').lower() in ['1', 'true'] - if self.prefill_usefusedsdpa: + self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags() + if self.prefill_use_fusedsdpa: assert alibi_slopes is None, \ 'Prefill with FusedSDPA not supported with alibi slopes!' @@ -144,10 +155,11 @@ def __init__( f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {suppored_head_sizes}.") - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " + self.attn_type = attn_type + if (self.attn_type != AttentionType.DECODER + and self.attn_type != AttentionType.ENCODER_DECODER): + raise NotImplementedError("Encoder self-attention " + "is not implemented for " "HPUAttentionImpl") def forward( @@ -172,6 +184,17 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ + if self.attn_type == AttentionType.ENCODER_DECODER: + return self.forward_encoder_decoder( + query=query, + key=key, + value=value, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + k_scale=k_scale, + v_scale=v_scale, + ) + batch_size, seq_len, hidden_size = query.shape _, seq_len_kv, _ = key.shape @@ -183,7 +206,7 @@ def forward( if attn_metadata.is_prompt: key = key.unflatten(0, (block_indices.size(0), -1)) value = value.unflatten(0, (block_indices.size(0), -1)) - if kv_cache is not None: + if kv_cache is not None and isinstance(kv_cache, tuple): key_cache, value_cache = HPUPagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) @@ -197,24 +220,147 @@ def forward( if attn_metadata.is_prompt: # Prompt run. - if not self.prefill_usefusedsdpa: - # TODO: move this outside of model - assert attn_metadata.attn_bias is not None, \ - 'attn_bias must be set before calling model.forward!' - attn_bias = attn_metadata.attn_bias - if self.alibi_slopes is not None: - position_bias = _make_alibi_bias(self.alibi_slopes, - self.num_kv_heads, - attn_bias.dtype, - attn_bias.shape[-1]) - attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1)) - attn_bias.add_(position_bias) - else: - attn_bias = None - query_shape = (batch_size, seq_len, self.num_heads, self.head_size) kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) + if attn_metadata is None or attn_metadata.block_list is None: + if not self.prefill_use_fusedsdpa: + # TODO: move this outside of model + assert attn_metadata.attn_bias is not None, \ + 'attn_bias must be set before calling model.forward' + attn_bias = attn_metadata.attn_bias + if self.alibi_slopes is not None: + position_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, + attn_bias.dtype, attn_bias.shape[-1]) + attn_bias = attn_bias.tile( + (1, self.num_kv_heads, 1, 1)) + attn_bias.add_(position_bias) + else: + attn_bias = None + + out = ops.prompt_attention( + query.view(query_shape), + key.view(kv_shape), + value.view(kv_shape), + attn_bias=attn_bias, + p=0.0, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + softmax_op=self.softmax, + matmul_av_op=self.matmul_av, + valid_seq_lengths=attn_metadata.seq_lens_tensor, + fsdpa_op=self.fused_scaled_dot_product_attention, + ) + else: + # TODO: enable FusedSDPA + out = HPUPagedAttention.forward_prefix( + query=query.view(query_shape), + key=key.view(kv_shape), + value=value.view(kv_shape), + key_cache=key_cache, + value_cache=value_cache, + block_list=attn_metadata.block_list, + attn_bias=attn_metadata.attn_bias, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + matmul_av_op=self.matmul_av, + softmax_op=self.softmax, + keys_fetch_func=self.k_cache.fetch_from_cache, + values_fetch_func=self.v_cache.fetch_from_cache) + output = out.reshape(batch_size, seq_len, hidden_size) + else: + # Decoding run. + output = HPUPagedAttention.forward_decode( + query=query, + key_cache=key_cache, + value_cache=value_cache, + block_list=attn_metadata.block_list, + block_mapping=attn_metadata.block_mapping, + block_bias=attn_metadata.attn_bias, + block_scales=attn_metadata.block_scales, + block_groups=attn_metadata.block_groups, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + matmul_av_op=self.matmul_av, + batch2block_matmul_op=self.batch2block_matmul, + block2batch_matmul_op=self.block2batch_matmul, + keys_fetch_func=self.k_cache.fetch_from_cache, + values_fetch_func=self.v_cache.fetch_from_cache) + # Reshape the output tensor. + return output.view(batch_size, seq_len, hidden_size) + + def forward_encoder_decoder( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: HPUAttentionMetadata, + k_scale: float = 1.0, + v_scale: float = 1.0, + ) -> torch.Tensor: + """Forward pass with xFormers and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + batch_size, hidden_size = query.shape + + if attn_metadata.is_prompt: + batch_size = attn_metadata.num_prefills + batched_tokens, _ = query.shape + batched_kv_tokens, _, _ = key.shape + assert batch_size > 0, ( + "In prefill stage the num_prefills should be > 0") + assert batched_tokens % batch_size == 0 + assert batched_kv_tokens % batch_size == 0 + seq_len = batched_tokens // batch_size + + query = query.view(-1, self.num_heads, self.head_size) + if key is not None: + assert value is not None + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + else: + assert value is None + + block_indices = attn_metadata.cross_block_indices + block_offsets = attn_metadata.cross_block_offsets + if kv_cache is not None and isinstance(kv_cache, tuple): + key_cache, value_cache = HPUPagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + + # Reshape the input keys and values and store them in the cache. + # If kv_cache is not provided, the new key and value tensors are + # not cached. This happens during the initial memory profiling run. + if (key is not None) and (value is not None): + # During cross-attention decode, key & value will be None, + # we don't need to cache them. + key_cache = self.k_cache(key, key_cache, block_indices, + block_offsets) + value_cache = self.v_cache(value, value_cache, block_indices, + block_offsets) + + if attn_metadata.is_prompt: + # Prompt run. + batch_size = attn_metadata.num_prefills + + query_shape = (batch_size, -1, self.num_heads, self.head_size) + kv_shape = (batch_size, -1, self.num_kv_heads, self.head_size) + # Just a workaround, to make ops.prompt_attention go into the + # torch ops assembly path. + # TODO: add new prompt_attention op in vllm_hpu_extension + # which calls FusedSDPA with causal = False. + attn_bias = torch.zeros((batch_size, 1, 1, 1), + device=query.device, + dtype=torch.bool) out = ops.prompt_attention( query.view(query_shape), key.view(kv_shape), @@ -228,16 +374,23 @@ def forward( ) output = out.reshape(batch_size, seq_len, hidden_size) else: + # Enc/dec cross-attention KVs match encoder sequence length; + # cross-attention utilizes special "cross" block tables + block_list = attn_metadata.cross_block_list + block_mapping = attn_metadata.cross_block_mapping + block_scales = attn_metadata.cross_block_scales + block_groups = attn_metadata.cross_block_groups + attn_bias = attn_metadata.cross_attn_bias # Decoding run. output = HPUPagedAttention.forward_decode( query=query, key_cache=key_cache, value_cache=value_cache, - block_list=attn_metadata.block_list, - block_mapping=attn_metadata.block_mapping, - block_bias=attn_metadata.attn_bias, - block_scales=attn_metadata.block_scales, - block_groups=None, + block_list=block_list, + block_mapping=block_mapping, + block_bias=attn_bias, + block_scales=block_scales, + block_groups=block_groups, scale=self.scale, matmul_qk_op=self.matmul_qk, matmul_av_op=self.matmul_av, @@ -246,7 +399,7 @@ def forward( keys_fetch_func=self.k_cache.fetch_from_cache, values_fetch_func=self.v_cache.fetch_from_cache) # Reshape the output tensor. - return output.view(batch_size, seq_len, hidden_size) + return output.view(batch_size, -1, hidden_size) def _make_alibi_bias( diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index 4c0fb2a628361..e55a4de11fd6c 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -3,7 +3,7 @@ ############################################################################### from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import List, Optional, Tuple import torch from vllm_hpu_extension import cache_ops, ops @@ -21,6 +21,7 @@ class HPUPagedAttentionMetadata: block_indices: Optional[torch.Tensor] block_offsets: Optional[torch.Tensor] block_scales: Optional[torch.Tensor] + block_groups: Optional[torch.Tensor] class HPUPagedAttention: @@ -62,42 +63,28 @@ def forward_decode(**kwargs) -> torch.Tensor: return ops.flat_pa(**kwargs) @staticmethod - def forward_prefix( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - subquery_start_loc: torch.Tensor, - seq_lens_tensor: torch.Tensor, - context_lens: torch.Tensor, - max_query_len: int, - alibi_slopes: Optional[torch.Tensor], - sliding_window: Optional[int], - ) -> torch.Tensor: - raise NotImplementedError( - "forward_prefix is not implemented for HPUPagedAttention") + def forward_prefix(**kwargs) -> torch.Tensor: + return ops.prompt_attention_with_context(**kwargs) @staticmethod def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], + src_kv_cache: Tuple[torch.Tensor, torch.Tensor], + dst_kv_cache: Tuple[torch.Tensor, torch.Tensor], + src_to_dsts: torch.Tensor, ) -> None: src_key_cache = src_kv_cache[0] dst_key_cache = dst_kv_cache[0] - cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts) src_value_cache = src_kv_cache[1] dst_value_cache = dst_kv_cache[1] - cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) + cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts) @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + src_to_dsts: torch.Tensor, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] value_caches = [kv_cache[1] for kv_cache in kv_caches] - cache_ops.copy_blocks(key_caches, value_caches, src_to_dists) + cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) diff --git a/vllm/config.py b/vllm/config.py index ac5a4c91b1738..29cc6887177ed 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -608,8 +608,9 @@ def _verify_cuda_graph(self) -> None: self.max_model_len) MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama'] + from vllm.platforms import current_platform if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH - and not self.enforce_eager): + and not self.enforce_eager and not current_platform.is_hpu()): logger.warning( "CUDA graph is not supported for %s yet, fallback to the eager " "mode.", self.hf_config.model_type) @@ -1027,12 +1028,13 @@ def _verify_args(self) -> None: def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass - elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"): + elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"): logger.info( "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " "Meanwhile, it may cause accuracy drop without a proper " - "scaling factor") + "scaling factor. " + "Intel Gaudi (HPU) supports fp8 (using fp8_inc).") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") @@ -1172,10 +1174,13 @@ class LoadConfig: ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's checkpoints. + device: Device to which model weights will be loaded, default to + device_config.device """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO download_dir: Optional[str] = None + device: Optional[str] = None model_loader_extra_config: Optional[Union[str, dict]] = field( default_factory=dict) ignore_patterns: Optional[Union[List[str], str]] = None @@ -1285,7 +1290,7 @@ def __post_init__(self) -> None: raise ValueError(f"worker-use-ray can't be used with " f"distributed executor backend " f"'{self.distributed_executor_backend}'.") - ray_only_devices = ["tpu", "hpu"] + ray_only_devices = ["tpu"] from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices and self.world_size > 1): @@ -1426,6 +1431,15 @@ class SchedulerConfig: chunked_prefill_enabled: bool = field(init=False) + # Maximum number of prefill sequences to be + # processed in a single iteration. Used only with padding-aware + # scheduling. + max_num_prefill_seqs: Optional[int] = None + + # If True, scheduler will consider padded + # tokens in prefill. + use_padding_aware_scheduling: bool = False + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -1514,6 +1528,13 @@ def _verify_args(self) -> None: "num_scheduler_steps " f"({self.num_scheduler_steps}) must be greater than or " "equal to 1.") + if self.max_num_prefill_seqs is not None \ + and not self.use_padding_aware_scheduling: + raise ValueError("max_num_prefill_seqs can be only " + "used with padding-aware-scheduling. ") + if self.use_padding_aware_scheduling and self.chunked_prefill_enabled: + raise ValueError("Padding-aware scheduling currently " + "does not work with chunked prefill ") @property def is_multi_step(self) -> bool: diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 9b94918ab38ef..695870742da50 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,5 +1,5 @@ -from collections import deque -from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple +import heapq +from typing import FrozenSet, Iterable, List, Optional, Tuple from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, get_all_blocks_recursively) @@ -36,7 +36,9 @@ def __init__( if block_ids is None: block_ids = range(num_blocks) - self._free_block_indices: Deque[BlockId] = deque(block_ids) + self._free_block_indices: List[ + BlockId] = block_ids[:] # type: ignore[index] + heapq.heapify(self._free_block_indices) self._all_block_indices = frozenset(block_ids) assert len(self._all_block_indices) == num_blocks @@ -132,7 +134,7 @@ def _allocate_block_id(self) -> BlockId: if not self._free_block_indices: raise BlockAllocator.NoFreeBlocksError() - block_id = self._free_block_indices.popleft() + block_id = heapq.heappop(self._free_block_indices) self._refcounter.incr(block_id) return block_id @@ -142,7 +144,7 @@ def _free_block_id(self, block: Block) -> None: refcount = self._refcounter.decr(block_id) if refcount == 0: - self._free_block_indices.appendleft(block_id) + heapq.heappush(self._free_block_indices, block_id) block.block_id = None diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index b3d396f9cedda..200098e3828da 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -12,6 +12,7 @@ from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (Sequence, SequenceData, SequenceGroup, SequenceGroupMetadata, SequenceGroupMetadataDelta, @@ -117,6 +118,94 @@ def num_cached_tokens(self): return self._num_cached_tokens +@dataclass +class PaddingAwareSchedulingBudget(SchedulingBudget): + max_num_prefill_seqs: Optional[int] = None + _prefill_request_ids_max_seq_lens: Dict[str, + int] = field(default_factory=dict) + _max_seq_len: int = 0 + _num_curr_prefill_seqs: int = 0 + + def _generic_padding_fn(self, batch_size, max_seq_len) -> int: + return batch_size * max_seq_len + + def _hpu_padding_fn(self, batch_size, max_seq_len): + from vllm_hpu_extension.bucketing import (HPUBucketingGlobalState, + find_bucket) + padded_bs = batch_size + padded_seq = max_seq_len + + hpu_bucketing_global_state = HPUBucketingGlobalState() + + bs_cfg = hpu_bucketing_global_state.prompt_bs_bucket_cfg + if bs_cfg is not None: + padded_bs = find_bucket(batch_size, bs_cfg) + else: + logger.warning( + "prompt_bs_bucket_cfg was not set! Using unpadded batch size.") + seq_cfg = hpu_bucketing_global_state.prompt_seq_bucket_cfg + if seq_cfg is not None: + padded_seq = find_bucket(max_seq_len, seq_cfg) + else: + logger.warning("prompt_seq_bucket_cfg was not set! " + "Using unpadded sequence length.") + return padded_bs * padded_seq + + def _padding_fn_selector(self): + if current_platform.is_hpu(): + return self._hpu_padding_fn + return self._generic_padding_fn + + def _maybe_update_max_seq_len(self, + new_seq_max_seq_len: Optional[int] = None): + if new_seq_max_seq_len is not None \ + and new_seq_max_seq_len > self._max_seq_len: + self._max_seq_len = new_seq_max_seq_len + return + self._max_seq_len = max( + self._prefill_request_ids_max_seq_lens.values()) + + def add_prefill_seqs(self, req_id, num_curr_prefill_seqs, max_seq_len): + self._prefill_request_ids_max_seq_lens[req_id] = max_seq_len + self._num_curr_prefill_seqs += num_curr_prefill_seqs + self._maybe_update_max_seq_len(max_seq_len) + + def subtract_prefill_seqs(self, req_id, num_curr_prefill_seqs): + if req_id in self._prefill_request_ids_max_seq_lens: + popped_seq_len = self._prefill_request_ids_max_seq_lens.pop(req_id) + self._num_curr_prefill_seqs -= num_curr_prefill_seqs + if popped_seq_len == self._max_seq_len: + self._maybe_update_max_seq_len() + + def can_schedule(self, + *args, + num_new_tokens: int, + num_new_seqs: int, + is_prefill: bool = False, + max_seq_len: int = 0): + can_parent_schedule = super().can_schedule( + *args, num_new_tokens=num_new_tokens, num_new_seqs=num_new_seqs) + if not can_parent_schedule or not is_prefill: + return can_parent_schedule + new_batch_size = self._num_curr_prefill_seqs + num_new_seqs + new_max_seq_len = max(max(self._max_seq_len, max_seq_len), 1) + padding_fn = self._padding_fn_selector() + num_new_padded_tokens = padding_fn(new_batch_size, new_max_seq_len) + result = num_new_padded_tokens <= self.token_budget + if self.max_num_prefill_seqs is not None and result: + result = self._num_curr_prefill_seqs + num_new_seqs \ + <= self.max_num_prefill_seqs + return result + + @property + def max_seq_len(self): + return self._max_seq_len + + @property + def num_curr_prefill_seqs(self): + return self._num_curr_prefill_seqs + + @dataclass class ScheduledSequenceGroup: # A sequence group that's scheduled. @@ -993,10 +1082,18 @@ def _schedule_prefills( break num_new_seqs = seq_group.get_max_num_running_seqs() - if num_new_tokens_uncached == 0 or not budget.can_schedule( - num_new_tokens=num_new_tokens_uncached, - num_new_seqs=num_new_seqs, - ): + max_prefill_seq_len = None + can_schedule_kwargs = { + 'num_new_tokens': num_new_tokens_uncached, + 'num_new_seqs': num_new_seqs + } + if self.scheduler_config.use_padding_aware_scheduling: + max_prefill_seq_len = max( + [seq.get_num_new_tokens() for seq in seq_group.get_seqs()]) + can_schedule_kwargs['is_prefill'] = True + can_schedule_kwargs['max_seq_len'] = max_prefill_seq_len + if (num_new_tokens_uncached == 0 + or not budget.can_schedule(**can_schedule_kwargs)): break # Can schedule this request. @@ -1031,6 +1128,10 @@ def _schedule_prefills( num_cached_tokens=num_new_tokens_cached, ) budget.add_num_seqs(seq_group.request_id, num_new_seqs) + if self.scheduler_config.use_padding_aware_scheduling: + assert isinstance(budget, PaddingAwareSchedulingBudget) + budget.add_prefill_seqs(seq_group.request_id, num_new_seqs, + max_prefill_seq_len) # Queue requests that couldn't be scheduled. waiting_queue.extendleft(leftover_waiting_sequences) @@ -1052,10 +1153,18 @@ def _schedule_default(self) -> SchedulerOutputs: be swapped or preempted. """ # Include running requests to the budget. - budget = SchedulingBudget( - token_budget=self.scheduler_config.max_num_batched_tokens, - max_num_seqs=self.scheduler_config.max_num_seqs, - ) + budget: SchedulingBudget + if self.scheduler_config.use_padding_aware_scheduling: + budget = PaddingAwareSchedulingBudget( + token_budget=self.scheduler_config.max_num_batched_tokens, + max_num_seqs=self.scheduler_config.max_num_seqs, + max_num_prefill_seqs=self.scheduler_config.max_num_prefill_seqs + ) + else: + budget = SchedulingBudget( + token_budget=self.scheduler_config.max_num_batched_tokens, + max_num_seqs=self.scheduler_config.max_num_seqs, + ) # Make sure we include num running seqs before scheduling prefill, # so that we don't schedule beyond max_num_seqs for prefill. for seq_group in self.running: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a4f4c9558d056..09f89242cbdec 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,6 +18,7 @@ from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.platforms import current_platform from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, StoreBoolean @@ -95,6 +96,7 @@ class EngineArgs: allowed_local_media_path: str = "" download_dir: Optional[str] = None load_format: str = 'auto' + weights_load_device: Optional[str] = None config_format: ConfigFormat = ConfigFormat.AUTO dtype: str = 'auto' kv_cache_dtype: str = 'auto' @@ -115,11 +117,13 @@ class EngineArgs: enable_prefix_caching: Optional[bool] = None disable_sliding_window: bool = False use_v2_block_manager: bool = True + use_padding_aware_scheduling: bool = current_platform.is_hpu() swap_space: float = 4 # GiB cpu_offload_gb: float = 0 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None max_num_seqs: Optional[int] = None + max_num_prefill_seqs: Optional[int] = None max_logprobs: int = 20 # Default value for OpenAI Chat Completions API disable_log_stats: bool = False revision: Optional[str] = None @@ -319,6 +323,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'Model Streamer \n' '* "bitsandbytes" will load the weights using bitsandbytes ' 'quantization.\n') + parser.add_argument("--weights-load-device", + type=str, + default=EngineArgs.weights_load_device, + choices=DEVICE_OPTIONS, + help=('Device to which model weights ' + 'will be loaded.')) parser.add_argument( '--config-format', default=EngineArgs.config_format, @@ -344,11 +354,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'], default=EngineArgs.kv_cache_dtype, help='Data type for kv cache storage. If "auto", will use model ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') + 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3). ' + 'Intel Gaudi (HPU) supports fp8 (using fp8_inc).') parser.add_argument( '--quantization-param-path', type=nullable_str, @@ -453,6 +464,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'block manager v2) is now the default. ' 'Setting this flag to True or False' ' has no effect on vLLM behavior.') + parser.add_argument( + '--use-padding-aware-scheduling', + default=EngineArgs.use_padding_aware_scheduling, + action='store_true', + help=('Use padding-aware scheduling. If True, the scheduler ' + 'will consider padded tokens in prefill. ' + 'By default this is set to False on non-HPU devices. ')) parser.add_argument( '--num-lookahead-slots', type=int, @@ -512,6 +530,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, default=EngineArgs.max_num_seqs, help='Maximum number of sequences per iteration.') + parser.add_argument( + '--max-num-prefill-seqs', + type=int, + default=EngineArgs.max_num_prefill_seqs, + help=('Maximum number of prefill sequences per ' + 'iteration. Can be used only with padding-aware ' + 'scheduling. Must be <= max_num_seqs.')) parser.add_argument( '--max-logprobs', type=int, @@ -1005,6 +1030,7 @@ def create_load_config(self) -> LoadConfig: return LoadConfig( load_format=self.load_format, download_dir=self.download_dir, + device=self.weights_load_device, model_loader_extra_config=self.model_loader_extra_config, ignore_patterns=self.ignore_patterns, ) @@ -1186,6 +1212,7 @@ def create_engine_config(self, runner_type=model_config.runner_type, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, + max_num_prefill_seqs=self.max_num_prefill_seqs, max_model_len=model_config.max_model_len, num_lookahead_slots=num_lookahead_slots, delay_factor=self.scheduler_delay_factor, @@ -1196,7 +1223,8 @@ def create_engine_config(self, multi_step_stream_outputs=self.multi_step_stream_outputs, send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), - policy=self.scheduling_policy) + policy=self.scheduling_policy, + use_padding_aware_scheduling=self.use_padding_aware_scheduling) lora_config = LoRAConfig( bias_enabled=self.enable_lora_bias, max_lora_rank=self.max_lora_rank, diff --git a/vllm/executor/multiproc_hpu_executor.py b/vllm/executor/multiproc_hpu_executor.py new file mode 100644 index 0000000000000..a82fff956738f --- /dev/null +++ b/vllm/executor/multiproc_hpu_executor.py @@ -0,0 +1,57 @@ +from typing import Callable, Optional, Tuple, Type + +import habana_frameworks.torch # noqa: F401 +import torch + +from vllm.executor.multiproc_gpu_executor import ( + MultiprocessingGPUExecutor, MultiprocessingGPUExecutorAsync) +from vllm.logger import init_logger +from vllm.utils import make_async +from vllm.worker.worker_base import WorkerBase + +logger = init_logger(__name__) + + +class MultiprocessingHPUExecutor(MultiprocessingGPUExecutor): + """Python multiprocessing-based multi-HPU executor""" + + def _get_worker_module_and_class( + self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]: + worker_class_fn = None + if self.scheduler_config.is_multi_step: + module_name = "vllm.worker.multi_step_hpu_worker" + class_name = "MultiStepHPUWorker" + elif self.speculative_config is not None: + module_name = "vllm.spec_decode.spec_decode_worker" + class_name = "create_spec_worker" + else: + module_name = "vllm.worker.hpu_worker" + class_name = "HPUWorker" + return (module_name, class_name, worker_class_fn) + + def _check_executor_parameters(self): + world_size = self.parallel_config.world_size + tensor_parallel_size = self.parallel_config.tensor_parallel_size + + hpu_device_count = torch.hpu.device_count() + assert tensor_parallel_size <= hpu_device_count, ( + f"please set tensor_parallel_size ({tensor_parallel_size}) " + f"to less than max local hpu count ({hpu_device_count})") + + assert world_size <= hpu_device_count, ( + f"please ensure that world_size ({world_size}) " + f"is less than than max local hpu count ({hpu_device_count})") + + def shutdown_inc(self): + self._run_workers("shutdown_inc") + + def __del__(self): + self.shutdown() + + +class MultiprocessingHPUExecutorAsync(MultiprocessingHPUExecutor, + MultiprocessingGPUExecutorAsync): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_exec_model = make_async(self.driver_worker.execute_model) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 539b6ae2d3572..302a509ccdfb9 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -12,8 +12,10 @@ import torch +from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.triton_utils.importing import HAS_TRITON from vllm.utils import _check_multiproc_method, get_mp_context, run_method @@ -285,6 +287,22 @@ def set_multiprocessing_worker_envs(parallel_config): _check_multiproc_method() + if (current_platform.is_hpu() + and parallel_config.distributed_executor_backend == 'mp' + and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might " + "cause application hangs on exit. Using " + "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " + "as it was explicitly requested.") + else: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might " + "cause application hangs on exit. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "To override that behavior, please set " + "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + # Configure thread parallelism if OMP_NUM_THREADS isn't set # # Helps to avoid CPU contention. The default of spawning a thread per diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e6f26d2b74b2f..c39926110c375 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1080,7 +1080,7 @@ def _get_logits( posinf=float("inf"), neginf=float("-inf"))) - # HPU needs special handling to prune out dummy samples. + # HPU needs special handling to prune out dummy samples if current_platform.is_hpu(): lora_logits = lora_logits[:logits.shape[0], :] diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 7360c8c09e3ac..4504e19b20816 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -2,6 +2,8 @@ import torch +from vllm.platforms import current_platform + if TYPE_CHECKING: # avoid circuit import from vllm.lora.layers import LoRAMapping @@ -86,10 +88,14 @@ def convert_mapping( embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None + if long_lora_context: - long_lora_offsets = torch.zeros(len(index_mapping_indices), - device=device, - dtype=torch.long) + if current_platform.is_hpu(): + long_lora_offsets_list: List[int] = [] + else: + long_lora_offsets = torch.zeros(len(index_mapping_indices), + device=device, + dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping @@ -102,10 +108,18 @@ def convert_mapping( embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 lora_indices[i] = lora_idx if long_lora_context: - assert long_lora_offsets is not None lora_offset: int = long_lora_context.offsets_by_lora_id.get( index_mapping_indices[i], 0) - long_lora_offsets[i] = lora_offset + if current_platform.is_hpu(): + long_lora_offsets_list.append(lora_offset) + else: + assert long_lora_offsets is not None + long_lora_offsets[i] = lora_offset + + if long_lora_context and current_platform.is_hpu(): + long_lora_offsets = torch.tensor(long_lora_offsets_list, + device=device, + dtype=torch.long) indices_list: List[Union[List[int], torch.Tensor]] = [ index_mapping_indices, diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index e4eb3f16e56cf..62f40355cabed 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -15,11 +15,11 @@ # limitations under the License. import copy import json +import math from collections import defaultdict from functools import lru_cache -from typing import Callable, DefaultDict, Dict, List, Union +from typing import Any, Callable, DefaultDict, Dict, List, Union -import numpy as np import torch from outlines import grammars from outlines.caching import cache @@ -31,6 +31,22 @@ from transformers import PreTrainedTokenizerBase +# Unfortunately we cannot use lru_cache as it breaks pickling +# so we use a simpler implementation +def _cached(fn): + cache: Dict[Any, Any] = {} + + def cached_fn(*args): + if args in cache: + result = cache[args] + else: + result = fn(*args) + cache[args] = result + return result + + return cached_fn + + class BaseLogitsProcessor: def __init__(self, guide: Guide): @@ -38,6 +54,27 @@ def __init__(self, guide: Guide): # CFGState is used for the FSM state for CFGGuide self._fsm_state: DefaultDict[int, Union[int, CFGState]] = defaultdict(int) + self._cached_get_mask_tensor = _cached(self._get_mask_tensor) + + @staticmethod + @lru_cache(maxsize=128) + def _create_mask_tensor(allowed_tokens, vocab_size, device): + mask = torch.full((vocab_size, ), -math.inf, device=device) + mask[list(allowed_tokens)] = 0 + return mask + + def _get_mask_tensor(self, state_id, vocab_size, device): + instruction = self._guide.get_next_instruction(state=state_id) + if type(instruction) == Generate: # noqa: E721 + allowed_tokens = instruction.tokens + elif type(instruction) == Write: # noqa: E721 + # TODO: support fast forward tokens + allowed_tokens = [instruction.tokens[0]] + else: + raise TypeError( + f"Unsupported instruction type {type(instruction)}") + return BaseLogitsProcessor._create_mask_tensor(tuple(allowed_tokens), + vocab_size, device) def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor: @@ -65,30 +102,9 @@ def __call__(self, input_ids: List[int], self._fsm_state[seq_id] = CFGState( parser_state=self._guide.parser.parse(""), prev_token=None) - instruction = self._guide.get_next_instruction( - state=self._fsm_state[seq_id]) - - if type(instruction) == Generate: # noqa: E721 - allowed_tokens = instruction.tokens - elif type(instruction) == Write: # noqa: E721 - # TODO: support fast forward tokens - allowed_tokens = [instruction.tokens[0]] - else: - raise TypeError( - f"Unsupported instruction type {type(instruction)}") - - mask = torch.full((scores.shape[-1], ), - -torch.inf, - device=scores.device) - # The tokenizer may support more token ids than the model can generate, - # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256 - # but scores.shape == torch.Size([128256]) - # Using NumPy is faster for filtering token ids - allowed_tokens = np.array(allowed_tokens, dtype=np.int64) - allowed_tokens = torch.tensor(allowed_tokens, device=scores.device) - allowed_tokens = allowed_tokens.masked_select( - allowed_tokens < scores.shape[-1]) - mask.index_fill_(0, allowed_tokens, 0) + state_id = self._fsm_state[seq_id] + mask = self._cached_get_mask_tensor(state_id, scores.size(-1), + scores.device) scores.add_(mask) return scores diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index f10a8fb8e03cf..2d8594cb8aafa 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -298,8 +298,11 @@ def __call__(self, input_ids: list[int], # token_bitmask is a CPU tensor for use with accept_token and # fill_next_token_bitmask so we move it to the device of scores device_type = scores.device.type + dtype = scores.dtype if device_type != "cuda": - scores = scores.to("cpu").unsqueeze(0) + # xgrammar on cpu only supports float32 scores + # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22 + scores = scores.to("cpu").float().unsqueeze(0) # Note: In this method, if the tensors have different dimensions # on CPU device fails, but on GPU it runs without error. Hence the @@ -307,7 +310,7 @@ def __call__(self, input_ids: list[int], xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device)) if device_type != "cuda": - scores = scores.to(device_type).squeeze() + scores = scores.to(dtype).to(device_type).squeeze() return scores diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 308c1d6ac6db1..48f5987b2e585 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -16,6 +16,10 @@ from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op +if current_platform.is_hpu(): + from vllm_hpu_extension.ops import scaled_fp8_quant + ops.scaled_fp8_quant = scaled_fp8_quant + logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3d822fc0c7f99..634e57dafa4de 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -15,6 +15,8 @@ from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum +is_hpu = current_platform.is_hpu() + if current_platform.is_cuda_alike(): from .fused_moe import fused_experts else: @@ -158,6 +160,28 @@ def forward_cuda( topk_ids=topk_ids, inplace=True) + def forward_hpu( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + **kwargs, + ): + assert not use_grouped_topk, 'use_grouped_topk must be False on HPU' + assert num_expert_group is None, ('num_expert_group is ' + 'not supported on HPU') + assert topk_group is None, 'topk_group is not supported on HPU' + if layer is not None: + return layer.hpu_static_fused_moe(x, layer.w13_weight, + layer.w2_weight, router_logits, + top_k) + def forward_cpu( self, layer: torch.nn.Module, @@ -275,6 +299,13 @@ def __init__( self.num_expert_group = num_expert_group self.topk_group = topk_group self.custom_routing_function = custom_routing_function + if is_hpu: + from vllm_hpu_extension.ops import DynamicFusedMOE, StaticFusedMOE + + from vllm.model_executor.layers.quantization.inc import INCConfig + selected_fused_moe = (StaticFusedMOE if isinstance( + quant_config, INCConfig) else DynamicFusedMOE) + self.hpu_static_fused_moe = selected_fused_moe(self.num_experts) self.scoring_func = scoring_func self.e_score_correction_bias = e_score_correction_bias @@ -315,8 +346,8 @@ def _load_per_tensor_weight_scale(self, shard_id: str, def _load_model_weight_or_group_weight_scale(self, shard_dim: int, expert_data: torch.Tensor, shard_id: str, - loaded_weight: torch.Tensor, - tp_rank: int): + loaded_weight: torch.tensor, + tp_rank: int, expert_id: int): # Load grouped weight scales for group quantization # or model weights if shard_id == "w2": @@ -324,13 +355,15 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + expert_id=expert_id) elif shard_id in ("w1", "w3"): self._load_w13(shard_id=shard_id, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + expert_id=expert_id) def _load_per_channel_weight_scale(self, expert_data: torch.Tensor, shard_dim: int, shard_id: str, @@ -346,9 +379,15 @@ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor, expert_data=expert_data, tp_rank=tp_rank) - def _load_w13(self, expert_data: torch.Tensor, shard_dim: int, - shard_id: str, loaded_weight: torch.Tensor, tp_rank: int): + def _load_w13(self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.tensor, + tp_rank: int, + expert_id: Optional[int] = None): + orig_exp_data = expert_data.view(expert_data.size()) # Index the loaded weight for tp sharding. # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim shard_size = expert_data.shape[shard_dim] // 2 @@ -364,8 +403,19 @@ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int, expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) expert_data.copy_(loaded_weight) - def _load_w2(self, expert_data: torch.Tensor, shard_dim: int, - shard_id: str, loaded_weight: torch.Tensor, tp_rank: int): + if is_hpu: + from vllm_hpu_extension.ops import StaticFusedMOE + if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): + self.hpu_static_fused_moe.w13_list[expert_id].set_weight( + orig_exp_data) + + def _load_w2(self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.tensor, + tp_rank: int, + expert_id: Optional[int] = None): # Index the loaded weight for tp sharding. # down_proj: "RowParallel" so tp sharding on input_dim @@ -375,6 +425,11 @@ def _load_w2(self, expert_data: torch.Tensor, shard_dim: int, shard_size) # w2, down_proj: Load into only logical weight of w2. expert_data.copy_(loaded_weight) + if is_hpu: + from vllm_hpu_extension.ops import StaticFusedMOE + if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): + self.hpu_static_fused_moe.w2_list[expert_id].set_weight( + expert_data) def _load_single_value(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int): @@ -480,7 +535,8 @@ def weight_loader(self, param: torch.nn.Parameter, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + expert_id=expert_id) elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: self._load_per_tensor_weight_scale(shard_id=shard_id, param=param, @@ -506,7 +562,8 @@ def weight_loader(self, param: torch.nn.Parameter, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + expert_id=expert_id) return @staticmethod @@ -591,29 +648,3 @@ def make_expert_params_mapping( ("w3", ckpt_up_proj_name), ] ] - - def _load_fp8_scale(self, param: torch.nn.Parameter, - loaded_weight: torch.Tensor, weight_name: str, - shard_id: str, expert_id: int) -> None: - param_data = param.data - - # Input scales can be loaded directly and should be equal. - if "input_scale" in weight_name: - if param_data[expert_id] != 1 and (param_data[expert_id] - - loaded_weight).abs() > 1e-5: - raise ValueError( - "input_scales of w1 and w3 of a layer " - f"must be equal. But got {param_data[expert_id]} " - f"vs. {loaded_weight}") - param_data[expert_id] = loaded_weight - # Weight scales - elif "weight_scale" in weight_name: - # If we are in merged column case (gate_up_proj) - if shard_id in ("w1", "w3"): - # We have to keep the weight scales of w1 and w3 because - # we need to re-quantize w1/w3 weights after weight loading. - idx = 0 if shard_id == "w1" else 1 - param_data[expert_id][idx] = loaded_weight - # If we are in the row parallel case (down_proj) - else: - param_data[expert_id] = loaded_weight diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 43ea4eb5a4d1a..58e82884df7a1 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -104,7 +104,8 @@ def forward_hpu( x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - from vllm_hpu_extension.ops import HPUFusedRMSNorm + from vllm_hpu_extension.kernels import rms_norm + HPUFusedRMSNorm = rms_norm() if HPUFusedRMSNorm is None: return self.forward_native(x, residual) if residual is not None: diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 00ae64bbe6388..616a53df2f020 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -292,6 +292,7 @@ def __init__(self, quant_config, prefix) self.gather_output = gather_output + self.collective_func = tensor_model_parallel_all_gather # Divide the weight matrix along the last dimension. tp_size = get_tensor_model_parallel_world_size() @@ -378,7 +379,7 @@ def forward(self, input_): output_parallel = self.quant_method.apply(self, input_, bias) if self.gather_output: # All-gather across the partitions. - output = tensor_model_parallel_all_gather(output_parallel) + output = self.collective_func(output_parallel) else: output = output_parallel output_bias = self.bias if self.skip_bias_add else None @@ -1034,6 +1035,7 @@ def __init__(self, self.input_is_parallel = input_is_parallel self.reduce_results = reduce_results + self.collective_func = tensor_model_parallel_all_reduce # Divide the weight matrix along the last dimension. self.tp_rank = get_tensor_model_parallel_rank() @@ -1112,7 +1114,7 @@ def weight_loader_v2(self, param: BasevLLMParameter, param.load_row_parallel_weight(loaded_weight=loaded_weight) - def forward(self, input_): + def resolve_input(self, input_): if self.input_is_parallel: input_parallel = input_ else: @@ -1120,6 +1122,10 @@ def forward(self, input_): splitted_input = split_tensor_along_last_dim( input_, num_partitions=self.tp_size) input_parallel = splitted_input[tp_rank].contiguous() + return input_parallel + + def forward(self, input_): + input_parallel = self.resolve_input(input_) # Matrix multiply. assert self.quant_method is not None diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 42decde1d0f79..9f7fe759aec11 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -128,12 +128,28 @@ def _prune_hidden_states( return hidden_states +def get_num_parameters(logits_processor): + """Extracts the number of parameters from the + signature and stores it for further use""" + if hasattr(logits_processor, 'num_parameters'): + return logits_processor.num_parameters + logits_processor.num_parameters = len( + inspect.signature(logits_processor).parameters) + return logits_processor.num_parameters + + def _apply_logits_processors( logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> torch.Tensor: - found_logits_processors = False logits_processed = 0 + found_logits_processors = any( + seq_group.sampling_params.logits_processors + for seq_group in sampling_metadata.seq_groups) + offload_to_cpu = current_platform.is_hpu() and found_logits_processors + if offload_to_cpu: + logits_device = logits.device + logits = logits.cpu() for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids sampling_params = seq_group.sampling_params @@ -148,8 +164,7 @@ def _apply_logits_processors( prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids for logits_processor in logits_processors: - parameters = inspect.signature(logits_processor).parameters - if len(parameters) == 3: + if get_num_parameters(logits_processor) == 3: logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids, logits_row) @@ -165,4 +180,6 @@ def _apply_logits_processors( if found_logits_processors: # verifies that no rows in logits were missed unexpectedly assert logits_processed == logits.shape[0] + if offload_to_cpu: + logits = logits.to(logits_device) return logits diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index caeb8b95e02f2..c2387638e360d 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -26,6 +26,7 @@ "experts_int8", "neuron_quant", "ipex", + "inc", "quark" ] @@ -52,6 +53,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: from .gptq_marlin import GPTQMarlinConfig from .gptq_marlin_24 import GPTQMarlin24Config from .hqq_marlin import HQQMarlinConfig + from .inc import INCConfig from .ipex_quant import IPEXConfig from .marlin import MarlinConfig from .modelopt import ModelOptFp8Config @@ -82,6 +84,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: "experts_int8": ExpertsInt8Config, "neuron_quant": NeuronQuantConfig, "ipex": IPEXConfig, + "inc": INCConfig, "quark": QuarkConfig } diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b2fc2360f47f1..a9351147a6c30 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -303,8 +303,10 @@ def _get_scheme_from_parts( if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): - is_fp8_w8a8_supported = self._check_scheme_supported( - CompressedTensorsW8A8Fp8.get_min_capability(), error=False) + is_fp8_w8a8_supported = current_platform.is_hpu() or \ + self._check_scheme_supported( + CompressedTensorsW8A8Fp8.get_min_capability(), + error=False) if is_fp8_w8a8_supported: return CompressedTensorsW8A8Fp8( strategy=weight_quant.strategy, @@ -409,7 +411,9 @@ def get_scheme( # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace) - self._check_scheme_supported(scheme.get_min_capability()) + if not current_platform.is_hpu(): + self._check_scheme_supported(scheme.get_min_capability()) + return scheme def get_cache_scale(self, name: str) -> Optional[str]: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 1d4e4bd52adaa..03a0866d2eb83 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -22,7 +22,8 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): def __init__(self, strategy: str, is_static_input_scheme: bool): self.strategy = strategy self.is_static_input_scheme = is_static_input_scheme - self.cutlass_fp8_supported = cutlass_fp8_supported() + self.cutlass_fp8_supported = not current_platform.is_hpu() and \ + cutlass_fp8_supported() @classmethod def get_min_capability(cls) -> int: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 4969ee559522e..09e542d848950 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -29,6 +29,10 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +if current_platform.is_hpu(): + from vllm_hpu_extension.ops import scaled_fp8_quant + ops.scaled_fp8_quant = scaled_fp8_quant + ACTIVATION_SCHEMES = ["static", "dynamic"] logger = init_logger(__name__) @@ -132,12 +136,16 @@ class Fp8LinearMethod(LinearMethodBase): def __init__(self, quant_config: Fp8Config): self.quant_config = quant_config - self.cutlass_fp8_supported = cutlass_fp8_supported() - - # For GPUs that lack FP8 hardware support, we can leverage the Marlin - # kernel for fast weight-only FP8 quantization - self.use_marlin = (not current_platform.has_device_capability(89) - or envs.VLLM_TEST_FORCE_FP8_MARLIN) + self.cutlass_fp8_supported = False + if current_platform.is_cuda_alike(): + self.cutlass_fp8_supported = cutlass_fp8_supported() + + self.use_marlin = False + if not current_platform.is_hpu(): + # For GPUs that lack FP8 hardware support, we can leverage the + # Marlin kernel for fast weight-only FP8 quantization + self.use_marlin = (not current_platform.has_device_capability(89) + or envs.VLLM_TEST_FORCE_FP8_MARLIN) # Disable marlin for rocm if current_platform.is_rocm(): self.use_marlin = False diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py new file mode 100644 index 0000000000000..3b2e6880db724 --- /dev/null +++ b/vllm/model_executor/layers/quantization/inc.py @@ -0,0 +1,42 @@ +from typing import Any, Dict, List, Optional + +import torch + +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, UnquantizedFusedMoEMethod) +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) + + +class INCConfig(QuantizationConfig): + """Config class for FP8 using Intel Neural Compressor.""" + + @classmethod + def get_name(cls) -> str: + return "inc" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.bfloat16] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "INCConfig": + raise AssertionError + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + return UnquantizedLinearMethod() + elif isinstance(layer, FusedMoE): + return UnquantizedFusedMoEMethod() + return None + + @classmethod + def get_min_capability(cls) -> int: + raise AssertionError + + @staticmethod + def get_config_filenames() -> List[str]: + return [] diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 7cdce67cf1677..e4a86544b6c20 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -9,6 +9,11 @@ # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32) +if current_platform.is_hpu(): + import habana_frameworks.torch.utils.experimental as htexp + from vllm_hpu_extension.ops import scaled_fp8_quant + ops.scaled_fp8_quant = scaled_fp8_quant + def sparse_cutlass_supported() -> bool: if not current_platform.is_cuda(): @@ -33,7 +38,15 @@ def cutlass_fp8_supported() -> bool: def per_tensor_dequantize( tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]) -> torch.Tensor: - fake_qweight = tensor.to(torch.float16) + dtype = torch.float16 + device = tensor.device + if current_platform.is_hpu(): + dtype = torch.bfloat16 + if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2: + #dequant on cpu to avoid nan on gaudi2 + tensor = tensor.to('cpu') + + fake_qweight = tensor.to(dtype).to(device) dq_weight = fake_qweight * inv_scale return dq_weight @@ -66,7 +79,10 @@ def requantize_with_max_scale( logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: # Max scale to be used for requanitzation. max_w_scale = weight_scale.max() - + if current_platform.is_hpu() and htexp._get_device_type( + ) == htexp.synDeviceType.synDeviceGaudi2: + max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max / + torch.finfo(torch.float8_e4m3fnuz).max) # QKV / MLP is fused in the on disk checkpoint if any of the # weight scales are still set to the default since we initialize # N weight scales for N shards but we only load 1 weight scale @@ -134,7 +150,7 @@ def apply_fp8_linear( qinput, x_scale = ops.scaled_fp8_quant( input_2d, input_scale, - num_token_padding=17, + batch_dim_padding=17, use_per_token_if_dynamic=use_per_token_if_dynamic) per_tensor_weights = (weight_scale.numel() == 1) @@ -148,6 +164,7 @@ def apply_fp8_linear( scale_a=x_scale, scale_b=weight_scale, bias=bias) + # A fix for discrepancy in scaled_mm which returns tuple # for torch < 2.5 and a single value in torch >= 2.5 if type(output) is tuple and len(output) == 2: diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 3fcd81a3c4213..614906b13f58b 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -27,6 +27,9 @@ import torch.nn as nn from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform + +is_hpu = current_platform.is_hpu() def _rotate_neox(x: torch.Tensor) -> torch.Tensor: @@ -97,6 +100,34 @@ def __init__( self.cos_sin_cache: torch.Tensor self.register_buffer("cos_sin_cache", cache, persistent=False) + def prepare_cos_sin(self, + positions: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + recompute_cos_sin: bool = False): + self.recompute_cos_sin = recompute_cos_sin + if offsets is not None: + offsets = offsets.view(positions.shape[0], -1) + positions = positions + offsets + positions = positions.flatten() + num_tokens = positions.shape[0] + cos_sin = self.cos_sin_cache.index_select(0, positions).view( + num_tokens, 1, -1) + cos, sin = cos_sin.chunk(2, dim=-1) + if self.is_neox_style: + cos = torch.cat((cos, cos), dim=-1) + sin = torch.cat((sin, sin), dim=-1) + else: + sin = torch.repeat_interleave(sin, + 2, + dim=-1, + output_size=cos_sin.shape[-1]) + cos = torch.repeat_interleave(cos, + 2, + dim=-1, + output_size=cos_sin.shape[-1]) + self.register_buffer("cos", cos, persistent=False) + self.register_buffer("sin", sin, persistent=False) + def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: """Compute the inverse frequency.""" # NOTE(woosuk): To exactly match the HF implementation, we need to @@ -203,13 +234,14 @@ def forward_hpu( ) -> Tuple[torch.Tensor, torch.Tensor]: from habana_frameworks.torch.hpex.kernels import ( RotaryPosEmbeddingMode, apply_rotary_pos_emb) - positions = positions.flatten() - if offsets is not None: - positions = positions + offsets - num_tokens = positions.shape[0] - cos_sin = self.cos_sin_cache.index_select(0, positions).view( - num_tokens, 1, -1) - cos, sin = cos_sin.chunk(2, dim=-1) + + # Prepare cos-sin caches for long-context + LoRA with offsets for every + # forward, since the offset information wasn't available previously + if hasattr(self, "scaling_factors") or self.sin is None: + self.prepare_cos_sin(positions, offsets) + if self.recompute_cos_sin: + self.prepare_cos_sin(positions, offsets, recompute_cos_sin=True) + num_tokens = positions.shape[0] * positions.shape[1] # HPU RoPE kernel requires hidden dimension for cos and sin to be equal # to query hidden dimension, so the original tensors need to be # expanded @@ -220,19 +252,10 @@ def forward_hpu( rope_mode: RotaryPosEmbeddingMode if self.is_neox_style: rope_mode = RotaryPosEmbeddingMode.BLOCKWISE - cos = torch.cat((cos, cos), dim=-1) - sin = torch.cat((sin, sin), dim=-1) else: rope_mode = RotaryPosEmbeddingMode.PAIRWISE - sin = torch.repeat_interleave(sin, - 2, - dim=-1, - output_size=cos_sin.shape[-1]) - cos = torch.repeat_interleave(cos, - 2, - dim=-1, - output_size=cos_sin.shape[-1]) - + sin = self.sin + cos = self.cos query_shape = query.shape query = query.view(num_tokens, -1, self.head_size) query_rot = query[..., :self.rotary_dim] @@ -642,9 +665,12 @@ def __init__( is_neox_style, dtype) def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: - pos_freqs = self.base**(torch.arange( - 0, self.rotary_dim, 2, dtype=torch.float, device="cuda") / - self.rotary_dim) + pos_freqs = self.base**( + torch.arange(0, + self.rotary_dim, + 2, + dtype=torch.float, + device="hpu" if is_hpu else "cuda") / self.rotary_dim) inv_freq_extrapolation = 1.0 / pos_freqs inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) @@ -662,7 +688,7 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: def _compute_cos_sin_cache(self) -> torch.Tensor: inv_freq = self._compute_inv_freq(self.scaling_factor) t = torch.arange(self.max_position_embeddings * self.scaling_factor, - device="cuda", + device="hpu" if is_hpu else "cuda", dtype=torch.float32) freqs = torch.einsum("i,j -> ij", t, inv_freq) cos = (freqs.cos() * self.mscale) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py old mode 100644 new mode 100755 index c2d12c466ba45..6b32a52071860 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,5 +1,7 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools +import math +import os import warnings from dataclasses import dataclass from importlib.util import find_spec @@ -204,14 +206,18 @@ def _init_sampling_tensors( self._sampling_tensors = None # Initialize new sampling tensors - (sampling_tensors, do_penalties, do_top_p_top_k, - do_min_p) = SamplingTensors.from_sampling_metadata( + (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p, + top_k_scalar, top_p_scalar) = SamplingTensors.from_sampling_metadata( sampling_metadata, vocab_size, logits.device, logits.dtype) self._sampling_tensors = sampling_tensors self._do_penalties = do_penalties self._do_top_p_top_k = do_top_p_top_k self._do_min_p = do_min_p + self._top_k_scalar = top_k_scalar + self._top_p_scalar = top_p_scalar + + self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5) def forward( self, @@ -271,8 +277,14 @@ def forward( logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1)) if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None: - logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, - sampling_tensors.top_ks) + # If we have a scalar p and k, we can use the optimized version. + if self._top_k_scalar and self._top_p_scalar: + logits = self._apply_top_k_top_p_opt(logits, + self._top_p_scalar, + self._top_k_scalar) + else: + logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, + sampling_tensors.top_ks) if do_min_p: logits = _apply_min_p(logits, sampling_tensors.min_ps) @@ -337,6 +349,135 @@ def _should_modify_greedy_probs_inplace(self) -> bool: return self.should_modify_greedy_probs_inplace +def _get_bin_counts_and_mask( + tokens: torch.Tensor, + vocab_size: int, + num_seqs: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + # Compute the bin counts for the tokens. + # vocab_size + 1 for padding. + bin_counts = torch.zeros((num_seqs, vocab_size + 1), + dtype=torch.long, + device=tokens.device) + bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens)) + bin_counts = bin_counts[:, :vocab_size] + mask = bin_counts > 0 + + return bin_counts, mask + + +class ApplyToppTopkScalar: + """ + The original implementation of _apply_top_k_top_p is more general + as it uses vector topp, topk + However in a lot of cases, topp and topk is same for all batch elements + For such "scalar" topp, topk cases, we can use this class + + The main optimizations in this class is: + Use topk instead of sort, which is much faster especially for small k. + However just using topk might not suffice in cases as shown below + Consider a tensor: 9 9 8 8 8 8 7 7 7 + Topk, with k=5, on this yields 9 9 8 8 8 + The value "8" is on the boundary, hence the last "8" gets snipped off + However the original implementation accepts all the "8"s, + so it should output: + 9 9 8 8 8 8 (6 values, even though k=5) + To ensure these semantics, we perform topk with _padded_k elements + If we find more boundary elements left over, + then we keep incrementing _padded_k + and in future calls use the expanded value of __padded_k + + The increments to _padded_k should be done + with value > 1 to prevent excessive recompilations + due to dynamic shapes (the output shape of the topk) + + The main logic of this is in __call__ + This is a class instead of a function, just to keep track of + the monotonic non-decreasing state _padded_k + + To enable the duplicates that are outside of kth border, + set VLLM_HANDLE_TOPK_DUPLICATES to 1 or true. + """ + _padded_k = 0 + _handle_duplicates = os.getenv('VLLM_HANDLE_TOPK_DUPLICATES', + '0').lower() in ['1', 'true'] + + def __init__(self, increment: int): + self._increment = increment + + def __call__(self, logits: torch.Tensor, p: float, k: int): + if k == 1 and not ApplyToppTopkScalar._handle_duplicates: + new_logits = torch.full(logits.shape, + -float("inf"), + device=logits.device) + vals, idx = torch.max(logits, keepdim=True, dim=1) + new_logits.scatter_(1, idx, vals.to(new_logits.dtype)) + return new_logits + + if k > ApplyToppTopkScalar._padded_k: + ApplyToppTopkScalar._padded_k = min(k + self._increment, + logits.shape[1]) + + vals, idx = torch.topk(logits, + k=ApplyToppTopkScalar._padded_k, + dim=1, + sorted=True) + + # this "if" checks if we have bucketed so much that + # we have padded k upto shape of logits + if self._handle_duplicates and \ + ApplyToppTopkScalar._padded_k != logits.shape[1]: + smallest_of_top_k = vals[:, k - 1] + num_duplicates_of_smallest_of_topk = torch.sum( + logits == smallest_of_top_k.unsqueeze(1), 1) + max_num_duplicates_of_smallest_of_topk = torch.max( + num_duplicates_of_smallest_of_topk).item() + + # there are n repeats for a border + # (border meaning the smallest value of the top k). + # we do not know if only 1 or 2 or (n-1) + # of them lie outside the kth border, + # so we choose to conservatively increase by n-1 + # when num_duplicates > _padded_k - k + if max_num_duplicates_of_smallest_of_topk - 1 > ( + ApplyToppTopkScalar._padded_k - k): + incr = int( + math.ceil((max_num_duplicates_of_smallest_of_topk - 1) / + self._increment) * self._increment) + # this while loop should be traversed at most twice, + # because we dont increment by self._increment and retry + # instead we compute incr in one go + ApplyToppTopkScalar._padded_k = min( + ApplyToppTopkScalar._padded_k + incr, logits.shape[1]) + + # recompute topk with expanded padded_k + vals, idx = torch.topk(logits, + k=ApplyToppTopkScalar._padded_k, + dim=1, + sorted=True) + + idx = torch.fliplr(idx) + vals = torch.fliplr(vals) + + top_k_smallest_val_idx = vals.size(1) - k + top_k_mask = vals[:, top_k_smallest_val_idx].unsqueeze(1) + top_k_mask = vals < top_k_mask + vals.masked_fill_(top_k_mask, -float("inf")) + + probs_sort = vals.softmax(dim=-1) + probs_sum = probs_sort.cumsum(dim=-1) + top_p_mask = probs_sum <= (1 - p) + top_p_mask[:, -1] = False + vals.masked_fill_(top_p_mask, -float("inf")) + + new_logits = torch.full(logits.shape, + -float("inf"), + device=logits.device) + new_logits.scatter_(1, idx, vals.to(new_logits.dtype)) + + return new_logits + + def _apply_min_tokens_penalty( logits: torch.Tensor, sampling_metadata: SamplingMetadata, @@ -384,6 +525,29 @@ def _apply_min_tokens_penalty( return logits +def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, + output_tokens_tensor: torch.Tensor, + presence_penalties: torch.Tensor, + frequency_penalties: torch.Tensor, + repetition_penalties: torch.Tensor) -> torch.Tensor: + num_seqs, vocab_size = logits.shape + _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size, + num_seqs) + output_bin_counts, output_mask = _get_bin_counts_and_mask( + output_tokens_tensor, vocab_size, num_seqs) + + repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size) + repetition_penalties.masked_fill_(~(prompt_mask | output_mask), 1.0) + logits = torch.where(logits > 0, logits / repetition_penalties, + logits * repetition_penalties) + + # We follow the definition in OpenAI API. + # Refer to https://platform.openai.com/docs/api-reference/parameter-details + logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts + logits -= presence_penalties.unsqueeze_(dim=1) * output_mask + return logits + + def _apply_top_k_top_p( logits: torch.Tensor, p: torch.Tensor, diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 6aa4b8bd34cde..f71837c482d9f 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -30,19 +30,6 @@ def __init__(self, strict_mode: bool = False): self.num_emitted_tokens: Optional[torch.Tensor] = None self.num_draft_tokens: int = 0 - def init_gpu_tensors(self, device: Union[int, str]) -> None: - assert self.num_accepted_tokens is None - if isinstance(device, int): - device = f"cuda:{device}" - elif not isinstance(device, str): - raise ValueError(f"Device must be int or str, get {type(device)}") - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - def init_tensors(self, device: Union[int, str], device_type: Union[torch.device, str] = 'cuda') -> None: diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 9fe0db62435a0..96f042df49d69 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -67,7 +67,7 @@ def device_loading_context(module: torch.nn.Module, # Store original device states and move parameters to GPU if they're on CPU for name, p in module.named_parameters(): - if p.device.type == "cpu": + if p.device.type == "cpu" and target_device.type != 'hpu': original_device_states[name] = p.device p.data = p.data.to(target_device) # Parameters already on target device are not touched @@ -325,6 +325,17 @@ def _xla_weights_iterator(iterator: Generator): weights_iterator = _xla_weights_iterator(weights_iterator) + if current_platform.is_hpu(): + + import habana_frameworks.torch.core as htcore + + def _hpu_weights_iterator(iterator: Generator): + for weights in iterator: + yield weights + htcore.mark_step() + + weights_iterator = _hpu_weights_iterator(weights_iterator) + # Apply the prefix. return ((source.prefix + name, tensor) for (name, tensor) in weights_iterator) @@ -357,13 +368,17 @@ def download_model(self, model_config: ModelConfig) -> None: def load_model(self, vllm_config: VllmConfig) -> nn.Module: device_config = vllm_config.device_config + load_config = vllm_config.load_config model_config = vllm_config.model_config - target_device = torch.device(device_config.device) + load_device = device_config.device if load_config.device is None else \ + load_config.device + target_device = torch.device(load_device) with set_default_torch_dtype(model_config.dtype): with target_device: model = _initialize_model(vllm_config=vllm_config) + logger.info("Loading weights on %s...", load_device) weights_to_load = {name for name, _ in model.named_parameters()} loaded_weights = model.load_weights( self._get_all_weights(model_config, model)) @@ -372,9 +387,10 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: if model_config.quantization is None and loaded_weights is not None: weights_not_loaded = weights_to_load - loaded_weights if weights_not_loaded: - raise ValueError( - "Following weights were not initialized from " - f"checkpoint: {weights_not_loaded}") + warning_msg = f"Following weights were not initialized \ + from checkpoint: {weights_not_loaded}" + + logger.warning(warning_msg) for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 3f923d2f6632a..bac773053482a 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -29,7 +29,7 @@ def get_model_architecture( # Special handling for quantized Mixtral. # FIXME(woosuk): This is a temporary hack. mixtral_supported = [ - "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin" + "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin", "inc" ] if (model_config.quantization is not None diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 9cfcdbf620d2b..8a3a29765c5fa 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -135,8 +135,8 @@ def get_quant_config(model_config: ModelConfig, quant_cls = get_quantization_config(model_config.quantization) # GGUF doesn't have config file - if model_config.quantization == "gguf": - return quant_cls.from_config({}) + if model_config.quantization in ("gguf", "inc"): + return quant_cls() # Read the quantization config from the HF model config, if available. hf_quant_config = getattr(model_config.hf_config, "quantization_config", diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index fd6b5659df5d1..548a122e176c6 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -137,13 +137,11 @@ def __init__(self, torch.empty(self.num_experts, 2 * self.intermediate_size, self.hidden_size, - device="cuda", dtype=self.params_dtype)) self.w2s = nn.Parameter( torch.empty(self.num_experts, self.hidden_size, self.intermediate_size, - device="cuda", dtype=self.params_dtype)) set_weight_attrs(self.ws, { "weight_loader": self.weight_loader, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index af6810a140b43..50592cb80333d 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -48,6 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP @@ -55,6 +56,8 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +is_hpu = current_platform.is_hpu() + class DeepseekV2MLP(nn.Module): @@ -112,18 +115,30 @@ def __init__( if config.hidden_act != "silu": raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") - - self.experts = FusedMoE(num_experts=config.n_routed_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - use_grouped_topk=True, - num_expert_group=config.n_group, - topk_group=config.topk_group, - prefix=f"{prefix}.experts") + if is_hpu: + self.experts = FusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=False, + prefix=f"{prefix}.experts") + else: + self.experts = FusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts") self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, @@ -277,9 +292,22 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: + if is_hpu: + # need reshape from tensor(x0, y0) to tensor(x1) for hpu + _batch_size = positions.shape[0] + positions = positions.reshape(positions.shape[0] * + positions.shape[1]) + hidden_states = hidden_states.reshape( + hidden_states.shape[0] * hidden_states.shape[1], + hidden_states.shape[2]) if self.q_lora_rank is not None: - q = self.q_a_proj(hidden_states)[0] - q = self.q_a_layernorm(q) + if is_hpu: + # w/a of SW-208144 + q = self.q_a_proj(hidden_states)[0].unsqueeze(0) + q = self.q_a_layernorm(q).squeeze(0) + else: + q = self.q_a_proj(hidden_states)[0] + q = self.q_a_layernorm(q) q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim) else: @@ -291,7 +319,11 @@ def forward( kv_a, _ = latent_cache.split( [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) latent_cache = latent_cache.unsqueeze(1) - kv_a = self.kv_a_layernorm(kv_a.contiguous()) + if is_hpu: + kv_a = self.kv_a_layernorm(kv_a.contiguous().unsqueeze(0)).squeeze( + 0) # w/a of SW-208144 + else: + kv_a = self.kv_a_layernorm(kv_a.contiguous()) kv = self.kv_b_proj(kv_a)[0] kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim) @@ -317,12 +349,26 @@ def forward( v = torch.nn.functional.pad( v, [0, self.qk_head_dim - self.v_head_dim], value=0).view(-1, self.num_local_heads * self.qk_head_dim) + if is_hpu: + # need restore from tensor(x0, y0) to tensor(x1, y1, z1) for hpu + q = q.reshape(_batch_size, q.shape[0] // _batch_size, q.shape[1]) + k = k.reshape(_batch_size, k.shape[0] // _batch_size, k.shape[1]) + v = v.reshape(_batch_size, v.shape[0] // _batch_size, v.shape[1]) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + if is_hpu: + # need restore from tensor(x0, y0, z0) to tensor(x1, y1) for hpu + attn_output = attn_output.reshape( + attn_output.shape[0] * attn_output.shape[1], + attn_output.shape[2]) attn_output = attn_output.view( -1, self.num_local_heads, self.qk_head_dim)[..., :self.v_head_dim].reshape( -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) + if is_hpu: + output = output.reshape(_batch_size, + output.shape[0] // _batch_size, + output.shape[1]) return output @@ -391,6 +437,8 @@ def forward( attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], ) -> torch.Tensor: + if is_hpu: + _batch_size = positions.shape[0] # Self Attention if residual is None: residual = hidden_states @@ -408,7 +456,16 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm( hidden_states, residual) + if is_hpu: + # need reshape from tensor(x0, y0) to tensor(x1) for hpu + hidden_states = hidden_states.reshape( + hidden_states.shape[0] * hidden_states.shape[1], + hidden_states.shape[2]) hidden_states = self.mlp(hidden_states) + if is_hpu: + hidden_states = hidden_states.reshape( + _batch_size, hidden_states.shape[0] // _batch_size, + hidden_states.shape[1]) return hidden_states, residual diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c64bc70688806..730d4b42b2a09 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -38,12 +38,15 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers) +is_hpu = current_platform.is_hpu() + class GPTBigCodeAttention(nn.Module): @@ -244,12 +247,14 @@ def forward( else: hidden_states = intermediate_tensors["hidden_states"] + if is_hpu: + import habana_frameworks.torch as htorch + htorch.core.mark_step() for i in range(self.start_layer, self.end_layer): layer = self.h[i] hidden_states = layer(hidden_states, kv_caches[i - self.start_layer], attn_metadata) - if not get_pp_group().is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) hidden_states = self.ln_f(hidden_states) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 28c23edd4c8e8..5a9de56ec2307 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -146,7 +146,9 @@ def __init__( ) def split_qkv(self, qkv: torch.Tensor): - seq_len = qkv.shape[0] + # Unpack all dimensions except the last one + *batch_dims, _ = qkv.shape + if self.tp_size > 1: qkv_map = [self.q_size, self.kv_size, self.kv_size] * self.tp_size qkv = tensor_model_parallel_all_gather(qkv) @@ -154,12 +156,15 @@ def split_qkv(self, qkv: torch.Tensor): qkv = qkv[::3] + qkv[1::3] + qkv[2::3] qkv = torch.cat(qkv, dim=-1) - qkv = qkv.view(seq_len, self.total_num_kv_heads, + qkv = qkv.contiguous() + + # Dynamically reshape based on the number of batch dimensions + qkv = qkv.view(*batch_dims, self.total_num_kv_heads, self.key_value_groups + 2, self.head_dim) q, k, v = torch.split(qkv, [self.key_value_groups, 1, 1], dim=-2) - q = q.reshape(seq_len, self.q_size * self.tp_size) - k = k.reshape(seq_len, self.kv_size * self.tp_size) - v = v.reshape(seq_len, self.kv_size * self.tp_size) + q = q.view(*batch_dims, self.q_size * self.tp_size) + k = k.view(*batch_dims, self.kv_size * self.tp_size) + v = v.view(*batch_dims, self.kv_size * self.tp_size) if self.tp_size > 1: splitter = partial(split_tensor_along_last_dim, @@ -167,6 +172,7 @@ def split_qkv(self, qkv: torch.Tensor): q = splitter(q)[self.tp_rank] k = splitter(k)[self.tp_rank] v = splitter(v)[self.tp_rank] + return q, k, v def forward( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index a5bd418801f2c..00ff2351efe34 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -2,6 +2,7 @@ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# Copyright 2024 Habana Labs, Ltd. an Intel Company # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its @@ -54,6 +55,8 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +is_hpu = current_platform.is_hpu() + class LlamaMLP(nn.Module): @@ -360,12 +363,15 @@ def forward( hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + if is_hpu: + import habana_frameworks.torch as htorch + htorch.core.mark_step() + for i in range(self.start_layer, self.end_layer): layer = self.layers[i] hidden_states, residual = layer(positions, hidden_states, kv_caches[i - self.start_layer], attn_metadata, residual) - if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 2554281610a30..b8c40582b629e 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -51,6 +51,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.platforms import current_platform from vllm.sequence import SequenceData from vllm.utils import is_list_of @@ -63,6 +64,8 @@ MLLAMA_IMAGE_TOKEN_ID = 128256 MLLAMA_IMAGE_TOKEN = "<|image|>" +is_hpu = current_platform.is_hpu() + class MllamaImagePixelInputs(TypedDict): type: Literal["pixel_values"] @@ -418,11 +421,12 @@ def forward( self.head_dim).transpose(1, 2) # TODO: remove padding in image encoder - attn_output = F.scaled_dot_product_attention(q, - k, - v, - attn_mask=attention_mask, - dropout_p=0.0) + if current_platform.is_hpu(): + from habana_frameworks.torch.hpex.kernels import FusedSDPA + attn_output = FusedSDPA.apply(q, k, v, attention_mask, 0.0) + else: + attn_output = F.scaled_dot_product_attention( + q, k, v, attn_mask=attention_mask, dropout_p=0.0) attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(attn_output.shape[0], @@ -951,6 +955,14 @@ def forward( kv_cache=kv_cache, attn_metadata=attn_metadata, ) + # the rank of full_text_row_masked_out_mask is 2, not match with + # the hidden_states, so expand its rank to 3. + # TODO: Change input_tokens tensor at the beginning of model execution + # to 2D tensor to align with public vllm input_tokens shape. But this + # will face the graph building failure issue, still need to investigate. + if len(hidden_states.shape) == 3: + full_text_row_masked_out_mask = full_text_row_masked_out_mask.view( + hidden_states.size(0), -1, 1) hidden_states = full_text_row_masked_out_mask * hidden_states hidden_states = residual + self.cross_attn_attn_gate.tanh( ) * hidden_states @@ -1020,6 +1032,11 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) hidden_states = inputs_embeds + if is_hpu: + for idx, decoder_layer in enumerate(self.layers): + if isinstance(decoder_layer, LlamaDecoderLayer): + self.layers[idx].self_attn.rotary_emb.prepare_cos_sin( + positions) for idx, decoder_layer in enumerate(self.layers): if isinstance(decoder_layer, MllamaCrossAttentionDecoderLayer): if not skip_cross_attention: diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index d015f60c6d065..51510eaeeb9f4 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -48,6 +48,7 @@ default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP @@ -333,6 +334,9 @@ def forward( assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + if current_platform.is_hpu(): + import habana_frameworks.torch as htorch + htorch.core.mark_step() for i in range(self.start_layer, self.end_layer): layer = self.layers[i] hidden_states, residual = layer( @@ -347,6 +351,7 @@ def forward( "hidden_states": hidden_states, "residual": residual }) + hidden_states, _ = self.norm(hidden_states, residual) return hidden_states diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 1df8f84ed4093..837960fd8d598 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -4,11 +4,13 @@ import torch +from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData, SequenceGroupMetadata) from vllm.utils import (PyObjectCache, async_tensor_h2d, - is_pin_memory_available, make_tensor_with_pad) + is_pin_memory_available, make_tensor_with_pad, + make_tensor_with_pad_align) _SAMPLING_EPS = 1e-5 @@ -266,8 +268,14 @@ def _prepare_seq_groups( if seq_group_metadata.is_prompt: if sampling_params.seed is not None: - generator = torch.Generator(device=device).manual_seed( - sampling_params.seed) + if current_platform.is_hpu(): + import habana_frameworks.torch.hpu.random as htrandom + generator = \ + htrandom.default_generators[ + 0].manual_seed(sampling_params.seed) + else: + generator = torch.Generator(device=device).manual_seed( + sampling_params.seed) if generators is not None: generators[seq_group_metadata.request_id] = generator @@ -382,7 +390,8 @@ def from_sampling_metadata( vocab_size: int, device: torch.device, dtype: torch.dtype, - ) -> Tuple["SamplingTensors", bool, bool, bool]: + ) -> Tuple["SamplingTensors", bool, bool, bool, Optional[int], + Optional[float]]: prompt_tokens: List[array] = [] output_tokens: List[array] = [] top_ks: List[int] = [] @@ -470,6 +479,11 @@ def from_sampling_metadata( prompt_tokens.append(seq_data.prompt_token_ids_array) output_tokens.append(seq_data.output_token_ids_array) + top_k_scalar = top_ks[0] if do_top_p_top_k and all( + k == top_ks[0] for k in top_ks) else None + top_p_scalar = top_ps[0] if do_top_p_top_k and all( + p == top_ps[0] for p in top_ps) else None + sampling_tensors = SamplingTensors.from_lists( temperatures, top_ps, @@ -484,7 +498,8 @@ def from_sampling_metadata( device, dtype, ) - return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p) + return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p, + top_k_scalar, top_p_scalar) @classmethod def from_lists( @@ -509,20 +524,38 @@ def from_lists( do_penalties = prompt_tokens or output_tokens if do_penalties: - prompt_t = make_tensor_with_pad( - prompt_tokens, - vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=pin_memory, - ) - output_t = make_tensor_with_pad( - output_tokens, - vocab_size, - device="cpu", - dtype=torch.int64, - pin_memory=pin_memory, - ) + if current_platform.is_hpu(): + prompt_t = make_tensor_with_pad_align( + prompt_tokens, + vocab_size, + device="cpu", + dtype=torch.int64, + pin_memory=pin_memory, + max_len_align=1024, + ) + output_t = make_tensor_with_pad_align( + output_tokens, + vocab_size, + device="cpu", + dtype=torch.int64, + pin_memory=pin_memory, + max_len_align=1024, + ) + else: + prompt_t = make_tensor_with_pad( + prompt_tokens, + vocab_size, + device="cpu", + dtype=torch.int64, + pin_memory=pin_memory, + ) + output_t = make_tensor_with_pad( + output_tokens, + vocab_size, + device="cpu", + dtype=torch.int64, + pin_memory=pin_memory, + ) else: empty_tensor = torch.empty(0, device=device, dtype=torch.long) prompt_t = empty_tensor diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 6ca95b41dbb07..88e35e1de1ec5 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -65,10 +65,19 @@ def rocm_platform_plugin() -> Optional[str]: amdsmi.amdsmi_shut_down() except Exception: pass - return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None +is_hpu = False +try: + import os + from importlib import util + is_hpu = util.find_spec('habana_frameworks') is not None or os.environ.get( + 'VLLM_USE_FAKE_HPU', '0') != '0' +except Exception: + pass + + def hpu_platform_plugin() -> Optional[str]: is_hpu = False try: diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 242c2c127979a..69c445766b824 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -21,6 +21,7 @@ class HpuPlatform(Platform): dispatch_key: str = "HPU" ray_device_key: str = "HPU" device_control_env_var: str = "HABANA_VISIBLE_MODULES" + supported_quantization: list[str] = ["inc"] @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, @@ -33,25 +34,23 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True - @staticmethod - def inference_mode(): - return torch.no_grad() - @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: scheduler_config = vllm_config.scheduler_config - if scheduler_config.is_multi_step: - raise NotImplementedError( - "Multi-step execution is not implemented for HPU") - - if vllm_config.speculative_config is not None: - raise NotImplementedError( - "Speculative decoding is not implemented for HPU") parallel_config = vllm_config.parallel_config if parallel_config.worker_cls == "auto": - parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" + if scheduler_config.is_multi_step: + parallel_config.worker_cls = \ + "vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker" + elif vllm_config.speculative_config: + parallel_config.worker_cls = \ + "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = \ + "vllm.worker.hpu_worker.HPUWorker" + else: + parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" # NOTE(kzawora): default block size for Gaudi should be 128 # smaller sizes still work, but very inefficiently diff --git a/vllm/sequence.py b/vllm/sequence.py index 5857f656dfc10..372879f2952c0 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1214,7 +1214,8 @@ def update(self, second_last_token_hidden_states: Optional[torch.Tensor] = None): """Update hidden states from target model invocation. Only used for decode steps""" - assert len(seq_group_metadata_list) == len(hidden_states) + if len(seq_group_metadata_list) < len(hidden_states): + hidden_states = hidden_states[:len(seq_group_metadata_list)] self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) self.hidden_states = torch.cat([self.hidden_states, hidden_states]) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 01b9cdad963da..53219042afeaa 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -6,6 +6,7 @@ from vllm import SamplingParams from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform from vllm.sequence import (VLLM_INVALID_TOKEN_ID, VLLM_TOKEN_ID_ARRAY_TYPE, ExecuteModelRequest, SequenceData, SequenceGroupMetadata, get_all_seq_ids) @@ -86,6 +87,7 @@ def score_proposals( contracted = self._contract_batch_all_spec( target_sampler_output=target_sampler_output, proposals=proposals, + num_scoring_tokens=num_scoring_tokens, ) else: # Batch has a mix of spec decode enabled and disabled seq groups @@ -158,11 +160,18 @@ def _contract_batch( target_sampler_output will be contracted to. """ contracted_bs = len(contracted_seq_group_metadata_list) - (target_token_ids, target_probs, target_logprobs, target_hidden_states, - non_spec_target_token_ids, non_spec_target_probs, - non_spec_target_logprobs, - non_spec_target_hidden_states) = self._split_scoring_output( - target_sampler_output, num_scoring_tokens) + if current_platform.is_hpu(): + (target_token_ids, target_probs, target_logprobs, + target_hidden_states, non_spec_target_token_ids, + non_spec_target_probs, non_spec_target_logprobs, + non_spec_target_hidden_states) = self._split_scoring_output_hpu( + target_sampler_output, num_scoring_tokens) + else: + (target_token_ids, target_probs, target_logprobs, + target_hidden_states, non_spec_target_token_ids, + non_spec_target_probs, non_spec_target_logprobs, + non_spec_target_hidden_states) = self._split_scoring_output( + target_sampler_output, num_scoring_tokens) # Map distinct sequences used to score each token # of shape [batch_size * k + 1] back to [batch_size, k + 1]. @@ -225,6 +234,7 @@ def _contract_batch_all_spec( self, target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, + num_scoring_tokens: int, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Contract the expanded batch back into its original size. @@ -237,6 +247,30 @@ def _contract_batch_all_spec( # Map distinct sequences used to score each token # of shape [batch_size * k + 1] back to [batch_size, k + 1]. contracted_bs, k = proposals.proposal_token_ids.shape + if current_platform.is_hpu(): + ( + target_sampler_output.sampled_token_ids, + target_sampler_output.sampled_token_probs, + target_sampler_output.logprobs, + target_sampler_output.hidden_states, + _, + _, + _, + _, + ) = self._split_scoring_output_hpu(target_sampler_output, + num_scoring_tokens) + else: + ( + target_sampler_output.sampled_token_ids, + target_sampler_output.sampled_token_probs, + target_sampler_output.logprobs, + target_sampler_output.hidden_states, + _, + _, + _, + _, + ) = self._split_scoring_output(target_sampler_output, + num_scoring_tokens) # Reshape tensors to original batch size target_token_ids = target_sampler_output.sampled_token_ids.reshape( @@ -371,6 +405,47 @@ def _create_single_target_seq_group_metadata( token_chunk_size=1, ) + @staticmethod + def _split_scoring_output_hpu( + sampler_output: SamplerOutput, num_scoring_tokens: int + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, + Optional[torch.Tensor], torch.Tensor, torch.Tensor, + torch.Tensor, Optional[torch.Tensor]]: + """Split the target model output into speculative and non-speculative + output. + """ + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + # + # First samples are from speculative scoring, latter samples are non- + # speculative samples. + split_sizes = (num_scoring_tokens, + sampler_output.sampled_token_ids.numel() - + num_scoring_tokens) + (spec_probs, non_spec_probs + ) = sampler_output.sampled_token_probs.split(split_sizes) + (spec_sampled_tokens, non_spec_sampled_tokens + ) = sampler_output.sampled_token_ids.flatten().split(split_sizes) + ( + spec_logprobs, + non_spec_logprobs, + ) = sampler_output.logprobs.split(split_sizes) + + if sampler_output.hidden_states is not None: + ( + spec_hidden_states, + non_spec_hidden_states, + ) = sampler_output.hidden_states.split(split_sizes) + else: + spec_hidden_states, non_spec_hidden_states = None, None + + return (spec_sampled_tokens, spec_probs, spec_logprobs, + spec_hidden_states, non_spec_sampled_tokens, non_spec_probs, + non_spec_logprobs, non_spec_hidden_states) + @staticmethod def _split_scoring_output( sampler_output: SamplerOutput, num_scoring_tokens: int diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index fe5fd39f42ac9..14b3945aa3fcb 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -10,12 +10,16 @@ from vllm.attention.backends.flash_attn import FlashAttentionMetadata except (ModuleNotFoundError, ImportError): # vllm_flash_attn is not installed, try the ROCm FA metadata - from vllm.attention.backends.rocm_flash_attn import ( - ROCmFlashAttentionMetadata as FlashAttentionMetadata) -except (ModuleNotFoundError, ImportError) as err: + try: + from vllm.attention.backends.rocm_flash_attn import ( + ROCmFlashAttentionMetadata as FlashAttentionMetadata) + except (ModuleNotFoundError, ImportError, AssertionError): + from vllm.attention.backends.hpu_attn import ( + HPUPagedAttentionMetadata as FlashAttentionMetadata) +except (ModuleNotFoundError, ImportError, AssertionError) as err: raise RuntimeError( "Draft model speculative decoding currently only supports" - "CUDA and ROCm flash attention backend.") from err + "CUDA and ROCm and HPU attention backend.") from err from vllm.logger import init_logger from vllm.multimodal import MultiModalKwargs diff --git a/vllm/spec_decode/hpu_draft_model_runner.py b/vllm/spec_decode/hpu_draft_model_runner.py new file mode 100644 index 0000000000000..dc99233dcdb18 --- /dev/null +++ b/vllm/spec_decode/hpu_draft_model_runner.py @@ -0,0 +1,72 @@ +from typing import List, Optional + +import torch + +from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import IntermediateTensors +from vllm.worker.model_runner_base import (ModelRunnerInputBase, + ModelRunnerWrapperBase) + +logger = init_logger(__name__) + +# A flag to enable debug prints for the updated input tensors +# before each step. +debug_advance_input = False +# A flag to allow GPU advance step for draft model runner. +# Set to False for debugging. +allow_gpu_advance_step = True + + +class HPUTP1DraftModelRunner(ModelRunnerWrapperBase): + """Specialized model runner for speculative decoding draft model. + Since the draft model always execute k forward passes consecutively to + generate k speculative tokens in a single speculative decoding step, + we could get rid of most CPU-GPU synchronization and data transfer + overheads by keeping model input and output tensors on GPU all the time. + + TODOs: + 1. Support TP > 1 (this requires some designs because we do not expect + any broadcasting inside execute_model). + """ + + def __init__(self, *args, **kwargs): + if kwargs.get("return_hidden_states"): + raise ValueError( + "return_hidden_states is not supported for TP1DraftModelRunner." + ) + + super().__init__(*args, **kwargs) + + self.indices_of_seq_with_bonus_tokens = None + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelRunnerInputBase, + kv_caches: List[torch.Tensor], + previous_hidden_states: Optional[torch.Tensor] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + ) -> Optional[List[SamplerOutput]]: + if previous_hidden_states is not None: + batch_size, block_size = model_input.input_tokens.shape + previous_hidden_states = previous_hidden_states.unsqueeze( + dim=1).expand(-1, block_size, -1) + # because HPU will pad batch_size, + # we need to pad previous_hidden_states as well + batch_size_padding = batch_size - previous_hidden_states.shape[0] + if batch_size_padding > 0: + dummy_previous_hidden_states = torch.zeros_like( + previous_hidden_states[1:2]).expand( + batch_size_padding, -1, -1) + previous_hidden_states = torch.cat( + [previous_hidden_states, dummy_previous_hidden_states], + dim=0) + return self.model_runner.execute_model( + model_input=model_input, + kv_caches=kv_caches, + previous_hidden_states=previous_hidden_states, + intermediate_tensors=intermediate_tensors, + num_steps=num_steps, + ) diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index d678f4578499b..66cedce1911db 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -77,10 +77,6 @@ def __init__(self, self._rejsample_metrics_collect_interval_s = collect_interval_s self._last_metrics_collect_time = self._timer() - def init_gpu_tensors(self, rank: int) -> None: - self._rank = rank - self._copy_stream = torch.cuda.Stream() - def init_tensors(self, rank: int, device_type: Union[torch.device, str] = 'cuda') -> None: @@ -89,6 +85,9 @@ def init_tensors(self, device_type = device_type.type if device_type == 'cuda': self._copy_stream = torch.cuda.Stream() + elif device_type == 'hpu': + import habana_frameworks.torch as htorch + self._copy_stream = htorch.hpu.Stream() def maybe_collect_rejsample_metrics( self, k: int) -> Optional[SpecDecodeWorkerMetrics]: @@ -97,6 +96,9 @@ def maybe_collect_rejsample_metrics( if not current_platform.is_cuda_alike(): return None + if not current_platform.is_cuda_alike(): + return None + # If a copy was initiated in the previous call, collect and return. if self._in_flight_copy is not None: ready_event = self._in_flight_copy diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index bb6b99135580e..6ee1ef6fb93bd 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -4,11 +4,29 @@ import torch from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer +if current_platform.is_cuda_alike(): + DEVICE_TYPE = "cuda" +elif current_platform.is_neuron(): + DEVICE_TYPE = "neuron" +elif current_platform.is_hpu(): + DEVICE_TYPE = "hpu" +elif current_platform.is_openvino(): + DEVICE_TYPE = "openvino" +elif current_platform.is_cpu(): + DEVICE_TYPE = "cpu" +elif current_platform.is_tpu(): + DEVICE_TYPE = "tpu" +elif current_platform.is_xpu(): + DEVICE_TYPE = "xpu" +else: + raise ValueError(f"Unsupported platform: {current_platform}") + class NGramWorker(NonLLMProposerWorkerBase): """NGramWorker provides a light drafter without need for model. diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 540d118d65ecb..64f59ab3bf4ff 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -23,6 +23,8 @@ if current_platform.is_cuda_alike(): from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner +if current_platform.is_hpu(): + from vllm.spec_decode.hpu_draft_model_runner import HPUTP1DraftModelRunner from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) @@ -178,6 +180,9 @@ def create_worker( if current_platform.is_cuda_alike(): draft_worker_kwargs[ "model_runner_cls"] = TP1DraftModelRunner + elif current_platform.is_hpu(): + draft_worker_kwargs[ + "model_runner_cls"] = HPUTP1DraftModelRunner else: if draft_model_config.hf_config.model_type == "eagle": raise NotImplementedError( diff --git a/vllm/utils.py b/vllm/utils.py index 89ba119bb5e55..afdaa9e08171c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -10,6 +10,7 @@ import importlib.util import inspect import ipaddress +import math import multiprocessing import os import re @@ -149,6 +150,7 @@ "fp8": torch.uint8, "fp8_e4m3": torch.uint8, "fp8_e5m2": torch.uint8, + "fp8_inc": torch.float8_e4m3fn, } TORCH_DTYPE_TO_NUMPY_DTYPE = { @@ -352,6 +354,23 @@ def reset(self): self._index = 0 +@lru_cache(maxsize=None) +def is_fake_hpu() -> bool: + return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0' + + +@lru_cache(maxsize=None) +def hpu_device_string(): + device_string = 'hpu' if not is_fake_hpu() else 'cpu' + return device_string + + +@lru_cache(maxsize=None) +def hpu_backend_string(): + backend_string = 'hccl' if not is_fake_hpu() else 'gloo' + return backend_string + + @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -751,6 +770,30 @@ def make_ndarray_with_pad( return padded_x +def make_ndarray_with_pad_align( + x: List[List[T]], + pad: T, + dtype: npt.DTypeLike, + *, + max_len_align: int = 1024, +) -> npt.NDArray: + """ + Make a padded array from 2D inputs. + The padding is applied to the end of each inner list until it reaches + `max_len`. + """ + # Unlike for most functions, map is faster than a genexpr over `len` + max_len = max(map(len, x), default=0) + max_len_aligned = math.ceil(max_len / max_len_align) * max_len_align + padded_x = np.full((len(x), max_len_aligned), pad, dtype=dtype) + + for ind, blocktb in enumerate(x): + assert len(blocktb) <= max_len_aligned + padded_x[ind, :len(blocktb)] = blocktb + + return padded_x + + def make_tensor_with_pad( x: List[List[T]], pad: T, @@ -776,6 +819,34 @@ def make_tensor_with_pad( return tensor +def make_tensor_with_pad_align( + x: List[List[T]], + pad: T, + dtype: torch.dtype, + *, + max_len_align: int = 1024, + device: Optional[Union[str, torch.device]] = None, + pin_memory: bool = False, +) -> torch.Tensor: + """ + Make a padded tensor from 2D inputs. + The padding is applied to the end of each inner list until it reaches + max_len_aligned, max_len_aligned is max_len rounding to the nearest + `max_len_align`. + """ + np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype] + padded_x = make_ndarray_with_pad_align(x, + pad, + np_dtype, + max_len_align=max_len_align) + + tensor = torch.from_numpy(padded_x).to(device) + if pin_memory: + tensor = tensor.pin_memory() + + return tensor + + def async_tensor_h2d( data: list, dtype: torch.dtype, @@ -1509,6 +1580,32 @@ def value(self): return self._value +def migrate_to_cpu(): + import importlib + from unittest.mock import MagicMock + + torch.hpu = MagicMock(name="torch.hpu") + + # Adding dummy submodules to habana_frameworks.torch for cpu-test, + # functions from dummy modules will do nothing by default + spec = importlib.util.spec_from_loader('habana_frameworks', loader=None) + sys.modules['habana_frameworks'] = MagicMock() + sys.modules['habana_frameworks'].__spec__ = spec + + builtin_import = __builtins__['__import__'] # type: ignore + + def import_wrapper(name, *args, **kwargs): + if 'habana_frameworks' in name: + sys.modules[name] = MagicMock() + return builtin_import(name, *args, **kwargs) + + __builtins__['__import__'] = import_wrapper + + # In case you want to mock a function to actually do something + import habana_frameworks.torch as htorch + htorch.utils.internal.is_lazy.return_value = False + + # Adapted from: https://stackoverflow.com/a/47212782/5082708 class LazyDict(Mapping[str, T], Generic[T]): diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py new file mode 100644 index 0000000000000..ca65701191c27 --- /dev/null +++ b/vllm/worker/hpu_enc_dec_model_runner.py @@ -0,0 +1,625 @@ +import dataclasses +import gc +import itertools +import math +from array import array +from functools import partial +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, cast + +import habana_frameworks.torch as htorch +import torch +from vllm_hpu_extension.ops import batch2block, block2batch + +from vllm.attention import AttentionMetadata +from vllm.forward_context import set_forward_context +from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sampling_params import SamplingParams +from vllm.sequence import (IntermediateTensors, SequenceData, + SequenceGroupMetadata) +from vllm.utils import is_fake_hpu +from vllm.worker.hpu_model_runner import (HpuModelAdapter, HPUModelRunnerBase, + ModelInputForHPUWithSamplingMetadata, + setup_profiler, subtuple) +from vllm.worker.model_runner_base import ( + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +logger = init_logger(__name__) + +# These values are assumed to be zero in several places. +# Use caution when updating them! +_PAD_SLOT_ID = 0 +_PAD_BLOCK_ID = 0 + + +class HpuModelAdapterEncoderDecoder(HpuModelAdapter): + + def __init__(self, model, vllm_config, layer_names): + super().__init__(model, vllm_config, layer_names) + + # We only wrap the language model in HPU graph because some Ops in + # vision model will fallback to CPU and cause the graph building fail. + if htorch.utils.internal.is_lazy() and hasattr(self.model, + "language_model"): + self.model.language_model = htorch.hpu.wrap_in_hpu_graph( + self.model.language_model, disable_tensor_cache=True) + + def _set_cross_block_mapping(self, metadata, batch_size, device, dtype): + mask = torch.arange(0, + self.block_size, + device=device, + dtype=torch.int32).unsqueeze(0) + + cross_attn_mask = mask >= metadata.cross_block_usage.unsqueeze(-1) + cross_attn_bias = (torch.zeros_like(cross_attn_mask, + dtype=dtype).masked_fill_( + cross_attn_mask, -math.inf)) + + if not is_fake_hpu() and htorch.utils.internal.is_lazy(): + cross_block_mapping = torch.nn.functional.one_hot( + metadata.cross_block_groups, num_classes=batch_size) + else: + # Unfortunately one_hot on CPU/torch.compile mode/eager mode + # doesn't handle out of bounds classes so we need to convert + # all negative values to 0 (block_mapping) or bs (block_groups) + cross_block_groups = metadata.cross_block_groups.to(torch.long) + cross_block_mapping = torch.nn.functional.relu(cross_block_groups) + cross_block_mapping = torch.nn.functional.one_hot( + cross_block_mapping, num_classes=batch_size) + oob_values = cross_block_groups.lt(0) + cross_block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0) + cross_block_groups.masked_fill_(oob_values, batch_size) + metadata = metadata._replace(cross_block_groups=cross_block_groups) + + cross_block_mapping = cross_block_mapping.to(dtype) + metadata = metadata._replace(cross_block_mapping=cross_block_mapping, + cross_attn_bias=cross_attn_bias) + return metadata + + def _set_cross_block_scales(self, metadata, device): + cross_block_mapping = metadata.cross_block_mapping + ones = torch.ones((cross_block_mapping.size(0), ), + device=device, + dtype=cross_block_mapping.dtype) + sums = batch2block(block2batch(ones, cross_block_mapping), + cross_block_mapping) + cross_block_scales = torch.reciprocal(torch.maximum(ones, sums)) + metadata = metadata._replace(cross_block_scales=cross_block_scales) + return metadata + + def _set_cross_indices_and_offsets(self, metadata, block_size): + cross_slot_mapping = metadata.cross_slot_mapping.flatten() + indices = torch.div(cross_slot_mapping, + block_size, + rounding_mode="floor") + offsets = torch.fmod(cross_slot_mapping, block_size) + metadata = metadata._replace(cross_block_offsets=offsets, + cross_block_indices=indices) + return metadata + + def _update_seq_lens(self, attn_metadata, batch_size, seq_len, device): + # Set the seq_lens to after-padding sequence lengths to prevent + # graph recapturing. + seq_lens = batch_size * [seq_len] + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.long, + device=device) + attn_metadata = attn_metadata._replace(seq_lens=seq_lens, + seq_lens_tensor=seq_lens_tensor) + return attn_metadata + + def _update_cross_metadata(self, attn_metadata, batch_size, seq_len, + device, dtype): + if max(attn_metadata.encoder_seq_lens) == 0: + return attn_metadata + if attn_metadata.is_prompt: + attn_metadata = self._set_cross_indices_and_offsets( + attn_metadata, self.block_size) + attn_metadata = self._update_seq_lens(attn_metadata, batch_size, + seq_len, device) + else: + attn_metadata = self._set_cross_block_mapping( + attn_metadata, batch_size, device, dtype) + attn_metadata = self._set_cross_block_scales(attn_metadata, device) + + return attn_metadata + + def forward(self, *args, **kwargs): + kwargs = kwargs.copy() + selected_token_indices = kwargs.pop('selected_token_indices') + if 'warmup_mode' in kwargs: + kwargs.pop('warmup_mode') + input_ids = kwargs['input_ids'] + kwargs['attn_metadata'] = self._update_metadata( + kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), + input_ids.device, self.dtype) + kwargs['attn_metadata'] = self._update_cross_metadata( + kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), + input_ids.device, self.dtype) + if htorch.utils.internal.is_lazy() and hasattr(self.model, + "language_model"): + bypass_hpu_graphs = kwargs.get('bypass_hpu_graphs', False) + self.model.language_model.forward = partial( + self.model.language_model.forward, + bypass_hpu_graphs=bypass_hpu_graphs) + # TODO: Change the input_ids to 1D to match the public vllm + # implementation and avoid shape mismatch issues with some + # models(i.e. Mllama). But currently this will cause graph + # building error. + # kwargs['input_ids'] = input_ids.flatten() + virtual_engine = 0 + if 'virtual_engine' in kwargs: + virtual_engine = kwargs.pop('virtual_engine') + with set_forward_context(kwargs['attn_metadata'], self.vllm_config, + virtual_engine): + hidden_states = self.model(*args, **kwargs) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = hidden_states.index_select(0, + selected_token_indices) + return hidden_states + + +@dataclasses.dataclass(frozen=True) +class EncoderDecoderModelInputForHPU(ModelInputForHPUWithSamplingMetadata): + """ + Used by the EncoderDecoderModelRunner. + """ + encoder_input_tokens: Optional[torch.Tensor] = None + encoder_input_positions: Optional[torch.Tensor] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "multi_modal_kwargs": self.multi_modal_kwargs, + "encoder_input_tokens": self.encoder_input_tokens, + "encoder_input_positions": self.encoder_input_positions, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "EncoderDecoderModelInputForHPU": + return cast( + EncoderDecoderModelInputForHPU, + super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) + + +class HPUEncoderDecoderModelRunner( + HPUModelRunnerBase[EncoderDecoderModelInputForHPU]): + _model_input_cls: Type[EncoderDecoderModelInputForHPU] = ( + EncoderDecoderModelInputForHPU) + _model_adapter_cls: Type[HpuModelAdapterEncoderDecoder] = ( + HpuModelAdapterEncoderDecoder) + + def _list_to_int32_tensor( + self, + _list: List[int], + ) -> torch.Tensor: + return torch.tensor(_list, dtype=torch.int32, device=self.device) + + def _list_to_long_tensor( + self, + _list: List[int], + ) -> torch.Tensor: + return torch.tensor(_list, dtype=torch.long, device=self.device) + + def make_model_input_from_broadcasted_tensor_dict( + self, tensor_dict: Dict[str, + Any]) -> EncoderDecoderModelInputForHPU: + return EncoderDecoderModelInputForHPU.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + + def _flatten(self, in_list): + return list(itertools.chain(*in_list)) + + def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): + return HpuModelAdapterEncoderDecoder(*args, **kwargs) + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> EncoderDecoderModelInputForHPU: + with self.profiler.record_event('internal', 'prepare_input_tensors'): + assert seq_group_metadata_list is not None + if self.profiler.enabled: + self.profiler_counter_helper.capture_seq_group_metadata_stats( + seq_group_metadata_list=seq_group_metadata_list) + model_input, sampling_metadata = self.prepare_input_tensors( + seq_group_metadata_list) + attn_metadata = self._prepare_encoder_model_input_tensors( + seq_group_metadata_list, model_input) + model_input = dataclasses.replace( + model_input, + attn_metadata=attn_metadata, + ) + assert model_input.attn_metadata is not None + is_prompt = model_input.attn_metadata.is_prompt + + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + is_prompt=is_prompt, + virtual_engine=virtual_engine) + + def _prepare_encoder_model_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + model_input: ModelInputForHPUWithSamplingMetadata, + ): + if len(seq_group_metadata_list) == 0: + return None + + # Since we are not supporting chunked prefill either the entire + # batch is prefill or it is decode + is_prompt = seq_group_metadata_list[0].is_prompt + # Build encoder inputs + encoder_seq_lens: List[int] = [] + cross_block_tables: List[List[int]] = [] + cross_slot_mapping: List[int] = [] + attn_metadata = model_input.attn_metadata + assert attn_metadata is not None + if is_prompt: + for seq_group_metadata in seq_group_metadata_list: + # Build seq lens + encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len( + ) if seq_group_metadata.encoder_seq_data else 0 + encoder_seq_lens.append(encoder_seq_len) + # Build slot mapping + if seq_group_metadata.cross_block_table is None: + cross_slot_mapping.extend([_PAD_SLOT_ID] * encoder_seq_len) + else: + for i in range(0, encoder_seq_len): + block_number = seq_group_metadata.cross_block_table[ + i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + cross_slot_mapping.append(slot) + attn_metadata.cross_slot_mapping = torch.tensor(cross_slot_mapping, + dtype=torch.long, + device=self.device) + else: + for seq_group_metadata in seq_group_metadata_list: + for _ in range(len(seq_group_metadata.seq_data)): + encoder_seq_len = ( + seq_group_metadata.encoder_seq_data.get_len() + if seq_group_metadata.encoder_seq_data else 0) + encoder_seq_lens.append(encoder_seq_len) + cross_block_table = seq_group_metadata.cross_block_table + cross_block_tables.append([] if ( + cross_block_table is None) else cross_block_table) + + last_block_usage = [(encoder_seq_len - 1) % self.block_size + 1 + for encoder_seq_len in encoder_seq_lens] + block_groups = [[i] * len(bt) + for i, bt in enumerate(cross_block_tables)] + block_usage = [ + [self.block_size] * (len(bt) - 1) + [lbu] + for bt, lbu in zip(cross_block_tables, last_block_usage) if bt + ] + + block_list = self._flatten(cross_block_tables) + block_groups = self._flatten(block_groups) + block_usage = self._flatten(block_usage) + + assert len(block_list) == len(block_groups) + assert len(block_list) == len(block_usage) + + block_list = torch.tensor(block_list, + dtype=torch.int, + device='cpu') + block_groups = torch.tensor(block_groups, + dtype=torch.int, + device='cpu') + block_usage = torch.tensor(block_usage, + dtype=self.model_config.dtype, + device='cpu') + + block_list = block_list.to( # type: ignore + self.device, non_blocking=True) + block_groups = block_groups.to( # type: ignore + self.device, non_blocking=True) + block_usage = block_usage.to( # type: ignore + self.device, non_blocking=True) + + attn_metadata.cross_block_list = block_list + attn_metadata.cross_block_groups = block_groups + attn_metadata.cross_block_usage = block_usage + + # add padding to align with language model shapes + real_batch_size = len(seq_group_metadata_list) + batch_size_padded = self.bucketing_ctx.get_padded_batch_size( + real_batch_size, is_prompt) + batch_size_padding = batch_size_padded - real_batch_size + if batch_size_padding > 0: + encoder_seq_lens.extend(encoder_seq_lens[0] + for _ in range(batch_size_padding)) + + encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens) + attn_metadata.encoder_seq_lens = encoder_seq_lens + attn_metadata.encoder_seq_lens_tensor = encoder_seq_lens_tensor + + return attn_metadata + + def profile_run(self) -> None: + num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [None] * num_layers + max_batch_size = self.max_num_prefill_seqs + _, max_seq_len = self.bucketing_ctx.get_max_prompt_shape() + max_seq_len = min(self.max_num_batched_tokens // max_batch_size, + max_seq_len) + + self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, + False) + return + + def warmup_scenario(self, + batch_size, + seq_len, + is_prompt, + kv_caches, + is_pt_profiler_run=False, + is_lora_profile_run=False, + temperature=0) -> None: + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + scenario_name = ("warmup_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") + self.profiler.start('internal', scenario_name) + times = 3 if use_graphs or is_pt_profiler_run else 1 + if is_prompt: + seqs = [ + self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) + for i in range(batch_size) + ] + else: + # FIXME: seq_len is actually number of blocks + blocks = [seq_len // batch_size for _ in range(batch_size)] + blocks[0] += seq_len % batch_size + seqs = [ + self.create_dummy_seq_group_metadata(i, + b * self.block_size - 1, + is_prompt) + for i, b in enumerate(blocks) + ] + torch.hpu.synchronize() + profiler = None + if is_pt_profiler_run and self.is_driver_worker: + profiler = setup_profiler() + profiler.start() + for _ in range(times): + inputs = self.prepare_model_input(seqs) + self.execute_model(inputs, kv_caches, warmup_mode=True) + torch.hpu.synchronize() + if profiler: + profiler.step() + if profiler: + profiler.stop() + self.profiler.end() + gc.collect() + + def create_dummy_seq_group_metadata(self, + group_id, + seq_len, + is_prompt, + lora_request=None, + temperature=0): + sampling_params = SamplingParams(temperature=0) + num_blocks = math.ceil(seq_len / self.block_size) + cross_block_table: Optional[List[int]] = None + encoder_dummy_data \ + = self.input_registry.dummy_data_for_profiling( + self.model_config, + seq_len, + self.mm_registry, + is_encoder_data=True) + mm_counts = self.mm_registry.get_mm_limits_per_prompt( + self.model_config) + num_images = mm_counts["image"] + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) * num_images + seq_len = max(seq_len, 1) + if is_prompt: + input_len = seq_len + output_len = 0 + block_tables = None + cross_block_table = None + else: + input_len = seq_len - 1 + output_len = 1 + block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks} + # limit cross blocks to the number of available blocks + num_cross_blocks = min(self.bucketing_ctx.num_hpu_blocks, + max_mm_tokens) // self.block_size + cross_block_table = [_PAD_BLOCK_ID] * num_cross_blocks + prompt_token_ids = [0] * input_len + output_token_ids = [1] * output_len + prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 + seq_data = SequenceData(prompt_token_ids_array) + seq_data.output_token_ids = output_token_ids + return SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=(output_len == 0), + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=block_tables, + encoder_seq_data=encoder_dummy_data.seq_data, + multi_modal_data=encoder_dummy_data.multi_modal_data, + cross_block_table=cross_block_table) + + def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: + # NOTE(kzawora): To anyone working on this in the future: + # Trimming metadata is required when using HPUGraphs. + # Attention metadata is going to be hashed by PT bridge, and + # appropriate HPUGraphs will be matched based on all inputs' hash. + + # Before you put more keys in here, make sure you know their + # value type and make sure you know how it's going to be hashed. + # You can find that information in input_hash function + # in habana_frameworks/torch/hpu/graphs.py. You can also hash + # it manually with torch.hpu.graphs.input_hash(attention_metadata) + + # If you use primitive types here - they will get hashed based + # on their value. You *will* get lots of excessive graph captures + # (and an OOM eventually) if you decide to put something like + # seq_len int here. + # If you absolutely need a scalar, put it in a tensor. Tensors + # get hashed using their metadata, not their values: + # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321)) + # input_hash(123) != input_hash(321) + # input_hash("abc") != input_hash("cba") + attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ + 'attn_bias', + 'seq_lens_tensor', + 'context_lens_tensor', + 'block_list', + 'block_mapping', + 'block_usage', + 'slot_mapping', + 'is_prompt', + 'block_indices', + 'block_offsets', + 'block_scales', + 'block_groups', + 'num_prefill_tokens', + 'num_decode_tokens', + 'num_prefills', + 'seq_lens', + 'encoder_seq_lens', + 'encoder_seq_lens_tensor', + 'cross_block_indices', + 'cross_block_offsets', + 'cross_block_list', + 'cross_slot_mapping', + 'cross_block_mapping', + 'cross_block_groups', + 'cross_block_scales', + 'cross_block_usage', + 'cross_attn_bias', + ]) + return attention_metadata + + def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): + cfg = (batch_size, seq_len, is_prompt) + seen = cfg in self.seen_configs + self.seen_configs.add(cfg) + if not seen and not warmup_mode: + phase = 'prompt' if is_prompt else 'decode' + logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", + phase, batch_size, seq_len) + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForHPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + warmup_mode=False, + ) -> Optional[List[SamplerOutput]]: + if num_steps > 1: + raise ValueError( + "num_steps > 1 is not supported in HPUEncoderDecoderModelRunner" + ) + + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + attn_metadata = model_input.attn_metadata + sampling_metadata = model_input.sampling_metadata + real_batch_size = model_input.real_batch_size + batch_size_padded = model_input.batch_size_padded + assert input_tokens is not None + assert input_positions is not None + assert sampling_metadata is not None + assert attn_metadata is not None + is_prompt = attn_metadata.is_prompt + assert is_prompt is not None + batch_size = input_tokens.size(0) + seq_len = self._seq_len(attn_metadata) + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + self._check_config(batch_size, seq_len, is_prompt, warmup_mode) + + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": self.trim_attn_metadata(attn_metadata), + "intermediate_tensors": intermediate_tensors, + **(model_input.multi_modal_kwargs or {}), + } + if htorch.utils.internal.is_lazy(): + execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs}) + + htorch.core.mark_step() + if self.is_driver_worker: + model_event_name = ("model_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") + else: + model_event_name = 'model_executable' + with self.profiler.record_event('internal', model_event_name): + hidden_states = self.model.forward( + **execute_model_kwargs, + selected_token_indices=sampling_metadata.selected_token_indices + ) + + # Compute the logits. + with self.profiler.record_event( + 'internal', ('compute_logits_' + f'{"prompt" if is_prompt else "decode"}_bs' + f'{batch_size}_' + f'seq{seq_len}')): + sampling_metadata.selected_token_indices = None + logits = self.model.compute_logits(hidden_states, + sampling_metadata) + htorch.core.mark_step() + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return [] + + if model_input.async_callback is not None: + model_input.async_callback() + + # Sample the next token. + with self.profiler.record_event( + 'internal', ('sample_' + f'{"prompt" if is_prompt else "decode"}_' + f'bs{batch_size}_' + f'seq{seq_len}')): + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + output.outputs = output.outputs[:real_batch_size] + htorch.core.mark_step() + + if self.is_driver_worker and self.profiler.enabled: + # Stop recording 'execute_model' event + self.profiler.end() + event_end = self.profiler.get_timestamp_us() + counters = self.profiler_counter_helper.get_counter_dict( + cache_config=self.cache_config, + duration=event_end - self.event_start, + seq_len=seq_len, + batch_size_padded=batch_size_padded, + real_batch_size=real_batch_size, + is_prompt=is_prompt) + self.profiler.record_counter(self.event_start, counters) + return [output] diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py old mode 100644 new mode 100755 index 260ffaf27f9a1..fd0c40e803f54 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -9,11 +9,9 @@ import gc import itertools import math -import operator import os import time from array import array -from dataclasses import dataclass, field from enum import IntEnum from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Type, TypeVar, Union) @@ -21,27 +19,40 @@ import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch +import vllm_hpu_extension.environment as environment +from vllm_hpu_extension.bucketing import HPUBucketingContext +from vllm_hpu_extension.flags import enabled_flags from vllm_hpu_extension.ops import LoraMask as LoraMask +from vllm_hpu_extension.ops import batch2block, block2batch from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, HabanaMemoryProfiler, format_bytes) from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import DeviceConfig, VllmConfig +from vllm.distributed import broadcast_tensor_dict from vllm.distributed.parallel_state import get_world_group from vllm.forward_context import set_forward_context +from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models import supports_multimodal +from vllm.model_executor.sampling_metadata import SequenceGroupToSample from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs) + MultiModalKwargs, MultiModalPlaceholderMap, + MultiModalRegistry) from vllm.sampling_params import SamplingParams -from vllm.sequence import (IntermediateTensors, SequenceData, - SequenceGroupMetadata) -from vllm.utils import (bind_kv_cache, is_pin_memory_available, +from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, + Logprob, SequenceData, SequenceGroupMetadata, + SequenceOutput) +from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available, make_tensor_with_pad) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, @@ -64,25 +75,6 @@ LORA_WARMUP_RANK = 8 -class Singleton(type): - _instances: Dict[type, object] = {} - - def __call__(cls, *args, **kwargs): - if cls not in cls._instances: - cls._instances[cls] = super().__call__(*args, **kwargs) - return cls._instances[cls] - - -@dataclass -class HPUBucketingGlobalState(metaclass=Singleton): - prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) - decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) - prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False) - decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False) - prompt_buckets: List[Tuple[int, int]] = field(init=False) - decode_buckets: List[Tuple[int, int]] = field(init=False) - - def subtuple(obj: object, typename: str, to_copy: List[str], @@ -92,141 +84,16 @@ def subtuple(obj: object, if to_override is None: to_override = {} fields = set(to_copy) | set(to_override.keys()) - values = {f: to_override.get(f, getattr(obj, f)) for f in fields} + if type(obj) is dict: + values = {key: obj[key] for key in fields if key in obj} + else: + values = {f: to_override.get(f, getattr(obj, f)) for f in fields} if typename not in _TYPE_CACHE: _TYPE_CACHE[typename] = collections.namedtuple(typename, ' '.join(fields)) return _TYPE_CACHE[typename](**values) -def read_bucket_settings(phase: str, dim: str, **defaults): - """Read bucketing configuration from env variables. - - phase is either 'prompt' or 'decode' - dim is either 'bs', 'seq' or 'block' - param is either 'min', 'step' or 'max' - example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 - """ - params = ['min', 'step', 'max'] - env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params] - default_values = [defaults[p] for p in params] - values = [ - int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values) - ] - for e, v, d in zip(env_vars, values, default_values): - logger.info('%s=%s (default:%s)', e, v, d) - return values - - -def warmup_range(config: Tuple[int, int, int]): - """Generate a warmup range. - - Start from bmin and multiply by 2 until you reach bstep. - Then, increase the values in the range by the value of bstep until you - reach bmax. - - Example: - bmin = 2, bstep = 32, bmax = 64 - => ramp_up = (2, 4, 8, 16) - => stable = (32, 64) - => return ramp_up + stable => (2, 4, 8, 16, 32, 64) - """ - bmin, bstep, bmax = config - assert bmin <= bmax, ("Min. batch size cannot be greater than max. " - "batch size. If you want to skip warmup, " - "set VLLM_SKIP_WARMUP=true") - base = itertools.repeat(2) - ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin) - ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \ - ramp_up_acc) - stable = range(bstep, bmax + 1, bstep) - buckets = list(ramp_up_tw) + list(stable) - return list(filter(lambda bucket: bucket >= bmin, buckets)) - - -def generate_prompt_buckets(bs_bucket_config, - seq_bucket_config, - max_num_batched_tokens=None): - buckets = list( - itertools.product(warmup_range(bs_bucket_config), - warmup_range(seq_bucket_config))) - if len(buckets) == 0: - msg = ("No buckets could be captured with following config " - f"(min, step, max_warmup): " - f"bs:{bs_bucket_config}, " - f"seq:{seq_bucket_config}") - raise ValueError(msg) - - filtered_buckets = buckets - if max_num_batched_tokens is not None: - # Remove buckets exceeding batch token budget - filtered_buckets = list( - filter( - lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens, - buckets)) - - if len(filtered_buckets) == 0: - # we can handle this if we ignore max_num_batched_tokens - min_bucket_bs, min_bucket_seq = min(buckets, - key=lambda b: (b[0] * b[1])) - min_reqd_budget = min_bucket_bs * min_bucket_seq - msg = ( - "The current bucketing configuration " - f"(min, step, max_warmup): " - f"bs:{bs_bucket_config}, " - f"seq:{seq_bucket_config} cannot be used with specified " - f"max_num_batched_tokens ({max_num_batched_tokens}), as the " - f"smallest bucket ({min_reqd_budget}) would exceed token " - "budget. Please increase max_num_batched_tokens or decrease " - "bucket minimum Ignoring max_num_batched_tokens at risk of " - "out-of-memory errors.") - logger.error(msg) - return list( - sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))), [] - - captured_buckets = list( - sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) - omitted_buckets = list( - sorted([x for x in buckets if x not in filtered_buckets])) - return captured_buckets, omitted_buckets - - -def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, - max_blocks): - buckets = [] - bs_buckets = warmup_range(bs_bucket_config) - block_buckets = warmup_range(blocks_bucket_config) - bmin, bstep, bmax = blocks_bucket_config - last_bucket = round_up(max_blocks, bstep) - for bs in bs_buckets: - for blocks in block_buckets: - if blocks < bs: - continue - if blocks > last_bucket: - break - buckets.append((bs, blocks)) - return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) - - -def next_pow2(value: int, base: int): - res = base - while value > 1: - value = (value + 1) // 2 - res *= 2 - return res - - -def round_up(value: int, k: int): - return (value + k - 1) // k * k - - -def find_bucket(value: int, config: Tuple[int, int, int]): - bmin, bstep, _ = config - next_step = round_up(value, bstep) - next_pow = next_pow2(value, bmin) - return max(bmin, min(next_step, next_pow)) - - def align_workers(value, op): group = get_world_group().cpu_group world_size = torch.distributed.get_world_size() @@ -239,17 +106,13 @@ def align_workers(value, op): def setup_profiler(): schedule = torch.profiler.schedule(wait=0, warmup=2, active=1, repeat=1) - DEVICE = 'hpu' - activities = [torch.profiler.ProfilerActivity.CPU] - activities.extend([torch.profiler.ProfilerActivity.HPU] if DEVICE == - 'hpu' else []) - #from habana_frameworks.torch.activity_profiler import DebugActivity - #debug_activities=[DebugActivity.BRIDGE_FUNCTION_CALLS] - + activities = [ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.HPU + ] profiler = torch.profiler.profile( schedule=schedule, activities=activities, - #debug_activities=debug_activities, on_trace_ready=torch.profiler.tensorboard_trace_handler('.', use_gzip=True), record_shapes=False, @@ -257,67 +120,177 @@ def setup_profiler(): return profiler -def pad_list(list, k, v): - target_len = round_up(len(list), k) - padding = target_len - len(list) - return list + [v] * padding +def round_up(value: int, k: int) -> int: + return (value + k - 1) // k * k + +def pad_list(input, k, v): + input_len = len(input) + target_len = round_up(input_len, k) + padding = target_len - input_len + return input + [v] * padding -def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): - slot_mapping = slot_mapping.flatten() - indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - if is_prompt: - indices = indices.unflatten(0, (-1, block_size))[:, 0] - offsets = None - else: - offsets = torch.fmod(slot_mapping, block_size) - return indices, offsets +def gather_list(input, indices, v): + return [input[i] if i is not None else v for i in indices] -def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"): - if module.__class__.__name__.endswith(suffix): - def forward_hook(module, args, output): - htorch.core.mark_step() - return output +def flatten(in_list): + return list(itertools.chain(*in_list)) + + +def get_target_layer_suffix_list(model_type) -> list[str]: + # This sets the suffix for the hidden layer name, which is controlled by + # VLLM_CONFIG_HIDDEN_LAYERS. The default suffix is "DecoderLayer," which is + # applicable for most language models such as LLaMA, Qwen, and BART. If the + # model's decoder layer name differs from the default, it will need to + # be specified here. + decoder_layer_table = { + "gpt_bigcode": "BigCodeBlock", + } + + return [ + decoder_layer_table.get(model_type, "DecoderLayer"), "EncoderLayer" + ] + + +def modify_model_layers(module: torch.nn.Module, + suffix_list: list[str], + n=1, + counter=None): + """Currently add mark_step at the end of specified layers. + """ + + def forward_hook(module, args, output): + htorch.core.mark_step() + return output - module.register_forward_hook(forward_hook) + if counter is None: + counter = [0] for child_name, child_module in module.named_children(): - modify_decoder_layer(child_module) + if any( + child_module.__class__.__name__.endswith(layer) + for layer in suffix_list): + counter[0] += 1 + if counter[0] % n == 0: + child_module.register_forward_hook(forward_hook) + else: + modify_model_layers(child_module, suffix_list, n, counter) + + +def get_path_to_rope(model: torch.nn.Module): + """Dynamically get the path to the RotaryEmbedding layer in the model. + This function will recursively search through the module hierarchy to find + a RotaryEmbedding layer and return the full path to that layer as a list + of names. + If no such layer is found, it returns None. + """ + + def find_rope_layer(parent, path): + # Base case: check if this parent is None + if parent is None: + return None + + # Check if the current layer is a RotaryEmbedding + if hasattr(parent, 'named_children'): + for child_name, child_module in parent.named_children(): + # If the current child is of type RotaryEmbedding, + # return the full path + if child_module.__class__.__name__.endswith("RotaryEmbedding"): + return path + [child_name] + # Otherwise, recurse into this child to check its children + result = find_rope_layer(child_module, path + [child_name]) + if result is not None: + return result + return None + + # Start the search from the top level model + path_to_rope = find_rope_layer(model, []) + + # Return the result if found, otherwise None + return path_to_rope class HpuModelAdapter: - def __init__(self, model, vllm_config): + def __init__(self, model, vllm_config, layer_names): self.model = model - self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', - '0').lower() in ['1', 'true'] + self.prefill_use_fusedsdpa = "fsdpa" in enabled_flags() + self.recompute_cos_sin = os.getenv('VLLM_COS_SIN_RECOMPUTE', + 'false').lower() in ['1', 'true'] self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size self.dtype = vllm_config.model_config.dtype + self.layer_names = layer_names enforce_eager = vllm_config.model_config.enforce_eager - if not htorch.utils.internal.is_lazy() and not enforce_eager: - self.model = torch.compile(self.model, - backend='hpu_backend', - dynamic=False) + if not is_fake_hpu() and not htorch.utils.internal.is_lazy( + ) and not enforce_eager: + if os.getenv('VLLM_REGIONAL_COMPILATION', + 'true').lower() == 'true': + self.regional_compilation_layers_list = [ + RMSNorm, VocabParallelEmbedding + ] + self._regional_compilation(self.model) + else: + self.model = torch.compile(self.model, + backend='hpu_backend', + dynamic=False) + + def _regional_compilation(self, + module, + parent_module=None, + module_name=None): + if isinstance(module, torch.nn.ModuleList): + for children_name, children_module in module.named_children(): + self._compile_region(module, children_name, children_module) + elif any( + isinstance(module, layer) + for layer in self.regional_compilation_layers_list): + self._compile_region(parent_module, module_name, module) + else: + for children_name, children_module in module.named_children(): + self._regional_compilation(children_module, module, + children_name) + + def _compile_region(self, model, name, module): + module = torch.compile(module, backend='hpu_backend', dynamic=False) + setattr(model, name, module) def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): - prefill_metadata = attn_metadata - if prefill_metadata is None or self.prefill_use_fusedsdpa: + if (attn_metadata is None or self.prefill_use_fusedsdpa + or not attn_metadata.is_prompt): return attn_metadata + prefill_metadata = attn_metadata + seq_lens_t = prefill_metadata.seq_lens_tensor + context_lens_t = prefill_metadata.context_lens_tensor + query_lens_t = seq_lens_t - context_lens_t + + block_list = attn_metadata.block_list + max_context_len = (block_list.size(-1) // + batch_size if block_list is not None else 0) + max_context_len = max_context_len * self.block_size + past_mask = torch.arange(0, + max_context_len, + dtype=torch.int32, + device=device) + past_mask = (past_mask.view(1, -1).expand(batch_size, -1).ge( + context_lens_t.view(-1, 1)).view(batch_size, 1, -1).expand( + batch_size, seq_len, -1).view(batch_size, 1, seq_len, -1)) + len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32).view(1, seq_len).ge( - seq_lens_t.unsqueeze(-1)).view( + query_lens_t.unsqueeze(-1)).view( batch_size, 1, 1, seq_len)) causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool), diagonal=1) mask = causal_mask.logical_or(len_mask) + mask = torch.concat((past_mask, mask), dim=-1) attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( mask, -math.inf)) attn_metadata = prefill_metadata._replace(attn_bias=attn_bias) @@ -331,25 +304,91 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype): mask = mask >= metadata.block_usage.unsqueeze(-1) attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( mask, -math.inf)) - block_mapping = torch.nn.functional.one_hot(metadata.block_mapping, - num_classes=batch_size) + + if not is_fake_hpu(): + block_mapping = torch.nn.functional.one_hot(metadata.block_groups, + num_classes=batch_size) + else: + # Unfortunately one_hot on CPU + # doesn't handle out of bounds classes so we need to convert + # all negative values to 0 (block_mapping) or bs (block_groups) + block_groups = metadata.block_groups.to(torch.long) + block_mapping = torch.nn.functional.relu(block_groups) + block_mapping = torch.nn.functional.one_hot(block_mapping, + num_classes=batch_size) + oob_values = block_groups.lt(0) + block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0) + block_groups.masked_fill_(oob_values, batch_size) + metadata = metadata._replace(block_groups=block_groups) block_mapping = block_mapping.to(dtype) metadata = metadata._replace(block_mapping=block_mapping, attn_bias=attn_bias) return metadata + def _set_block_scales(self, metadata, device): + block_mapping = metadata.block_mapping + ones = torch.ones((block_mapping.size(0), ), + device=device, + dtype=block_mapping.dtype) + sums = batch2block(block2batch(ones, block_mapping), block_mapping) + block_scales = torch.reciprocal(torch.maximum(ones, sums)) + metadata = metadata._replace(block_scales=block_scales) + return metadata + + def _set_indices_and_offsets(self, metadata, block_size, is_prompt): + slot_mapping = metadata.slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + if is_prompt: + indices = indices.unflatten(0, (-1, block_size))[:, 0] + offsets = None + else: + offsets = torch.fmod(slot_mapping, block_size) + metadata = metadata._replace(block_offsets=offsets, + block_indices=indices) + return metadata + def _update_metadata(self, attn_metadata, batch_size, seq_len, device, dtype): if attn_metadata.is_prompt: - meta = attn_metadata - attn_metadata = self._set_attn_bias(meta, batch_size, seq_len, - device, dtype) + attn_metadata = self._set_attn_bias(attn_metadata, batch_size, + seq_len, device, dtype) else: - meta = attn_metadata - attn_metadata = self._set_block_mapping(meta, batch_size, device, - dtype) + attn_metadata = self._set_block_mapping(attn_metadata, batch_size, + device, dtype) + attn_metadata = self._set_block_scales(attn_metadata, device) + attn_metadata = self._set_indices_and_offsets(attn_metadata, + self.block_size, + attn_metadata.is_prompt) return attn_metadata + def _prepare_cos_sin(self, positions): + """Navigate through the model using the provided path and call + the prepare_cos_sin method on the 'RotaryEmbedding' layer.""" + + current_module = self.model # Start from the top level of the model + + for layer in self.layer_names: + if layer.isdigit(): # Check if the layer is an index + layer = int(layer) + + # Check if the current layer is a name in a module + if isinstance( + layer, + str) and not isinstance(layer, int): # Name-based access + current_module = getattr(current_module, layer) + elif isinstance(layer, + int): # Indexed-based access (like ModuleList) + current_module = list(current_module._modules.values())[layer] + + # At the end, we should be at the RotaryEmbedding layer. + if hasattr(current_module, 'prepare_cos_sin'): + current_module.prepare_cos_sin( + positions, recompute_cos_sin=self.recompute_cos_sin) + else: + raise AttributeError( + "The module at the end of the path does not have \ + a 'prepare_cos_sin' method.") + def forward(self, *args, **kwargs): kwargs = kwargs.copy() selected_token_indices = kwargs.pop('selected_token_indices') @@ -363,6 +402,8 @@ def forward(self, *args, **kwargs): kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, self.dtype) LoraMask.setLoraMask(kwargs.pop('lora_mask')) + if self.layer_names is not None: + self._prepare_cos_sin(kwargs['positions']) with set_forward_context(kwargs['attn_metadata'], self.vllm_config, virtual_engine): hidden_states = self.model(*args, **kwargs) @@ -377,6 +418,15 @@ def compute_logits(self, *args, **kwargs): def sample(self, *args, **kwargs): return self.model.sample(*args, **kwargs) + def generate_proposals(self, *args, **kwargs): + return self.model.generate_proposals(*args, **kwargs) + + # sampler property will be used by spec_decode_worker + # don't rename + @property + def sampler(self): + return self.model.sampler + class PreparePromptMetadata(NamedTuple): input_tokens: torch.Tensor @@ -462,6 +512,8 @@ class ModelInputForHPU(ModelRunnerInputBase): virtual_engine: int = 0 lora_ids: Optional[List[int]] = None async_callback: Optional[Callable] = None + is_first_multi_step: bool = True + is_last_step: bool = True def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { @@ -474,6 +526,8 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "batch_size_padded": self.batch_size_padded, "virtual_engine": self.virtual_engine, "lora_ids": self.lora_ids, + "is_first_multi_step": self.is_first_multi_step, + "is_last_step": self.is_last_step, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -537,10 +591,14 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): def __init__( self, vllm_config: VllmConfig, + kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, return_hidden_states: bool = False, + input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): ModelRunnerBase.__init__(self, vllm_config=vllm_config) + environment.set_model_config(self.model_config) self.is_driver_worker = is_driver_worker self.return_hidden_states = return_hidden_states @@ -548,12 +606,16 @@ def __init__( if self.model_config is not None else None) self.device_config = (self.device_config if self.device_config is not None else DeviceConfig()) + if is_fake_hpu(): + self.device_config.device = torch.device('cpu') + self.device_config.device_type = 'cpu' + self.load_config.device = None self.device = self.device_config.device self.enforce_eager = self.model_config.enforce_eager self.max_num_seqs = self.scheduler_config.max_num_seqs - # NOTE(kzawora): Change that to scheduler_config.max_num_prefill_seqs - # once padding-aware scheduling gets merged - self.max_num_prefill_seqs = 64 + self.max_num_prefill_seqs = self.scheduler_config.max_num_prefill_seqs \ + if self.scheduler_config.max_num_prefill_seqs is not None \ + else self.max_num_seqs self.max_model_len = self.scheduler_config.max_model_len self.max_num_batched_tokens = \ self.scheduler_config.max_num_batched_tokens @@ -562,13 +624,25 @@ def __init__( self.pin_memory = is_pin_memory_available() self.kv_cache_dtype = self.cache_config.cache_dtype + num_attn_heads = self.model_config.get_num_attention_heads( + self.parallel_config) + needs_attn_backend = (num_attn_heads != 0 + or self.model_config.is_attention_free) self.attn_backend = get_attn_backend( self.model_config.get_head_size(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, self.model_config.is_attention_free, - ) + ) if needs_attn_backend else None + + # Multi-modal data support + self.input_registry = input_registry + self.mm_registry = mm_registry + self.mm_registry = MULTIMODAL_REGISTRY + self.multi_modal_input_mapper = self.mm_registry \ + .create_input_mapper(self.model_config) + self.mm_registry.init_mm_limits_per_prompt(self.model_config) # Lazy initialization self.lora_manager: LRUCacheWorkerLoRAManager = None @@ -580,9 +654,22 @@ def __init__( self.profiler_counter_helper = HabanaProfilerCounterHelper() self.seen_configs: set = set() self._mem_margin: Optional[int] = None - self.bucketing_global_state = HPUBucketingGlobalState() - self._setup_buckets() + self.bucketing_ctx = HPUBucketingContext(self.max_num_seqs, + self.max_num_prefill_seqs, + self.block_size, + self.max_num_batched_tokens) + self.graphed_buckets: Set[Any] = set() + self._set_gc_threshold() + self.use_contiguous_pa = os.environ.get('VLLM_CONTIGUOUS_PA', + 'true').lower() == 'true' + if vllm_config.speculative_config is not None \ + and self.use_contiguous_pa: + raise ValueError( + "Speculative decoding is not supported with " + "contiguous PA, please set VLLM_CONTIGUOUS_PA=false") + # For multi-step scheduling + self.cached_step_outputs: List[torch.Tensor] = [] def _set_gc_threshold(self) -> None: # Read https://docs.python.org/3/library/gc.html#gc.set_threshold @@ -636,12 +723,29 @@ def load_model(self) -> None: "Bias support in LoRA is not enabled in HPU yet." assert not self.lora_config.fully_sharded_loras, \ "Fully sharded LoRAs is not enabled in HPU yet." + if supports_multimodal(self.model): + logger.warning( + "Regarding multimodal models, vLLM currently " + "only supports adding LoRA to language model.") + # It's necessary to distinguish between the + # max_position_embeddings of VLMs and LLMs. + if hasattr(self.model.config, "max_position_embeddings"): + max_pos_embeddings = ( + self.model.config.max_position_embeddings) + else: + max_pos_embeddings = ( + self.model.config.text_config.max_position_embeddings) + self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens, - self.vocab_size, self.lora_config, self.device, + self.vocab_size, + self.lora_config, + self.device, self.model.embedding_modules, - self.model.embedding_padding_modules) + self.model.embedding_padding_modules, + max_position_embeddings=max_pos_embeddings, + ) self.model = self.lora_manager.create_lora_manager(self.model) if self.model_config.quantization == 'inc': @@ -660,15 +764,27 @@ def load_model(self) -> None: self.inc_initialized_successfully = True logger.info("Preparing model with INC took %s", m_inc.get_summary_string()) - else: + elif not is_fake_hpu(): self.model = self.model.to("hpu") htcore.mark_step() - modify_decoder_layer(self.model) + + hidden_layer_markstep_interval = int( + os.getenv('VLLM_CONFIG_HIDDEN_LAYERS', '1')) + model_config = getattr(self.model, "config", None) + modify_model_layers( + self.model, + get_target_layer_suffix_list( + model_config. + model_type if model_config is not None else None), + hidden_layer_markstep_interval) + path_to_rope = get_path_to_rope(self.model) torch.hpu.synchronize() with HabanaMemoryProfiler() as m_wrap: - self.model = _maybe_wrap_in_hpu_graph( - self.model, vllm_config=self.vllm_config) + self.model = self._maybe_wrap_in_hpu_graph( + self.model, + vllm_config=self.vllm_config, + layer_names=path_to_rope) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) @@ -676,6 +792,12 @@ def load_model(self) -> None: msg = f"Loading model weights took in total {m.get_summary_string()}" logger.info(msg) + def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): + return htorch.hpu.wrap_in_hpu_graph( + HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True + ) if htorch.utils.internal.is_lazy() else HpuModelAdapter( + *args, **kwargs) + def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False @@ -686,46 +808,6 @@ def _use_graphs(self, batch_size, seq_len, is_prompt): def _is_valid_bucket(self, bucket): return bucket[0] * bucket[1] <= self.max_num_batched_tokens - def _setup_buckets(self) -> None: - align_bs = lambda x: min(self.max_num_seqs, x) - #FIXME: The default values should be max_model_len - max_prompt_seq = 1024 - max_decode_seq = 2048 - self.bucketing_global_state.prompt_bs_bucket_cfg = read_bucket_settings( - 'prompt', - 'bs', - min=1, - step=align_bs(32), - max=self.max_num_prefill_seqs) - self.bucketing_global_state.decode_bs_bucket_cfg = read_bucket_settings( - 'decode', 'bs', min=1, step=align_bs(32), max=self.max_num_seqs) - self.bucketing_global_state.prompt_seq_bucket_cfg = \ - read_bucket_settings( - 'prompt', - 'seq', - min=self.block_size, - step=self.block_size, - max=max_prompt_seq) - self.bucketing_global_state.decode_block_bucket_cfg = \ - read_bucket_settings( - 'decode', - 'block', - min=self.block_size, - step=self.block_size, - max=max(self.block_size, - self.max_num_seqs * max_decode_seq // self.block_size)) - self.graphed_buckets: Set[Any] = set() - - msg = ("Prompt bucket config (min, step, max_warmup) " - f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, " - f"seq:{self.bucketing_global_state.prompt_seq_bucket_cfg}") - logger.info(msg) - - msg = ("Decode bucket config (min, step, max_warmup) " - f"bs:{self.bucketing_global_state.decode_bs_bucket_cfg}, " - f"block:{self.bucketing_global_state.decode_block_bucket_cfg}") - logger.info(msg) - def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -742,6 +824,9 @@ def _prepare_prompt( query_lens: List[int] = [] prefix_block_tables: List[List[int]] = [] multi_modal_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_placeholder_maps: Dict[ + str, MultiModalPlaceholderMap] = collections.defaultdict( + MultiModalPlaceholderMap) if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() @@ -799,11 +884,26 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) - mm_data = seq_group_metadata.multi_modal_data - if mm_data: - mm_kwargs = self.multi_modal_input_mapper(mm_data) + if seq_group_metadata.multi_modal_data: + positions = input_positions[0] + mm_data, placeholder_maps = MultiModalPlaceholderMap \ + .from_seq_group(seq_group_metadata, + range(positions[0], positions[0] + len(positions))) + + if self.mm_registry.has_processor(self.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + multi_modal_kwargs_list.append(mm_kwargs) + for modality, placeholder_map in placeholder_maps.items(): + multi_modal_placeholder_maps[modality].extend( + placeholder_map) + if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized # yet. In this case, we just use a dummy slot mapping. @@ -836,13 +936,11 @@ def _prepare_prompt( slot_mapping[-1].append(slot) max_query_len = max(query_lens) - sum_query_len = sum(query_lens) real_num_seqs = len(query_lens) assert max_query_len > 0 max_prompt_len = max( - find_bucket(max(seq_lens), - self.bucketing_global_state.prompt_seq_bucket_cfg), + self.bucketing_ctx.get_padded_prompt_seq_len(max(seq_lens)), self.block_size) lora_ids: List[int] = [] @@ -854,56 +952,107 @@ def _prepare_prompt( if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - lora_index_mapping += [lora_id] * (max_prompt_len - context_len) + lora_index_mapping += [lora_id] * max_prompt_len lora_prompt_mapping.extend( [lora_id] * - (max_prompt_len - context_len + (max_prompt_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) - input_tokens = make_tensor_with_pad(input_tokens, - max_len=max_prompt_len, - pad=0, - dtype=torch.long, - device=self.device) + if any(context_lens): + assert not self.scheduler_config.chunked_prefill_enabled + # prefix caching + + max_num_block = max(len(bt) for bt in prefix_block_tables) + prefix_block_list = list( + itertools.chain.from_iterable( + bt if len(bt) == max_num_block else bt + + ([_PAD_BLOCK_ID] * (max_num_block - len(bt))) + for bt in prefix_block_tables)) + + # TODO: pad to proper len + pad_len = len(prefix_block_list) + prefix_block_list = pad_list(prefix_block_list, pad_len, + _PAD_BLOCK_ID) + + prefix_block_list_tensor = torch.tensor(prefix_block_list, + dtype=torch.long, + device='cpu') + else: + prefix_block_list_tensor = None + + input_tokens_tensor = make_tensor_with_pad(input_tokens, + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device='cpu') input_positions = make_tensor_with_pad(input_positions, max_len=max_prompt_len, pad=0, dtype=torch.long, - device=self.device) + device='cpu') slot_mapping = make_tensor_with_pad(slot_mapping, max_len=max_prompt_len, pad=_PAD_SLOT_ID, dtype=torch.long, - device=self.device) + device='cpu') seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.long, - device=self.device) + device='cpu') + + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.long, + device='cpu') + + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + multi_modal_placeholder_maps.items() + } + + # Note: num_prefill_tokens is calculated using the length of + # input_tokens after padding. + num_prefill_tokens = input_tokens_tensor.numel() + if prefix_block_list_tensor is not None: + prefix_block_list_tensor = prefix_block_list_tensor.to( + self.device, non_blocking=True) + input_tokens_tensor = input_tokens_tensor.to( # type: ignore + self.device, non_blocking=True) + input_positions = input_positions.to( # type: ignore + self.device, non_blocking=True) + slot_mapping = slot_mapping.to( # type: ignore + self.device, non_blocking=True) + seq_lens_tensor = seq_lens_tensor.to(self.device, non_blocking=True) + context_lens_tensor = context_lens_tensor.to(self.device, + non_blocking=True) - block_indices, block_offsets = precompute_indices_and_offsets( - self.block_size, slot_mapping, True) attn_metadata = self.attn_backend.make_metadata( is_prompt=True, - block_list=None, + block_list=prefix_block_list_tensor, block_mapping=None, block_usage=None, - block_indices=block_indices, - block_offsets=block_offsets, + block_indices=None, + block_offsets=None, block_scales=None, + block_groups=None, attn_bias=None, + seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, + context_lens_tensor=context_lens_tensor, num_prefills=real_num_seqs, - num_prefill_tokens=sum_query_len, + num_prefill_tokens=num_prefill_tokens, num_decode_tokens=0, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps= - None # FIXME(kzawora): mutli-modality will not work here - ) + multi_modal_placeholder_index_maps=placeholder_index_maps) multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + for t in multi_modal_kwargs: + if torch.is_tensor(multi_modal_kwargs[t]): + multi_modal_kwargs[t] = multi_modal_kwargs[t].to( + self.device, non_blocking=True) - return PreparePromptMetadata(input_tokens=input_tokens, + return PreparePromptMetadata(input_tokens=input_tokens_tensor, input_positions=input_positions, attn_metadata=attn_metadata, seq_lens=seq_lens, @@ -918,6 +1067,7 @@ def _prepare_prompt( def _prepare_decode( self, seq_group_metadata_list: List[SequenceGroupMetadata], + output=None, ) -> PrepareDecodeMetadata: input_tokens: List[List[int]] = [] input_positions: List[List[int]] = [] @@ -948,8 +1098,9 @@ def _prepare_decode( for seq_id in seq_ids: seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append([generation_token]) + if output is None: + generation_token = seq_data.get_last_token_id() + input_tokens.append([generation_token]) seq_len = seq_data.get_len() position = seq_len - 1 @@ -960,6 +1111,9 @@ def _prepare_decode( seq_lens.append(seq_len) block_table = seq_group_metadata.block_tables[seq_id] + num_fully_occupied_blocks = position // self.block_size + block_table = block_table[:num_fully_occupied_blocks + 1] + if len(block_table) == 0: block_number = _PAD_BLOCK_ID else: @@ -979,76 +1133,92 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) + if output is None: + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device='cpu') + else: + real_batch_size = len(seq_group_metadata_list) + input_tokens = output[:real_batch_size].clone() + input_positions = torch.tensor(input_positions, dtype=torch.long, - device=self.device) - - num_decode_tokens = sum(seq_lens) - - blocks_used = [len(bt) for bt in block_tables if bt] - block_list = [] - block_scales = [] - for i, bt in enumerate(block_tables): - block_list.extend(bt) - blocks_in_group = len(bt) - if blocks_in_group > 0: - scale = 1.0 / blocks_in_group - block_scales.extend([scale] * blocks_in_group) - - block_mapping_nested: List[List[int]] = [ - [i] * b_u for i, b_u in enumerate(blocks_used) - ] - block_mapping: List[int] = list( - itertools.chain.from_iterable(block_mapping_nested)) + device='cpu') + + num_decode_tokens = len(seq_lens) - last_block = [ - sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping) + last_block_usage = [ + slot[0] % self.block_size + 1 for slot in slot_mapping ] - block_usage = [[self.block_size] * (b_u - 1) + [lb] - for b_u, lb in zip(blocks_used, last_block)] - block_usage = list(itertools.chain(*block_usage)) - - block_bucket_size = find_bucket( - len(block_list), - self.bucketing_global_state.decode_block_bucket_cfg) - block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) - block_mapping = pad_list(block_mapping, block_bucket_size, -1) - block_usage = pad_list(block_usage, block_bucket_size, 1) - block_scales = pad_list(block_scales, block_bucket_size, 0.0) - - block_list = torch.tensor(block_list, - dtype=torch.int, - device=self.device) - block_mapping = torch.tensor(block_mapping, - dtype=torch.long, - device=self.device) + block_groups = [[i] * len(bt) for i, bt in enumerate(block_tables)] + block_usage = [[self.block_size] * (len(bt) - 1) + [lbu] + for bt, lbu in zip(block_tables, last_block_usage) + if bt] + + block_list = flatten(block_tables) + block_groups = flatten(block_groups) + block_usage = flatten(block_usage) + + assert len(block_list) == len(block_groups) + assert len(block_list) == len(block_usage) + + padding_fn = None + if self.use_contiguous_pa: + block_bucket_size = max(max(block_list) + 1, len(block_list)) + block_bucket_size = self.bucketing_ctx.get_padded_decode_num_blocks( + block_bucket_size) + indices: List[Any] + indices = [None] * block_bucket_size + for i, bid in enumerate(block_list): + indices[bid] = i + padding_fn = lambda tensor, pad_value: gather_list( + tensor, indices, pad_value) + else: + block_bucket_size = self.bucketing_ctx.get_padded_decode_num_blocks( + len(block_list)) + padding_fn = lambda tensor, pad_value: pad_list( + tensor, block_bucket_size, pad_value) + + block_list = padding_fn(block_list, _PAD_BLOCK_ID) + block_groups = padding_fn(block_groups, -1) + block_usage = padding_fn(block_usage, 1) + + block_list = torch.tensor(block_list, dtype=torch.int, device='cpu') + block_groups = torch.tensor(block_groups, + dtype=torch.int, + device='cpu') block_usage = torch.tensor(block_usage, dtype=self.model_config.dtype, - device=self.device) - + device='cpu') slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, - device=self.device) - - block_indices, block_offsets = precompute_indices_and_offsets( - self.block_size, slot_mapping, False) - block_scales = torch.tensor(block_scales, - dtype=self.model_config.dtype, - device=self.device) + device='cpu') + + input_tokens = input_tokens.to( # type: ignore + self.device, non_blocking=True) + input_positions = input_positions.to( # type: ignore + self.device, non_blocking=True) + block_list = block_list.to( # type: ignore + self.device, non_blocking=True) + block_groups = block_groups.to( # type: ignore + self.device, non_blocking=True) + block_usage = block_usage.to( # type: ignore + self.device, non_blocking=True) + slot_mapping = slot_mapping.to( # type: ignore + self.device, non_blocking=True) attn_metadata = self.attn_backend.make_metadata( is_prompt=False, block_list=block_list, - block_mapping=block_mapping, + block_mapping=None, block_usage=block_usage, - block_indices=block_indices, - block_offsets=block_offsets, - block_scales=block_scales, + block_indices=None, + block_offsets=None, + block_scales=None, + block_groups=block_groups, attn_bias=None, seq_lens_tensor=None, + context_lens_tensor=None, num_prefills=0, num_prefill_tokens=0, num_decode_tokens=num_decode_tokens, @@ -1087,9 +1257,8 @@ def prepare_input_tensors( self.profiler.start('internal', base_event_name) real_batch_size = len(seq_group_metadata_list) - bucket_cfg = self.bucketing_global_state.prompt_bs_bucket_cfg \ - if is_prompt else self.bucketing_global_state.decode_bs_bucket_cfg - batch_size_padded = find_bucket(real_batch_size, bucket_cfg) + batch_size_padded = self.bucketing_ctx.get_padded_batch_size( + real_batch_size, is_prompt) batch_size_padding = batch_size_padded - real_batch_size seq_group_metadata_list = seq_group_metadata_list.copy() if batch_size_padding > 0: @@ -1160,7 +1329,7 @@ def prepare_input_tensors( # FIXME: We need to adjust selected_token_indices to accommodate # for padding max_len = input_tokens.size(1) - paddings = [max_len - s for s in seq_lens] + paddings = [max_len - q for q in query_lens] paddings = [0] + paddings[:-1] paddings = list(itertools.accumulate(paddings)) paddings_prompt_logprobs = [] @@ -1256,9 +1425,18 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: # input_hash(123) != input_hash(321) # input_hash("abc") != input_hash("cba") attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ - 'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping', - 'block_usage', 'slot_mapping', 'is_prompt', 'block_indices', - 'block_offsets', 'block_scales' + 'attn_bias', + 'seq_lens_tensor', + 'context_lens_tensor', + 'block_list', + 'block_mapping', + 'block_usage', + 'slot_mapping', + 'is_prompt', + 'block_indices', + 'block_offsets', + 'block_scales', + 'block_groups', ]) return attention_metadata @@ -1266,8 +1444,9 @@ def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt, - lora_request=None): - sampling_params = SamplingParams(temperature=0) + lora_request=None, + temperature=0): + sampling_params = SamplingParams(temperature=temperature) num_blocks = math.ceil(seq_len / self.block_size) seq_len = max(seq_len, 1) if is_prompt: @@ -1296,9 +1475,10 @@ def profile_run(self) -> None: bind_kv_cache( self.vllm_config.compilation_config.static_forward_context, [kv_caches]) - max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1] - max_batch_size = min(self.max_num_batched_tokens // max_seq_len, - self.scheduler_config.max_num_seqs) + _, max_seq_len = self.bucketing_ctx.get_max_prompt_shape() + max_batch_size = min(self.max_num_seqs, + self.max_num_batched_tokens // max_seq_len) + self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, False, True) return @@ -1309,7 +1489,8 @@ def warmup_scenario(self, is_prompt, kv_caches, is_pt_profiler_run=False, - is_lora_profile_run=False) -> None: + is_lora_profile_run=False, + temperature=0) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) scenario_name = ("warmup_" f"{'prompt' if is_prompt else 'decode'}_" @@ -1348,8 +1529,8 @@ def warmup_scenario(self, seq_len, is_prompt, lora_request=dummy_lora_requests_per_seq[i] - if dummy_lora_requests_per_seq else None) - for i in range(batch_size) + if dummy_lora_requests_per_seq else None, + temperature=temperature) for i in range(batch_size) ] else: # FIXME: seq_len is actually number of blocks @@ -1361,8 +1542,8 @@ def warmup_scenario(self, b * self.block_size - 1, is_prompt, lora_request=dummy_lora_requests_per_seq[i] - if dummy_lora_requests_per_seq else None) - for i, b in enumerate(blocks) + if dummy_lora_requests_per_seq else None, + temperature=temperature) for i, b in enumerate(blocks) ] torch.hpu.synchronize() profiler = None @@ -1371,7 +1552,27 @@ def warmup_scenario(self, profiler.start() for _ in range(times): inputs = self.prepare_model_input(seqs) - self.execute_model(inputs, kv_caches, warmup_mode=True) + is_single_step = \ + self.vllm_config.scheduler_config.num_scheduler_steps == 1 + if is_prompt or is_single_step: + self.execute_model(inputs, kv_caches, warmup_mode=True) + else: # decode with multi-step + inputs = dataclasses.replace(inputs, + is_first_multi_step=True, + is_last_step=False) + self.execute_model(inputs, + kv_caches, + warmup_mode=True, + num_steps=2, + seqs=seqs) + inputs = dataclasses.replace(inputs, + is_first_multi_step=False, + is_last_step=True) + self.execute_model(inputs, + kv_caches, + warmup_mode=True, + num_steps=2, + seqs=seqs) torch.hpu.synchronize() if profiler: profiler.step() @@ -1415,7 +1616,7 @@ def log_warmup(self, phase, i, max_i, batch_size, seq_len): free_mem = format_bytes( HabanaMemoryProfiler.current_free_device_memory()) dim = "num_blocks" - if phase == "Prompt": + if "Prompt" in phase: dim = "seq_len" msg = (f"[Warmup][{phase}][{i+1}/{max_i}] " f"batch_size:{batch_size} " @@ -1452,6 +1653,7 @@ def warmup_graphs(self, f'Unsupported graph allocation strategy: {strategy}') buckets = list(sorted(buckets, key=ordering)) captured_all = True + warmed_random_sampler_bs: Set[int] = set() for idx, (batch_size, seq_len) in enumerate(buckets): # Graph memory usage is proportional to seq dimension in a batch batch_seq = batch_size * seq_len if is_prompt else batch_size @@ -1465,7 +1667,13 @@ def warmup_graphs(self, self.graphed_buckets.add(graphed_bucket) self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) with HabanaMemoryProfiler() as mem_prof: - self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + self.warmup_scenario(batch_size, + seq_len, + is_prompt, + kv_caches, + temperature=1.0 if batch_size + not in warmed_random_sampler_bs else 0) + warmed_random_sampler_bs.add(batch_size) used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX) available_mem -= used_mem @@ -1498,42 +1706,15 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches, True) raise AssertionError("Finished profiling") - if self.skip_warmup: - logger.info("Skipping warmup...") - return - self.profiler.start('internal', 'warmup') max_blocks = kv_caches[0][0].size(0) - - self.bucketing_global_state.prompt_buckets, prompt_omitted_buckets = \ - generate_prompt_buckets( - self.bucketing_global_state.prompt_bs_bucket_cfg, - self.bucketing_global_state.prompt_seq_bucket_cfg, - self.max_num_batched_tokens) - - msg = (f"Generated {len(self.bucketing_global_state.prompt_buckets)} " - f"prompt buckets [bs, seq]: \ - {list(sorted(self.bucketing_global_state.prompt_buckets))}") - logger.info(msg) - - msg = (f"Omitted {len(prompt_omitted_buckets)} " - "prompt buckets due to exceeded token budget " - f"(max_num_batched_tokens={self.max_num_batched_tokens})") - logger.info(msg) - - msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}" - logger.debug(msg) - - self.bucketing_global_state.decode_buckets = generate_decode_buckets( - self.bucketing_global_state.decode_bs_bucket_cfg, - self.bucketing_global_state.decode_block_bucket_cfg, max_blocks) - logger.info("Generated %d decode buckets [bs, total_blocks]: %s", - len(self.bucketing_global_state.decode_buckets), - list(sorted(self.bucketing_global_state.decode_buckets))) - + self.bucketing_ctx.generate_prompt_buckets() + self.bucketing_ctx.generate_decode_buckets(max_blocks) if not htorch.utils.internal.is_lazy() and not self.enforce_eager: - cache_size_limit = len( - self.bucketing_global_state.prompt_buckets) + len( - self.bucketing_global_state.decode_buckets) + 1 + multiplier = 3 if os.getenv('VLLM_REGIONAL_COMPILATION', + 'true').lower() == 'true' else 1 + cache_size_limit = 1 + multiplier * ( + len(self.bucketing_ctx.prompt_buckets) + + len(self.bucketing_ctx.decode_buckets)) torch._dynamo.config.cache_size_limit = max( cache_size_limit, torch._dynamo.config.cache_size_limit) # Multiply by 8 to follow the original default ratio between @@ -1541,7 +1722,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: torch._dynamo.config.accumulated_cache_size_limit = max( cache_size_limit * 8, torch._dynamo.config.accumulated_cache_size_limit) - + if self.skip_warmup: + logger.info("Skipping warmup...") + return + self.profiler.start('internal', 'warmup') start_mem = HabanaMemoryProfiler.current_device_memory_usage() start_time = time.perf_counter() @@ -1560,10 +1744,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: 'Please update Gaudi Software Suite.') with compile_only_mode_context( ) if can_use_compile_only_mode else contextlib.nullcontext(): - self.warmup_all_buckets(self.bucketing_global_state.prompt_buckets, - True, kv_caches) - self.warmup_all_buckets(self.bucketing_global_state.decode_buckets, - False, kv_caches) + self.warmup_all_buckets(self.bucketing_ctx.prompt_buckets, True, + kv_caches) + self.warmup_all_buckets(self.bucketing_ctx.decode_buckets, False, + kv_caches) if not self.enforce_eager and htorch.utils.internal.is_lazy(): assert self.mem_margin is not None, \ @@ -1593,11 +1777,11 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: 'max_bs') mem_post_prompt, prompt_batch_seq, prompt_captured_all = \ self.warmup_graphs( - prompt_strategy, self.bucketing_global_state.prompt_buckets, + prompt_strategy, self.bucketing_ctx.prompt_buckets, True, kv_caches, prompt_available_memory) mem_post_decode, decode_batch_seq, decode_captured_all = \ self.warmup_graphs( - decode_strategy, self.bucketing_global_state.decode_buckets, + decode_strategy, self.bucketing_ctx.decode_buckets, False, kv_caches, decode_available_memory) # Not all prompt buckets were captured, but all decode buckets @@ -1607,9 +1791,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: and not prompt_captured_all and decode_captured_all): mem_post_prompt, _, prompt_captured_all = ( self.warmup_graphs( - prompt_strategy, - self.bucketing_global_state.prompt_buckets, True, - kv_caches, + prompt_strategy, self.bucketing_ctx.prompt_buckets, + True, kv_caches, graph_free_mem - mem_post_prompt - mem_post_decode, mem_post_prompt, prompt_batch_seq)) @@ -1620,18 +1803,15 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: and not decode_captured_all \ and prompt_captured_all: mem_post_decode, _, _ = self.warmup_graphs( - decode_strategy, - self.bucketing_global_state.decode_buckets, False, - kv_caches, + decode_strategy, self.bucketing_ctx.decode_buckets, + False, kv_caches, graph_free_mem - mem_post_prompt - mem_post_decode, mem_post_decode, decode_batch_seq) self.log_graph_warmup_summary( - self.bucketing_global_state.prompt_buckets, True, - mem_post_prompt) + self.bucketing_ctx.prompt_buckets, True, mem_post_prompt) self.log_graph_warmup_summary( - self.bucketing_global_state.decode_buckets, False, - mem_post_decode) + self.bucketing_ctx.decode_buckets, False, mem_post_decode) end_time = time.perf_counter() end_mem = HabanaMemoryProfiler.current_device_memory_usage() @@ -1642,6 +1822,21 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: logger.info(msg) self.profiler.end() + def finish_measurements(self): + from neural_compressor.torch.quantization import finalize_calibration + finalize_calibration(self.model.model) + + def shutdown_inc(self): + can_finalize_inc = (self.model_config.quantization == 'inc') and \ + (self.model.model is not None) and \ + self.inc_initialized_successfully and \ + not getattr(self, "_is_inc_finalized", False) + if can_finalize_inc: + from neural_compressor.torch.quantization import ( + finalize_calibration) + finalize_calibration(self.model.model) + self._is_inc_finalized = True + @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() @@ -1655,12 +1850,6 @@ def mem_margin(self, value): self._mem_margin = value -def _maybe_wrap_in_hpu_graph(*args, **kwargs): - return htorch.hpu.wrap_in_hpu_graph( - HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True - ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs) - - class HabanaProfilerCounterHelper: def __init__(self): @@ -1750,15 +1939,6 @@ def get_counter_dict(self, cache_config, duration, seq_len, return counters -def unwrap_model(model): - if isinstance(model, torch._dynamo.eval_frame.OptimizedModule): - return unwrap_model(model._orig_mod) - else: - model = list(vars(model)['_modules'].values())[0] - modules = list(vars(model)['_modules'].values()) - return modules - - class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): """ GPU model runner with sampling step. @@ -1807,10 +1987,6 @@ def prepare_model_input( is_prompt=is_prompt, virtual_engine=virtual_engine) - def finish_measurements(self): - from neural_compressor.torch.quantization import finalize_calibration - finalize_calibration(self.model.model) - def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): cfg = (batch_size, seq_len, is_prompt) seen = cfg in self.seen_configs @@ -1826,7 +2002,7 @@ def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], This is a helper function to create the mask for lora computations. Lora Mask is needed to ensure we match the correct lora weights for the for the request. - For Prompt phase we have + For Prompt phase we have lora_mask with shape (batch_size * seq_len, max_loras * max_rank) lora_logits_mask with shape (batch_size, max_loras * max_rank) For Decode phase we have both @@ -1890,6 +2066,19 @@ def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], return lora_mask, lora_logits_mask + def add_dummy_seq(self, seq_group_metadata_list, is_prompt): + real_batch_size = len(seq_group_metadata_list) + batch_size_padded = self.bucketing_ctx.get_padded_batch_size( + real_batch_size, is_prompt) + batch_size_padding = batch_size_padded - real_batch_size + seq_group_metadata_list = seq_group_metadata_list.copy() + if batch_size_padding > 0: + dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( + 0, 0, is_prompt) + seq_group_metadata_list.extend(dummy_seq_group_metadata + for _ in range(batch_size_padding)) + return seq_group_metadata_list + @torch.inference_mode() def execute_model( self, @@ -1898,130 +2087,297 @@ def execute_model( intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, warmup_mode=False, + previous_hidden_states: Optional[torch.Tensor] = None, + seqs=None, ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: - if num_steps > 1: - raise ValueError( - "num_steps > 1 is not supported in HPUModelRunner") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - sampling_metadata = model_input.sampling_metadata - real_batch_size = model_input.real_batch_size - batch_size_padded = model_input.batch_size_padded - assert input_tokens is not None - assert input_positions is not None - assert sampling_metadata is not None - assert attn_metadata is not None - is_prompt = attn_metadata.is_prompt - assert is_prompt is not None - batch_size = input_tokens.size(0) - seq_len = self._seq_len(attn_metadata) - use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) - self._check_config(batch_size, seq_len, is_prompt, warmup_mode) + if not model_input.is_first_multi_step: + if not model_input.is_last_step: + # not first or last multi-step + return [] + # last multi-step + output = self._decode_sampler_outputs( + model_input) if self.is_driver_worker else [] + torch.hpu.synchronize() + if model_input.is_first_multi_step: + # first multi-step + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + attn_metadata = model_input.attn_metadata + sampling_metadata = model_input.sampling_metadata + real_batch_size = model_input.real_batch_size + batch_size_padded = model_input.batch_size_padded + assert input_tokens is not None + assert input_positions is not None + assert sampling_metadata is not None + assert attn_metadata is not None + is_prompt = attn_metadata.is_prompt + assert is_prompt is not None + batch_size = input_tokens.size(0) + seq_len = self._seq_len(attn_metadata) + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + self._check_config(batch_size, seq_len, is_prompt, warmup_mode) + + lora_mask: torch.Tensor = None + lora_logits_mask: torch.Tensor = None + if self.lora_config: + assert model_input.lora_ids is not None + lora_mask, lora_logits_mask = self.create_lora_mask( + input_tokens, model_input.lora_ids, + attn_metadata.is_prompt) + + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": self.trim_attn_metadata(attn_metadata), + "intermediate_tensors": intermediate_tensors, + "lora_mask": lora_mask, + "virtual_engine": model_input.virtual_engine, + **(model_input.multi_modal_kwargs or {}), + } + if previous_hidden_states is not None: + execute_model_kwargs.update( + {"previous_hidden_states": previous_hidden_states}) + if htorch.utils.internal.is_lazy(): + execute_model_kwargs.update( + {"bypass_hpu_graphs": not use_graphs}) - lora_mask: torch.Tensor = None - lora_logits_mask: torch.Tensor = None - if self.lora_config: - assert model_input.lora_ids is not None - lora_mask, lora_logits_mask = self.create_lora_mask( - input_tokens, model_input.lora_ids, attn_metadata.is_prompt) - - execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, - "kv_caches": kv_caches, - "attn_metadata": self.trim_attn_metadata(attn_metadata), - "intermediate_tensors": intermediate_tensors, - "lora_mask": lora_mask, - "virtual_engine": model_input.virtual_engine, - **(model_input.multi_modal_kwargs or {}), - } - if htorch.utils.internal.is_lazy(): - execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs}) + htorch.core.mark_step() + if self.is_driver_worker: + model_event_name = ("model_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") + else: + model_event_name = 'model_executable' + if num_steps > 1: + # in case of multi-step scheduling + # we only want to pythonize in the last step + sampling_metadata.skip_sampler_cpu_output = True + self.model.model.sampler.include_gpu_probs_tensor = True + cache_orig_output_tokens_len: List[Dict] = [] + + def try_revert_dummy_output_tokens(): + if len(cache_orig_output_tokens_len) > 0: + # Reuse the original output token ids length + for i in range(len(cache_orig_output_tokens_len)): + seq_group_metadata = seq_group_metadata_list[i] + for j, data in seq_group_metadata.seq_data.items(): + orig_output_tokens_len = \ + cache_orig_output_tokens_len[i][j] + data.output_token_ids = \ + data.output_token_ids[:orig_output_tokens_len] + + for i in range(num_steps): + if i != 0 and not self.is_driver_worker: + broadcast_data = broadcast_tensor_dict(src=0) + if 'early_exit' in broadcast_data and broadcast_data[ + 'early_exit']: + return [output] if num_steps == 1 else [] + execute_model_kwargs.update({ + "input_ids": + broadcast_data["input_ids"], + "positions": + broadcast_data["positions"], + "attn_metadata": + self.trim_attn_metadata( + broadcast_data["attn_metadata"]) + }) + with self.profiler.record_event('internal', model_event_name): + hidden_states = self.model.forward( + **execute_model_kwargs, + selected_token_indices=sampling_metadata. + selected_token_indices) + + if self.lora_config: + LoraMask.setLoraMask( + lora_logits_mask.index_select( + 0, sampling_metadata.selected_token_indices)) + + # Compute the logits. + with self.profiler.record_event( + 'internal', + ('compute_logits_' + f'{"prompt" if is_prompt else "decode"}_bs' + f'{batch_size}_' + f'seq{seq_len}')): + if num_steps == 1: + sampling_metadata.selected_token_indices = None + logits = self.model.compute_logits(hidden_states, + sampling_metadata) + htorch.core.mark_step() + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + continue - htorch.core.mark_step() - if self.is_driver_worker: - model_event_name = ("model_" - f"{'prompt' if is_prompt else 'decode'}_" - f"bs{batch_size}_" - f"seq{seq_len}_" - f"graphs{'T' if use_graphs else 'F'}") + if model_input.async_callback is not None: + model_input.async_callback() + # Sample the next token. + with self.profiler.record_event( + 'internal', ('sample_' + f'{"prompt" if is_prompt else "decode"}_' + f'bs{batch_size}_' + f'seq{seq_len}')): + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + if num_steps > 1: + output = output.sampled_token_ids + self.cached_step_outputs.append( + output.detach().clone()) + htorch.core.mark_step() + if i < num_steps - 1: + if i == 0: + if model_input.async_callback is not None: + ctx = model_input.async_callback.keywords[ # type: ignore + "ctx"] + seq_group_metadata_list = \ + ctx.seq_group_metadata_list + elif seqs is not None: + seq_group_metadata_list = seqs + else: + raise RuntimeError( + "seq_group_metadata_list is uninitialized") + for seq_idx, seq_group_metadata in enumerate( + seq_group_metadata_list): + # Skip empty steps + seq_group_metadata.state.current_step += ( + num_steps - 2) + # Cache the original output token ids + cache_orig_output_tokens_len.append({}) + for j, data in seq_group_metadata.seq_data.items(): + cache_orig_output_tokens_len[seq_idx][j] = \ + len(data.output_token_ids) + seq_group_metadata_list = self.add_dummy_seq( + seq_group_metadata_list, is_prompt=False) + for seq_group_metadata in seq_group_metadata_list: + for data in seq_group_metadata.seq_data.values(): + max_output_len = sampling_metadata.seq_groups[ + 0].sampling_params.max_tokens + if len(data.output_token_ids) < max_output_len - 1: + # add a place holder for prepare_decode + # arbitrary value, this could be any token + dummy_token = (540, ) + data.output_token_ids += (dummy_token) + else: + broadcast_tensor_dict({'early_exit': True}, + src=0) + if num_steps == 1: + return [output] + else: + try_revert_dummy_output_tokens() + return [] + + result = self._prepare_decode(seq_group_metadata_list, + output=output) + if self.lora_config: + lora_mapping = LoRAMapping( + **dict(index_mapping=result.lora_index_mapping, + prompt_mapping=result.lora_prompt_mapping, + is_prefill=False)) + self.set_active_loras(result.lora_requests, + lora_mapping) + lora_mask, lora_logits_mask = self.create_lora_mask( + result.input_tokens, result.lora_ids, False) + + execute_model_kwargs.update({ + "input_ids": + result.input_tokens, + "positions": + result.input_positions, + "attn_metadata": + self.trim_attn_metadata(result.attn_metadata), + "lora_mask": + lora_mask, + }) + model_kwargs_broadcast_data = { + "input_ids": result.input_tokens, + "positions": result.input_positions, + "attn_metadata": vars(result.attn_metadata), + "lora_mask": lora_mask, + } + broadcast_tensor_dict(model_kwargs_broadcast_data, src=0) + else: + try_revert_dummy_output_tokens() + + if self.is_driver_worker and self.profiler.enabled: + # Stop recording 'execute_model' event + self.profiler.end() + event_end = self.profiler.get_timestamp_us() + counters = self.profiler_counter_helper.get_counter_dict( + cache_config=self.cache_config, + duration=event_end - self.event_start, + seq_len=seq_len, + batch_size_padded=batch_size_padded, + real_batch_size=real_batch_size, + is_prompt=is_prompt) + self.profiler.record_counter(self.event_start, counters) + if num_steps == 1: + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + assert model_input.sampling_metadata is not None + if model_input.is_prompt: + output.prefill_hidden_states = hidden_states + output.hidden_states = hidden_states + return [output] if self.is_driver_worker else [] + else: + return [] + + return output if type(output) is list else [output] + + def _decode_sampler_outputs(self, model_input): + use_async_out_proc = model_input.async_callback is not None + sampler_outputs = [] + num_outputs = len(self.cached_step_outputs) + for i in range(num_outputs): + next_token_ids = self.cached_step_outputs.pop(0) + next_token_ids = next_token_ids.cpu().tolist() + sampler_output = self._make_decode_output( + next_token_ids, model_input.sampling_metadata.seq_groups) + sampler_outputs.append(sampler_output) + + if i < num_outputs - 1 and use_async_out_proc: + assert model_input.async_callback is not None + ctx = model_input.async_callback.keywords[ # type: ignore + "ctx"] + ctx.append_output( + outputs=[sampler_output], + seq_group_metadata_list=ctx.seq_group_metadata_list, + scheduler_outputs=ctx.scheduler_outputs, + is_async=False, + is_last_step=False, + is_first_step_output=False) + model_input.async_callback() + + if use_async_out_proc: + return [sampler_outputs[-1]] else: - model_event_name = 'model_executable' - with self.profiler.record_event('internal', model_event_name): - hidden_states = self.model.forward( - **execute_model_kwargs, - selected_token_indices=sampling_metadata.selected_token_indices - ) + return sampler_outputs - if self.lora_config: - LoraMask.setLoraMask( - lora_logits_mask.index_select( - 0, sampling_metadata.selected_token_indices)) - - # Compute the logits. - with self.profiler.record_event( - 'internal', ('compute_logits_' - f'{"prompt" if is_prompt else "decode"}_bs' - f'{batch_size}_' - f'seq{seq_len}')): - sampling_metadata.selected_token_indices = None - logits = self.model.compute_logits(hidden_states, - sampling_metadata) - htorch.core.mark_step() - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - with self.profiler.record_event( - 'internal', ('sample_' - f'{"prompt" if is_prompt else "decode"}_' - f'bs{batch_size}_' - f'seq{seq_len}')): - output = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, - ) - output.outputs = output.outputs[:real_batch_size] - htorch.core.mark_step() - - if self.is_driver_worker and self.profiler.enabled: - # Stop recording 'execute_model' event - self.profiler.end() - event_end = self.profiler.get_timestamp_us() - counters = self.profiler_counter_helper.get_counter_dict( - cache_config=self.cache_config, - duration=event_end - self.event_start, - seq_len=seq_len, - batch_size_padded=batch_size_padded, - real_batch_size=real_batch_size, - is_prompt=is_prompt) - self.profiler.record_counter(self.event_start, counters) - return [output] - - def shutdown_inc(self): - can_finalize_inc = False - from contextlib import suppress - with suppress(AttributeError): - can_finalize_inc = (self.model_config.quantization == 'inc') and \ - (self.model.model is not None) and \ - self.inc_initialized_successfully and \ - not getattr(self, "_is_inc_finalized", False) - if can_finalize_inc: - from neural_compressor.torch.quantization import ( - finalize_calibration) - finalize_calibration(self.model.model) - self._is_inc_finalized = True - - def __del__(self): - self.shutdown_inc() + def _make_decode_output( + self, + next_token_ids: List[List[int]], + seq_groups: List[SequenceGroupToSample], + ) -> SamplerOutput: + zero_logprob = Logprob(0.0) + sampler_outputs = [] + batch_idx = 0 + for seq_group in seq_groups: + seq_ids = seq_group.seq_ids + seq_outputs = [] + for seq_id in seq_ids: + next_token_id = next_token_ids[batch_idx][0] + seq_outputs.append( + SequenceOutput(seq_id, next_token_id, + {next_token_id: zero_logprob})) + batch_idx += 1 + sampler_outputs.append( + CompletionSequenceGroupOutput(seq_outputs, None)) + return SamplerOutput(sampler_outputs) diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 9401241073c7d..a83039054fc78 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -4,7 +4,11 @@ import contextlib import gc +import gzip +import json import os +import queue +import time from typing import List, Optional, Set, Tuple, Type import habana_frameworks.torch as htorch # noqa:F401 @@ -22,10 +26,11 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import bind_kv_cache +from vllm.utils import (bind_kv_cache, hpu_backend_string, hpu_device_string, + is_fake_hpu) from vllm.worker.cache_engine import CacheEngine -from vllm.worker.hpu_model_runner import HPUModelRunner -from vllm.worker.model_runner_base import ModelRunnerBase +from vllm.worker.hpu_enc_dec_model_runner import HPUEncoderDecoderModelRunner +from vllm.worker.hpu_model_runner import HPUModelRunner, HPUModelRunnerBase from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, WorkerInput) @@ -47,7 +52,7 @@ def __init__( rank: int, distributed_init_method: str, is_driver_worker: bool = False, - model_runner_cls: Optional[Type[ModelRunnerBase]] = None, + model_runner_cls: Optional[Type[HPUModelRunner]] = None, ) -> None: WorkerBase.__init__(self, vllm_config=vllm_config) self.parallel_config.rank = rank @@ -63,8 +68,29 @@ def __init__( from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.model_runner: HPUModelRunner = HPUModelRunner( - vllm_config=vllm_config, is_driver_worker=is_driver_worker) + # Return hidden states from target model if the draft model is an + # mlp_speculator + speculative_config = self.speculative_config + model_config = self.model_config + speculative_args = {} if speculative_config is None \ + or (speculative_config.draft_model_config.model == + model_config.model) \ + or (speculative_config.draft_model_config.hf_config.model_type + not in ["medusa", "mlp_speculator", "eagle"]) \ + else {"return_hidden_states": True} + + is_encoder_decoder_model = self._is_encoder_decoder_model() + ModelRunnerClass: Type[HPUModelRunnerBase] = HPUModelRunner + if is_encoder_decoder_model: + ModelRunnerClass = HPUEncoderDecoderModelRunner + self.model_runner: HPUModelRunnerBase = ModelRunnerClass( + vllm_config=vllm_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=is_driver_worker, + **speculative_args, + ) + if model_runner_cls is not None: + self.model_runner = model_runner_cls(self.model_runner) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[HPUCacheEngine] @@ -76,21 +102,85 @@ def __init__( torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR logger.info("Profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir) + + if os.getenv('VLLM_PROFILER_ENABLED') == 'full': + fn = self.full_trace_handler + with_stack = False + else: + fn = torch.profiler.tensorboard_trace_handler + with_stack = True self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU, ], - with_stack=True, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True)) + with_stack=with_stack, + on_trace_ready=fn(torch_profiler_trace_dir, use_gzip=True)) else: self.profiler = None + def full_trace_handler(self, dir_name, use_gzip=False): + + def handler_fn(prof) -> None: + if not os.path.isdir(dir_name): + try: + os.makedirs(dir_name, exist_ok=True) + except Exception as e: + raise RuntimeError("Can't create directory: " + + dir_name) from e + file_name = f"vllm.{time.time_ns()}.pt.trace.json" + file_path = os.path.join(dir_name, file_name) + prof.export_chrome_trace(file_path) + with open(file_path) as f: + pytorch_trace = json.load(f) + os.remove(file_path) + base = pytorch_trace['baseTimeNanoseconds'] / 1000 + events = self.model_runner.profiler.profiling_trace_events + while True: + try: + event_str = events.get_nowait() + event = json.loads(event_str[:-1]) + event['ts'] = event['ts'] - base + pytorch_trace['traceEvents'].append(event) + except queue.Empty: + break + + pytorch_trace['traceEvents'].append({ + "args": { + "name": "vLLM" + }, + "name": "process_name", + "ph": "M", + "pid": 1, + "tid": 0, + "ts": 0.0 + }) + if use_gzip: + file_path = file_path + ".gz" + with gzip.open(file_path, 'wt', encoding="ascii") as zipfile: + json.dump(pytorch_trace, zipfile) + else: + with open(file_path, "w") as outfile: + outfile.write(json.dumps(pytorch_trace)) + logger.info("Saved full profiling to %s", file_path) + + return handler_fn + + def _is_encoder_decoder_model(self): + return self.model_config.is_encoder_decoder + def start_profile(self): if self.profiler is None: raise RuntimeError("Profiler is not enabled.") - self.profiler.start() + high_level_profiler = self.model_runner.profiler + with high_level_profiler.record_event('internal', 'start_profiler'): + # Clean up the queue + while True: + try: + high_level_profiler.profiling_trace_events.get_nowait() + except queue.Empty: + break + self.profiler.start() def stop_profile(self): if self.profiler is None: @@ -111,6 +201,8 @@ def init_device(self) -> None: if self.device_config.device.type == "hpu": self.device = torch.device("hpu") torch.hpu.set_device(self.device) + elif self.device_config.device_type == "cpu": + self.device = torch.device("cpu") else: raise RuntimeError( f"Not support device type: {self.device_config.device}") @@ -130,7 +222,6 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, ) -> Optional[List[SamplerOutput]]: - assert execute_model_req is not None # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 @@ -144,7 +235,8 @@ def execute_model( 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all - if log_graph_compilation or log_cpu_fallbacks: + if (log_graph_compilation or log_cpu_fallbacks) \ + and execute_model_req is not None: from habana_frameworks.torch.hpu.metrics import metric_localcontext seq_group_metadata_list = execute_model_req.seq_group_metadata_list is_prompt = any([ @@ -208,6 +300,12 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. + if is_fake_hpu(): + cache_block_size = self.get_cache_block_size_bytes() + fake_hpu_cache_alloc = 4 * 2**30 # take 4 GiB flat on fake hpu + num_fake_hpu_blocks = fake_hpu_cache_alloc // cache_block_size + self.model_runner.bucketing_ctx.num_hpu_blocks = num_fake_hpu_blocks + return num_fake_hpu_blocks, 0 with HabanaMemoryProfiler() as m: self.model_runner.profile_run() torch.hpu.synchronize() @@ -244,6 +342,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: num_hpu_blocks = max(num_hpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) + self.model_runner.bucketing_ctx.num_hpu_blocks = num_hpu_blocks + if self.model_runner.lora_manager: self.model_runner.remove_all_loras() @@ -294,9 +394,6 @@ def _warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) - def finish_measurements(self): - self.model_runner.finish_measurements() - @property def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @@ -404,11 +501,12 @@ def init_worker_distributed_environment( local_rank: int = -1, ) -> None: """Initialize the distributed environment.""" + backend = hpu_backend_string() init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank, - backend='hccl') + backend=backend) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) @@ -425,15 +523,17 @@ def init_worker_distributed_environment( "distributed_init_method must be set if torch.distributed " "is not already initialized") else: + backend = hpu_backend_string() torch.distributed.init_process_group( - backend="hccl", + backend=backend, world_size=parallel_config.world_size, rank=rank, init_method=distributed_init_method, ) # A small all_reduce for warmup & checking conformance. - dummy_tensor_hpu = torch.ones(1).to('hpu') + device = hpu_device_string() + dummy_tensor_hpu = torch.ones(1).to(device) torch.distributed.all_reduce(dummy_tensor_hpu) assert dummy_tensor_hpu.item() == parallel_config.world_size ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, @@ -467,12 +567,14 @@ def _allocate_kv_cache( kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] + dtype = self.dtype + if device != 'hpu' and not is_fake_hpu() \ + and self.dtype == torch.float8_e4m3fn: + dtype = torch.uint8 for _ in range(self.num_attention_layers): - key_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - device=device) + key_cache = torch.zeros(kv_cache_shape, dtype=dtype, device=device) value_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, + dtype=dtype, device=device) kv_layer = (key_cache, value_cache) kv_cache.append(kv_layer) diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py new file mode 100644 index 0000000000000..f2791a833c4b7 --- /dev/null +++ b/vllm/worker/multi_step_hpu_worker.py @@ -0,0 +1,116 @@ +import dataclasses +from typing import Dict, Optional, Tuple + +import torch + +from vllm.distributed import broadcast_tensor_dict +from vllm.sequence import ExecuteModelRequest +from vllm.worker.hpu_model_runner import ModelInputForHPU +from vllm.worker.hpu_worker import HPUWorker +from vllm.worker.worker_base import WorkerInput + + +class MultiStepHPUWorker(HPUWorker): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.cached_model_input: Optional[ModelInputForHPU] = None + + def _get_driver_input_and_broadcast( + self, execute_model_req: ExecuteModelRequest + ) -> Tuple[ModelInputForHPU, WorkerInput, Dict[str, torch.Tensor]]: + """ + Get the driver input and broadcast it to other workers. + """ + assert self.is_driver_worker + assert execute_model_req.virtual_engine == 0 + + is_first_multi_step = execute_model_req.is_first_multi_step + is_last_step = execute_model_req.is_last_step + + if is_first_multi_step: + # on first step we prepare the worker input and model input normally + worker_input: WorkerInput = self.prepare_worker_input( + execute_model_req=execute_model_req) + worker_input = dataclasses.replace( + worker_input, + num_steps=execute_model_req.num_lookahead_slots + 1) + model_input: ModelInputForHPU = ( + self.model_runner.prepare_model_input( + execute_model_req.seq_group_metadata_list, + execute_model_req.virtual_engine, + execute_model_req.finished_requests_ids)) + + if execute_model_req.async_callback: + model_input = dataclasses.replace( + model_input, + async_callback=execute_model_req.async_callback) + else: + # on subsequent steps we reuse the worker input and model input + assert self.cached_model_input is not None + model_input = self.cached_model_input + worker_input = WorkerInput() + + model_input = dataclasses.replace( + model_input, + is_first_multi_step=is_first_multi_step, + is_last_step=is_last_step) + + if self.do_metadata_broadcast: + if is_first_multi_step: + broadcast_data = worker_input.as_broadcastable_tensor_dict() + broadcast_data.update( + model_input.as_broadcastable_tensor_dict()) + broadcast_tensor_dict(broadcast_data, src=0) + else: + broadcast_data = { + "is_first_multi_step": is_first_multi_step, + "is_last_step": is_last_step, + } + broadcast_tensor_dict(broadcast_data, src=0) + + # Returning empty dict here to keep this compatible with + # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast` + return model_input, worker_input, {} + + def prepare_input( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + ) -> Optional[Tuple[ModelInputForHPU, WorkerInput, Dict[str, + torch.Tensor]]]: + if self.is_driver_worker: + if execute_model_req is None: + if self.do_metadata_broadcast: + # This signals that there's no more requests to process for + # now. All workers are running infinite loop with + # broadcast_tensor_dict, and it stops the loop when the + # driver broadcasts an empty input. Send an empty input to + # notify all other workers to stop their execution loop. + broadcast_tensor_dict({}, src=0) + return None + model_input, worker_input, _ = self._get_driver_input_and_broadcast( + execute_model_req) + if model_input.is_first_multi_step: + self.cached_model_input = model_input + return model_input, worker_input, {} + else: + broadcast_data = broadcast_tensor_dict(src=0) + if not broadcast_data: + return None + + if len(broadcast_data) == 2: + assert self.cached_model_input is not None + self.cached_model_input = dataclasses.replace( + self.cached_model_input, + is_first_multi_step=broadcast_data["is_first_multi_step"], + is_last_step=broadcast_data["is_last_step"]) + empty_worker_input = WorkerInput() + return self.cached_model_input, empty_worker_input, {} + + worker_input = WorkerInput.from_broadcasted_tensor_dict( + broadcast_data) + model_input = ( + self.model_runner. + make_model_input_from_broadcasted_tensor_dict(broadcast_data)) + self.cached_model_input = model_input + return model_input, worker_input, {} diff --git a/vllm/worker/selector.py b/vllm/worker/selector.py new file mode 100644 index 0000000000000..544840289e203 --- /dev/null +++ b/vllm/worker/selector.py @@ -0,0 +1,18 @@ +from vllm.platforms import current_platform + +if current_platform.is_neuron(): + from vllm.worker.neuron_worker import NeuronWorker as WorkerCls +elif current_platform.is_hpu(): + from vllm.worker.hpu_worker import HPUWorker as WorkerCls # type: ignore +elif current_platform.is_cpu(): + from vllm.worker.cpu_worker import CPUWorker as WorkerCls # type: ignore +elif current_platform.is_tpu(): + from vllm.worker.tpu_worker import TPUWorker as WorkerCls # type: ignore +elif current_platform.is_xpu(): + from vllm.worker.xpu_worker import XPUWorker as WorkerCls # type: ignore +else: + from vllm.worker.worker import Worker as WorkerCls # type: ignore + + +def init_worker(*args, **kwargs): + return WorkerCls(*args, **kwargs)