[BUILD] Upgrade torch-npu to 2.5.1 (#661)

Yikun · web-flow · commit 2e20797934fe · 2025-04-27T17:28:29.000+08:00
### What this PR does / why we need it? The torch-npu 2.5.1 are published: https://pypi.org/project/torch-npu/2.5.1/ It's time to remove all torch-npu dev version from vllm-ascend code base ### Does this PR introduce _any_ user-facing change? Yes, using torch-npu 2.5.1 ### How was this patch tested? - [ ] CI passed - [ ] Manually test - [ ] Grep all `dev2025` --------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -38,7 +38,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        # TODO(yikun): Add 3.12 back when torch-npu support 3.12
+        python-version: ["3.9", "3.10", "3.11"]
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
     - name: Set up Python ${{ matrix.python-version }}
@@ -47,7 +48,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        pip install -r requirements-dev.txt
+        pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
     - name: Checkout vllm-project/vllm repo
       uses: actions/checkout@v4
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -97,25 +97,10 @@ jobs:
         run: |
           VLLM_TARGET_DEVICE=empty pip install -e .
 
-      - name: Install pta
-        run: |
-          if [ ! -d /root/.cache/pta ]; then
-            mkdir -p /root/.cache/pta
-          fi
-
-          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
-            cd /root/.cache/pta
-            rm -rf pytorch_v2.5.1_py310*
-            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-            tar -zxvf pytorch_v2.5.1_py310.tar.gz
-          fi
-
-          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-
       - name: Install vllm-project/vllm-ascend
         run: |
           pip install -r requirements-dev.txt
-          pip install -v --no-build-isolation -e .
+          pip install -v -e .
 
       - name: Run vllm-project/vllm-ascend test for V1 Engine
         env:
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,7 +10,8 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+# TODO: Add 3.12 back when torch-npu support 3.12
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11")
 
 find_package(pybind11 REQUIRED)
 
diff --git a/Dockerfile b/Dockerfile
@@ -39,21 +39,21 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.8.4
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /workspace/vllm
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN python3 -m pip uninstall -y triton
-
-# Install torch-npu
-RUN bash /workspace/vllm-ascend/pta_install.sh
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
 
 # Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
 RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
     source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib:$LD_LIBRARY_PATH && \
-    export LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:$LIBRARY_PATH && \
-    python3 -m pip install -v /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
 
 # Install modelscope (for fast download) and ray (for multinode)
-RUN python3 -m pip install modelscope ray
+RUN python3 -m pip install modelscope ray && \
+    python3 -m pip cache purge
 
 CMD ["/bin/bash"]
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
@@ -42,9 +42,6 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-i
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN python3 -m pip uninstall -y triton
 
-# Install torch-npu
-RUN bash /workspace/vllm-ascend/pta_install.sh
-
 # Install vllm-ascend
 RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
     source /usr/local/Ascend/nnal/atb/set_env.sh && \
diff --git a/README.md b/README.md
@@ -36,9 +36,9 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
 - Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series
 - OS: Linux
 - Software:
-  * Python >= 3.9
+  * Python >= 3.9, < 3.12
   * CANN >= 8.0.0
-  * PyTorch >= 2.5.1, torch-npu >= 2.5.1.dev20250320
+  * PyTorch >= 2.5.1, torch-npu >= 2.5.1
   * vLLM (the same version as vllm-ascend)
 
 ## Getting Started
diff --git a/README.zh.md b/README.zh.md
@@ -39,7 +39,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
 - 软件：
   * Python >= 3.9
   * CANN >= 8.0.RC2
-  * PyTorch >= 2.5.1, torch-npu >= 2.5.1.dev20250320
+  * PyTorch >= 2.5.1, torch-npu >= 2.5.1
   * vLLM (与vllm-ascend版本一致)
 
 ## 开始使用
diff --git a/docs/source/developer_guide/versioning_policy.md b/docs/source/developer_guide/versioning_policy.md
@@ -61,16 +61,22 @@ As shown above:
 - `version` documentation: Corresponds to specific released versions (e.g., `v0.7.3`, `v0.7.3rc1`). No further updates after release.
 - `stable` documentation (**not yet released**): Official release documentation. Updates are allowed in real-time after release, typically based on vX.Y.Z-dev. Once stable documentation is available, non-stable versions should display a header warning: `You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.`.
 
+## Software Dependency Management
+- `torch-npu`: Ascend Extension for PyTorch (torch-npu) releases a stable version to [PyPi](https://pypi.org/project/torch-npu)
+  every 3 months, a development version (aka the POC version) every month, and a nightly version every day.
+  The PyPi stable version **CAN** be used in vLLM Ascend final version, the monthly dev version **ONLY CANN** be used in
+  vLLM Ascend RC version for rapid iteration, the nightly version **CANNOT** be used in vLLM Ascend any version and branches.
+
 ## Release Compatibility Matrix
 
 Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
 
-| vllm-ascend  | vLLM         | Python | Stable CANN | PyTorch/torch_npu |
-|--------------|--------------| --- | --- | --- |
-| v0.8.4rc1 | v0.8.4 | 3.9 - 3.12 | 8.0.0   |  2.5.1 / 2.5.1.dev20250320 |
-| v0.7.3rc2 | v0.7.3 | 3.9 - 3.12 | 8.0.0   |  2.5.1 / 2.5.1.dev20250320 |
-| v0.7.3rc1 | v0.7.3 | 3.9 - 3.12 | 8.0.0   |  2.5.1 / 2.5.1.dev20250308 |
-| v0.7.1rc1 | v0.7.1 | 3.9 - 3.12 | 8.0.0   |  2.5.1 / 2.5.1.dev20250218 |
+| vllm-ascend  | vLLM         | Python         | Stable CANN | PyTorch/torch_npu |
+|--------------|--------------|----------------| --- | --- |
+| v0.8.4rc1 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0   |  2.5.1 / 2.5.1.dev20250320 |
+| v0.7.3rc2 | v0.7.3 | >= 3.9, < 3.12 | 8.0.0   |  2.5.1 / 2.5.1.dev20250320 |
+| v0.7.3rc1 | v0.7.3 | >= 3.9, < 3.12 | 8.0.0   |  2.5.1 / 2.5.1.dev20250308 |
+| v0.7.1rc1 | v0.7.1 | >= 3.9, < 3.12 | 8.0.0   |  2.5.1 / 2.5.1.dev20250218 |
 
 ## Release cadence
 
diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -5,15 +5,15 @@ This document describes how to install vllm-ascend manually.
 ## Requirements
 
 - OS: Linux
-- Python: 3.9 or higher
+- Python: >= 3.9, < 3.12
 - A hardware with Ascend NPU. It's usually the Atlas 800 A2 series.
 - Software:
 
-    | Software     | Supported version | Note |
-    | ------------ | ----------------- | ---- | 
-    | CANN         | >= 8.0.0          | Required for vllm-ascend and torch-npu |
-    | torch-npu    | >= 2.5.1.dev20250320       | Required for vllm-ascend |
-    | torch        | >= 2.5.1          | Required for torch-npu and vllm |
+    | Software  | Supported version | Note                                   |
+    |-----------|-------------------|----------------------------------------| 
+    | CANN      | >= 8.0.0          | Required for vllm-ascend and torch-npu |
+    | torch-npu | >= 2.5.1          | Required for vllm-ascend               |
+    | torch     | >= 2.5.1          | Required for torch-npu and vllm        |
 
 You have 2 way to install:
 - **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip.
@@ -127,27 +127,6 @@ apt update  -y
 apt install -y gcc g++ cmake libnuma-dev wget
 ```
 
-Current version depends on a unreleased `torch-npu`, you need to install manually:
-
-```
-# Once the packages are installed, you need to install `torch-npu` manually,
-# because that vllm-ascend relies on an unreleased version of torch-npu.
-# This step will be removed in the next vllm-ascend release.
-# 
-# Here we take python 3.10 on aarch64 as an example. Feel free to install the correct version for your environment. See:
-#
-# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py39.tar.gz
-# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py311.tar.gz
-#
-mkdir pta
-cd pta
-wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-tar -xvf pytorch_v2.5.1_py310.tar.gz
-pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-cd ..
-```
-
 **[Optinal]** Config the extra-index of `pip` if you are working on a **x86** machine, so that the torch with cpu could be found:
 
 ```bash
@@ -181,13 +160,13 @@ or build from **source code**:
 # Install vLLM
 git clone --depth 1 --branch |vllm_version| https://github.com/vllm-project/vllm
 cd vllm
-VLLM_TARGET_DEVICE=empty pip install .
+VLLM_TARGET_DEVICE=empty pip install -e -v .
 cd ..
 
 # Install vLLM Ascend
 git clone  --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git
 cd vllm-ascend
-python setup.py develop
+pip install -e -v .
 cd ..
 ```
 
diff --git a/pta_install.sh b/pta_install.sh
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,8 +11,9 @@ requires = [
     "scipy",
     "setuptools>=64",
     "setuptools-scm>=8",
-    "torch_npu==2.5.1rc1",
+    "torch-npu==2.5.1",
     "torch>=2.5.1",
     "torchvision<0.21.0",
+    "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,7 @@ pyyaml
 scipy
 setuptools>=64
 setuptools-scm>=8
+torch-npu==2.5.1
 torch>=2.5.1
 torchvision<0.21.0
 wheel
diff --git a/setup.py b/setup.py
@@ -342,11 +342,11 @@ def _read_requirements(filename: str) -> List[str]:
     project_urls={
         "Homepage": "https://github.com/vllm-project/vllm-ascend",
     },
+    # TODO: Add 3.12 back when torch-npu support 3.12
     classifiers=[
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: 3.12",
         "License :: OSI Approved :: Apache Software License",
         "Intended Audience :: Developers",
         "Intended Audience :: Information Technology",
diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py
@@ -86,7 +86,7 @@ def forward(
 
         context_layer = torch.torch.empty_like(q)
 
-        # operator requires pta version >= 2.5.1.dev20250226
+        # operator requires pta version >= 2.5.1
         torch_npu._npu_flash_attention_unpad(
             query=q,
             key=k,