vllm-project · wangxiyuan · Apr 28, 2025 · Apr 27, 2025
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -38,7 +38,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        # TODO: Add 3.12 back when torch-npu support 3.12
+        python-version: ["3.9", "3.10", "3.11"]
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
     - name: Set up Python ${{ matrix.python-version }}
@@ -47,7 +48,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        pip install -r requirements-dev.txt 
+        pip install -r requirements-dev.txt
 
     - name: Checkout vllm-project/vllm repo
       uses: actions/checkout@v4

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -89,21 +89,6 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -e .
 
-      - name: Install pta
-        run: |
-          if [ ! -d /root/.cache/pta ]; then
-            mkdir -p /root/.cache/pta
-          fi
-
-          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
-            cd /root/.cache/pta
-            rm -rf pytorch_v2.5.1_py310*
-            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-            tar -zxvf pytorch_v2.5.1_py310.tar.gz
-          fi
-
-          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-
       - name: Run vllm-project/vllm-ascend e2e test
         run: |
           pytest -sv tests/test_offline_inference.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,7 +10,8 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+# TODO: Add 3.12 back when torch-npu support 3.12
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11")
 
 find_package(pybind11 REQUIRED)
 

diff --git a/Dockerfile b/Dockerfile
@@ -37,18 +37,16 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.7.3
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /workspace/vllm
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/
-# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
-RUN python3 -m pip uninstall -y triton
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
 
 # Install vllm-ascend
-RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
-
-# Install torch-npu
-RUN bash /workspace/vllm-ascend/pta_install.sh
+RUN python3 -m pip install -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
 
 # Install modelscope (for fast download) and ray (for multinode)
-# TODO(yikun): Remove "<1.23.0" after v0.7.4 which resloved by https://github.com/vllm-project/vllm/pull/13807
-RUN python3 -m pip install "modelscope<1.23.0" ray
+# TODO(yikun): Remove "<1.23.0" after v0.8.4 which resloved by https://github.com/vllm-project/vllm/pull/13807
+RUN python3 -m pip install "modelscope<1.23.0" ray && \
+    python3 -m pip cache purge
 
 CMD ["/bin/bash"]
diff --git a/README.md b/README.md
@@ -34,9 +34,9 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
 
 - Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series
 - Software:
-  * Python >= 3.9
+  * Python >= 3.9, < 3.12
   * CANN >= 8.0.0
-  * PyTorch >= 2.5.1, torch-npu >= 2.5.1.dev20250320
+  * PyTorch >= 2.5.1, torch-npu >= 2.5.1
   * vLLM (the same version as vllm-ascend)
 
 Find more about how to setup your environment step by step in [here](docs/source/installation.md).

diff --git a/README.zh.md b/README.zh.md
@@ -34,9 +34,9 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个让vLLM在Ascend NPU无缝运行的
 
 - 硬件：Atlas 800I A2 Inference系列、Atlas A2 Training系列
 - 软件：
-  * Python >= 3.9
+  * Python >= 3.9, < 3.12
   * CANN >= 8.0.RC2
-  * PyTorch >= 2.5.1, torch-npu >= 2.5.1.dev20250320
+  * PyTorch >= 2.5.1, torch-npu >= 2.5.1
   * vLLM (与vllm-ascend版本一致)
 
 在[此处](docs/source/installation.md)，您可以了解如何逐步准备环境。

diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -5,15 +5,15 @@ This document describes how to install vllm-ascend manually.
 ## Requirements
 
 - OS: Linux
-- Python: 3.9 or higher
+- Python: >= 3.9, < 3.12
 - A hardware with Ascend NPU. It's usually the Atlas 800 A2 series.
 - Software:
 
-    | Software     | Supported version | Note |
-    | ------------ | ----------------- | ---- | 
+    | Software     | Supported version | Note                                   |
+    | ------------ | ----------------- | -------------------------------------- | 
     | CANN         | >= 8.0.0          | Required for vllm-ascend and torch-npu |
-    | torch-npu    | >= 2.5.1.dev20250320       | Required for vllm-ascend |
-    | torch        | >= 2.5.1          | Required for torch-npu and vllm |
+    | torch-npu    | >= 2.5.1          | Required for vllm-ascend               |
+    | torch        | >= 2.5.1          | Required for torch-npu and vllm        |
 
 You have 2 way to install:
 - **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip.
@@ -156,25 +156,6 @@ pip install -e . --extra-index https://download.pytorch.org/whl/cpu/
 ```
 :::
 
-Current version depends on a unreleased `torch-npu`, you need to install manually:
-
-```
-# Once the packages are installed, you need to install `torch-npu` manually,
-# because that vllm-ascend relies on an unreleased version of torch-npu.
-# This step will be removed in the next vllm-ascend release.
-# 
-# Here we take python 3.10 on aarch64 as an example. Feel free to install the correct version for your environment. See:
-#
-# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py39.tar.gz
-# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-# https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py311.tar.gz
-#
-mkdir pta
-cd pta
-wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
-tar -xvf pytorch_v2.5.1_py310.tar.gz
-pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-```
 ::::
 
 ::::{tab-item} Using docker

diff --git a/pta_install.sh b/pta_install.sh
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,14 +1,19 @@
 [build-system]
 # Should be mirrored in requirements.txt
 requires = [
-    "setuptools>=64",
-    "setuptools-scm>=8",
     "cmake>=3.26",
-    "pybind11",
     "decorator",
+    "numpy<2.0.0",
+    "packaging",
+    "pip",
+    "pybind11",
     "pyyaml",
     "scipy",
-    "torch_npu >= 2.5.1rc1",
-    "torch >= 2.5.1"
+    "setuptools>=64",
+    "setuptools-scm>=8",
+    "torch-npu==2.5.1",
+    "torch>=2.5.1",
+    "torchvision<0.21.0",
+    "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements-lint.txt b/requirements-lint.txt
@@ -9,7 +9,7 @@ clang-format==18.1.5
 sphinx-lint==1.0.0
 
 # type checking
-mypy==1.11.1
+mypy==1.15.0
 types-PyYAML
 types-requests
 types-setuptools
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,15 @@
+# Should be mirrored in pyporject.toml
+cmake>=3.26
 decorator
+numpy<2.0.0
+packaging
+pip
+pybind11
 pyyaml
 scipy
-pybind11
-setuptools
-setuptools-scm
-numpy==1.26.4
+setuptools>=64
+setuptools-scm>=8
+torch-npu==2.5.1
+torch>=2.5.1
+torchvision<0.21.0
+wheel
diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py
@@ -95,7 +95,7 @@ def forward(
 
         context_layer = torch.torch.empty_like(q)
 
-        # operator requires pta version >= 2.5.1.dev20250226
+        # operator requires pta version >= 2.5.1
         torch_npu._npu_flash_attention_unpad(
             query=q,
             key=k,

diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py
@@ -86,7 +86,7 @@ def forward(
 
         context_layer = torch.torch.empty_like(q)
 
-        # operator requires pta version >= 2.5.1.dev20250226
+        # operator requires pta version >= 2.5.1
         torch_npu._npu_flash_attention_unpad(
             query=q,
             key=k,