add v0.8.3 test

Potabk · Potabk · commit e95cc8b720ef · 2025-04-08T03:30:27.000Z
Signed-off-by: wangli &lt;wangli858794774@gmail.com&gt;
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -25,7 +25,7 @@ on:
     paths:
       - '*.txt'
       - '**/*.py'
-      - '.github/workflows/vllm_ascend_test_main.yaml'
+      - '.github/workflows/vllm_ascend_test.yaml'
       - '!docs/**'
       - 'pytest.ini'
 
@@ -72,7 +72,6 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
-          fetch-depth: 0 
           path: ./vllm-empty
 
       - name: Install vllm-project/vllm from source
@@ -100,45 +99,195 @@ jobs:
 
           pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
 
-      - name: Run vllm-project/vllm-ascend test
+      - name: Run vllm-project/vllm-ascend test on V0 engine
         env:
           VLLM_USE_V1: 0
           HF_ENDPOINT: https://hf-mirror.com
         run: |
           VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
 
+      - name: Run vllm-project/vllm-ascend test for V1 Engine
+        env:
+          VLLM_USE_V1: 1
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          HF_ENDPOINT: https://hf-mirror.com
+        run: |
+          pytest -sv -m 'not multinpu' tests
+
       - name: Run vllm-project/vllm test for V0 Engine
         env:
           VLLM_USE_V1: 0
           PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
           HF_ENDPOINT: https://hf-mirror.com
         run: |
           pytest -sv
-      
-      - name: Checkout to vllm 0.8.3
+
+  test-multinpu:
+    name: vLLM Ascend test (multi-npu)
+    runs-on: linux-arm64-npu-4  
+    container:
+      image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
+      env:
+        HF_ENDPOINT: https://hf-mirror.com
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          # sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+      - name: Install system dependencies
+        run: |
+          apt-get update -y
+          apt-get -y install git wget
+
+      - name: Config git
+        run: |
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ 
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-dev.txt
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          path: ./vllm-empty
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -e .
+
+      - name: Install pta
+        run: |
+          if [ ! -d /root/.cache/pta ]; then
+            mkdir -p /root/.cache/pta
+          fi
+
+          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
+            cd /root/.cache/pta
+            rm -rf pytorch_v2.5.1_py310*
+            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
+            tar -zxvf pytorch_v2.5.1_py310.tar.gz
+          fi
+
+          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
+      - name: Run vllm-project/vllm-ascend test on V0 engine
+        env:
+          VLLM_USE_V1: 0
+          HF_ENDPOINT: https://hf-mirror.com
+        run: |
+          VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
+
+      - name: Run vllm-project/vllm-ascend test for V1 Engine
+        env:
+          VLLM_USE_V1: 1
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          HF_ENDPOINT: https://hf-mirror.com
+        run: |
+          pytest -sv -m 'multinpu' tests
+
+  test-singlenpu-v0.8.3:
+    name: vLLM Ascend test (single-npu)
+    runs-on: linux-arm64-npu-1  # actionlint-ignore: runner-label
+    container:
+      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+          apt-get update -y
+          apt install git -y
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          apt-get -y install `cat packages.txt`
+          apt-get -y install gcc g++ cmake libnuma-dev
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: v0.8.3
+          path: ./vllm-empty
+
+      - name: Install vllm-project/vllm from source
         working-directory: ./vllm-empty
         run: |
-          git checkout v0.8.3
           VLLM_TARGET_DEVICE=empty pip install -e .
 
-      - name: Run vllm-project/vllm-ascend test with vllm v0.8.3
+      - name: Install vllm-project/vllm-ascend
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -e .
+
+      - name: Install pta
+        run: |
+          if [ ! -d /root/.cache/pta ]; then
+            mkdir -p /root/.cache/pta
+          fi
+
+          if [ ! -f /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ]; then
+            cd /root/.cache/pta
+            rm -rf pytorch_v2.5.1_py310*
+            wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
+            tar -zxvf pytorch_v2.5.1_py310.tar.gz
+          fi
+
+          pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
+
+      - name: Run vllm-project/vllm-ascend test on V0 engine
         env:
           VLLM_USE_V1: 0
           HF_ENDPOINT: https://hf-mirror.com
         run: |
           VLLM_USE_V1=0 pytest -sv -m 'not multinpu' tests
 
-      - name: Run vllm-project/vllm test for V0 Engine with vllm v0.8.3
+      - name: Run vllm-project/vllm-ascend test for V1 Engine
+        env:
+          VLLM_USE_V1: 1
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          HF_ENDPOINT: https://hf-mirror.com
+        run: |
+          pytest -sv -m 'not multinpu' tests
+
+      - name: Run vllm-project/vllm test for V0 Engine
         env:
           VLLM_USE_V1: 0
           PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
           HF_ENDPOINT: https://hf-mirror.com
         run: |
           pytest -sv
 
-  test-multinpu:
+  test-multinpu-v0.8.3:
     name: vLLM Ascend test (multi-npu)
-    runs-on: linux-arm64-npu-4  
+    runs-on: linux-arm64-npu-4
+    needs: test-multinpu
     container:
       image: ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
       env:
@@ -175,7 +324,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
-          fetch-depth: 0
+          ref: v0.8.3
           path: ./vllm-empty
 
       - name: Install vllm-project/vllm from source
@@ -202,22 +351,17 @@ jobs:
           fi
 
           pip install /root/.cache/pta/torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-      - name: Run vllm-project/vllm-ascend test
+      - name: Run vllm-project/vllm-ascend test on V0 engine
         env:
           VLLM_USE_V1: 0
-          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
+          HF_ENDPOINT: https://hf-mirror.com
         run: |
-          pytest -sv -m multinpu tests/
+          VLLM_USE_V1=0 pytest -sv -m 'multinpu' tests
 
-      - name: Checkout to vllm 0.8.3
-        working-directory: ./vllm-empty
-        run: |
-          git checkout v0.8.3
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Run vllm-project/vllm-ascend test with vllm v0.8.3
+      - name: Run vllm-project/vllm-ascend test for V1 Engine
         env:
-          VLLM_USE_V1: 0
-          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
+          VLLM_USE_V1: 1
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          HF_ENDPOINT: https://hf-mirror.com
         run: |
-          pytest -sv -m multinpu tests/
+          pytest -sv -m 'multinpu' tests
diff --git a/tests/test_offline_inference.py b/tests/test_offline_inference.py
@@ -35,58 +35,44 @@
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("use_v1", ["1", "0"])
 @pytest.mark.parametrize("dtype", ["half", "float16"])
 @pytest.mark.parametrize("max_tokens", [5])
-def test_models(model: str, use_v1: str, dtype: str, max_tokens: int,
-                monkeypatch: pytest.MonkeyPatch) -> None:
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", use_v1)
-        if use_v1 == '1':
-            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-        # 5042 tokens for gemma2
-        # gemma2 has alternating sliding window size of 4096
-        # we need a prompt with more than 4096 tokens to test the sliding window
-        prompt = "The following numbers of the sequence " + ", ".join(
-            str(i) for i in range(1024)) + " are:"
-        example_prompts = [prompt]
+def test_models(model: str, dtype: str, max_tokens: int) -> None:
+    # 5042 tokens for gemma2
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
+    prompt = "The following numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
 
-        with VllmRunner(model,
-                        max_model_len=8192,
-                        dtype=dtype,
-                        enforce_eager=False,
-                        gpu_memory_utilization=0.7) as vllm_model:
-            vllm_model.generate_greedy(example_prompts, max_tokens)
+    with VllmRunner(model,
+                    max_model_len=8192,
+                    dtype=dtype,
+                    enforce_eager=False,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-# Now our pvc reading speed is too slow
-# For faster testing, temporarily uncheck the support for testing large weight models on v1
 @pytest.mark.multinpu
-@pytest.mark.parametrize("use_v1", ["0"])
 @pytest.mark.parametrize("model, distributed_executor_backend", [
     ("Qwen/QwQ-32B", "mp"),
 ])
-def test_models_distributed(vllm_runner, use_v1: str, model: str,
-                            distributed_executor_backend: str,
-                            monkeypatch: pytest.MonkeyPatch) -> None:
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", use_v1)
-        if use_v1 == '1':
-            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-        example_prompts = [
-            "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
-            "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
-            "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
-        ]
-        dtype = "half"
-        max_tokens = 5
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=4,
-                distributed_executor_backend=distributed_executor_backend,
-        ) as vllm_model:
-            vllm_model.generate_greedy(example_prompts, max_tokens)
+def test_models_distributed(vllm_runner, model: str,
+                            distributed_executor_backend: str) -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    max_tokens = 5
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=4,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
 if __name__ == "__main__":