vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/vllm_ascend_test_long_term.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/vllm_ascend_test_long_term.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/vllm_ascend_test_pd.yaml‎
Lines changed: 5 additions & 7 deletions b/‎.github/workflows/vllm_ascend_test_pd.yaml‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile.openEuler‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.openEuler‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/disaggregated_prefill/disaggregated_prefill_offline.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/disaggregated_prefill/disaggregated_prefill_offline.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py‎
Lines changed: 9 additions & 2 deletions b/‎examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎packages.txt‎
Lines changed: 2 additions & 0 deletions b/‎packages.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎requirements-dev.txt‎
Lines changed: 2 additions & 0 deletions b/‎requirements-dev.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/compile/test_aclgraph.py‎
Lines changed: 102 additions & 0 deletions b/‎tests/compile/test_aclgraph.py‎
Lines changed: 102 additions & 0 deletions
@@ -43,7 +43,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.8.5.post1]
+        vllm_version: [main, v0.9.0]
     concurrency:
       group: >
         ${{
 
@@ -41,7 +41,7 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        vllm_version: [main, v0.8.5.post1]
+        vllm_version: [main, v0.9.0]
     name: vLLM Ascend long term test
     runs-on: linux-arm64-npu-1
     container:
 
@@ -40,7 +40,7 @@ jobs:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
     strategy:
       matrix:
-        vllm_verison: [main, v0.8.5.post1]
+        vllm_verison: [main, v0.9.0]
     name: vLLM Ascend prefilling decoding disaggregation test
     runs-on: linux-arm64-npu-static-8
 
@@ -55,12 +55,6 @@ jobs:
       options: >-
         --device /dev/davinci0
         --device /dev/davinci1
-        --device /dev/davinci2
-        --device /dev/davinci3
-        --device /dev/davinci4
-        --device /dev/davinci5
-        --device /dev/davinci6
-        --device /dev/davinci7
         --device /dev/davinci_manager
         --device /dev/devmm_svm
         --device /dev/hisi_hdc
@@ -105,3 +99,7 @@ jobs:
         run: |
           pip install -r requirements-dev.txt
           pip install -v -e .
+
+      - name: Run vllm-project/vllm-ascend PD Disaggregation test
+        run: |
+          pytest -sv tests/e2e/pd_disaggreate/test_pd_e2e.py
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.8.5.post1
+ARG VLLM_TAG=v0.9.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
 
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.8.5.post1
+ARG VLLM_TAG=v0.9.0
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 
@@ -13,7 +13,7 @@
 from multiprocessing import Event, Process
 
 kv_connector_extra_config = {
-    "prompt_device_ips": ["1.2.3.1", "1.2.3.2"],
+    "prefill_device_ips": ["1.2.3.1", "1.2.3.2"],
     "decode_device_ips": ["1.2.3.9", "1.2.3.10"],
     "llmdatadist_comm_port": 26000,
 }
 
@@ -181,6 +181,13 @@ async def handle_request():
 
 
 if __name__ == "__main__":
-    t = start_service_discovery("0.0.0.0", 30001)
-    app.run(host="0.0.0.0", port=10001)
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="args of disaggregated-prefill proxy")
+    parser.add_argument("--http-port", type=int, default=10001)
+    parser.add_argument("--register-port", type=int, default=10002)
+    args = parser.parse_args()
+
+    t = start_service_discovery("0.0.0.0", args.register_port)
+    app.run(host="0.0.0.0", port=args.http_port)
     t.join()
@@ -1,3 +1,5 @@
 git
 vim
 wget
+jq
+curl
@@ -10,3 +10,5 @@ types-jsonschema
 xgrammar
 zmq
 numba
+quart
+types-psutil
@@ -0,0 +1,102 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/compile/test_aclgraph.py`.
+"""
+
+import os
+
+import pytest
+import torch
+from vllm import LLM, SamplingParams
+
+from tests.conftest import VllmRunner
+from tests.model_utils import check_outputs_equal
+from vllm_ascend.utils import vllm_version_is
+
+MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="aclgraph only support on v1")
+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+def test_models(
+    model: str,
+    max_tokens: int,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    with monkeypatch.context() as m:
+        prompts = [
+            "Hello, my name is", "The president of the United States is",
+            "The capital of France is", "The future of AI is"
+        ]
+
+        # aclgraph only support on v1
+        m.setenv("VLLM_USE_V1", "1")
+
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         temperature=0.0)
+        # TODO: change to use vllmrunner when the registry of custom op is solved
+        # while running pytest
+        vllm_model = LLM(model)
+        vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
+        del vllm_model
+        torch.npu.empty_cache()
+
+        vllm_model = LLM(model, enforce_eager=True)
+        vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
+        del vllm_model
+        torch.npu.empty_cache()
+
+    vllm_aclgraph_outputs_list = []
+    for output in vllm_aclgraph_outputs:
+        vllm_aclgraph_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    vllm_eager_outputs_list = []
+    for output in vllm_eager_outputs:
+        vllm_eager_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=vllm_aclgraph_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_aclgraph_outputs",
+    )
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="aclgraph only support on v1")
+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
+def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        m.setenv("VLLM_USE_V1", "1")
+        with pytest.raises(NotImplementedError) as excinfo:
+            VllmRunner("deepseek-ai/DeepSeek-V2-Lite-Chat",
+                       max_model_len=1024,
+                       enforce_eager=False)
+        assert "ACL Graph does not support deepseek" in str(excinfo.value)
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`from multiprocessing import Event, Process`
`14`	`14`
`15`	`15`	`kv_connector_extra_config = {`
`16`		`- "prompt_device_ips": ["1.2.3.1", "1.2.3.2"],`
	`16`	`+ "prefill_device_ips": ["1.2.3.1", "1.2.3.2"],`
`17`	`17`	`"decode_device_ips": ["1.2.3.9", "1.2.3.10"],`
`18`	`18`	`"llmdatadist_comm_port": 26000,`
`19`	`19`	`}`
-Original file line number
+Diff line change
@@ @@ -1,3 +1,5 @@ @@
 git
 vim
 wget
 +jq
 +curl