[CI/UT][Refactor] move e2e spec decode and deepseek acc test to per pr

MengqingCao · MengqingCao · commit 1469d7980cdd · 2025-06-19T06:58:03.000Z
Signed-off-by: MengqingCao &lt;cmq0113@163.com&gt;
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -239,6 +239,10 @@ jobs:
           --ignore=tests/e2e/singlecard/test_ilama_lora.py \
           --ignore=tests/e2e/singlecard/test_guided_decoding.py \
           --ignore=tests/e2e/singlecard/test_camem.py
+          # ------------ spec decode e2e test on v1 ------------ #
+          VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
+          # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
+          # VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
 
       - name: Run e2e test on V0 engine
         if: ${{ github.event_name == 'schedule' }}
diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -42,7 +42,10 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        os: [linux-arm64-npu-1, linux-arm64-npu-4]
+        os: [linux-arm64-npu-1, 
+            # revert me if requires multi-card test
+            # linux-arm64-npu-4
+            ]
         vllm_version: [main, v0.9.1]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
@@ -96,14 +99,7 @@ jobs:
 
       - name: Run vllm-project/vllm-ascend long term test
         run: |
-          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
-            # spec decode test
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
-            # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
-            # VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py  # it needs a clean process
-            pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
-            pytest -sv tests/e2e/long_term/test_accuracy.py
-          else
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
-          fi
+          # ------------ spec decode test ------------ #
+          VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py  # it needs a clean process
+          pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
+          pytest -sv tests/e2e/long_term/test_accuracy.py
diff --git a/tests/e2e/multicard/test_deepseek_v2_lite_tp2_accuracy.py b/tests/e2e/multicard/test_deepseek_v2_lite_tp2_accuracy.py
@@ -38,7 +38,7 @@
 
 
 def run_test(model_name, queue, more_args=None):
-    model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4"
+    model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True"
     if more_args is not None:
         model_args = f"{model_args},{more_args}"
     results = lm_eval.simple_evaluate(
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -25,8 +25,10 @@
 
 from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
+import pytest
 
 from tests.conftest import VllmRunner
+from tests.model_utils import check_outputs_equal
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
@@ -46,21 +48,6 @@ def test_models_distributed_QwQ():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-def test_models_distributed_DeepSeek():
-    example_prompts = [
-        "Hello, my name is",
-    ]
-    dtype = "half"
-    max_tokens = 5
-    with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype=dtype,
-            tensor_parallel_size=4,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
-        vllm_model.generate_greedy(example_prompts, max_tokens)
-
-
 @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1"})
 def test_models_distributed_topk() -> None:
     example_prompts = [
@@ -83,18 +70,36 @@ def test_models_distributed_topk() -> None:
         vllm_model.generate(example_prompts, sampling_params)
 
 
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
-def test_models_distributed_DeepSeek_dbo():
-    example_prompts = ["The president of the United States is"] * 41
-    dtype = "half"
-    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
-    with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype=dtype,
-            tensor_parallel_size=4,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
+def test_models_distributed_DeepSeek_dbo(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ASCEND_ENABLE_DBO", "1")
+
+        example_prompts = ["The president of the United States is"] * 41
+        dtype = "half"
+        sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+        with VllmRunner(
+                "deepseek-ai/DeepSeek-V2-Lite",
+                dtype=dtype,
+                tensor_parallel_size=4,
+                distributed_executor_backend="mp",
+        ) as vllm_model:
+            dpo_output = vllm_model.generate(example_prompts, sampling_params)
+
+        m.setenv("VLLM_ASCEND_ENABLE_DBO", "0")
+        with VllmRunner(
+                "deepseek-ai/DeepSeek-V2-Lite",
+                dtype=dtype,
+                tensor_parallel_size=4,
+                distributed_executor_backend="mp",
+        ) as vllm_model:
+            output = vllm_model.generate(example_prompts, sampling_params)
+
+    check_outputs_equal(
+        outputs_0_lst=output,
+        outputs_1_lst=dpo_output,
+        name_0="vllm_outputs",
+        name_1="vllm_dbo_outputs",
+    )
 
 
 def test_models_distributed_DeepSeek_W8A8():
diff --git a/tests/e2e/singlecard/spec_decode/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode/test_v1_mtp_correctness.py
diff --git a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py