vllm-project · wangxiyuan · Jul 3, 2025 · Jun 28, 2025 · Jun 30, 2025 · Jun 30, 2025
diff --git a/.github/workflows/doc_codespell.yaml b/.github/workflows/doc_codespell.yaml
@@ -28,6 +28,6 @@ jobs:
       - name: Run codespell check
         run: |
           CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
-          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn')
+          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn,rever')
 
           codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -86,7 +86,7 @@ jobs:
       - name: Run codespell check
         run: |
           CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
-          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn')
+          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn,rever')
 
           codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
       - name: Analysing the code with ruff

diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md
@@ -40,14 +40,14 @@ The details of each config option are as follows:
 
 | Name | Type | Default | Description |
 | ---- | ---- | ------- | ----------- |
-| `enabled` | bool | `False` | Whether to enable torchair graph mode |
-| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream |
-| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert |
+| `enabled` | bool | `False` | Whether to enable torchair graph mode. Currently only DeepSeek series models and PanguProMoE are supported to use torchair graph mode |
+| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream. This option only takes effects on models using MLA (e.g., DeepSeek). |
+| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert. This option only takes effects on DeepSeek moe models. |
 | `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization |
 | `use_cached_graph` | bool | `False` | Whether to use cached graph |
 | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
 | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
-| `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout |
+| `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout. This option only takes effects on models using MLA (e.g., DeepSeek). |
 
 **ascend_scheduler_config**
 

diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md
@@ -12,7 +12,7 @@ From v0.9.1rc1 with V1 Engine, vLLM Ascend will run models in graph mode by defa
 
 There are two kinds for graph mode supported by vLLM Ascend:
 - **ACLGraph**: This is the default graph mode supported by vLLM Ascend. In v0.9.1rc1, only Qwen series models are well tested.
-- **TorchAirGraph**: This is the GE graph mode. In v0.9.1rc1, only DeepSeek series models are supported.
+- **TorchAirGraph**: This is the GE graph mode. In v0.9.1rc1, only DeepSeek series models are supported. In v0.9.1rc2, we also support PanguProMoe with torchair.
 
 ## Using ACLGraph
 ACLGraph is enabled by default. Take Qwen series models as an example, just set to use V1 Engine is enough.

diff --git a/format.sh b/format.sh
@@ -145,7 +145,7 @@ CODESPELL_EXCLUDES=(
 )
 
 CODESPELL_IGNORE_WORDS=(
-    '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn'
+    '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn,rever'
 )
 
 # check spelling of specified files

diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -165,3 +165,20 @@ def test_models_distributed_DeepSeek_W8A8():
             quantization="ascend",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_models_distributed_pangu():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
+            max_model_len=8192,
+            enforce_eager=True,
+            dtype="auto",
+            tensor_parallel_size=4,
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py
@@ -99,3 +99,63 @@ def test_e2e_deepseekv3_with_torchair_ms_mla():
         },
     }
     _deepseek_torchair_test_fixture(additional_config)
+
+
+def _pangu_torchair_test_fixture(
+    additional_config: Dict,
+    *,
+    tensor_parallel_size=4,
+):
+    example_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # torchair is only work without chunked-prefill now
+    kwargs = {
+        "ascend_scheduler_config": {
+            "enabled": True,
+        },
+        "refresh": True,
+    }
+    additional_config.update(**kwargs)
+
+    with VllmRunner(
+            "vllm-ascend/pangu-pro-moe-pruing",
+            dtype="half",
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend="mp",
+            enforce_eager=False,
+            additional_config=additional_config,
+    ) as vllm_model:
+        # use greedy sampler to make sure the generated results are fix
+        vllm_output = vllm_model.generate_greedy(example_prompts, 5)
+
+    # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE
+    # with 2 hidden layers, thus the golden results seems inaccurate.
+    # This will only change if accuracy changes with the official weights
+    # of PanguProMoE.
+    golden_results = [
+        'Hello, my name is Remempondeprecatedmiot忱',
+        'The president of the United States is Remem下的一个 rever ceremoni Segnali',
+        'The capital of France is Rememvoud administrativ Remem投',
+        'The future of AI isotope Segnali Zoeken精细化 supus',
+    ]
+
+    assert len(golden_results) == len(vllm_output)
+    for i in range(len(vllm_output)):
+        assert golden_results[i] == vllm_output[i][1]
+        print(f"Generated text: {vllm_output[i][1]!r}")
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="torchair graph is not supported on v0")
+def test_e2e_pangu_with_torchair():
+    additional_config = {
+        "torchair_graph_config": {
+            "enabled": True,
+        },
+    }
+    _pangu_torchair_test_fixture(additional_config)