Merge branch 'main' into main

zzzzwwjj · zzzzwwjj · commit c62fd91db825 · 2025-07-25T10:24:22.000+08:00
diff --git a/.github/workflows/image_310p_openeuler.yml b/.github/workflows/image_310p_openeuler.yml
@@ -36,6 +36,12 @@ on:
       - '.github/workflows/image_310p_openeuler.yml'
       - 'Dockerfile.310p.openEuler'
       - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
 
 jobs:
   build:
diff --git a/.github/workflows/image_310p_ubuntu.yml b/.github/workflows/image_310p_ubuntu.yml
@@ -36,6 +36,12 @@ on:
       - '.github/workflows/image_310p_ubuntu.yml'
       - 'Dockerfile.310p'
       - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
 jobs:
 
   build:
diff --git a/.github/workflows/image_a3_openeuler.yml b/.github/workflows/image_a3_openeuler.yml
@@ -36,6 +36,12 @@ on:
       - '.github/workflows/image_a3_openeuler.yml'
       - 'Dockerfile.a3.openEuler'
       - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
 
 jobs:
   build:
diff --git a/.github/workflows/image_a3_ubuntu.yml b/.github/workflows/image_a3_ubuntu.yml
@@ -36,6 +36,12 @@ on:
       - '.github/workflows/image_a3_ubuntu.yml'
       - 'Dockerfile.a3'
       - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
 jobs:
 
   build:
diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml
@@ -35,6 +35,12 @@ on:
       - '.github/workflows/image_openeuler.yml'
       - 'Dockerfile.openEuler'
       - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
 
 jobs:
   build:
diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml
@@ -36,6 +36,12 @@ on:
       - '.github/workflows/image_ubuntu.yml'
       - 'Dockerfile'
       - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
 jobs:
 
   build:
diff --git a/tests/ut/models/test_deepseek_v2.py b/tests/ut/models/test_deepseek_v2.py
@@ -122,7 +122,8 @@ def mock_distributed():
 @pytest.fixture
 def mock_forward_context():
     forward_context = Mock(in_profile_run=False, with_prefill=False)
-    with patch("vllm_ascend.models.deepseek_v2.get_forward_context", return_value=forward_context):
+    with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
+               return_value=forward_context):
         yield
 
 
@@ -213,7 +214,8 @@ def test_custom_deepseek_v2_mlp(mock_distributed, base_config):
                             quant_config=None)
 
 
-def test_custom_deepseek_v2_moe(mock_distributed, base_config, mock_forward_context):
+def test_custom_deepseek_v2_moe(mock_distributed, base_config,
+                                mock_forward_context):
     base_config.n_shared_experts = 1
     moe = CustomDeepseekV2MoE(config=base_config,
                               quant_config=None,
diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py
@@ -20,10 +20,10 @@
 import torch.nn as nn
 from pytest_mock import MockerFixture
 
+from vllm_ascend.ascend_forward_context import get_fused_moe_state
 from vllm_ascend.ops.fused_moe import (AscendFusedMoE,
                                        AscendUnquantizedFusedMoEMethod)
-from vllm_ascend.utils import adapt_patch, AscendSocVersion  # noqa E402
-from vllm_ascend.ascend_forward_context import get_fused_moe_state
+from vllm_ascend.utils import AscendSocVersion, adapt_patch  # noqa E402
 
 adapt_patch(True)
 
@@ -245,8 +245,11 @@ def test_forward(self, mock_dist_env, default_moe_config, others_param):
             moe.moe_parallel_config.ep_size = 1
 
         moe.quant_method = MockQuantMethod(shared_experts, num_tokens)
-        forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens, dtype=torch.bool), padded_num_tokens=num_tokens)
-        with patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context):
+        forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens,
+                                                         dtype=torch.bool),
+                                    padded_num_tokens=num_tokens)
+        with patch("vllm_ascend.ops.fused_moe.get_forward_context",
+                   return_value=forward_context):
             output = moe.forward(inputs,
                                  router_logits,
                                  is_prefill=is_prefill,
@@ -306,7 +309,8 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env,
         global_num_experts, ep_size, select_softmax = others_param
         is_prefill = False
         is_deepseek_v3_r1 = global_num_experts == 256
-        forward_context = MagicMock(fused_moe_state=get_fused_moe_state(ep_size, is_prefill, is_deepseek_v3_r1))
+        forward_context = MagicMock(fused_moe_state=get_fused_moe_state(
+            ep_size, is_prefill, is_deepseek_v3_r1))
         with patch(
                 "vllm_ascend.ops.fused_moe.SELECT_GATING_TOPK_SOTFMAX_EXPERTS",
                 select_softmax), \
@@ -342,7 +346,8 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
         """
         ep_size, alltoall_buffer = others_param
         is_prefill = False
-        forward_context = MagicMock(fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True))
+        forward_context = MagicMock(
+            fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True))
         with patch("vllm_ascend.ops.fused_moe.MOE_ALL2ALL_BUFFER",
                    alltoall_buffer), \
              patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -121,28 +121,36 @@ class AscendAttentionState(Enum):
 
 @dataclass
 class AscendMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    block_tables: torch.Tensor
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    query_start_loc: torch.Tensor
-    query_lens: torch.Tensor
-    seq_lens: torch.Tensor
-    # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int] = None
-    # (num_tokens,). The indices of the token slots that input tokens will be
-    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
-    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
-    # in block 0, and 1st slot in block 1, respectively.
-    slot_mapping: torch.Tensor = None
+
+    # **************************** Basic Properties ****************************
+    attn_mask: Optional[torch.Tensor] = None
     # Current state of this attention run.
     attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill
-    attn_mask: Optional[torch.Tensor] = None
 
-    # For logging.
-    num_input_tokens: int = 0  # Number of tokens including padding.
+    # Number of tokens excluding padding.
+    num_actual_tokens: int = 0
+
+    # The sequence length per sequence. Sequence length means the computed
+    # tokens + new tokens (is None if it is a decoding).
+    # (batch_size,)
+    seq_lens: torch.Tensor = None
+
+    query_start_loc: torch.Tensor = None
+    query_lens: torch.Tensor = None
+    # Maximum query length in the batch (None for decoding).
+    max_query_len: Optional[int] = None
+
+    # ********************** KV Cache Related Properties ***********************
+    # Block addresses per sequence (Seq id -> list of physical block).
+    # (batch_size, max_blocks_per_seq)
+    block_tables: torch.Tensor = None
+
+    # The indices of the token slots that input tokens will be stored into.
+    # E.g., if `slot_mapping` is [35, 2, 17] and the block size is 16, the
+    # three tokens are stored in the 3rd slot in block 2, 2nd slot in block 0,
+    # and 1st slot in block 1, respectively.
+    # (num_tokens,)
+    slot_mapping: torch.Tensor = None
 
 
 class AscendAttentionMetadataBuilder:
diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py
@@ -107,7 +107,7 @@ def forward(
             for x in (q, k, v)
         ]
 
-        context_layer = torch.torch.empty_like(q)
+        context_layer = torch.empty_like(q)
 
         # operator requires pta version >= 2.5.1
         torch_npu._npu_flash_attention_unpad(
diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen2_5_vl_without_padding.py
@@ -87,7 +87,7 @@ def forward(
             for x in (q, k, v)
         ]
 
-        context_layer = torch.torch.empty_like(q)
+        context_layer = torch.empty_like(q)
 
         # operator requires pta version >= 2.5.1.dev20250226
         torch_npu._npu_flash_attention_unpad(
diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py
@@ -95,7 +95,7 @@ def forward(
             for x in (q, k, v)
         ]
 
-        context_layer = torch.torch.empty_like(q)
+        context_layer = torch.empty_like(q)
 
         # operator requires pta version >= 2.5.1
         torch_npu._npu_flash_attention_unpad(

Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ def forward(`
`107`	`107`	`for x in (q, k, v)`
`108`	`108`	`]`
`109`	`109`
`110`		`- context_layer = torch.torch.empty_like(q)`
	`110`	`+ context_layer = torch.empty_like(q)`
`111`	`111`
`112`	`112`	`# operator requires pta version >= 2.5.1`
`113`	`113`	`torch_npu._npu_flash_attention_unpad(`