feat: support v1 engine on 310P

farawayboat · farawayboat · commit 7d0b0f6692c5 · 2025-06-03T03:29:29.000Z
Signed-off-by: Vincent Yuan &lt;farawayboat@gmail.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -30,6 +30,8 @@
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
+                               nd_to_nz_2d, nd_to_nz_spec)
 
 
 class AscendAttentionBackend(AttentionBackend):
@@ -62,6 +64,9 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
+        if is_310p():
+            return (2, num_blocks, num_kv_heads * head_size // 16, block_size,
+                    16)
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
@@ -160,6 +165,16 @@ def build(self, num_reqs, num_actual_tokens, max_query_len,
         query_start_loc = query_start_loc_cpu.to(self.runner.device,
                                                  non_blocking=True)
 
+        if is_310p():
+            if attn_state == AscendAttentionState.PrefillNoCache:
+                mask_nz = nd_to_nz_2d(attn_mask)
+                attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(),
+                                                      ACL_FORMAT_FRACTAL_NZ)
+            elif attn_state == AscendAttentionState.ChunkedPrefill:
+                mask_nz = nd_to_nz_spec(attn_mask)
+                attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(),
+                                                      ACL_FORMAT_FRACTAL_NZ)
+
         attn_metadata = AscendMetadata(num_actual_tokens=num_actual_tokens,
                                        block_tables=block_table,
                                        query_start_loc=query_start_loc,
@@ -240,6 +255,7 @@ def forward(
                                  self.head_size,
                                  dtype=query.dtype,
                                  device=query.device)
+        ori_output = output
         if trace_flag:
             torch.ops.vllm.unified_ascend_attention_with_output(
                 query=query,
@@ -284,6 +300,18 @@ def forward(
                 assert attn_metadata is not None
                 assert attn_metadata.attn_mask is not None
                 mask = attn_metadata.attn_mask
+                if is_310p():
+                    # align q k v output tensors
+                    query = aligned_16(query)
+                    key = aligned_16(key)
+                    value = aligned_16(value)
+                    output = aligned_16(output)
+
+                    # do reformat in case of broadcasted tensors
+                    mask = mask.repeat(attn_metadata.seq_lens.size(0), 1, 1, 1)
+                    mask = torch_npu.npu_format_cast(mask.contiguous(),
+                                                     ACL_FORMAT_FRACTAL_NZ)
+
                 torch_npu._npu_flash_attention(query=query,
                                                key=key,
                                                value=value,
@@ -293,6 +321,7 @@ def forward(
                                                num_heads=self.num_heads,
                                                num_kv_heads=self.num_kv_heads,
                                                out=output)
+                output = output[:num_tokens, :, :]
             elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
                 assert attn_metadata is not None
                 assert attn_metadata.attn_mask is not None
@@ -310,6 +339,10 @@ def forward(
                     scale_value=self.scale,
                     out=output)
             elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+                if is_310p():
+                    # # seq_lens_tensor needs to be transferred to the device for 310P
+                    attn_metadata.seq_lens = \
+                        attn_metadata.seq_lens.to(device=self.key_cache.device)
                 torch_npu._npu_paged_attention(
                     query=query,
                     key_cache=self.key_cache,
@@ -343,6 +376,12 @@ def forward(
                                             self.scale, None, True)
                 else:
                     # use paged attention
+                    if is_310p():
+                        # do reformat in case of broadcasted tensors
+                        attn_metadata.attn_mask = \
+                            torch_npu.npu_format_cast(attn_metadata.attn_mask.contiguous(), ACL_FORMAT_FRACTAL_NZ)
+                        attn_metadata.seq_lens = \
+                            attn_metadata.seq_lens.to(device=self.key_cache.device)
                     torch_npu._npu_paged_attention_splitfuse(
                         query=query,
                         key_cache=self.key_cache,
@@ -355,6 +394,10 @@ def forward(
                         num_heads=self.num_heads,
                         scale_value=self.scale,
                         out=output)
+
+        # to make in-place change to the output tensor
+        if not id(ori_output) == id(output):
+            ori_output[:, :, :] = output[:num_tokens, :, :]
         return output.view(num_tokens, self.hidden_size)
 
 
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -24,7 +24,8 @@
 from vllm.logger import logger
 from vllm.platforms import Platform, PlatformEnum
 
-from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
+from vllm_ascend.utils import (ASCEND_QUATIZATION_METHOD, is_310p,
+                               update_aclgraph_sizes)
 
 CUSTOM_OP_ENABLED = False
 try:
@@ -219,8 +220,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 cache_config.block_size = 128
 
         if envs.VLLM_USE_V1:
-            # Activate custom ops for v1.
-            compilation_config.custom_ops = ["all"]
+            # Activate custom ops for v1, except on 310P
+            if not is_310p():
+                compilation_config.custom_ops = ["all"]
             # If ascend_scheduler_config exists in additional_config,
             # extents original scheduler_config to use AscendScheduler.
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -100,6 +100,21 @@ def nd_to_nz_2d(in_tensor: torch.Tensor) -> torch.Tensor:
         2).contiguous()
 
 
+def nd_to_nz_spec(mask_tensor: torch.Tensor) -> torch.Tensor:
+    num_tokens = mask_tensor.shape[0]
+    max_seq_len = mask_tensor.shape[1]
+
+    tokens_pad = (num_tokens + 15) // 16 * 16
+    max_seq_len_pad = (max_seq_len + 15) // 16 * 16
+
+    mask_tensor_pad = \
+        torch.zeros((1, tokens_pad, max_seq_len_pad), dtype=mask_tensor.dtype, device=mask_tensor.device)
+    mask_tensor_pad[0][:num_tokens, :max_seq_len] = mask_tensor
+    mask = mask_tensor_pad.reshape(
+        (1, tokens_pad, max_seq_len_pad // 16, 16)).permute(0, 2, 1, 3)
+    return mask
+
+
 def aligned_16(tensor: torch.Tensor):
     """Aligned tensor for 310P"""
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -64,6 +64,8 @@
 from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
+                               is_310p)
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 
 if TYPE_CHECKING:
@@ -1263,6 +1265,8 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             cache size of each layer
         """
         import torch_npu
+        acl_format = ACL_FORMAT_FRACTAL_NZ if is_310p(
+        ) else ACL_FORMAT_FRACTAL_ND
         kv_caches: Dict[str, torch.Tensor] = {}
 
         self.input_batch = InputBatch(
@@ -1312,13 +1316,16 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                             device=self.device)
                         kv_caches[layer_name] = (layer_kv_cache_nope,
                                                  layer_kv_cache_pe)
-                        torch_npu.npu_format_cast(kv_caches[layer_name][0], 2)
-                        torch_npu.npu_format_cast(kv_caches[layer_name][1], 2)
+                        kv_caches[layer_name][0] = \
+                            torch_npu.npu_format_cast(kv_caches[layer_name][0], acl_format)
+                        kv_caches[layer_name][1] = \
+                            torch_npu.npu_format_cast(kv_caches[layer_name][1], acl_format)
                     else:
                         kv_caches[layer_name] = torch.zeros(kv_cache_shape,
                                                             dtype=dtype,
                                                             device=self.device)
-                        torch_npu.npu_format_cast(kv_caches[layer_name], 2)
+                        kv_caches[layer_name] = \
+                            torch_npu.npu_format_cast(kv_caches[layer_name], acl_format)
                 else:
                     # TODO: add new branches when introducing more types of
                     # KV cache specs.