feat: support v1 engine on 310P

farawayboat · farawayboat · commit ddf54eff50f5 · 2025-06-20T03:27:51.000Z
Signed-off-by: Vincent Yuan &lt;farawayboat@gmail.com&gt;
diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
@@ -36,9 +36,8 @@
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ops.cache import concat_and_cache_mla
-from vllm_ascend.utils import enable_custom_op
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d)
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16,
+                               enable_custom_op, is_310p, nd_to_nz_2d)
 from vllm_ascend.worker.model_runner import (
     ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
 
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -30,6 +30,8 @@
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
+                               nd_to_nz_2d, nd_to_nz_spec)
 
 
 class AscendAttentionBackend(AttentionBackend):
@@ -62,6 +64,9 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
+        if is_310p():
+            return (2, num_blocks, num_kv_heads * head_size // 16, block_size,
+                    16)
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
@@ -167,6 +172,16 @@ def build(self,
         query_start_loc = query_start_loc_cpu.to(self.runner.device,
                                                  non_blocking=True)
 
+        if is_310p():
+            if attn_state == AscendAttentionState.PrefillNoCache:
+                mask_nz = nd_to_nz_2d(attn_mask)
+                attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(),
+                                                      ACL_FORMAT_FRACTAL_NZ)
+            elif attn_state == AscendAttentionState.ChunkedPrefill:
+                mask_nz = nd_to_nz_spec(attn_mask)
+                attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(),
+                                                      ACL_FORMAT_FRACTAL_NZ)
+
         attn_metadata = AscendMetadata(
             num_actual_tokens=num_actual_tokens,
             block_tables=block_table,
@@ -250,6 +265,7 @@ def forward(
                                  self.head_size,
                                  dtype=query.dtype,
                                  device=query.device)
+        ori_output = output
         if trace_flag:
             torch.ops.vllm.unified_ascend_attention_with_output(
                 query=query,
@@ -294,6 +310,18 @@ def forward(
                 assert attn_metadata is not None
                 assert attn_metadata.attn_mask is not None
                 mask = attn_metadata.attn_mask
+                if is_310p():
+                    # align q k v output tensors
+                    query = aligned_16(query)
+                    key = aligned_16(key)
+                    value = aligned_16(value)
+                    output = aligned_16(output)
+
+                    # do reformat in case of broadcasted tensors
+                    mask = mask.repeat(attn_metadata.seq_lens.size(0), 1, 1, 1)
+                    mask = torch_npu.npu_format_cast(mask.contiguous(),
+                                                     ACL_FORMAT_FRACTAL_NZ)
+
                 torch_npu._npu_flash_attention(query=query,
                                                key=key,
                                                value=value,
@@ -303,6 +331,7 @@ def forward(
                                                num_heads=self.num_heads,
                                                num_kv_heads=self.num_kv_heads,
                                                out=output)
+                output = output[:num_tokens, :, :]
             elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
                 assert attn_metadata is not None
                 assert attn_metadata.attn_mask is not None
@@ -320,6 +349,10 @@ def forward(
                     scale_value=self.scale,
                     out=output)
             elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+                if is_310p():
+                    # # seq_lens_tensor needs to be transferred to the device for 310P
+                    attn_metadata.seq_lens = \
+                        attn_metadata.seq_lens.to(device=query.device)
                 torch_npu._npu_paged_attention(
                     query=query,
                     key_cache=self.key_cache,
@@ -353,6 +386,14 @@ def forward(
                                             self.scale, None, True)
                 else:
                     # use paged attention
+                    assert attn_metadata is not None
+                    assert attn_metadata.attn_mask is not None
+                    if is_310p():
+                        # do reformat in case of broadcasted tensors
+                        attn_metadata.attn_mask = \
+                            torch_npu.npu_format_cast(attn_metadata.attn_mask.contiguous(), ACL_FORMAT_FRACTAL_NZ)
+                        attn_metadata.seq_lens = \
+                            attn_metadata.seq_lens.to(device=query.device)
                     torch_npu._npu_paged_attention_splitfuse(
                         query=query,
                         key_cache=self.key_cache,
@@ -365,6 +406,10 @@ def forward(
                         num_heads=self.num_heads,
                         scale_value=self.scale,
                         out=output)
+
+        # to make in-place change to the output tensor
+        if not id(ori_output) == id(output):
+            ori_output[:, :, :] = output[:num_tokens, :, :]
         return output.view(num_tokens, self.hidden_size)
 
 
diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
@@ -22,8 +22,7 @@
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 
-from vllm_ascend.utils import enable_custom_op
-from vllm_ascend.utils import is_310p
+from vllm_ascend.utils import enable_custom_op, is_310p
 
 
 def custom_rotary_embedding_enabled(query, neox_style, head_size):
diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
@@ -17,16 +17,17 @@
 # Adapted from vllm/model_executor/models/qwen2_vl.py
 # This file is a part of the vllm-ascend project.
 
+import torch
 import vllm
 import vllm.distributed
 import vllm.envs as envs
 from torch.distributed import ProcessGroup
 from vllm.config import ParallelConfig
+from vllm.distributed.utils import \
+    stateless_init_torch_distributed_process_group
 from vllm.logger import logger
 
 from vllm_ascend.utils import NullHandle, is_310p
-from vllm.distributed.utils import \
-    stateless_init_torch_distributed_process_group
 
 
 def ascend_destroy_model_parallel():
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -28,7 +28,8 @@
 from vllm.platforms import Platform, PlatformEnum
 
 from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
-from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
+from vllm_ascend.utils import (ASCEND_QUATIZATION_METHOD, is_310p,
+                               update_aclgraph_sizes)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -205,8 +206,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 cache_config.block_size = 128
 
         if envs.VLLM_USE_V1:
-            # Activate custom ops for v1.
-            compilation_config.custom_ops = ["all"]
+            # Activate custom ops for v1, except on 310P
+            if not is_310p():
+                compilation_config.custom_ops = ["all"]
 
             # If ascend_scheduler_config is enabled,
             # extents original scheduler_config to use AscendScheduler.
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -116,6 +116,21 @@ def nd_to_nz_2d(in_tensor: torch.Tensor) -> torch.Tensor:
         2).contiguous()
 
 
+def nd_to_nz_spec(mask_tensor: torch.Tensor) -> torch.Tensor:
+    num_tokens = mask_tensor.shape[0]
+    max_seq_len = mask_tensor.shape[1]
+
+    tokens_pad = (num_tokens + 15) // 16 * 16
+    max_seq_len_pad = (max_seq_len + 15) // 16 * 16
+
+    mask_tensor_pad = \
+        torch.zeros((1, tokens_pad, max_seq_len_pad), dtype=mask_tensor.dtype, device=mask_tensor.device)
+    mask_tensor_pad[0][:num_tokens, :max_seq_len] = mask_tensor
+    mask = mask_tensor_pad.reshape(
+        (1, tokens_pad, max_seq_len_pad // 16, 16)).permute(0, 2, 1, 3)
+    return mask
+
+
 def aligned_16(tensor: torch.Tensor):
     """Aligned tensor for 310P"""
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -74,7 +74,9 @@
 from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
-from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
+                               ProfileExecuteDuration, is_310p,
+                               vllm_version_is)
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 
 if TYPE_CHECKING:
@@ -1641,6 +1643,8 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             cache size of each layer
         """
         import torch_npu
+        acl_format = ACL_FORMAT_FRACTAL_NZ if is_310p(
+        ) else ACL_FORMAT_FRACTAL_ND
         kv_caches: Dict[str, torch.Tensor] = {}
 
         self.input_batch = InputBatch(
@@ -1698,13 +1702,16 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                             device=self.device)
                         kv_caches[layer_name] = (layer_kv_cache_nope,
                                                  layer_kv_cache_pe)
-                        torch_npu.npu_format_cast(kv_caches[layer_name][0], 2)
-                        torch_npu.npu_format_cast(kv_caches[layer_name][1], 2)
+                        kv_caches[layer_name][0] = \
+                            torch_npu.npu_format_cast(kv_caches[layer_name][0], acl_format)
+                        kv_caches[layer_name][1] = \
+                            torch_npu.npu_format_cast(kv_caches[layer_name][1], acl_format)
                     else:
                         kv_caches[layer_name] = torch.zeros(kv_cache_shape,
                                                             dtype=dtype,
                                                             device=self.device)
-                        torch_npu.npu_format_cast(kv_caches[layer_name], 2)
+                        kv_caches[layer_name] = \
+                            torch_npu.npu_format_cast(kv_caches[layer_name], acl_format)
                 else:
                     # TODO: add new branches when introducing more types of
                     # KV cache specs.