support aclgraph for vllm v1 engine

闫鹏全 · 闫鹏全 · commit f95fc5a0ba6f · 2025-04-15T15:01:59.000+08:00
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -23,9 +23,11 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
-
+from vllm.utils import direct_register_custom_op
+from vllm.forward_context import ForwardContext, get_forward_context
 
 class AscendAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
 
     @staticmethod
     def get_name() -> str:
@@ -150,6 +152,7 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: AscendMetadata,
         output: Optional[torch.Tensor] = None,
+        trace_flag: bool = True,
     ) -> torch.Tensor:
         """Forward pass with Ascend attention.
         Args:
@@ -167,59 +170,100 @@ def forward(
             shape = [batch_size * seq_len, num_heads, head_size]
         """
         num_tokens = query.shape[0]
-        output = torch.empty(num_tokens,
+        if output is None:
+            output = torch.empty(num_tokens,
                              self.num_heads,
                              self.head_size,
                              dtype=query.dtype,
                              device=query.device)
-
-        if attn_metadata is None:
-            # Profiling run.
-            return output.view(num_tokens, self.hidden_size)
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        attn_type = self.attn_type
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "PallasAttentionBackendImpl")
-        # View q k v to BSH.
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-        # TODO: Remove this contiguous in the future.
-        value = value.contiguous()
-
-        if hasattr(layer, 'quant_method'):
-            # TODO: Add attr (num_prefills, prefill_metadata, decode_metadata) to AscendMetadata
-            pass
-        else:
-            if kv_cache.numel() > 0:
-                key_cache, value_cache = kv_cache[0], kv_cache[1]
-                num_blocks, block_size, _ = key_cache.shape
-                key_cache = key_cache.view(num_blocks, block_size,
-                                           self.num_kv_heads, self.head_size)
-                value_cache = value_cache.view(num_blocks, block_size,
-                                               self.num_kv_heads,
-                                               self.head_size)
-                slots = attn_metadata.slot_mapping
-                torch_npu._npu_reshape_and_cache(key=key,
-                                                 value=value,
-                                                 key_cache=key_cache,
-                                                 value_cache=value_cache,
-                                                 slot_indices=slots)
-
-            # use paged attention
-            torch_npu._npu_paged_attention_splitfuse(
+        if trace_flag:
+            torch.ops.vllm.unified_ascend_attention_with_output(
                 query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                mask=attn_metadata.attn_mask,
-                block_table=attn_metadata.block_tables,
-                seq_len=attn_metadata.seq_lens,
-                context_lens=attn_metadata.context_lens,
-                num_kv_heads=self.num_kv_heads,
-                num_heads=self.num_heads,
-                scale_value=self.scale,
-                out=output)
+                key=key,
+                value=value,
+                output=output,
+                layer_name=layer.layer_name
+            )
+        else:
+            num_tokens = query.shape[0]
+            if attn_metadata is None:
+                return output.view(num_tokens, self.hidden_size)
+            assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
+            attn_type = self.attn_type
+            if attn_type != AttentionType.DECODER:
+                raise NotImplementedError("Encoder self-attention and "
+                                            "encoder/decoder cross-attention "
+                                            "are not implemented for "
+                                            "PallasAttentionBackendImpl")
+            # View q k v to BSH.
+            query = query.view(-1, self.num_heads, self.head_size)
+            key = key.view(-1, self.num_kv_heads, self.head_size)
+            value = value.view(-1, self.num_kv_heads, self.head_size)
+            # TODO: Remove this contiguous in the future.
+            value = value.contiguous()
+
+            if hasattr(layer, 'quant_method'):
+                # TODO: Add attr (num_prefills, prefill_metadata, decode_metadata) to AscendMetadata
+                pass
+            else:
+                if kv_cache.numel() > 0:
+                    key_cache, value_cache = kv_cache[0], kv_cache[1]
+                    num_blocks, block_size, _ = key_cache.shape
+                    key_cache = key_cache.view(num_blocks, block_size,
+                                                self.num_kv_heads, self.head_size)
+                    value_cache = value_cache.view(num_blocks, block_size,
+                                                    self.num_kv_heads,
+                                                    self.head_size)
+                    slots = attn_metadata.slot_mapping
+                    torch_npu._npu_reshape_and_cache(key=key,
+                                                        value=value,
+                                                        key_cache=key_cache,
+                                                        value_cache=value_cache,
+                                                        slot_indices=slots)
+                # use paged attention
+                torch_npu._npu_paged_attention_splitfuse(
+                    query=query,
+                    key_cache=key_cache,
+                    value_cache=value_cache,
+                    mask=attn_metadata.attn_mask,
+                    block_table=attn_metadata.block_tables,
+                    seq_len=attn_metadata.seq_lens,
+                    context_lens=attn_metadata.context_lens,
+                    num_kv_heads=self.num_kv_heads,
+                    num_heads=self.num_heads,
+                    scale_value=self.scale,
+                    out=output)
         return output.view(num_tokens, self.hidden_size)
+
+
+def unified_ascend_attention_with_output(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    self = forward_context.no_compile_layers[layer_name]
+    kv_cache = self.kv_cache[forward_context.virtual_engine]
+    self.impl.forward(self, query, key, value, kv_cache, attn_metadata, output, trace_flag=False)
+    return
+
+def unified_attention_with_output_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="unified_ascend_attention_with_output",
+    op_func=unified_ascend_attention_with_output,
+    mutates_args=["output"],
+    fake_impl=unified_attention_with_output_fake,
+    dispatch_key="PrivateUse1",
+)
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -36,6 +36,8 @@
             "Warning: Failed to register custom ops, all custom ops will be disabled"
         )
 
+from vllm.config import CompilationLevel, VllmConfig, ModelConfig
+from vllm.logger import init_logger
 from vllm.platforms import Platform, PlatformEnum
 
 if TYPE_CHECKING:
@@ -104,9 +106,18 @@ def mem_get_info(cls) -> Tuple[int, int]:
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         from vllm.config import CompilationLevel  # noqa: E402
         compilation_config = vllm_config.compilation_config
-        if compilation_config and compilation_config.level != CompilationLevel.NO_COMPILATION:
+        import os
+        aclgraph_enabled = os.getenv('ENABLE_ACLGRAPH')
+
+        if aclgraph_enabled == '1' and compilation_config and compilation_config.level == CompilationLevel.PIECEWISE:
+            logger.warning(
+                "Compilation level %s is supported on NPU now, But use_inductor is no support",
+                compilation_config.level)
+            compilation_config.use_inductor = False
+            compilation_config.splitting_ops = ["vllm.unified_ascend_attention_with_output"]
+        elif compilation_config and compilation_config.level != CompilationLevel.NO_COMPILATION:
             logger.warning(
-                "Compilation level %s is not supported on NPU now, forcing compilation level to NO_COMPILATION",
+                "ENABLE_ACLGRAPH is not set, Compilation level %s is not supported on NPU now, forcing compilation level to NO_COMPILATION",
                 compilation_config.level)
             compilation_config.level = CompilationLevel.NO_COMPILATION
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -19,6 +19,7 @@
 
 import gc
 import os
+import time
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 import numpy as np
@@ -27,8 +28,8 @@
 import torch.nn as nn
 from vllm.attention import AttentionType
 from vllm.attention.layer import Attention
-from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
+from vllm.config import VllmConfig, CompilationLevel
+from vllm.distributed.parallel_state import get_pp_group, graph_capture
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import logger
@@ -42,6 +43,13 @@
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
+from vllm.triton_utils import HAS_TRITON
+if HAS_TRITON:
+    from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
+else:
+    INVALID_TOKEN_ID = None
+    RejectionSampler = None
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
@@ -171,6 +179,12 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.input_positions_cpu = torch.arange(0,
                                                 self.max_num_tokens,
                                                 device="cpu")
+        self.use_cuda_graph = (self.vllm_config.compilation_config.level
+                                == CompilationLevel.PIECEWISE
+                                and not self.model_config.enforce_eager)
+        self.cudagraph_batch_sizes = list(
+            reversed(
+                self.vllm_config.compilation_config.cudagraph_capture_sizes))
 
         # NOTE: Pre-construct a mask matrix to improve the efficiency of
         # attention mask construction during inference.
@@ -627,7 +641,7 @@ def _dummy_run(self) -> torch.Tensor:
         if self.uses_mrope:
             positions = self.mrope_positions[:, :self.max_num_tokens]
         else:
-            positions = self.input_positions_cpu[:self.max_num_tokens]
+            positions = self.positions[:self.max_num_tokens]
 
         if get_pp_group().is_first_rank:
             intermediate_tensors = None
@@ -645,7 +659,7 @@ def _dummy_run(self) -> torch.Tensor:
 
         with set_forward_context(None, self.vllm_config):
             hidden_states = model(input_ids=input_ids,
-                                  positions=positions.to(self.device),
+                                  positions=positions,
                                   intermediate_tensors=intermediate_tensors,
                                   inputs_embeds=inputs_embeds)
         return hidden_states
@@ -779,3 +793,32 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                     f"Unknown attention type: {attn_module.attn_type}")
 
         return kv_cache_spec
+
+
+    def capture_model(self) -> None:
+        if not self.use_cuda_graph:
+            logger.warning(
+                "Skipping NPU graph capture. Please add "
+                "-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE)
+            return
+
+        start_time = time.perf_counter()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        # Trigger CUDA graph capture for specific shapes.
+        # Capture the large shapes first so that the smaller shapes
+        # can reuse the memory pool allocated for the large shapes.
+        with graph_capture(device=self.device):
+            for num_tokens in reversed(self.cudagraph_batch_sizes):
+                for _ in range(self.vllm_config.compilation_config.
+                               cudagraph_num_of_warmups):
+                    self._dummy_run(num_tokens)
+                self._dummy_run(num_tokens)
+
+        end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+        elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+        # This usually takes 5~20 seconds.
+        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
+                    elapsed_time, cuda_graph_size / (1 << 30))
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -158,8 +158,17 @@ def load_model(self) -> None:
         self.model_runner.load_model()
 
     def compile_or_warm_up_model(self) -> None:
+        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
         if not self.model_config.enforce_eager:
-            logger.warning("Graph capture is not supported on NPU.")
+            warmup_sizes = [
+                x for x in warmup_sizes if x not in
+                self.vllm_config.compilation_config.cudagraph_capture_sizes
+            ]
+        for size in sorted(warmup_sizes, reverse=True):
+            logger.info("Compile and warming up model for size %d", size)
+            self.model_runner._dummy_run(size)
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model()
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)