cleanup

Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
ROCm · Jan 31, 2025 · 2d61054 · 2d61054
1 parent f2b2500
commit 2d61054
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 26 deletions.
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
@@ -168,8 +168,7 @@ def __init__(self, runner: "ModelRunnerBase"):
 
     @abstractmethod
     @contextmanager
-    def graph_capture(self, max_batch_size: int,
-                      positions: Optional[torch.Tensor]):
+    def graph_capture(self, max_batch_size: int):
         """Context manager used when capturing CUDA graphs."""
         yield
 

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
@@ -213,8 +213,7 @@ def _get_decode_wrapper(self):
         return self._decode_wrapper
 
     @contextmanager
-    def graph_capture(self, max_batch_size: int,
-                      positions: Optional[torch.Tensor]):
+    def graph_capture(self, max_batch_size: int):
         self._is_graph_capturing = True
         self._graph_decode_wrapper = None
         self._graph_slot_mapping = torch.full((max_batch_size, ),

diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
@@ -123,10 +123,7 @@ def graph_clone(self, batch_size: int):
         return self.__class__(self.runner)
 
     def graph_capture_get_metadata_for_batch(
-            self,
-            batch_size: int,
-            is_encoder_decoder_model: bool = False,
-            positions: Optional[torch.Tensor] = None):
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
         assert self._is_graph_capturing
 
         attn_metadata = self.runner.attn_backend.make_metadata(
@@ -175,15 +172,16 @@ def prepare_graph_input_buffers(self,
                                     input_buffers,
                                     attn_metadata,
                                     is_encoder_decoder_model: bool = False):
+        input_positions = attn_metadata.input_positions
+        num_positions = input_positions.shape[0]
         input_buffers["seq_lens_tensor"].copy_(
             attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
         input_buffers["block_tables"].copy_(
             attn_metadata.decode_metadata.block_tables, non_blocking=True)
-        input_buffers["input_positions"][:attn_metadata.decode_metadata.
-                                         input_positions.shape[0]].copy_(
-                                             attn_metadata.decode_metadata.
-                                             input_positions,
-                                             non_blocking=True)
+        # CUDA graph buffer is padded so only perform a partial copy based on
+        # num_positions
+        input_buffers["input_positions"][:num_positions].copy_(
+            input_positions, non_blocking=True)
         if is_encoder_decoder_model:
             raise NotImplementedError(
                 "TritonMLAState does not support encoder/decoder yet")

diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
@@ -2,8 +2,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
-                    TypeVar, Union)
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
 
 import numpy as np
 import torch
@@ -289,9 +288,7 @@ def __init__(self, runner: "ModelRunnerBase"):
         self._is_graph_capturing = False
 
     @contextmanager
-    def graph_capture(self, max_batch_size: int,
-                      positions: Optional[torch.Tensor]):
-        assert positions is None
+    def graph_capture(self, max_batch_size: int):
 
         self._is_graph_capturing = True
 
@@ -317,10 +314,7 @@ def graph_clone(self, batch_size: int) -> "CommonAttentionState":
         return self.__class__(self.runner)
 
     def graph_capture_get_metadata_for_batch(
-            self,
-            batch_size: int,
-            is_encoder_decoder_model: bool = False,
-            positions: Optional[torch.Tensor] = None):
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
         assert self._is_graph_capturing
         attn_metadata = self.runner.attn_backend.make_metadata(
             num_prefills=0,

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1483,13 +1483,11 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                                            self.vllm_config.compilation_config.
                                            cudagraph_capture_sizes)
                 for batch_size in cudagraph_capture_sizes:
-                    cur_input_positions = input_positions[..., :batch_size]
                     attn_metadata = (
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,
                             is_encoder_decoder_model=self.model_config.
-                            is_encoder_decoder,
-                            positions=cur_input_positions))
+                            is_encoder_decoder))
                     # Disable KV Scale Calculation for graph capture
                     attn_metadata.enable_kv_scales_calculation = False
                     if self.lora_config:
@@ -1515,7 +1513,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         "input_ids":
                         input_tokens[:batch_size],
                         "positions":
-                        cur_input_positions,
+                        input_positions[..., :batch_size],
                         "intermediate_inputs":
                         intermediate_inputs[:batch_size]
                         if intermediate_inputs is not None else None,