Pack mamba2 metadata

cyang49 · cyang49 · commit 017597e938a3 · 2025-04-10T09:05:34.000-04:00
Signed-off-by: Chih-Chieh-Yang &lt;7364402+cyang49@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+import math
+from dataclasses import dataclass
+
+import torch
+
+
+@dataclass
+class Mamba2Metadata:
+    chunk_size: int
+    chunk_indices: torch.Tensor
+    chunk_offsets: torch.Tensor
+
+
+def prepare_mamba2_metadata(seq_idx: torch.Tensor,
+                            chunk_size: int) -> Mamba2Metadata:
+    # convert seq_idx to chunk indices and offsets
+    # - derive the cu_seqlens
+    _, cu_seqlens = torch.where(seq_idx.diff())
+    cu_seqlens += 1
+
+    # outputs will have length expansion of chunks that do not divide
+    # chunk_size
+    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
+                                                     > 0).sum()
+    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
+    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
+
+    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
+    p = 0  # num of insertions
+    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+
+        # if does not divide chunk_size, then there is one chunk insertion
+        p += (s % chunk_size > 0)
+
+        # get the dimensions
+        # - the + 1 for _e is to shift the boundary by one chunk
+        # - this shifting is not needed if chunk_size divides e
+        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
+                                                             > 0)
+
+        # adjust inidces and offsets
+        chunk_indices[_s:_e] -= p
+        chunk_offsets[_s] = s % chunk_size
+
+    return Mamba2Metadata(chunk_size=chunk_size,
+                          chunk_indices=chunk_indices,
+                          chunk_offsets=chunk_offsets)
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -18,6 +18,7 @@
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
+from vllm.model_executor.layers.mamba.mamba2_metadata import Mamba2Metadata
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
@@ -389,18 +390,21 @@ def forward_cuda(
         hidden_states: torch.Tensor,
         mamba_cache_params: MambaCacheParams,
         sequence_idx: Optional[torch.Tensor] = None,
-        chunk_indices: Optional[torch.Tensor] = None,
-        chunk_offsets: Optional[torch.Tensor] = None,
+        mamba2_metadata: Optional[Mamba2Metadata] = None,
     ):
         # For the mamba2 triton kernels to operate in continuous batching,
         # the sequence_idx is needed to be passed in. Also, for the kernels
-        # to operate in chunked prefill, the chunk_indices and chunk_offsets
-        # can be optionally passed in; it is more efficient to pre-compute
-        # once since they are common to all layers. If they are not provided
-        # then they will be derived from sequence_idx inside the kernels
-
+        # to operate in chunked prefill, the mamba2_metadata containing
+        # chunk_indices and chunk_offsets must be passed in; it is
+        # more efficient to pre-compute once since they are common to all
+        # layers.
         attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
 
+        chunk_indices, chunk_offsets = None, None
+        if mamba2_metadata is not None:
+            chunk_indices = mamba2_metadata.chunk_indices
+            chunk_offsets = mamba2_metadata.chunk_offsets
+
         seq_len, _ = hidden_states.shape
         groups_time_state_size = self.n_groups * self.ssm_state_size
 
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -5,8 +5,6 @@
 
 # ruff: noqa: E501,SIM102
 
-import math
-
 import torch
 import triton
 import triton.language as tl
@@ -442,40 +440,6 @@ def _chunk_scan_fwd_kernel(
              (offs_out_n[None, :] < hdim))
 
 
-def seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
-
-    # convert seq_idx to chunk indices and offsets
-    # - derive the cu_seqlens
-    _, cu_seqlens = torch.where(seq_idx.diff())
-    cu_seqlens += 1
-
-    # outputs will have length expansion of chunks that do not divide
-    # chunk_size
-    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
-                                                     > 0).sum()
-    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
-    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
-
-    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
-    p = 0  # num of insertions
-    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
-
-        # if does not divide chunk_size, then there is one chunk insertion
-        p += (s % chunk_size > 0)
-
-        # get the dimensions
-        # - the + 1 for _e is to shift the boundary by one chunk
-        # - this shifting is not needed if chunk_size divides e
-        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
-                                                             > 0)
-
-        # adjust inidces and offsets
-        chunk_indices[_s:_e] -= p
-        chunk_offsets[_s] = s % chunk_size
-
-    return chunk_indices, chunk_offsets
-
-
 def _chunk_scan_fwd(
     cb,
     x,
@@ -515,16 +479,10 @@ def _chunk_scan_fwd(
             if initial_states.shape[0] == 1:
                 # no in this case no point to use initial states
                 initial_states = None
-            elif chunk_indices is None and chunk_offsets is None:
-                # if chunk_indices and chunk_offsets both unset, then derive
-                # from seq_idx
-                chunk_indices, chunk_offsets = seq_idx_to_chunk_indices_offsets(
-                    seq_idx, chunk_size)
             else:
                 assert chunk_indices is not None and chunk_offsets is not None, \
                     (
-                        "chunk_indices and chunk_offsets should either "
-                        "be left unset, or else both should be set."
+                        "chunk_indices and chunk_offsets should have been set"
                     )
         else:
             chunk_indices, chunk_offsets = None, None
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
@@ -18,10 +18,10 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     MambaMixer2, extra_groups_for_head_shards)
-from vllm.model_executor.layers.mamba.ops.ssd_chunk_scan import (
-    seq_idx_to_chunk_indices_offsets)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -111,8 +111,7 @@ def forward(
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
         sequence_idx: Optional[torch.Tensor] = None,
-        chunk_indices: Optional[torch.Tensor] = None,
-        chunk_offsets: Optional[torch.Tensor] = None,
+        mamba2_metadata: Optional[Mamba2Metadata] = None,
         **kwargs,
     ):
         if residual is None:
@@ -123,7 +122,7 @@ def forward(
                 hidden_states, residual)
 
         hidden_states = self.mamba(hidden_states, mamba_cache_params,
-                                   sequence_idx, chunk_indices, chunk_offsets)
+                                   sequence_idx, mamba2_metadata)
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(
             hidden_states, residual)
@@ -317,7 +316,7 @@ def forward(
         # proper continuous batching computation including
         # chunked prefill
         seq_idx = None
-        chunk_indices, chunk_offsets = None, None
+        mamba2_metadata = None
         attn_metadata = get_forward_context().attn_metadata
         if attn_metadata.num_prefills > 0:
             seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
@@ -338,7 +337,7 @@ def forward(
             # metadata, we simply just compute redundently and
             # will be silently ignored inside the mamba kernels.
             # if not needed.
-            chunk_indices, chunk_offsets = seq_idx_to_chunk_indices_offsets(
+            mamba2_metadata = prepare_mamba2_metadata(
                 seq_idx, self.config.mamba_chunk_size)
 
         if get_pp_group().is_first_rank:
@@ -370,8 +369,7 @@ def forward(
                 residual=residual,
                 mamba_cache_params=layer_mamba_cache_params,
                 sequence_idx=seq_idx,
-                chunk_indices=chunk_indices,
-                chunk_offsets=chunk_offsets,
+                mamba2_metadata=mamba2_metadata,
             )
 
         if not get_pp_group().is_last_rank:
@@ -574,4 +572,4 @@ def sample(
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(weights)