refactored mixer and bamba

fabianlim · cyang49 · commit 4c672ff231b0 · 2025-03-25T09:48:16.000-04:00
Signed-off-by: Yu Chin Fabian Lim &lt;flim@sg.ibm.com&gt;

Signed-off-by: Chih-Chieh-Yang &lt;7364402+cyang49@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -392,6 +392,13 @@ def forward_cuda(
         chunk_indices: Optional[torch.Tensor] = None,
         chunk_offsets: Optional[torch.Tensor] = None,
     ):
+        # For the mamba2 triton kernels to operate in continuous batching,
+        # the sequence_idx is needed to be passed in. Also, for the kernels
+        # to operate in chunked prefill, the chunk_indices and chunk_offsets
+        # can be optionally passed in; it is more efficient to pre-compute
+        # once since they are common to all layers. If they are not provided
+        # then they will be derived from sequence_idx inside the kernels
+
         attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
 
         seq_len, _ = hidden_states.shape
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -5,6 +5,8 @@
 
 # ruff: noqa: E501,SIM102
 
+import math
+
 import torch
 import triton
 import triton.language as tl
@@ -440,6 +442,40 @@ def _chunk_scan_fwd_kernel(
              (offs_out_n[None, :] < hdim))
 
 
+def seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
+
+    # convert seq_idx to chunk indices and offsets
+    # - derive the cu_seqlens
+    _, cu_seqlens = torch.where(seq_idx.diff())
+    cu_seqlens += 1
+
+    # outputs will have length expansion of chunks that do not divide
+    # chunk_size
+    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
+                                                     > 0).sum()
+    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
+    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
+
+    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
+    p = 0  # num of insertions
+    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+
+        # if does not divide chunk_size, then there is one chunk insertion
+        p += (s % chunk_size > 0)
+
+        # get the dimensions
+        # - the + 1 for _e is to shift the boundary by one chunk
+        # - this shifting is not needed if chunk_size divides e
+        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
+                                                             > 0)
+
+        # adjust inidces and offsets
+        chunk_indices[_s:_e] -= p
+        chunk_offsets[_s] = s % chunk_size
+
+    return chunk_indices, chunk_offsets
+
+
 def _chunk_scan_fwd(
     cb,
     x,
@@ -481,8 +517,20 @@ def _chunk_scan_fwd(
             if initial_states.shape[0] == 1:
                 # no in this case no point to use initial states
                 initial_states = None
-
-    if initial_states is None:
+            elif chunk_indices is None and chunk_offsets is None:
+                # if chunk_indices and chunk_offsets both unset, then derive 
+                # from seq_idx
+                chunk_indices, chunk_offsets = seq_idx_to_chunk_indices_offsets(
+                    seq_idx, chunk_size)
+            else:
+                assert chunk_indices is not None and chunk_offsets is not None, \
+                    (
+                        "chunk_indices and chunk_offsets should either "
+                        "be left unset, or else both should be set."
+                    )
+        else:
+            chunk_indices, chunk_offsets = None, None
+    else:
         chunk_indices, chunk_offsets = None, None
 
     # Allocates output.
@@ -509,7 +557,6 @@ def _chunk_scan_fwd(
         if chunk_offsets is None else len(chunk_offsets), nheads)
     z_strides = ((z.stride(0), z.stride(1), z.stride(2),
                   z.stride(3)) if z is not None else (0, 0, 0, 0))
-
     _chunk_scan_fwd_kernel[grid](
         cb,
         x,
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -98,7 +98,7 @@ def _mamba_chunk_scan_combined_fwd(x,
     # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
     # (middle term of factorization of off-diag blocks; A terms)
     # - for handling chunked prefill, this requires i) initial_states
-    #   ii) seq_idx and iii) has_cu_seqlens to be all specified.
+    #   ii) seq_idx and iii) is_cont_batched to be all specified.
     # - When a new seq_idx is detected, we will stop passing the prev_state
     #   and switch accordingly to the init_state corresponding to the new seq_idx.
     # - this will ensure that states will be updated with the rightmost flushed seq_idx
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
@@ -22,6 +22,8 @@
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.mamba.ops.ssd_chunk_scan import (
+    seq_idx_to_chunk_indices_offsets)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -256,41 +258,6 @@ def forward(
     "mamba": BambaMixerDecoderLayer
 }
 
-
-def _seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
-
-    # convert seq_idx to chunk indices and offsets
-    # - derive the cu_seqlens
-    _, cu_seqlens = torch.where(seq_idx.diff())
-    cu_seqlens += 1
-
-    # outputs will have length expansion of chunks that do not divide
-    # chunk_size
-    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
-                                                     > 0).sum()
-    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
-    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
-
-    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
-    p = 0  # num of insertions
-    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
-
-        # if does not divide chunk_size, then there is one chunk insertion
-        p += (s % chunk_size > 0)
-
-        # get the dimensions
-        # - the + 1 for _e is to shift the boundary by one chunk
-        # - this shifting is not needed if chunk_size divides e
-        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
-                                                             > 0)
-
-        # adjust inidces and offsets
-        chunk_indices[_s:_e] -= p
-        chunk_offsets[_s] = s % chunk_size
-
-    return chunk_indices, chunk_offsets
-
-
 class BambaModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -361,8 +328,17 @@ def forward(
                     )):
                 seq_idx[srt:end] = i
             seq_idx.unsqueeze_(0)
-            # Compute mamba2 metadata tensors that are reused across layers
-            chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets(
+
+            # compute metadata for chunked prefill. 
+            # actually this is only needed if there are 
+            # initial states, but this is determinable
+            # only from attention metadata yet 
+            # unavailable from the current top-level forward.
+            # Rather than complicating things to extract said
+            # metadata, we simply just compute redundently and
+            # will be silently ignored inside the mamba kernels.
+            # if not needed.
+            chunk_indices, chunk_offsets = seq_idx_to_chunk_indices_offsets(
                 seq_idx, self.config.mamba_chunk_size)
 
         if get_pp_group().is_first_rank:
@@ -378,7 +354,6 @@ def forward(
 
         residual = None
         num_attn = 0
-        extra_args = {}
         for i in range(len(self.layers)):
             layer = self.layers[i]
             if isinstance(layer, BambaAttentionDecoderLayer):
@@ -388,19 +363,15 @@ def forward(
             if isinstance(layer, BambaMixerDecoderLayer):
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     i - num_attn)
-                extra_args = {
-                    'chunk_indices': chunk_indices,
-                    'chunk_offsets': chunk_offsets,
-                }
 
-            # print(f"{len(extra_args)=}")
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
                 residual=residual,
                 mamba_cache_params=layer_mamba_cache_params,
                 sequence_idx=seq_idx,
-                **extra_args,
+                chunk_indices=chunk_indices,
+                chunk_offsets=chunk_offsets,
             )
 
         if not get_pp_group().is_last_rank: