Cuda graph (vllm-project#5)

mzusman · Mor Zusman · tomeras91 · Mor Zusman · commit 6d336f6eb0ed · 2024-04-16T10:21:24.000+03:00
* Drop indecies when finish

* min 1 attention layer

* CG is working on forward pass passing

* Remove comments

* cosmetics - rename indecies -&gt; indices, organize some whitespaces

* Add some TODOs

* Adding mamba cache for cg

* Remove useless vars from input_metadata

* Remove unused import

* Set the seqlen offset to boolean

* Return only hidden state

* Return only hidden states

* Add padding to match forward pass bs

* Is prompt instead of seqlen offset

* Remove mamba cache class (not used)

* Another remove

* Remove

* Use mamba4gc

* Fix mamba forward, run update only on non prompt

* Use 1 index after the maximal index

* Remove import

* Remove import

* typo

* typo

* place holder

* Padding and empty token takes it from the first empty place

* reformat

* Apply suggestions from code review

Whitespaces

---------

Co-authored-by: Mor Zusman &lt;morz@ai21.com&gt;
Co-authored-by: Tomer Asida &lt;tomera@ai21.com&gt;
Co-authored-by: tomeras91 &lt;57313761+tomeras91@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
@@ -1,11 +1,13 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.model_executor.mamba_metadata import MambaCacheParams, RequestInfo, MambaCache
+from vllm.model_executor.utils import set_random_seed
 
 __all__ = [
     "SamplingMetadata",
     "set_random_seed",
     "MambaCacheParams",
     "RequestInfo",
     "MambaCache",
+    "RequestInfo"
 ]
diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from vllm.model_executor.mamba_metadata import MambaCache, RequestInfo
+from vllm.model_executor.mamba_metadata import RequestInfo
 
 
 class InputMetadata:
@@ -45,7 +45,6 @@ def __init__(
         # Set during the execution of the first attention op.
         # FIXME(woosuk): This is a hack.
         self.attn_bias = None
-        self.mamba_cache_batch: List[MambaCache] = []
         self.requests_info = requests_info
 
     def __repr__(self) -> str:
diff --git a/vllm/model_executor/mamba_metadata.py b/vllm/model_executor/mamba_metadata.py
@@ -5,7 +5,7 @@
 
 @dataclass
 class MambaCacheParams:
-    seqlen_offset: int = 0
+    is_prompt: bool = False
     conv_state: torch.Tensor = torch.Tensor()
     ssm_state: torch.Tensor = torch.Tensor()
 
@@ -16,15 +16,3 @@ class RequestInfo:
     n: int = 1
 
 
-class MambaCache:
-    def __init__(
-        self,
-        request_info: RequestInfo,
-        layer_idx2mamba_cache: Optional[Dict[int, MambaCacheParams]] = None
-    ) -> None:
-        self.request_info = request_info
-        if layer_idx2mamba_cache is None:
-            self.layer_idx2mamba_cache = defaultdict(MambaCacheParams)
-        else:
-            self.layer_idx2mamba_cache = layer_idx2mamba_cache
-
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
-from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
 from mamba_ssm.ops.triton.selective_state_update import selective_state_update
 from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
 
@@ -114,7 +114,7 @@ def mamba_forward(self, hidden_states: torch.Tensor, cache_params: MambaCachePar
 
         # 2. Convolution sequence transformation
         conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
-        if cache_params is not None and cache_params.seqlen_offset > 0:
+        if cache_params is not None and not cache_params.is_prompt:
             hidden_states = causal_conv1d_update(
                 hidden_states.squeeze(-1),
                 cache_params.conv_state,
@@ -154,7 +154,7 @@ def mamba_forward(self, hidden_states: torch.Tensor, cache_params: MambaCachePar
         A = -torch.exp(self.A_log.float())
         # 3.c perform the recurrence y ← SSM(A, B, C)(x)
         time_proj_bias = self.dt_proj.bias.float() if hasattr(self.dt_proj, "bias") else None
-        if cache_params is not None and cache_params.seqlen_offset > 0:
+        if cache_params is not None and not cache_params.is_prompt:
             scan_outputs = selective_state_update(
                 cache_params.ssm_state,
                 hidden_states[..., 0],
@@ -187,50 +187,14 @@ def mamba_forward(self, hidden_states: torch.Tensor, cache_params: MambaCachePar
         contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
         return contextualized_states
 
-    def forward(self, hidden_states: torch.Tensor, input_metadata: InputMetadata):
-        if input_metadata.is_prompt:
-            batch_size = hidden_states.shape[0]
-            conv_cache = torch.zeros(
-                batch_size,
-                self.config.mamba_expand * self.config.hidden_size,
-                self.config.mamba_d_conv,
-                device=hidden_states.device,
-                dtype=hidden_states.dtype
-            )
-            ssm_cache = torch.zeros(
-                batch_size,
-                self.config.mamba_expand * self.config.hidden_size,
-                self.config.mamba_d_state,
-                device=hidden_states.device,
-                dtype=hidden_states.dtype
-            )
-            cache = MambaCacheParams(0, conv_cache, ssm_cache)
-        else:
-            for mamba_cache_request in input_metadata.mamba_cache_batch:
-                # check if batch size of cache fits "n"
-                n = mamba_cache_request.request_info.n 
-                if mamba_cache_request.layer_idx2mamba_cache[self.layer_idx].conv_state.shape[0] < n:
-                    expanded_dims_conv = (n, *mamba_cache_request.layer_idx2mamba_cache[self.layer_idx].conv_state.shape[1:])
-                    conv_state = mamba_cache_request.layer_idx2mamba_cache[self.layer_idx].conv_state.expand(*expanded_dims_conv)
-                    expanded_dims_ssm = (n, *mamba_cache_request.layer_idx2mamba_cache[self.layer_idx].ssm_state.shape[1:])
-                    ssm_state = mamba_cache_request.layer_idx2mamba_cache[self.layer_idx].ssm_state.expand(*expanded_dims_ssm)
-                    mamba_cache_request.layer_idx2mamba_cache[self.layer_idx].conv_state = conv_state
-                    mamba_cache_request.layer_idx2mamba_cache[self.layer_idx].ssm_state = ssm_state
-
-            # mamba requires concatenated cache
-            conv_state = torch.concat([req.layer_idx2mamba_cache[self.layer_idx].conv_state for req in input_metadata.mamba_cache_batch], dim=0)
-            ssm_state = torch.concat([req.layer_idx2mamba_cache[self.layer_idx].ssm_state for req in input_metadata.mamba_cache_batch], dim=0)
-            cache = MambaCacheParams(1, conv_state, ssm_state)
+    def forward(self, hidden_states: torch.Tensor, input_metadata: InputMetadata, conv_state: torch.Tensor, ssm_state: torch.Tensor):
+        cache = MambaCacheParams(
+            input_metadata.is_prompt, 
+            conv_state=conv_state[self.layer_idx],
+            ssm_state=ssm_state[self.layer_idx]
+        )
         hidden_states = self.mamba_forward(hidden_states, cache_params=cache)
 
-        # split cache back to individual requests
-        sample_id = 0
-        for req_mamba_metadata in input_metadata.mamba_cache_batch:
-            n = 1 if input_metadata.is_prompt else req_mamba_metadata.request_info.n
-            req_mamba_metadata.layer_idx2mamba_cache[self.layer_idx].conv_state=cache.conv_state[sample_id:sample_id + n]
-            req_mamba_metadata.layer_idx2mamba_cache[self.layer_idx].ssm_state=cache.ssm_state[sample_id:sample_id + n]
-            sample_id += n
-
         return hidden_states
 
 
@@ -352,6 +316,8 @@ def forward(self,
                 hidden_states: torch.Tensor,
                 input_metadata: InputMetadata,
                 residual: Optional[torch.Tensor],
+                conv_state: torch.Tensor,
+                ssm_state: torch.Tensor,
                 **kwargs):
 
         if residual is None:
@@ -360,7 +326,12 @@ def forward(self,
         else:
             hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
-        hidden_states = self.mamba(hidden_states, input_metadata)
+        hidden_states = self.mamba(
+            hidden_states,
+            input_metadata,
+            conv_state,
+            ssm_state
+        )
         # Fully Connected
         hidden_states, residual = self.pre_moe_layernorm(
             hidden_states, residual)
@@ -433,7 +404,8 @@ def self_attention(self,
             positions: torch.Tensor,
             hidden_states: torch.Tensor,
             kv_cache: KVCache,
-            input_metadata: InputMetadata) -> torch.Tensor:
+            input_metadata: InputMetadata,
+            **kwargs) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         #   TODO - add embedding flag
@@ -450,7 +422,8 @@ def forward(
             hidden_states: torch.Tensor,
             kv_cache: KVCache,
             input_metadata: InputMetadata,
-            residual: Optional[torch.Tensor]):
+            residual: Optional[torch.Tensor],
+            **kwargs):
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
@@ -524,6 +497,8 @@ def forward(
             positions: torch.Tensor,
             kv_caches: List[KVCache],
             input_metadata: InputMetadata,
+            conv_state: torch.Tensor,
+            ssm_state: torch.Tensor
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
         residual = None
@@ -534,7 +509,10 @@ def forward(
                                             hidden_states=hidden_states,
                                             kv_cache=kv_caches[i],
                                             input_metadata=input_metadata,
-                                            residual=residual)
+                                            residual=residual,
+                                            conv_state=conv_state,
+                                            ssm_state=ssm_state
+                                            )
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
@@ -593,9 +571,17 @@ def forward(
             positions: torch.Tensor,
             kv_caches: List[KVCache],
             input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
+            conv_state: torch.Tensor,
+            ssm_state: torch.Tensor
+        ):
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            kv_caches,
+            input_metadata,
+            conv_state,
+            ssm_state
+        )
         return hidden_states
 
     def sample(
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
@@ -93,7 +93,7 @@ def get_cache_block_size(
 
         if is_mamba:
             attention_period = model_config.hf_config.attn_layer_period
-            num_layers = num_layers // attention_period
+            num_layers = max(num_layers // attention_period, 1)
 
         key_cache_block = cache_config.block_size * num_heads * head_size
         value_cache_block = key_cache_block
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py