xuebwang-amd
diff --git a/‎tests/models/language/generation/test_hybrid.py‎
Lines changed: 19 additions & 36 deletions b/‎tests/models/language/generation/test_hybrid.py‎
Lines changed: 19 additions & 36 deletions
diff --git a/‎tests/models/registry.py‎
Lines changed: 6 additions & 7 deletions b/‎tests/models/registry.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎vllm/model_executor/layers/mamba/abstract.py‎
Lines changed: 1 addition & 4 deletions b/‎vllm/model_executor/layers/mamba/abstract.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎vllm/model_executor/layers/mamba/linear_attn.py‎
Lines changed: 40 additions & 67 deletions b/‎vllm/model_executor/layers/mamba/linear_attn.py‎
Lines changed: 40 additions & 67 deletions
@@ -20,7 +20,9 @@
 SSM_MODELS = [
     "state-spaces/mamba-130m-hf",
     "tiiuae/falcon-mamba-tiny-dev",
-    "yujiepan/mamba2-codestral-v0.1-tiny-random",
+    # mamba2-codestral in transformers is broken pending:
+    # https://github.com/huggingface/transformers/pull/40861
+    #"yujiepan/mamba2-codestral-v0.1-tiny-random",
 ]
 
 HYBRID_MODELS = [
@@ -31,18 +33,7 @@
     "ibm-granite/granite-4.0-tiny-preview",
     "tiiuae/Falcon-H1-0.5B-Base",
     "LiquidAI/LFM2-1.2B",
-]
-
-V1_SUPPORTED_MODELS = [
-    "state-spaces/mamba-130m-hf",
-    "ai21labs/Jamba-tiny-dev",
-    "pfnet/plamo-2-1b",
-    "yujiepan/mamba2-codestral-v0.1-tiny-random",
-    "Zyphra/Zamba2-1.2B-instruct",
-    "hmellor/tiny-random-BambaForCausalLM",
-    "ibm-granite/granite-4.0-tiny-preview",
-    "tiiuae/Falcon-H1-0.5B-Base",
-    "LiquidAI/LFM2-1.2B",
+    "tiny-random/qwen3-next-moe",
 ]
 
 FULL_CUDA_GRAPH_MODELS = [
@@ -51,10 +42,6 @@
     "Zyphra/Zamba2-1.2B-instruct",
 ]
 
-V0_UNSUPPORTED_MODELS = [
-    "LiquidAI/LFM2-1.2B",
-]
-
 FP32_STATE_MODELS = [
     "state-spaces/mamba-130m-hf",
     "Zyphra/Zamba2-1.2B-instruct",
@@ -88,20 +75,16 @@ def test_models(
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
 
-    if model in V1_SUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v1_outputs = None
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
-    if model in V1_SUPPORTED_MODELS:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v1_outputs,
-            name_0="hf",
-            name_1="vllm-v1",
-        )
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -299,14 +282,14 @@ def test_full_cuda_graph(
             example_prompts, max_tokens, num_logprobs)
 
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_v1_outputs,
+        outputs_1_lst=vllm_outputs,
         name_0="hf",
-        name_1="vllm-v1",
+        name_1="vllm",
     )
 
 
@@ -340,12 +323,12 @@ def test_fp32_cache_state(
     with vllm_runner(model,
                      max_num_seqs=MAX_NUM_SEQS,
                      **{cache_dtype_param: "float32"}) as vllm_model:
-        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_v1_outputs,
+        outputs_1_lst=vllm_outputs,
         name_0="hf",
-        name_1="vllm-v1",
+        name_1="vllm",
     )
@@ -312,14 +312,12 @@ def check_available_online(
     "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
     "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
     "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
-    "Phi4FlashForCausalLM": _HfExamplesInfo("microsoft/Phi-4-mini-flash-reasoning", # noqa: E501
-                                        trust_remote_code=True,
-                                        v0_only=True,
-                                        max_model_len=10240),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
-                                        trust_remote_code=True),
+                                         max_transformers_version="4.55.4",
+                                         transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
+                                         trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        max_transformers_version="4.53",
                                        transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers",  # noqa: E501
@@ -330,7 +328,8 @@ def check_available_online(
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
     "Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
-                                            min_transformers_version="4.56.2"),
+                                            extras={"tiny-random": "tiny-random/qwen3-next-moe"}, # noqa: E501
+                                            min_transformers_version="4.56.3"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
     "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
                                           trust_remote_code=True,
@@ -644,7 +643,7 @@ def check_available_online(
                                     trust_remote_code=True,
                                     speculative_model="XiaomiMiMo/MiMo-7B-RL"),
     "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
-                                     min_transformers_version="4.56.2"),
+                                     min_transformers_version="4.56.3"),
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
 
@@ -20,10 +20,7 @@ class MambaBase(AttentionLayerBase):
 
     # Contains the KV cache (mamba state) for the layer
     # in the shape specified by `self.get_state_shape`.
-    # The outer list is for v0 PP virtual engine. Though this code path
-    # only runs for v1, we have to do this to unify with the interface
-    # of Attention + v0 PP.
-    kv_cache: list[Iterable[torch.Tensor]]
+    kv_cache: tuple[torch.Tensor, ...]
 
     @abstractmethod
     def get_state_shape(self) -> Iterable[tuple[int, ...]]:
 
@@ -15,7 +15,6 @@
 from einops import rearrange
 from torch import nn
 
-from vllm import envs
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
@@ -42,8 +41,6 @@
 import torch
 import torch.distributed
 
-from vllm.model_executor.models.minimax_cache import MinimaxCacheParams
-
 
 class MiniMaxText01RMSNormTP(CustomOp):
     name = "MiniMaxText01RMSNormTP"
@@ -225,11 +222,10 @@ def __init__(
                                         self.tp_heads:(self.tp_rank + 1) *
                                         self.tp_heads].contiguous()
 
-        if envs.VLLM_USE_V1:
-            compilation_config = get_current_vllm_config().compilation_config
-            if prefix in compilation_config.static_forward_context:
-                raise ValueError(f"Duplicate layer name: {prefix}")
-            compilation_config.static_forward_context[prefix] = self
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
 
     @staticmethod
     def weight_direct_load(param: torch.Tensor,
@@ -268,8 +264,7 @@ def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
                 break
             if _prefill_idx >= len(state_indices_tensor):
                 break
-            # prefills are packed at end of batch in V1
-            offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0
+            offset = attn_metadata.num_decode_tokens
             _start = attn_metadata.query_start_loc[offset + _prefill_idx]
             _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1]
             slot_id = state_indices_tensor[offset + _prefill_idx]
@@ -291,10 +286,7 @@ def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
             hidden_decode = self._decode_infer(q, k, v, kv_cache,
                                                state_indices_tensor,
                                                attn_metadata)
-            if envs.VLLM_USE_V1:
-                hidden.insert(0, hidden_decode)
-            else:
-                hidden.append(hidden_decode)
+            hidden.insert(0, hidden_decode)
 
         if not hidden:
             return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
@@ -304,40 +296,28 @@ def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
 
     def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
                       attn_metadata):
-        if not envs.VLLM_USE_V1:
-            q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            num_prefills = getattr(attn_metadata, "num_prefills", 0)
-            slot_id = state_indices_tensor[num_prefills:]
-        else:
-            q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            slot_id = state_indices_tensor[:attn_metadata.num_decodes]
+        q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+        k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+        v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+        slot_id = state_indices_tensor[:attn_metadata.num_decodes]
         hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
                                               slot_id, 32)
         return hidden
 
     def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                positions: torch.Tensor,
-                kv_caches: MinimaxCacheParams) -> None:
-        if not envs.VLLM_USE_V1:
-            self._forward(hidden_states, output, positions, kv_caches)
-        else:
-            torch.ops.vllm.linear_attention(
-                hidden_states,
-                output,
-                positions,
-                self.prefix,
-            )
+                positions: torch.Tensor) -> None:
+        torch.ops.vllm.linear_attention(
+            hidden_states,
+            output,
+            positions,
+            self.prefix,
+        )
 
     def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                 positions: torch.Tensor,
-                 kv_caches: Optional[MinimaxCacheParams]) -> None:
+                 positions: torch.Tensor) -> None:
         forward_context = get_forward_context()
         attn_metadata: AttentionMetadata = forward_context.attn_metadata
-        if envs.VLLM_USE_V1 and attn_metadata is not None:
+        if attn_metadata is not None:
             assert isinstance(attn_metadata, dict)
             attn_metadata = attn_metadata[self.prefix]
             assert isinstance(attn_metadata, LinearAttentionMetadata)
@@ -351,32 +331,26 @@ def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
         qkvact = torch.nn.functional.silu(qkv32)
         qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
         q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
-        if envs.VLLM_USE_V1:
-            if attn_metadata is not None:
-                kv_cache = self.kv_cache[forward_context.virtual_engine][0]
-                state_indices_tensor = attn_metadata.state_indices_tensor
-
-                num_prefills = getattr(attn_metadata, "num_prefills", 0)
-                if num_prefills > 0:
-                    num_decode_tokens = getattr(attn_metadata,
-                                                "num_decode_tokens", 0)
-                    for prefill_idx in range(num_prefills):
-                        q_start = attn_metadata.query_start_loc[
-                            num_decode_tokens + prefill_idx]
-                        q_end = attn_metadata.query_start_loc[num_decode_tokens
-                                                              + prefill_idx +
-                                                              1]
-                        query_len = q_end - q_start
-                        context_len = attn_metadata.seq_lens[
-                            num_decode_tokens + prefill_idx] - query_len
-                        if context_len == 0:
-                            block_to_clear = state_indices_tensor[
-                                num_decode_tokens + prefill_idx]
-                            kv_cache[block_to_clear, ...] = 0
-        else:
-            assert kv_caches is not None
-            kv_cache = kv_caches.minimax_cache
-            state_indices_tensor = kv_caches.state_indices_tensor
+        if attn_metadata is not None:
+            kv_cache = self.kv_cache[forward_context.virtual_engine][0]
+            state_indices_tensor = attn_metadata.state_indices_tensor
+
+            num_prefills = getattr(attn_metadata, "num_prefills", 0)
+            if num_prefills > 0:
+                num_decode_tokens = getattr(attn_metadata, "num_decode_tokens",
+                                            0)
+                for prefill_idx in range(num_prefills):
+                    q_start = attn_metadata.query_start_loc[num_decode_tokens +
+                                                            prefill_idx]
+                    q_end = attn_metadata.query_start_loc[num_decode_tokens +
+                                                          prefill_idx + 1]
+                    query_len = q_end - q_start
+                    context_len = attn_metadata.seq_lens[
+                        num_decode_tokens + prefill_idx] - query_len
+                    if context_len == 0:
+                        block_to_clear = state_indices_tensor[num_decode_tokens
+                                                              + prefill_idx]
+                        kv_cache[block_to_clear, ...] = 0
 
         decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
         if attn_metadata is None:
@@ -410,8 +384,7 @@ def linear_attention(
     self = forward_context.no_compile_layers[layer_name]
     self._forward(hidden_states=hidden_states,
                   output=output,
-                  positions=positions,
-                  kv_caches=None)
+                  positions=positions)
 
 
 def linear_attention_fake(