vllm-project
diff --git a/‎tests/models/encoder_decoder/vision_language/test_mllama.py‎
Lines changed: 3 additions & 2 deletions b/‎tests/models/encoder_decoder/vision_language/test_mllama.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vllm/config.py‎
Lines changed: 10 additions & 6 deletions b/‎vllm/config.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 20 additions & 0 deletions b/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎vllm/model_executor/models/__init__.py‎
Lines changed: 5 additions & 2 deletions b/‎vllm/model_executor/models/__init__.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/bamba.py‎
Lines changed: 3 additions & 2 deletions b/‎vllm/model_executor/models/bamba.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/bart.py‎
Lines changed: 2 additions & 1 deletion b/‎vllm/model_executor/models/bart.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/bert.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/models/bert.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/deepseek_mtp.py‎
Lines changed: 9 additions & 5 deletions b/‎vllm/model_executor/models/deepseek_mtp.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎vllm/model_executor/models/florence2.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/models/florence2.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/gritlm.py‎
Lines changed: 3 additions & 1 deletion b/‎vllm/model_executor/models/gritlm.py‎
Lines changed: 3 additions & 1 deletion
@@ -479,8 +479,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
 
         # Regression tests for https://github.com/vllm-project/vllm/issues/10648
 
-        # Number of image tags is greater than the number of images provided
-        prompt = "<|begin_of_text|><|image|><|image|> Compare the two images"  # noqa: E501
+        # Number of groups of image tokens is greater than the number of images
+        # provided (the whitespace between the tags is necessary)
+        prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images"  # noqa: E501
         image = stop_sign
         with pytest.raises(ValueError):
             vllm_model.generate_greedy_logprobs([prompt],
 
@@ -1039,6 +1039,11 @@ def supported_runner_types(self) -> Set[RunnerType]:
     def runner_type(self) -> RunnerType:
         return _TASK_RUNNER[self.task]
 
+    @property
+    def is_v1_compatible(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return ModelRegistry.is_v1_compatible(architectures)
+
 
 class CacheConfig:
     """Configuration for the KV cache.
@@ -1978,13 +1983,12 @@ def maybe_create_spec_config(
                 if num_speculative_tokens is None:
                     # Default to max value defined in draft model config.
                     num_speculative_tokens = n_predict
-                elif num_speculative_tokens > n_predict:
-                    # Verify provided value doesn't exceed the maximum
-                    # supported by the draft model.
+                elif num_speculative_tokens > n_predict and \
+                        num_speculative_tokens % n_predict != 0:
+                    # Ensure divisibility for MTP module reuse.
                     raise ValueError(
-                        "This speculative model supports a maximum of "
-                        f"num_speculative_tokens={n_predict}, but "
-                        f"{num_speculative_tokens=} was provided.")
+                        f"{num_speculative_tokens=} must be divisible by "
+                        f"{n_predict=}")
 
             speculative_draft_tensor_parallel_size = \
                 SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
 
@@ -737,3 +737,23 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
             # If we are in the row parallel case (down_proj)
             else:
                 param_data[expert_id] = loaded_weight
+
+    def extra_repr(self) -> str:
+
+        s = (
+            f"global_num_experts={self.global_num_experts}, "
+            f"local_num_experts={self.local_num_experts}, "
+            f"top_k={self.top_k}, "
+            f"intermediate_size_per_partition={self.intermediate_size_per_partition}, "  # noqa: E501
+            f"tp_size={self.tp_size},\n"
+            f"ep_size={self.ep_size}, "
+            f"reduce_results={self.reduce_results}, "
+            f"renormalize={self.renormalize}, "
+            f"use_grouped_topk={self.use_grouped_topk}")
+
+        if self.use_grouped_topk:
+            s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}"  # noqa: E501
+
+        s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'"  # noqa: E501
+
+        return s
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
-                         SupportsPP, has_inner_state, supports_lora,
-                         supports_multimodal, supports_pp)
+                         SupportsPP, SupportsV0Only, has_inner_state,
+                         supports_lora, supports_multimodal, supports_pp,
+                         supports_v0_only)
 from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
                               is_pooling_model, is_text_generation_model)
 from .registry import ModelRegistry
@@ -21,4 +22,6 @@
     "supports_multimodal",
     "SupportsPP",
     "supports_pp",
+    "SupportsV0Only",
+    "supports_v0_only",
 ]
@@ -32,7 +32,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
+                         SupportsV0Only)
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -366,7 +367,7 @@ def forward(
 
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid):
+                       IsHybrid, SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
 
@@ -43,6 +43,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsV0Only
 from .utils import maybe_prefix
 
 logger = logging.get_logger(__name__)
@@ -776,7 +777,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         return decoder_outputs
 
 
-class BartForConditionalGeneration(nn.Module):
+class BartForConditionalGeneration(nn.Module, SupportsV0Only):
     base_model_prefix = "model"
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
@@ -26,7 +26,7 @@
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
-from .interfaces import SupportsCrossEncoding
+from .interfaces import SupportsCrossEncoding, SupportsV0Only
 from .utils import WeightsMapper, maybe_prefix
 
 
@@ -385,7 +385,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-class BertEmbeddingModel(nn.Module):
+class BertEmbeddingModel(nn.Module, SupportsV0Only):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
 
@@ -87,7 +87,7 @@ def forward(
                                                  hidden_states=hidden_states,
                                                  residual=None)
         hidden_states = residual + hidden_states
-        return self.shared_head(hidden_states)
+        return hidden_states
 
 
 class DeepSeekMultiTokenPredictor(nn.Module):
@@ -121,12 +121,13 @@ def forward(
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)](
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
             input_ids,
             positions,
             previous_hidden_states,
             inputs_embeds,
-            spec_step_idx,
+            current_step_idx,
         )
 
     def compute_logits(
@@ -135,9 +136,12 @@ def compute_logits(
         sampling_metadata: SamplingMetadata,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        mtp_layer = self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        mtp_layer = self.layers[str(self.mtp_start_layer_idx +
+                                    current_step_idx)]
         logits = self.logits_processor(mtp_layer.shared_head.head,
-                                       hidden_states, sampling_metadata)
+                                       mtp_layer.shared_head(hidden_states),
+                                       sampling_metadata)
         return logits
 
 
 
@@ -29,7 +29,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsMultiModal
+from .interfaces import SupportsMultiModal, SupportsV0Only
 from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
 
 
@@ -651,7 +651,7 @@ def forward(
         return decoder_outputs
 
 
-class Florence2LanguageForConditionalGeneration(nn.Module):
+class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -19,6 +19,8 @@
                            PoolingSequenceGroupOutput)
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
+from .interfaces import SupportsV0Only
+
 logger = init_logger(__name__)
 
 
@@ -177,7 +179,7 @@ def forward(
         return PoolerOutput(outputs=pooled_outputs)
 
 
-class GritLM(LlamaForCausalLM):
+class GritLM(LlamaForCausalLM, SupportsV0Only):
     """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
 
     The class inherits from LlamaForCausalLM and provides a custom pooling