vllm-project · WoosukKwon · Aug 6, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.53.2
+transformers >= 4.55.0
 huggingface-hub[hf_xet] >= 0.33.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.

diff --git a/requirements/test.in b/requirements/test.in
@@ -35,7 +35,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.53.2
+transformers==4.55.0
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.33.0  # Required for Xet downloads.
 schemathesis>=3.39.15 # Required for openai schema test.

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -214,7 +214,7 @@ fiona==1.10.1
     # via torchgeo
 flask==3.1.1
     # via mlflow
-fonttools==4.54.1
+fonttools==4.55.0
     # via matplotlib
 fqdn==1.5.1
     # via jsonschema
@@ -286,7 +286,7 @@ httpx==0.27.2
     # via
     #   -r requirements/test.in
     #   schemathesis
-huggingface-hub==0.33.1
+huggingface-hub==0.34.3
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1148,7 +1148,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.53.2
+transformers==4.55.0
     # via
     #   -r requirements/test.in
     #   genai-perf

@@ -337,6 +337,10 @@
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
+        # should enable this again after the fix is released:
+        # https://github.com/huggingface/transformers/pull/39915
+        marks=[pytest.mark.skip("HF model is broken")],
     ),
     "gemma3": VLMTestInfo(
         models=["google/gemma-3-4b-it"],

@@ -179,8 +179,7 @@ def check_available_online(
                                           min_transformers_version="4.54"),
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
-                                          min_transformers_version="4.53"),
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
@@ -223,7 +222,10 @@ def check_available_online(
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        extras={"tiny": "ai21labs/Jamba-tiny-dev"}),  # noqa: E501
+                                        extras={
+                                            "tiny": "ai21labs/Jamba-tiny-dev",
+                                            "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
+                                        }),
     "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
                                         extras={"guard": "meta-llama/Llama-Guard-3-1B",  # noqa: E501
                                                 "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501
@@ -239,8 +241,7 @@ def check_available_online(
                                          trust_remote_code=True),
     "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
                                          trust_remote_code=True),
-    "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf",
-                                          min_transformers_version="4.53"),
+    "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"),
     "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
                                                 trust_remote_code=True,
                                                 revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"),  # noqa: E501
@@ -272,6 +273,8 @@ def check_available_online(
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
+                                         max_transformers_version="4.53",
+                                         transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings",  # noqa: E501
                                         trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        trust_remote_code=True),
@@ -299,8 +302,7 @@ def check_available_online(
     "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                         trust_remote_code=True),
-    "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst",
-                                        min_transformers_version="4.53"),
+    "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
     # [Encoder-decoder]
     "BartModel": _HfExamplesInfo("facebook/bart-base"),
     "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
@@ -326,8 +328,12 @@ def check_available_online(
     "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
                                                trust_remote_code=True, v0_only=True),  # noqa: E501
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
-    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
-    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
+    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
+                                           max_transformers_version="4.53",
+                                           transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
+    "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
+                                                  max_transformers_version="4.53",
+                                                  transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
     "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
     "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
     "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501

@@ -9,6 +9,8 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
+from ..models.registry import HF_EXAMPLE_MODELS
+
 MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
 
 
@@ -25,6 +27,8 @@ def test_model_experts_int8_startup(
     dtype: str,
     max_tokens: int,
 ) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_transformers_version(on_fail="skip")
 
     with vllm_runner(model, dtype=dtype,
                      quantization="experts_int8") as vllm_model:

diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
+from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol,
                     Union, overload, runtime_checkable)
 
 import torch
@@ -14,6 +14,10 @@
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.pooler import Pooler
     from vllm.model_executor.sampling_metadata import SamplingMetadata
+else:
+    VllmConfig = Any
+    Pooler = Any
+    SamplingMetadata = Any
 
 logger = init_logger(__name__)
 
@@ -34,7 +38,7 @@ class VllmModel(Protocol[T_co]):
 
     def __init__(
         self,
-        vllm_config: "VllmConfig",
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
         ...
@@ -96,7 +100,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
     def compute_logits(
         self,
         hidden_states: T,
-        sampling_metadata: "SamplingMetadata",
+        sampling_metadata: SamplingMetadata,
     ) -> Optional[T]:
         """Return `None` if TP rank > 0."""
         ...
@@ -140,7 +144,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
-    pooler: "Pooler"
+    pooler: Pooler
     """The pooler is only called on TP rank 0."""
 
 

@@ -1395,11 +1395,12 @@ def __init__(
         **kwargs,
     ):
         self.image_processor = Tarsier2ImageProcessor(**vision_config)
-        super().__init__(image_processor=self.image_processor,
-                         tokenizer=tokenizer,
-                         video_processor=Qwen2VLVideoProcessor(),
-                         chat_template=None,
-                         **kwargs)
+        super().__init__(
+            image_processor=self.image_processor,
+            tokenizer=tokenizer,
+            video_processor=Qwen2VLVideoProcessor(**vision_config),
+            chat_template=None,
+            **kwargs)
 
 
 class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
@@ -90,7 +90,7 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
 def replace_linear_class(
     linear: nn.Linear, style: Literal["colwise", "rowwise"],
     quant_config: QuantizationConfig
-) -> Union[ColumnParallelLinear, RowParallelLinear]:
+) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
 
@@ -445,7 +445,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         # Set correct attn and init on "meta" to delay allocating GPU tensors
         # TODO: @raushan, use the public `model.set_attn_implementation()`
-        # method after v4.54.0 is released
+        # method once its checks are fixed in Transformers.
         self.text_config._attn_implementation = "vllm"
         with init_on_device_without_buffers("meta"), config_override:
             self.model: PreTrainedModel = AutoModel.from_config(
@@ -520,7 +520,7 @@ def pipeline_parallel(self):
         for i in range(len(layers)):
             if start_layer <= i and i < end_layer:
                 continue
-            layers[i] = PPMissingLayer(return_tuple=True)
+            layers[i] = PPMissingLayer()
 
         # Layers after module list
         for name in pp_plan[module_list_idx + 1:]:
@@ -533,14 +533,16 @@ def tensor_parallel(self):
         Apply the model's tensor parallelization plan.
         Currently only supports linear layers.
         """
-        if not self.model.supports_tp_plan:
-            if self.tp_size <= 1:
-                return
+        tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
 
+        if not tp_plan and self.tp_size > 1:
             raise ValueError(
                 f"{type(self.model)} does not support tensor parallel yet!")
 
-        tp_plan = self.model._tp_plan
+        # Some weight loaders expect linear layers to inherit from vLLM's
+        # LinearBase class, so we set a default style which causes any
+        # unspecified linear layers to be replaced with ReplicatedLinear
+        tp_plan[".*"] = "replicated"
 
         def _tensor_parallel(module: nn.Module, prefix: str = ""):
             for child_name, child_module in module.named_children():
@@ -552,6 +554,7 @@ def _tensor_parallel(module: nn.Module, prefix: str = ""):
                             child_module, style, self.quant_config)
                         setattr(module, child_name, new_module)
                         log_replacement(qual_name, child_module, new_module)
+                        break
                 else:
                     _tensor_parallel(child_module, prefix=qual_name)
 

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
@@ -534,16 +534,10 @@ class PPMissingLayer(torch.nn.Identity):
 
     def __init__(self, *args, **kwargs):
         super().__init__()
-        self.return_tuple = kwargs.get("return_tuple", False)
 
     def forward(self, *args, **kwargs):
-        """
-        Return the first arg from args or the first value from kwargs.
-
-        Wraps the input in a tuple if `self.return_tuple` is True.
-        """
-        input = args[0] if args else next(iter(kwargs.values()))
-        return (input, ) if self.return_tuple else input
+        """Return the first arg from args or the first value from kwargs."""
+        return args[0] if args else next(iter(kwargs.values()))
 
 
 _CPU_OFFLOAD_BYTES = 0

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -35,7 +35,8 @@
                                              MllamaConfig, MLPSpeculatorConfig,
                                              Nemotron_Nano_VL_Config,
                                              NemotronConfig, NVLM_D_Config,
-                                             RWConfig, SpeculatorsConfig,
+                                             OvisConfig, RWConfig,
+                                             SpeculatorsConfig,
                                              Step3TextConfig, Step3VLConfig,
                                              UltravoxConfig)
 # yapf: enable
@@ -85,6 +86,7 @@ def _get_hf_token() -> Optional[str]:
     "speculators": SpeculatorsConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
+    "ovis": OvisConfig,
     "ultravox": UltravoxConfig,
     "step3_vl": Step3VLConfig,
     "step3_text": Step3TextConfig,

diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
@@ -24,6 +24,7 @@
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
 from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
                                                       Step3VisionEncoderConfig,
@@ -45,6 +46,7 @@
     "NemotronHConfig",
     "Nemotron_Nano_VL_Config",
     "NVLM_D_Config",
+    "OvisConfig",
     "SpeculatorsConfig",
     "UltravoxConfig",
     "Step3VLConfig",