Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a742162
Update transformers to `v4.54.1`
hmellor Jul 30, 2025
97d7f25
Use public method to set attn implementation in Transformers backend
hmellor Jul 30, 2025
fa697f5
Fix MPT
hmellor Jul 30, 2025
24bb2c4
Exaone is a remote model
hmellor Jul 30, 2025
ecebd0c
Fix solar
hmellor Jul 30, 2025
ad38ae2
Fix telechat
hmellor Jul 30, 2025
d439137
Fix skywork
hmellor Jul 30, 2025
1dcf9f4
Fix hunyuan
hmellor Jul 30, 2025
30bdcde
spaces
hmellor Jul 30, 2025
3ce689f
Merge branch 'main' into update-transformers-4-54
DarkLight1337 Jul 30, 2025
36621f4
Drop `min_transformers_version="4.53"`
DarkLight1337 Jul 30, 2025
c305846
Fix duplicated code
DarkLight1337 Jul 30, 2025
59cd39e
Revert telechat2 to how it is on main
hmellor Jul 30, 2025
0af4810
Revert public method as it's too brittle to use for our purposes righ…
hmellor Jul 30, 2025
f286825
Merge branch 'main' into update-transformers-4-54
hmellor Jul 31, 2025
a6d56b9
Merge branch 'main' into update-transformers-4-54
hmellor Aug 1, 2025
1c2570d
Merge branch 'main' into update-transformers-4-54
hmellor Aug 4, 2025
d5ab6f9
fix ovis
Isotr0py Aug 4, 2025
ce19024
fix tarsier2 processing
Isotr0py Aug 4, 2025
7f32eb6
Fix type hint in `replace_linear_class`
hmellor Aug 4, 2025
462ebc7
Fix `tp_plan` retrieval in Transformers backend
hmellor Aug 4, 2025
3d9754a
Fix basic Models Test
hmellor Aug 4, 2025
800edac
Fix pipeline parallel test
hmellor Aug 4, 2025
afe9f80
Handle `base_model_tp_plan` being explicitly `None`
hmellor Aug 5, 2025
d819ce4
Cap transformers version for custom models which are now broken
hmellor Aug 5, 2025
ba55881
Merge branch 'main' into update-transformers-4-54
hmellor Aug 5, 2025
f306e75
disable fuyu temporarily
Isotr0py Aug 5, 2025
bfa2f3f
syntax error
hmellor Aug 5, 2025
05240fd
Ficx quantization tests
hmellor Aug 5, 2025
3b61cf7
Update to 4.55
hmellor Aug 5, 2025
4104f9d
Remove 4.55 min version as that's what we're on now
hmellor Aug 5, 2025
0a6ff09
Merge branch 'main' into update-transformers-4-54
WoosukKwon Aug 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.53.2
transformers >= 4.55.0
huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads.
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.53.2
transformers==4.55.0
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads.
schemathesis>=3.39.15 # Required for openai schema test.
Expand Down
6 changes: 3 additions & 3 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ fiona==1.10.1
# via torchgeo
flask==3.1.1
# via mlflow
fonttools==4.54.1
fonttools==4.55.0
# via matplotlib
fqdn==1.5.1
# via jsonschema
Expand Down Expand Up @@ -286,7 +286,7 @@ httpx==0.27.2
# via
# -r requirements/test.in
# schemathesis
huggingface-hub==0.33.1
huggingface-hub==0.34.3
# via
# -r requirements/test.in
# accelerate
Expand Down Expand Up @@ -1148,7 +1148,7 @@ tqdm==4.66.6
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.53.2
transformers==4.55.0
# via
# -r requirements/test.in
# genai-perf
Expand Down
4 changes: 4 additions & 0 deletions tests/models/multimodal/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,10 @@
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
# FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
# should enable this again after the fix is released:
# https://github.com/huggingface/transformers/pull/39915
marks=[pytest.mark.skip("HF model is broken")],
),
"gemma3": VLMTestInfo(
models=["google/gemma-3-4b-it"],
Expand Down
24 changes: 15 additions & 9 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,7 @@ def check_available_online(
min_transformers_version="4.54"),
"Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base",
min_transformers_version="4.53"),
"FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
"Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
Expand Down Expand Up @@ -223,7 +222,10 @@ def check_available_online(
trust_remote_code=True),
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
extras={
"tiny": "ai21labs/Jamba-tiny-dev",
"random": "ai21labs/Jamba-tiny-random", # noqa: E501
}),
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
extras={"guard": "meta-llama/Llama-Guard-3-1B", # noqa: E501
"hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501
Expand All @@ -239,8 +241,7 @@ def check_available_online(
trust_remote_code=True),
"MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
trust_remote_code=True),
"MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf",
min_transformers_version="4.53"),
"MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"),
"MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
trust_remote_code=True,
revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"), # noqa: E501
Expand Down Expand Up @@ -272,6 +273,8 @@ def check_available_online(
"PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
trust_remote_code=True),
"Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
max_transformers_version="4.53",
transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings", # noqa: E501
trust_remote_code=True),
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
trust_remote_code=True),
Expand Down Expand Up @@ -299,8 +302,7 @@ def check_available_online(
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
trust_remote_code=True),
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst",
min_transformers_version="4.53"),
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
# [Encoder-decoder]
"BartModel": _HfExamplesInfo("facebook/bart-base"),
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
Expand All @@ -326,8 +328,12 @@ def check_available_online(
"NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
trust_remote_code=True, v0_only=True), # noqa: E501
"Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
"Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
"Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
"Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
max_transformers_version="4.53",
transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501
"Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
max_transformers_version="4.53",
transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501
Comment on lines +331 to +336
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My point is that vllm correctly loads and executes these models, but when performing accuracy tests compared to hf, it requires hf Implementation 4.53

Copy link
Member Author

@hmellor hmellor Aug 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand, but vLLM's CI is not set up to arbitrarily change dependency versions at test time for reference models which are unmaintained.

The best course of action would be to make PRs in Qwen/Qwen2.5-Math-RM-72B and Qwen/Qwen2.5-Math-PRM-7B to fix them, but we're not going to block upgrading the Transformers pin waiting for those.

"RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True), # noqa: E501
"RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True), # noqa: E501
"XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True), # noqa: E501
Expand Down
4 changes: 4 additions & 0 deletions tests/quantization/test_experts_int8.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from tests.quantization.utils import is_quant_method_supported

from ..models.registry import HF_EXAMPLE_MODELS

MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]


Expand All @@ -25,6 +27,8 @@ def test_model_experts_int8_startup(
dtype: str,
max_tokens: int,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_transformers_version(on_fail="skip")

with vllm_runner(model, dtype=dtype,
quantization="experts_int8") as vllm_model:
Expand Down
12 changes: 8 additions & 4 deletions vllm/model_executor/models/interfaces_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol,
Union, overload, runtime_checkable)

import torch
Expand All @@ -14,6 +14,10 @@
from vllm.config import VllmConfig
from vllm.model_executor.layers.pooler import Pooler
from vllm.model_executor.sampling_metadata import SamplingMetadata
else:
VllmConfig = Any
Pooler = Any
SamplingMetadata = Any

logger = init_logger(__name__)

Expand All @@ -34,7 +38,7 @@ class VllmModel(Protocol[T_co]):

def __init__(
self,
vllm_config: "VllmConfig",
vllm_config: VllmConfig,
prefix: str = "",
) -> None:
...
Expand Down Expand Up @@ -96,7 +100,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
def compute_logits(
self,
hidden_states: T,
sampling_metadata: "SamplingMetadata",
sampling_metadata: SamplingMetadata,
) -> Optional[T]:
"""Return `None` if TP rank > 0."""
...
Expand Down Expand Up @@ -140,7 +144,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
MRO of your model class.
"""

pooler: "Pooler"
pooler: Pooler
"""The pooler is only called on TP rank 0."""


Expand Down
11 changes: 6 additions & 5 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1395,11 +1395,12 @@ def __init__(
**kwargs,
):
self.image_processor = Tarsier2ImageProcessor(**vision_config)
super().__init__(image_processor=self.image_processor,
tokenizer=tokenizer,
video_processor=Qwen2VLVideoProcessor(),
chat_template=None,
**kwargs)
super().__init__(
image_processor=self.image_processor,
tokenizer=tokenizer,
video_processor=Qwen2VLVideoProcessor(**vision_config),
chat_template=None,
**kwargs)


class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
Expand Down
17 changes: 10 additions & 7 deletions vllm/model_executor/models/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
def replace_linear_class(
linear: nn.Linear, style: Literal["colwise", "rowwise"],
quant_config: QuantizationConfig
) -> Union[ColumnParallelLinear, RowParallelLinear]:
) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
"""
Replace nn.Linear with one of vLLM's tensor parallel linear classes.

Expand Down Expand Up @@ -445,7 +445,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

# Set correct attn and init on "meta" to delay allocating GPU tensors
# TODO: @raushan, use the public `model.set_attn_implementation()`
# method after v4.54.0 is released
# method once its checks are fixed in Transformers.
self.text_config._attn_implementation = "vllm"
with init_on_device_without_buffers("meta"), config_override:
self.model: PreTrainedModel = AutoModel.from_config(
Expand Down Expand Up @@ -520,7 +520,7 @@ def pipeline_parallel(self):
for i in range(len(layers)):
if start_layer <= i and i < end_layer:
continue
layers[i] = PPMissingLayer(return_tuple=True)
layers[i] = PPMissingLayer()

# Layers after module list
for name in pp_plan[module_list_idx + 1:]:
Expand All @@ -533,14 +533,16 @@ def tensor_parallel(self):
Apply the model's tensor parallelization plan.
Currently only supports linear layers.
"""
if not self.model.supports_tp_plan:
if self.tp_size <= 1:
return
tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}

if not tp_plan and self.tp_size > 1:
raise ValueError(
f"{type(self.model)} does not support tensor parallel yet!")

tp_plan = self.model._tp_plan
# Some weight loaders expect linear layers to inherit from vLLM's
# LinearBase class, so we set a default style which causes any
# unspecified linear layers to be replaced with ReplicatedLinear
tp_plan[".*"] = "replicated"

def _tensor_parallel(module: nn.Module, prefix: str = ""):
for child_name, child_module in module.named_children():
Expand All @@ -552,6 +554,7 @@ def _tensor_parallel(module: nn.Module, prefix: str = ""):
child_module, style, self.quant_config)
setattr(module, child_name, new_module)
log_replacement(qual_name, child_module, new_module)
break
else:
_tensor_parallel(child_module, prefix=qual_name)

Expand Down
10 changes: 2 additions & 8 deletions vllm/model_executor/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,16 +534,10 @@ class PPMissingLayer(torch.nn.Identity):

def __init__(self, *args, **kwargs):
super().__init__()
self.return_tuple = kwargs.get("return_tuple", False)

def forward(self, *args, **kwargs):
"""
Return the first arg from args or the first value from kwargs.

Wraps the input in a tuple if `self.return_tuple` is True.
"""
input = args[0] if args else next(iter(kwargs.values()))
return (input, ) if self.return_tuple else input
"""Return the first arg from args or the first value from kwargs."""
return args[0] if args else next(iter(kwargs.values()))


_CPU_OFFLOAD_BYTES = 0
Expand Down
4 changes: 3 additions & 1 deletion vllm/transformers_utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
MllamaConfig, MLPSpeculatorConfig,
Nemotron_Nano_VL_Config,
NemotronConfig, NVLM_D_Config,
RWConfig, SpeculatorsConfig,
OvisConfig, RWConfig,
SpeculatorsConfig,
Step3TextConfig, Step3VLConfig,
UltravoxConfig)
# yapf: enable
Expand Down Expand Up @@ -85,6 +86,7 @@ def _get_hf_token() -> Optional[str]:
"speculators": SpeculatorsConfig,
"nemotron": NemotronConfig,
"NVLM_D": NVLM_D_Config,
"ovis": OvisConfig,
"ultravox": UltravoxConfig,
"step3_vl": Step3VLConfig,
"step3_text": Step3TextConfig,
Expand Down
2 changes: 2 additions & 0 deletions vllm/transformers_utils/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
from vllm.transformers_utils.configs.ovis import OvisConfig
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
Step3VisionEncoderConfig,
Expand All @@ -45,6 +46,7 @@
"NemotronHConfig",
"Nemotron_Nano_VL_Config",
"NVLM_D_Config",
"OvisConfig",
"SpeculatorsConfig",
"UltravoxConfig",
"Step3VLConfig",
Expand Down
Loading