Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,27 +886,27 @@ def check_available_online(

_TRANSFORMERS_BACKEND_MODELS = {
"TransformersEmbeddingModel": _HfExamplesInfo(
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0"
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
),
"TransformersForSequenceClassification": _HfExamplesInfo(
"papluca/xlm-roberta-base-language-detection",
min_transformers_version="5.0.0",
min_transformers_version="5.0.0.dev",
),
"TransformersForCausalLM": _HfExamplesInfo(
"hmellor/Ilama-3.2-1B", trust_remote_code=True
),
"TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"TransformersMoEForCausalLM": _HfExamplesInfo(
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0"
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
),
"TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0"
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
),
"TransformersMoEEmbeddingModel": _HfExamplesInfo(
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
),
"TransformersMoEForSequenceClassification": _HfExamplesInfo(
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
),
"TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
"TransformersMultiModalForSequenceClassification": _HfExamplesInfo(
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_models(
from packaging.version import Version

installed = Version(transformers.__version__)
required = Version("5.0.0")
required = Version("5.0.0.dev")
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
pytest.skip(
"MoE models with the Transformers backend require "
Expand Down
10 changes: 8 additions & 2 deletions vllm/model_executor/models/transformers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

from vllm.attention import Attention, AttentionType
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
from vllm.config.utils import getattr_iter
from vllm.distributed import get_pp_group, get_tp_group
from vllm.distributed.utils import get_pp_indices
Expand Down Expand Up @@ -317,7 +318,7 @@ def create_attention_instances(self) -> dict[int, Attention]:
# vLLM does not support encoder-decoder models, so if any encoder layer is
# found in a text only model, we assume the whole model is an encoder model
if has_encoder(self.model) and not is_multimodal(self.config):
self.check_version("4.57.0.dev0", "encoder models support")
self.check_version("5.0.0.dev0", "encoder models support")
attn_type = AttentionType.ENCODER_ONLY
else:
attn_type = AttentionType.DECODER
Expand All @@ -336,7 +337,12 @@ def create_attention_instances(self) -> dict[int, Attention]:
):
per_layer_sliding_window = self.config.sliding_window

attention_instances[i] = Attention(
attn_cls = (
Copy link
Collaborator

@NickLucche NickLucche Nov 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@heheda12345 do you want to handle it inside the Attention class init to signal deprecation with a warning?

Copy link
Collaborator

@heheda12345 heheda12345 Nov 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to handle it like this now. For the deprecation warning, just add one line of warning in Attention class? (not necessary in this PR)

EncoderOnlyAttention
if attn_type == AttentionType.ENCODER_ONLY
else Attention
)
attention_instances[i] = attn_cls(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does passing attn_type not work? Are the two not equivalent?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

num_heads=num_heads,
head_size=head_size,
# NOTE: We use Llama scale as default, if it's set by
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/transformers/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def transformers_moe_forward_fake(

class MoEMixin(MixtureOfExperts):
def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
self.check_version("4.57.0.dev0", "MoE models support")
self.check_version("5.0.0.dev0", "MoE models support")
# Skip MixtureOfExperts.__init__ and call the next class in MRO
super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)

Expand Down