diff --git a/tests/models/registry.py b/tests/models/registry.py index 00fe99980500..449e188820d4 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -886,27 +886,27 @@ def check_available_online( _TRANSFORMERS_BACKEND_MODELS = { "TransformersEmbeddingModel": _HfExamplesInfo( - "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0" + "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev" ), "TransformersForSequenceClassification": _HfExamplesInfo( "papluca/xlm-roberta-base-language-detection", - min_transformers_version="5.0.0", + min_transformers_version="5.0.0.dev", ), "TransformersForCausalLM": _HfExamplesInfo( "hmellor/Ilama-3.2-1B", trust_remote_code=True ), "TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "TransformersMoEForCausalLM": _HfExamplesInfo( - "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0" + "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev" ), "TransformersMultiModalMoEForCausalLM": _HfExamplesInfo( - "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0" + "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev" ), "TransformersMoEEmbeddingModel": _HfExamplesInfo( - "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0" + "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev" ), "TransformersMoEForSequenceClassification": _HfExamplesInfo( - "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0" + "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev" ), "TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"), "TransformersMultiModalForSequenceClassification": _HfExamplesInfo( diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 06e51df32d18..a18f5b607763 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -82,7 +82,7 @@ def test_models( from packaging.version import Version installed = Version(transformers.__version__) - required = Version("5.0.0") + required = Version("5.0.0.dev") if model == "allenai/OLMoE-1B-7B-0924" and installed < required: pytest.skip( "MoE models with the Transformers backend require " diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index 41d170c9e139..eb992f7bec72 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -28,6 +28,7 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from vllm.attention import Attention, AttentionType +from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed.utils import get_pp_indices @@ -317,7 +318,7 @@ def create_attention_instances(self) -> dict[int, Attention]: # vLLM does not support encoder-decoder models, so if any encoder layer is # found in a text only model, we assume the whole model is an encoder model if has_encoder(self.model) and not is_multimodal(self.config): - self.check_version("4.57.0.dev0", "encoder models support") + self.check_version("5.0.0.dev0", "encoder models support") attn_type = AttentionType.ENCODER_ONLY else: attn_type = AttentionType.DECODER @@ -336,7 +337,12 @@ def create_attention_instances(self) -> dict[int, Attention]: ): per_layer_sliding_window = self.config.sliding_window - attention_instances[i] = Attention( + attn_cls = ( + EncoderOnlyAttention + if attn_type == AttentionType.ENCODER_ONLY + else Attention + ) + attention_instances[i] = attn_cls( num_heads=num_heads, head_size=head_size, # NOTE: We use Llama scale as default, if it's set by diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 5de786f99580..2056ebeb1086 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -115,7 +115,7 @@ def transformers_moe_forward_fake( class MoEMixin(MixtureOfExperts): def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""): - self.check_version("4.57.0.dev0", "MoE models support") + self.check_version("5.0.0.dev0", "MoE models support") # Skip MixtureOfExperts.__init__ and call the next class in MRO super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)