Skip to content

Commit 0ff05e3

Browse files
Isotr0pyhmellor
andauthored
[Bugfix] Fix encoder-only model support for transformers backend (#28021)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
1 parent 428bc7b commit 0ff05e3

File tree

4 files changed

+16
-10
lines changed

4 files changed

+16
-10
lines changed

tests/models/registry.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -899,27 +899,27 @@ def check_available_online(
899899

900900
_TRANSFORMERS_BACKEND_MODELS = {
901901
"TransformersEmbeddingModel": _HfExamplesInfo(
902-
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0"
902+
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
903903
),
904904
"TransformersForSequenceClassification": _HfExamplesInfo(
905905
"papluca/xlm-roberta-base-language-detection",
906-
min_transformers_version="5.0.0",
906+
min_transformers_version="5.0.0.dev",
907907
),
908908
"TransformersForCausalLM": _HfExamplesInfo(
909909
"hmellor/Ilama-3.2-1B", trust_remote_code=True
910910
),
911911
"TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
912912
"TransformersMoEForCausalLM": _HfExamplesInfo(
913-
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0"
913+
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
914914
),
915915
"TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
916-
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0"
916+
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
917917
),
918918
"TransformersMoEEmbeddingModel": _HfExamplesInfo(
919-
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
919+
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
920920
),
921921
"TransformersMoEForSequenceClassification": _HfExamplesInfo(
922-
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
922+
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
923923
),
924924
"TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
925925
"TransformersMultiModalForSequenceClassification": _HfExamplesInfo(

tests/models/test_transformers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def test_models(
8282
from packaging.version import Version
8383

8484
installed = Version(transformers.__version__)
85-
required = Version("5.0.0")
85+
required = Version("5.0.0.dev")
8686
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
8787
pytest.skip(
8888
"MoE models with the Transformers backend require "

vllm/model_executor/models/transformers/base.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
2929

3030
from vllm.attention import Attention, AttentionType
31+
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
3132
from vllm.config.utils import getattr_iter
3233
from vllm.distributed import get_pp_group, get_tp_group
3334
from vllm.distributed.utils import get_pp_indices
@@ -317,7 +318,7 @@ def create_attention_instances(self) -> dict[int, Attention]:
317318
# vLLM does not support encoder-decoder models, so if any encoder layer is
318319
# found in a text only model, we assume the whole model is an encoder model
319320
if has_encoder(self.model) and not is_multimodal(self.config):
320-
self.check_version("4.57.0.dev0", "encoder models support")
321+
self.check_version("5.0.0.dev0", "encoder models support")
321322
attn_type = AttentionType.ENCODER_ONLY
322323
else:
323324
attn_type = AttentionType.DECODER
@@ -336,7 +337,12 @@ def create_attention_instances(self) -> dict[int, Attention]:
336337
):
337338
per_layer_sliding_window = self.config.sliding_window
338339

339-
attention_instances[i] = Attention(
340+
attn_cls = (
341+
EncoderOnlyAttention
342+
if attn_type == AttentionType.ENCODER_ONLY
343+
else Attention
344+
)
345+
attention_instances[i] = attn_cls(
340346
num_heads=num_heads,
341347
head_size=head_size,
342348
# NOTE: We use Llama scale as default, if it's set by

vllm/model_executor/models/transformers/moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def transformers_moe_forward_fake(
115115

116116
class MoEMixin(MixtureOfExperts):
117117
def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
118-
self.check_version("4.57.0.dev0", "MoE models support")
118+
self.check_version("5.0.0.dev0", "MoE models support")
119119
# Skip MixtureOfExperts.__init__ and call the next class in MRO
120120
super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)
121121

0 commit comments

Comments
 (0)