From a742162145d89746eca308131a4677c4fe07f4ed Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 14:58:29 +0200 Subject: [PATCH 01/26] Update transformers to `v4.54.1` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements/common.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index d29b3e59d35b..9ea2e888aa79 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -7,7 +7,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.53.2 +transformers >= 4.54.1 huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads. tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. diff --git a/requirements/test.in b/requirements/test.in index 3c5e3c0204bf..27febd163d3f 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -35,7 +35,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test mteb[bm25s]>=1.38.11, <2 # required for mteb test -transformers==4.53.2 +transformers==4.54.1 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index d45048aae580..4475bf709b9c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -289,7 +289,7 @@ httpx==0.27.2 # via # -r requirements/test.in # schemathesis -huggingface-hub==0.33.1 +huggingface-hub==0.34.3 # via # -r requirements/test.in # accelerate @@ -1140,7 +1140,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.53.2 +transformers==4.54.1 # via # -r requirements/test.in # genai-perf From 97d7f25f857f0d1eecbf48e5ec25b6989646d7d1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:06:10 +0200 Subject: [PATCH 02/26] Use public method to set attn implementation in Transformers backend Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 8cd95605cdfa..e8691570f15e 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -449,15 +449,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): sliding_window=self.config.interleaved_sliding_window) # Set correct attn and init on "meta" to delay allocating GPU tensors - # TODO: @raushan, use the public `model.set_attn_implementation()` - # method after v4.54.0 is released - self.text_config._attn_implementation = "vllm" with init_on_device_without_buffers("meta"), config_override: self.model: PreTrainedModel = AutoModel.from_config( self.config, torch_dtype=self.model_config.dtype, trust_remote_code=self.model_config.trust_remote_code, ) + self.model.set_attn_implementation("vllm") self.pipeline_parallel() self.tensor_parallel() From fa697f5e8d6b411efbf960f35363152452f9d33e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:30:21 +0200 Subject: [PATCH 03/26] Fix MPT Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/mpt.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index c243f575ae54..8db52a69924c 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn -from transformers import PretrainedConfig +from transformers import MptConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -50,7 +50,7 @@ class MPTAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MptConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -59,15 +59,15 @@ def __init__( self.d_model = config.d_model self.total_num_heads = config.n_heads self.head_dim = self.d_model // self.total_num_heads - self.clip_qkv = config.attn_config["clip_qkv"] - self.qk_ln = config.attn_config["qk_ln"] - self.alibi_bias_max = config.attn_config["alibi_bias_max"] + self.clip_qkv = config.attn_config.clip_qkv + self.qk_ln = config.attn_config.qk_ln + self.alibi_bias_max = config.attn_config.alibi_bias_max if "kv_n_heads" in config.attn_config: - self.total_num_kv_heads = config.attn_config['kv_n_heads'] + self.total_num_kv_heads = config.attn_config.kv_n_heads else: self.total_num_kv_heads = self.total_num_heads - assert not config.attn_config["prefix_lm"] - assert config.attn_config["alibi"] + assert not config.attn_config.prefix_lm + assert config.attn_config.alibi # pylint: disable=invalid-name self.Wqkv = QKVParallelLinear( @@ -144,7 +144,7 @@ class MPTMLP(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MptConfig, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -176,7 +176,7 @@ class MPTBlock(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MptConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", From 24bb2c401843ae736f8515527d83c181d50b6b23 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:39:39 +0200 Subject: [PATCH 04/26] Exaone is a remote model Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index caa691039fce..01ad9da8aa01 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -170,7 +170,8 @@ def check_available_online( min_transformers_version="4.54"), "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT", min_transformers_version="4.54"), - "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", + trust_remote_code=True), "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"), # noqa: E501 "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), From ecebd0c9fe9fb86ea07dc71131fe37f3759538f6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:46:44 +0200 Subject: [PATCH 05/26] Fix solar Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 01ad9da8aa01..dc8e1a383e42 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -276,7 +276,8 @@ def check_available_online( "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), # noqa: E501 "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), - "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), + "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct", + trust_remote_code=True), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", trust_remote_code=True), "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407", From ad38ae2afdcad614dff56494b42fb9877451ba8b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:59:36 +0200 Subject: [PATCH 06/26] Fix telechat Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/telechat2.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index f0b31b1332fb..85d4e5b3e8bd 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -123,6 +123,18 @@ class TeleChat2ForCausalLM(LlamaForCausalLM): }, ) + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + + vllm_config.model_config.hf_config.attribute_map = { + "num_hidden_layers": "n_layer", + "num_attention_heads": "n_head", + "intermediate_size": "ffn_hidden_size", + "rms_norm_eps": "layer_norm_epsilon" + } + vllm_config.model_config.hf_config.hidden_act = "silu" + + super().__init__(vllm_config=vllm_config, prefix=prefix) + def _init_model(self, vllm_config: VllmConfig, prefix: str = "", From d439137354b11e92cdf886aa2b0a2ff049e86d63 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 16:13:38 +0200 Subject: [PATCH 07/26] Fix skywork Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index dc8e1a383e42..7bdce8f27bc4 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -451,7 +451,8 @@ def check_available_online( max_model_len=4096), "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 - "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), + "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B", + trust_remote_code=True), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), From 1dcf9f43d2e2f8960196661d8f77bc3cb7ce9e58 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 16:15:20 +0200 Subject: [PATCH 08/26] Fix hunyuan Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 7bdce8f27bc4..815906d06836 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -200,8 +200,10 @@ def check_available_online( trust_remote_code=True), "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct", trust_remote_code=True), + # TODO: Remove is_available_online once their config.json is fixed "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", - trust_remote_code=True), + trust_remote_code=True, + is_available_online=False), "HCXVisionForCausalLM": _HfExamplesInfo( "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", trust_remote_code=True), From 30bdcde1b745bcd624cd91564c61199d5333f271 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 16:25:06 +0200 Subject: [PATCH 09/26] spaces Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 815906d06836..3426c09b4d95 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -202,8 +202,8 @@ def check_available_online( trust_remote_code=True), # TODO: Remove is_available_online once their config.json is fixed "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", - trust_remote_code=True, - is_available_online=False), + trust_remote_code=True, + is_available_online=False), "HCXVisionForCausalLM": _HfExamplesInfo( "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", trust_remote_code=True), From 36621f483aac560afbaa477d43ef6a7c06d27b05 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 30 Jul 2025 16:14:07 +0000 Subject: [PATCH 10/26] Drop `min_transformers_version="4.53"` Signed-off-by: DarkLight1337 --- tests/models/registry.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 8fcff5a8c511..956716a2ab85 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -176,13 +176,11 @@ def check_available_online( min_transformers_version="4.54"), "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), - "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base", - min_transformers_version="4.53"), + "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), - "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it", # noqa: E501 - min_transformers_version="4.53"), + "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it"), # noqa: E501 "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"), "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", @@ -234,8 +232,7 @@ def check_available_online( trust_remote_code=True), "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B", trust_remote_code=True), - "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf", - min_transformers_version="4.53"), + "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"), "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01", trust_remote_code=True, revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"), # noqa: E501 @@ -291,8 +288,7 @@ def check_available_online( "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"), "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True), - "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst", - min_transformers_version="4.53"), + "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"), # [Encoder-decoder] "BartModel": _HfExamplesInfo("facebook/bart-base"), "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), @@ -374,7 +370,7 @@ def check_available_online( "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b", trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 - "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"), # noqa: E501 + "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking"), # noqa: E501 "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5", min_transformers_version="4.54", is_available_online=False), # noqa: E501 From c305846e95c391d6b07d10c38cffde9dbbc6367f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 30 Jul 2025 16:15:26 +0000 Subject: [PATCH 11/26] Fix duplicated code Signed-off-by: DarkLight1337 --- vllm/model_executor/models/telechat2.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 109e38dbc9f9..e263391f5690 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -39,14 +39,6 @@ class TeleChat2Model(LlamaModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): hf_config = vllm_config.model_config.hf_config - vllm_config.model_config.hf_config.attribute_map = { - "num_hidden_layers": "n_layer", - "num_attention_heads": "n_head", - "intermediate_size": "ffn_hidden_size", - "rms_norm_eps": "layer_norm_epsilon" - } - vllm_config.model_config.hf_config.hidden_act = "silu" - # 1. Initialize the LlamaModel with bias hf_config.bias = True hf_config.mlp_bias = True From 59cd39e426d86cacd51194531475e0621e11ec8a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 18:53:14 +0200 Subject: [PATCH 12/26] Revert telechat2 to how it is on main Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/telechat2.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index e263391f5690..49a7677151a9 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -39,6 +39,14 @@ class TeleChat2Model(LlamaModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): hf_config = vllm_config.model_config.hf_config + vllm_config.model_config.hf_config.attribute_map = { + "num_hidden_layers": "n_layer", + "num_attention_heads": "n_head", + "intermediate_size": "ffn_hidden_size", + "rms_norm_eps": "layer_norm_epsilon" + } + vllm_config.model_config.hf_config.hidden_act = "silu" + # 1. Initialize the LlamaModel with bias hf_config.bias = True hf_config.mlp_bias = True @@ -126,18 +134,6 @@ class TeleChat2ForCausalLM(LlamaForCausalLM): }, ) - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - - vllm_config.model_config.hf_config.attribute_map = { - "num_hidden_layers": "n_layer", - "num_attention_heads": "n_head", - "intermediate_size": "ffn_hidden_size", - "rms_norm_eps": "layer_norm_epsilon" - } - vllm_config.model_config.hf_config.hidden_act = "silu" - - super().__init__(vllm_config=vllm_config, prefix=prefix) - def _init_model(self, vllm_config: VllmConfig, prefix: str = "", From 0af4810b825ddb52f6bc5fdae48637f144450555 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 30 Jul 2025 19:04:59 +0200 Subject: [PATCH 13/26] Revert public method as it's too brittle to use for our purposes right now Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index e8691570f15e..57849bf0ffb6 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -449,13 +449,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): sliding_window=self.config.interleaved_sliding_window) # Set correct attn and init on "meta" to delay allocating GPU tensors + # TODO: @raushan, use the public `model.set_attn_implementation()` + # method once its checks are fixed in Transformers. + self.text_config._attn_implementation = "vllm" with init_on_device_without_buffers("meta"), config_override: self.model: PreTrainedModel = AutoModel.from_config( self.config, torch_dtype=self.model_config.dtype, trust_remote_code=self.model_config.trust_remote_code, ) - self.model.set_attn_implementation("vllm") self.pipeline_parallel() self.tensor_parallel() From d5ab6f9399a4bff92c5a81a5bf2f2c1c9e7c4fcb Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 4 Aug 2025 20:57:17 +0800 Subject: [PATCH 14/26] fix ovis Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/transformers_utils/config.py | 4 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/ovis.py | 176 ++++++++++++++++++++ 3 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 vllm/transformers_utils/configs/ovis.py diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index cc41a771d06c..7a5ac0a2ae62 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -35,7 +35,8 @@ MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, - RWConfig, SpeculatorsConfig, + OvisConfig, RWConfig, + SpeculatorsConfig, Step3TextConfig, Step3VLConfig, UltravoxConfig) # yapf: enable @@ -85,6 +86,7 @@ def _get_hf_token() -> Optional[str]: "speculators": SpeculatorsConfig, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, + "ovis": OvisConfig, "ultravox": UltravoxConfig, "step3_vl": Step3VLConfig, "step3_text": Step3TextConfig, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 64ace167a5a0..82d24bb16ba5 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -24,6 +24,7 @@ from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config +from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, Step3VisionEncoderConfig, @@ -45,6 +46,7 @@ "NemotronHConfig", "Nemotron_Nano_VL_Config", "NVLM_D_Config", + "OvisConfig", "SpeculatorsConfig", "UltravoxConfig", "Step3VLConfig", diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py new file mode 100644 index 000000000000..550f5e15dbcc --- /dev/null +++ b/vllm/transformers_utils/configs/ovis.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# yapf: disable +# ruff: noqa: E501 +# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py +# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py +# Ovis Config with AimV2 config registration removed for Transformers compatibility +from typing import Any, Optional, Union + +from transformers import AutoConfig, PretrainedConfig + + +class AIMv2Config(PretrainedConfig): + """This is the configuration class to store the configuration of an [`AIMv2Model`]. + Instantiating a configuration with the defaults will yield a similar configuration + to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224). + Args: + hidden_size: Dimension of the hidden representations. + intermediate_size: Dimension of the SwiGLU representations. + num_hidden_layers: Number of hidden layers in the Transformer. + num_attention_heads: Number of attention heads for each attention layer + in the Transformer. + num_channels: Number of input channels. + image_size: Image size. + patch_size: Patch size. + rms_norm_eps: Epsilon value used for the RMS normalization layer. + attention_dropout: Dropout ratio for attention probabilities. + projection_dropout: Dropout ratio for the projection layer after the attention. + qkv_bias: Whether to add a bias to the queries, keys and values. + use_bias: Whether to add a bias in the feed-forward and projection layers. + kwargs: Keyword arguments for the [`PretrainedConfig`]. + """ + + model_type: str = "aimv2" + + def __init__( + self, + hidden_size: int = 1024, + intermediate_size: int = 2816, + num_hidden_layers: int = 24, + num_attention_heads: int = 8, + num_channels: int = 3, + image_size: int = 224, + patch_size: int = 14, + rms_norm_eps: float = 1e-5, + attention_dropout: float = 0.0, + projection_dropout: float = 0.0, + qkv_bias: bool = False, + use_bias: bool = False, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.rms_norm_eps = rms_norm_eps + + self.projection_dropout = projection_dropout + self.qkv_bias = qkv_bias + self.use_bias = use_bias + + +# ---------------------------------------------------------------------- +# Visual Tokenizer Configuration +# ---------------------------------------------------------------------- +class BaseVisualTokenizerConfig(PretrainedConfig): + + def __init__(self, + vocab_size=16384, + tokenize_function="softmax", + tau=1.0, + depths=None, + drop_cls_token=False, + backbone_config: Optional[Union[PretrainedConfig, + dict]] = None, + hidden_stride: int = 1, + **kwargs): + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.tokenize_function = tokenize_function + self.tau = tau + if isinstance(depths, str): + depths = [int(x) for x in depths.split('|')] + self.depths = depths + self.backbone_kwargs = dict[str, Any]() + self.drop_cls_token = drop_cls_token + if backbone_config is not None: + assert isinstance(backbone_config, (PretrainedConfig, dict)), \ + f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type" + if not isinstance(backbone_config, PretrainedConfig): + model_type = backbone_config['model_type'] + if model_type != "aimv2": + backbone_config.pop('model_type') + backbone_config = AutoConfig.for_model(model_type, **backbone_config) + else: + backbone_config = AIMv2Config(**backbone_config) + self.backbone_config = backbone_config + self.hidden_stride = hidden_stride + + +class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig): + model_type = "aimv2_visual_tokenizer" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.drop_cls_token: + self.drop_cls_token = False + if self.depths: + assert len(self.depths) == 1 + self.backbone_kwargs['num_hidden_layers'] = self.depths[0] + + +class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig): + model_type = "siglip_visual_tokenizer" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.drop_cls_token: + self.drop_cls_token = False + if self.depths: + assert len(self.depths) == 1 + self.backbone_kwargs['num_hidden_layers'] = self.depths[0] + + +AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig) +AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig) + + +# ---------------------------------------------------------------------- +# Ovis Configuration +# ---------------------------------------------------------------------- +class OvisConfig(PretrainedConfig): + model_type = "ovis" + + def __init__(self, + llm_config: Optional[Union[PretrainedConfig, dict]] = None, + visual_tokenizer_config: Optional[Union[PretrainedConfig, + dict]] = None, + multimodal_max_length=8192, + hidden_size=None, + conversation_formatter_class=None, + llm_attn_implementation=None, + disable_tie_weight=False, + **kwargs): + super().__init__(**kwargs) + if llm_config is not None: + assert isinstance(llm_config, (PretrainedConfig, dict)), \ + f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type" + if not isinstance(llm_config, PretrainedConfig): + model_type = llm_config['model_type'] + llm_config.pop('model_type') + llm_config = AutoConfig.for_model(model_type, **llm_config) + + # map llm_config to text_config + self.text_config = llm_config + if visual_tokenizer_config is not None: + assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \ + f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type" + if not isinstance(visual_tokenizer_config, PretrainedConfig): + model_type = visual_tokenizer_config['model_type'] + visual_tokenizer_config.pop('model_type') + visual_tokenizer_config = AutoConfig.for_model( + model_type, **visual_tokenizer_config) + + self.visual_tokenizer_config = visual_tokenizer_config + self.multimodal_max_length = multimodal_max_length + self.hidden_size = hidden_size + self.conversation_formatter_class = conversation_formatter_class + self.llm_attn_implementation = llm_attn_implementation + self.disable_tie_weight = disable_tie_weight From ce190243a996a70a029e8ec11c9f986ef48b9767 Mon Sep 17 00:00:00 2001 From: isotr0py <2037008807@qq.com> Date: Mon, 4 Aug 2025 22:08:09 +0800 Subject: [PATCH 15/26] fix tarsier2 processing Signed-off-by: isotr0py <2037008807@qq.com> --- vllm/model_executor/models/qwen2_vl.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 40d77312b72c..633f8598e879 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1395,11 +1395,12 @@ def __init__( **kwargs, ): self.image_processor = Tarsier2ImageProcessor(**vision_config) - super().__init__(image_processor=self.image_processor, - tokenizer=tokenizer, - video_processor=Qwen2VLVideoProcessor(), - chat_template=None, - **kwargs) + super().__init__( + image_processor=self.image_processor, + tokenizer=tokenizer, + video_processor=Qwen2VLVideoProcessor(**vision_config), + chat_template=None, + **kwargs) class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo): From 7f32eb64f97a1cf2750bcd6bdae291e6ddd105e2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 4 Aug 2025 18:00:37 +0200 Subject: [PATCH 16/26] Fix type hint in `replace_linear_class` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index aa445baffe8f..5b13f414206e 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -90,7 +90,7 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module): def replace_linear_class( linear: nn.Linear, style: Literal["colwise", "rowwise"], quant_config: QuantizationConfig -) -> Union[ColumnParallelLinear, RowParallelLinear]: +) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]: """ Replace nn.Linear with one of vLLM's tensor parallel linear classes. From 462ebc737e4a56c476ecb78c1e41a97ec181eec1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 4 Aug 2025 18:15:33 +0200 Subject: [PATCH 17/26] Fix `tp_plan` retrieval in Transformers backend Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 5b13f414206e..6a591a032871 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -533,14 +533,16 @@ def tensor_parallel(self): Apply the model's tensor parallelization plan. Currently only supports linear layers. """ - if not self.model.supports_tp_plan: - if self.tp_size <= 1: - return + tp_plan = getattr(self.model.config, "base_model_tp_plan", {}) + if not tp_plan and self.tp_size > 1: raise ValueError( f"{type(self.model)} does not support tensor parallel yet!") - tp_plan = self.model._tp_plan + # Some weight loaders expect linear layers to inherit from vLLM's + # LinearBase class, so we set a default style which causes any + # unspecified linear layers to be replaced with ReplicatedLinear + tp_plan[".*"] = "replicated" def _tensor_parallel(module: nn.Module, prefix: str = ""): for child_name, child_module in module.named_children(): @@ -552,6 +554,7 @@ def _tensor_parallel(module: nn.Module, prefix: str = ""): child_module, style, self.quant_config) setattr(module, child_name, new_module) log_replacement(qual_name, child_module, new_module) + break else: _tensor_parallel(child_module, prefix=qual_name) From 3d9754a716cd1c80c172925b372fe5a3fe89025f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 4 Aug 2025 19:46:58 +0200 Subject: [PATCH 18/26] Fix basic Models Test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/interfaces_base.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 4d68227b2af8..697fa020deb4 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, +from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol, Union, overload, runtime_checkable) import torch @@ -14,6 +14,10 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import Pooler from vllm.model_executor.sampling_metadata import SamplingMetadata +else: + VllmConfig = Any + Pooler = Any + SamplingMetadata = Any logger = init_logger(__name__) @@ -34,7 +38,7 @@ class VllmModel(Protocol[T_co]): def __init__( self, - vllm_config: "VllmConfig", + vllm_config: VllmConfig, prefix: str = "", ) -> None: ... @@ -96,7 +100,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): def compute_logits( self, hidden_states: T, - sampling_metadata: "SamplingMetadata", + sampling_metadata: SamplingMetadata, ) -> Optional[T]: """Return `None` if TP rank > 0.""" ... @@ -140,7 +144,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): MRO of your model class. """ - pooler: "Pooler" + pooler: Pooler """The pooler is only called on TP rank 0.""" From 800edacfe81d571a596ccd54750533cfae60ed5a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 4 Aug 2025 21:02:46 +0200 Subject: [PATCH 19/26] Fix pipeline parallel test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 2 +- vllm/model_executor/models/utils.py | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 6a591a032871..5426aa1a905b 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -520,7 +520,7 @@ def pipeline_parallel(self): for i in range(len(layers)): if start_layer <= i and i < end_layer: continue - layers[i] = PPMissingLayer(return_tuple=True) + layers[i] = PPMissingLayer() # Layers after module list for name in pp_plan[module_list_idx + 1:]: diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 28508e1bac1e..fecd14dde4a8 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -534,16 +534,10 @@ class PPMissingLayer(torch.nn.Identity): def __init__(self, *args, **kwargs): super().__init__() - self.return_tuple = kwargs.get("return_tuple", False) def forward(self, *args, **kwargs): - """ - Return the first arg from args or the first value from kwargs. - - Wraps the input in a tuple if `self.return_tuple` is True. - """ - input = args[0] if args else next(iter(kwargs.values())) - return (input, ) if self.return_tuple else input + """Return the first arg from args or the first value from kwargs.""" + return args[0] if args else next(iter(kwargs.values())) _CPU_OFFLOAD_BYTES = 0 From afe9f80ee0c0ce29df56f36030a15d761fd8986a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 5 Aug 2025 09:53:39 +0200 Subject: [PATCH 20/26] Handle `base_model_tp_plan` being explicitly `None` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 5426aa1a905b..0c3df267edb1 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -533,7 +533,7 @@ def tensor_parallel(self): Apply the model's tensor parallelization plan. Currently only supports linear layers. """ - tp_plan = getattr(self.model.config, "base_model_tp_plan", {}) + tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {} if not tp_plan and self.tp_size > 1: raise ValueError( From d819ce4ba5aeb10826a599d6bd112a00b95b87a1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 5 Aug 2025 10:42:30 +0200 Subject: [PATCH 21/26] Cap transformers version for custom models which are now broken Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 9b57af19b146..6a6a11ef2ec0 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -271,6 +271,8 @@ def check_available_online( "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", trust_remote_code=True), "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b", + max_transformers_version="4.53", + transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings", # noqa: E501 trust_remote_code=True), "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat", trust_remote_code=True), @@ -324,8 +326,12 @@ def check_available_online( "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True, v0_only=True), # noqa: E501 "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), - "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), - "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"), + "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B", + max_transformers_version="4.53", + transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501 + "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B", + max_transformers_version="4.53", + transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"), # noqa: E501 "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True), # noqa: E501 "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True), # noqa: E501 "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True), # noqa: E501 @@ -454,7 +460,7 @@ def check_available_online( extras={"chat": "Qwen/Qwen-VL-Chat"}, # noqa: E501 trust_remote_code=True, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501 - "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 + "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct", min_transformers_version=4.55, transformers_version_reason="HF impl is broken in 4.54"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 max_model_len=4096), From f306e75f7f451d795d50f09ba5a0b0447cc37f21 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 5 Aug 2025 20:28:48 +0800 Subject: [PATCH 22/26] disable fuyu temporarily Signed-off-by: Isotr0py --- tests/models/multimodal/generation/test_common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 8cb826c1144d..2a65d7e244d7 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -337,6 +337,10 @@ vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we + # should enable this again after the fix is released: + # https://github.com/huggingface/transformers/pull/39915 + marks=[pytest.mark.skip("HF model is broken")], ), "gemma3": VLMTestInfo( models=["google/gemma-3-4b-it"], From bfa2f3f4cc5c39ec025e9c37952ee7be6f0d5d6b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 5 Aug 2025 14:43:43 +0200 Subject: [PATCH 23/26] syntax error Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 1da35e33e930..bd0db8bd9fb8 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -459,7 +459,7 @@ def check_available_online( extras={"chat": "Qwen/Qwen-VL-Chat"}, # noqa: E501 trust_remote_code=True, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501 - "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct", min_transformers_version=4.55, transformers_version_reason="HF impl is broken in 4.54"), # noqa: E501 + "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct", min_transformers_version="4.55", transformers_version_reason="HF impl is broken in 4.54"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 max_model_len=4096), From 05240fde12c59ace58f02e79556c73e049aff90e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:37:43 +0200 Subject: [PATCH 24/26] Ficx quantization tests Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 5 ++++- tests/quantization/test_experts_int8.py | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index bd0db8bd9fb8..ef3808bcfbc4 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -222,7 +222,10 @@ def check_available_online( trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini", - extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501 + extras={ + "tiny": "ai21labs/Jamba-tiny-dev", + "random": "ai21labs/Jamba-tiny-random", # noqa: E501 + }), "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct", extras={"guard": "meta-llama/Llama-Guard-3-1B", # noqa: E501 "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", # noqa: E501 diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index 84a656a3b9da..1e3e69e008bd 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -9,6 +9,8 @@ from tests.quantization.utils import is_quant_method_supported +from ..models.registry import HF_EXAMPLE_MODELS + MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"] @@ -25,6 +27,8 @@ def test_model_experts_int8_startup( dtype: str, max_tokens: int, ) -> None: + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_transformers_version(on_fail="skip") with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model: From 3b61cf77378819c17fe6daa2e7b8752084b3144b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 5 Aug 2025 18:22:04 +0200 Subject: [PATCH 25/26] Update to 4.55 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements/common.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index 0d8a927d13b2..1fffc8d1fa70 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -7,7 +7,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.54.1 +transformers >= 4.55.0 huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads. tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. diff --git a/requirements/test.in b/requirements/test.in index 13e36b18fa61..9c8c75dd6f70 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -35,7 +35,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test mteb[bm25s]>=1.38.11, <2 # required for mteb test -transformers==4.54.1 +transformers==4.55.0 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index dd41ea0552cf..08ba964f22a4 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -214,7 +214,7 @@ fiona==1.10.1 # via torchgeo flask==3.1.1 # via mlflow -fonttools==4.54.1 +fonttools==4.55.0 # via matplotlib fqdn==1.5.1 # via jsonschema @@ -1148,7 +1148,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.54.1 +transformers==4.55.0 # via # -r requirements/test.in # genai-perf From 4104f9d4ed370384d6212f118f619afecdcf0ddd Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 5 Aug 2025 18:25:09 +0200 Subject: [PATCH 26/26] Remove 4.55 min version as that's what we're on now Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index ef3808bcfbc4..92a719d7a92d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -462,7 +462,7 @@ def check_available_online( extras={"chat": "Qwen/Qwen-VL-Chat"}, # noqa: E501 trust_remote_code=True, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501 - "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct", min_transformers_version="4.55", transformers_version_reason="HF impl is broken in 4.54"), # noqa: E501 + "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 max_model_len=4096),