From fcf09f2412145c64709bb55442db034f76f306e4 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 5 Jun 2025 18:11:55 +0800 Subject: [PATCH 1/5] fix cross_encoding --- vllm/model_executor/models/roberta.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 8fa8b89798d0..a8efa93dc022 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -178,6 +178,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + if vllm_config.model_config.task == "score": + # fix cross_encoding. See #19201 + padding_idx = getattr(config, + "pad_token_id", 0) + vllm_config.model_config.max_model_len -= padding_idx * 2 + self.default_activation_function = \ get_cross_encoder_activation_function(config) From e3fab630d8f6196f3e0b2a60651e54f4eaf8ba7f Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 5 Jun 2025 18:56:20 +0800 Subject: [PATCH 2/5] fix --- vllm/model_executor/models/roberta.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index a8efa93dc022..767181806089 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -180,8 +180,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if vllm_config.model_config.task == "score": # fix cross_encoding. See #19201 - padding_idx = getattr(config, - "pad_token_id", 0) + padding_idx = getattr(config, "pad_token_id", 0) vllm_config.model_config.max_model_len -= padding_idx * 2 self.default_activation_function = \ From 7ad41a83811ef8e38ffbfe7ccef5d1e65eaebc7e Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 6 Jun 2025 13:55:18 +0800 Subject: [PATCH 3/5] model_max_length in tokenizer_config --- vllm/config.py | 11 +++++++++++ vllm/model_executor/models/roberta.py | 5 ----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index a07c41ddab19..6ff25296cdc7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -28,6 +28,7 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE from torch.distributed import ProcessGroup, ReduceOp from transformers import PretrainedConfig +from transformers.models.auto.tokenization_auto import get_tokenizer_config from typing_extensions import deprecated, runtime_checkable import vllm.envs as envs @@ -1427,6 +1428,16 @@ def get_and_verify_max_len(self, max_model_len: int): sliding_window_len=self.get_hf_config_sliding_window(), spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config) + try: + tokenizer_config = get_tokenizer_config( + self.tokenizer, + trust_remote_code=self.trust_remote_code, + revision=self.tokenizer_revision) + model_max_length = getattr(tokenizer_config, "model_max_length", + max_model_len) + max_model_len = min(max_model_len, model_max_length) + except ValueError: + pass return max_model_len diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 767181806089..8fa8b89798d0 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -178,11 +178,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - if vllm_config.model_config.task == "score": - # fix cross_encoding. See #19201 - padding_idx = getattr(config, "pad_token_id", 0) - vllm_config.model_config.max_model_len -= padding_idx * 2 - self.default_activation_function = \ get_cross_encoder_activation_function(config) From bd2ed3c6b0f1c0d5dcba3dfbf5c09d65c37126b2 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 6 Jun 2025 14:12:57 +0800 Subject: [PATCH 4/5] + try_get_tokenizer_config --- vllm/config.py | 26 ++++++++++++++------------ vllm/transformers_utils/config.py | 16 ++++++++++++++++ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 6ff25296cdc7..1abe9f22a749 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -28,7 +28,6 @@ from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE from torch.distributed import ProcessGroup, ReduceOp from transformers import PretrainedConfig -from transformers.models.auto.tokenization_auto import get_tokenizer_config from typing_extensions import deprecated, runtime_checkable import vllm.envs as envs @@ -45,7 +44,8 @@ ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - try_get_generation_config, try_get_safetensors_metadata, uses_mrope) + try_get_generation_config, try_get_safetensors_metadata, + try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, @@ -1428,16 +1428,18 @@ def get_and_verify_max_len(self, max_model_len: int): sliding_window_len=self.get_hf_config_sliding_window(), spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config) - try: - tokenizer_config = get_tokenizer_config( - self.tokenizer, - trust_remote_code=self.trust_remote_code, - revision=self.tokenizer_revision) - model_max_length = getattr(tokenizer_config, "model_max_length", - max_model_len) - max_model_len = min(max_model_len, model_max_length) - except ValueError: - pass + + tokenizer_config = try_get_tokenizer_config( + self.tokenizer, + trust_remote_code=self.trust_remote_code, + revision=self.tokenizer_revision) + + if tokenizer_config is None: + return max_model_len + + model_max_length = tokenizer_config.get("model_max_length", + max_model_len) + max_model_len = min(max_model_len, model_max_length) return max_model_len diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 9bc3b8e09ada..d99290dd690e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -23,6 +23,7 @@ get_image_processor_config) from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) +from transformers.models.auto.tokenization_auto import get_tokenizer_config from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs @@ -867,3 +868,18 @@ def try_get_safetensors_metadata( "Error retrieving safetensors") except Exception: return None + + +def try_get_tokenizer_config( + pretrained_model_name_or_path: Union[str, os.PathLike], + trust_remote_code: bool, + revision: Optional[str] = None, +) -> Optional[dict[str, Any]]: + try: + tokenizer_config = get_tokenizer_config( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + revision=revision) + return tokenizer_config + except Exception: + return None From f83380dc1ebf0b9b952117e83439e18099122a0f Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 6 Jun 2025 16:56:16 +0800 Subject: [PATCH 5/5] fix --- vllm/transformers_utils/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d99290dd690e..d66e296fdbba 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -876,10 +876,10 @@ def try_get_tokenizer_config( revision: Optional[str] = None, ) -> Optional[dict[str, Any]]: try: - tokenizer_config = get_tokenizer_config( + return get_tokenizer_config( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, - revision=revision) - return tokenizer_config + revision=revision, + ) except Exception: return None