From 1e0edcc08ca73b4b42f0c30562ce0bc4b32f4480 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 16:25:03 +0800 Subject: [PATCH 01/19] fix nomic max_model_len --- vllm/config.py | 5 +++++ vllm/model_executor/models/bert_with_rope.py | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 4196684639ee..64f527c6ffa6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4467,6 +4467,11 @@ def _set_cudagraph_sizes(self): self.compilation_config.init_with_cudagraph_sizes( batch_size_capture_list) + def reset_max_model_len(self, max_model_len: int): + self.model_config.max_model_len = max_model_len + self.scheduler_config.max_model_len = max_model_len + self.compute_hash() + def __str__(self): return ( f"model={self.model_config.model!r}," diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index af6deb3bf072..b69535a42c26 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -10,6 +10,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.logger import init_logger from vllm.model_executor.layers.activation import (get_act_and_mul_fn, get_act_fn) from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -27,6 +28,8 @@ from vllm.model_executor.models.utils import WeightsMapper from vllm.sequence import IntermediateTensors +logger = init_logger(__name__) + class BertWithRopeEmbedding(nn.Module): @@ -526,6 +529,23 @@ def config_verify(self, vllm_config): # with SentenceTransformer. # The context extension uses vllm style rope_theta and rope_scaling. # See #17785 + if vllm_config.model_config.hf_overrides is not None: + # We need to allow users to manually change max_model_len. + from vllm.config import _get_and_verify_max_len + max_model_len = _get_and_verify_max_len( + hf_config=self.hf_text_config, + max_model_len=self.max_model_len, + disable_sliding_window=self.disable_sliding_window) + vllm_config.reset_max_model_len(max_model_len) + else: + # Reset max_model_len to config.max_trained_positions. + vllm_config.reset_max_model_len(config.max_trained_positions) + logger.warning( + "We did not use the nomic context extension method, " + "current max_model_len is %s. " + "The context extension uses vllm style " + "rope_theta and rope_scaling. ", + vllm_config.model_config.max_model_len) return config From 994ac46796b49ced80cfa956dae13ce8ff440a04 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 17:12:56 +0800 Subject: [PATCH 02/19] + examples --- .../context_extension/chat.py | 59 +++++++++++++++++++ .../context_extension/embed.py | 42 +++++++++++++ vllm/model_executor/models/bert_with_rope.py | 15 +++-- 3 files changed, 110 insertions(+), 6 deletions(-) create mode 100644 examples/offline_inference/context_extension/chat.py create mode 100644 examples/offline_inference/context_extension/embed.py diff --git a/examples/offline_inference/context_extension/chat.py b/examples/offline_inference/context_extension/chat.py new file mode 100644 index 000000000000..a6079d187e2a --- /dev/null +++ b/examples/offline_inference/context_extension/chat.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams + +rope_theta = 1000000 +original_max_position_embeddings = 32768 +factor = 4.0 + +# Use yarn to extend context +hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": original_max_position_embeddings + }, + "max_model_len": int(original_max_position_embeddings * factor) +} + +llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides) + +sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + max_tokens=128, +) + +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] +outputs = llm.chat(conversation, sampling_params, use_tqdm=False) + + +def print_outputs(outputs): + print("\nGenerated Outputs:\n" + "-" * 80) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\n") + print(f"Generated text: {generated_text!r}") + print("-" * 80) + + +print_outputs(outputs) diff --git a/examples/offline_inference/context_extension/embed.py b/examples/offline_inference/context_extension/embed.py new file mode 100644 index 000000000000..184609e4a713 --- /dev/null +++ b/examples/offline_inference/context_extension/embed.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM + +rope_theta = 1000 +factor = 4.0 +original_max_position_embeddings = 2048 + +# Use yarn to extend context +hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": original_max_position_embeddings + }, + "max_model_len": int(original_max_position_embeddings * factor) +} + +llm = LLM(model="nomic-ai/nomic-embed-text-v1", + trust_remote_code=True, + task="embed", + hf_overrides=hf_overrides) + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +outputs = llm.embed(prompts) + +print("\nGenerated Outputs:\n" + "-" * 60) +for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + embeds_trimmed = ((str(embeds[:16])[:-1] + + ", ...]") if len(embeds) > 16 else embeds) + print( + f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})" + ) + print("-" * 60) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index b69535a42c26..8153da502f55 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -516,10 +516,11 @@ def config_verify(self, vllm_config): head_dim = config.hidden_size // config.num_attention_heads rotary_emb_dim = head_dim * config.rotary_emb_fraction + max_trained_positions = getattr(config, "max_trained_positions", 2048) config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, - "max_position": config.max_trained_positions, + "max_position": max_trained_positions, "base": getattr(config, "rope_theta", config.rotary_emb_base), "rope_scaling": getattr(config, "rope_scaling", None) } @@ -533,13 +534,15 @@ def config_verify(self, vllm_config): # We need to allow users to manually change max_model_len. from vllm.config import _get_and_verify_max_len max_model_len = _get_and_verify_max_len( - hf_config=self.hf_text_config, - max_model_len=self.max_model_len, - disable_sliding_window=self.disable_sliding_window) + hf_config=vllm_config.model_config.hf_text_config, + max_model_len=vllm_config.model_config.max_model_len, + disable_sliding_window=False, + sliding_window_len=None + ) vllm_config.reset_max_model_len(max_model_len) else: - # Reset max_model_len to config.max_trained_positions. - vllm_config.reset_max_model_len(config.max_trained_positions) + # Reset max_model_len to max_trained_positions + vllm_config.reset_max_model_len(max_trained_positions) logger.warning( "We did not use the nomic context extension method, " "current max_model_len is %s. " From 6333d8027f7a9a179434be12f026642dbb764976 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 18:16:45 +0800 Subject: [PATCH 03/19] fix --- .../chat.py => context_extension.py} | 23 +++------- .../context_extension/embed.py | 42 ------------------- vllm/model_executor/models/bert_with_rope.py | 7 ++-- 3 files changed, 8 insertions(+), 64 deletions(-) rename examples/offline_inference/{context_extension/chat.py => context_extension.py} (72%) delete mode 100644 examples/offline_inference/context_extension/embed.py diff --git a/examples/offline_inference/context_extension/chat.py b/examples/offline_inference/context_extension.py similarity index 72% rename from examples/offline_inference/context_extension/chat.py rename to examples/offline_inference/context_extension.py index a6079d187e2a..1a70446c30a0 100644 --- a/examples/offline_inference/context_extension/chat.py +++ b/examples/offline_inference/context_extension.py @@ -12,9 +12,9 @@ "rope_scaling": { "rope_type": "yarn", "factor": factor, - "original_max_position_embeddings": original_max_position_embeddings + "original_max_position_embeddings": original_max_position_embeddings, }, - "max_model_len": int(original_max_position_embeddings * factor) + "max_model_len": int(original_max_position_embeddings * factor), } llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides) @@ -26,22 +26,9 @@ ) conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, ] outputs = llm.chat(conversation, sampling_params, use_tqdm=False) diff --git a/examples/offline_inference/context_extension/embed.py b/examples/offline_inference/context_extension/embed.py deleted file mode 100644 index 184609e4a713..000000000000 --- a/examples/offline_inference/context_extension/embed.py +++ /dev/null @@ -1,42 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM - -rope_theta = 1000 -factor = 4.0 -original_max_position_embeddings = 2048 - -# Use yarn to extend context -hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { - "rope_type": "yarn", - "factor": factor, - "original_max_position_embeddings": original_max_position_embeddings - }, - "max_model_len": int(original_max_position_embeddings * factor) -} - -llm = LLM(model="nomic-ai/nomic-embed-text-v1", - trust_remote_code=True, - task="embed", - hf_overrides=hf_overrides) - -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -outputs = llm.embed(prompts) - -print("\nGenerated Outputs:\n" + "-" * 60) -for prompt, output in zip(prompts, outputs): - embeds = output.outputs.embedding - embeds_trimmed = ((str(embeds[:16])[:-1] + - ", ...]") if len(embeds) > 16 else embeds) - print( - f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})" - ) - print("-" * 60) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 8153da502f55..d5bd5853ea56 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -530,18 +530,17 @@ def config_verify(self, vllm_config): # with SentenceTransformer. # The context extension uses vllm style rope_theta and rope_scaling. # See #17785 - if vllm_config.model_config.hf_overrides is not None: + if vllm_config.model_config.hf_overrides: # We need to allow users to manually change max_model_len. from vllm.config import _get_and_verify_max_len max_model_len = _get_and_verify_max_len( hf_config=vllm_config.model_config.hf_text_config, max_model_len=vllm_config.model_config.max_model_len, disable_sliding_window=False, - sliding_window_len=None - ) + sliding_window_len=None) vllm_config.reset_max_model_len(max_model_len) else: - # Reset max_model_len to max_trained_positions + # Reset max_model_len to max_trained_positions. vllm_config.reset_max_model_len(max_trained_positions) logger.warning( "We did not use the nomic context extension method, " From b2846e2aa694f003d1ce453a8e07d3a5b3ea2955 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 20:28:21 +0800 Subject: [PATCH 04/19] fix --- .../pooling/test_nomic_max_model_len.py | 144 ++++++++++++++++++ vllm/config.py | 1 + vllm/model_executor/models/bert_with_rope.py | 44 ++++-- 3 files changed, 176 insertions(+), 13 deletions(-) create mode 100644 tests/models/language/pooling/test_nomic_max_model_len.py diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py new file mode 100644 index 000000000000..31217610dd73 --- /dev/null +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# ruff: noqa: SIM117 +import pytest + +from ...utils import EmbedModelInfo + +MODELS = [ + EmbedModelInfo("nomic-ai/nomic-embed-text-v1"), + #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), + #EmbedModelInfo("nomic-ai/CodeRankEmbed"), + EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"), + #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), +] + +rope_theta = 1000 +factor = 4.0 +original_max_position_embeddings = 2048 +max_model_len = int(original_max_position_embeddings * factor) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_default(model_info, vllm_runner): + hf_overrides = {} + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": + # For nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + assert model_config.max_model_len == 512 + else: + assert model_config.max_model_len == 2048 + + +@pytest.mark.parametrize("model_info", MODELS) +def test_set_max_model_len_legal(model_info, vllm_runner): + hf_overrides = {} + + # set max_model_len < 512 + with vllm_runner(model_info.name, + task="embed", + max_model_len=256, + hf_overrides=hf_overrides) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + assert model_config.max_model_len == 256 + + # set 512 < max_model_len < 2048 + if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": + # For nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=1024, + hf_overrides=hf_overrides): + pass + else: + with vllm_runner(model_info.name, + task="embed", + max_model_len=1024, + hf_overrides=hf_overrides) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + assert model_config.max_model_len == 1024 + + +@pytest.mark.parametrize("model_info", MODELS) +def test_set_max_model_len_illegal(model_info, vllm_runner): + # set max_model_len > 2048 + hf_overrides = {} + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=4096, + hf_overrides=hf_overrides): + pass + + # set max_model_len > 2048 by hf_overrides + hf_overrides = {"max_model_len": 4096} + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass + + +@pytest.mark.parametrize("model_info", MODELS) +def test_use_rope_scaling_legal(model_info, vllm_runner): + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + }, + "max_model_len": max_model_len + } + + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass + + +@pytest.mark.parametrize("model_info", MODELS) +def test_use_rope_scaling_illegal(model_info, vllm_runner): + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + } + } + # illegal max_model_len + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=max_model_len + 1, + hf_overrides=hf_overrides): + pass + + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + }, + "max_model_len": max_model_len + 1 + } + # illegal max_model_len by hf_overrides + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass diff --git a/vllm/config.py b/vllm/config.py index 64f527c6ffa6..254cc6c63e0a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -571,6 +571,7 @@ def __post_init__(self) -> None: sliding_window = None + self.original_max_model_len = self.max_model_len self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, max_model_len=self.max_model_len, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index d5bd5853ea56..819476e8dd91 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -529,26 +529,44 @@ def config_verify(self, vllm_config): # than max_trained_positions 2048, the results are consistent # with SentenceTransformer. # The context extension uses vllm style rope_theta and rope_scaling. - # See #17785 - if vllm_config.model_config.hf_overrides: - # We need to allow users to manually change max_model_len. - from vllm.config import _get_and_verify_max_len - max_model_len = _get_and_verify_max_len( - hf_config=vllm_config.model_config.hf_text_config, - max_model_len=vllm_config.model_config.max_model_len, - disable_sliding_window=False, - sliding_window_len=None) - vllm_config.reset_max_model_len(max_model_len) - else: + # See #17785 #18755 + if (not vllm_config.model_config.hf_overrides and + vllm_config.model_config.original_max_model_len is None): + # Default # Reset max_model_len to max_trained_positions. - vllm_config.reset_max_model_len(max_trained_positions) + # nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + max_model_len = min(vllm_config.model_config.max_model_len, + max_trained_positions) + + vllm_config.reset_max_model_len(max_model_len) logger.warning( "We did not use the nomic context extension method, " "current max_model_len is %s. " "The context extension uses vllm style " - "rope_theta and rope_scaling. ", + "rope_theta and rope_scaling. See: " + "examples/offline_inference/context_extension.html", vllm_config.model_config.max_model_len) + else: + # We need to re-verify max_model_len to avoid lengths greater than position_embedding. + from vllm.config import _get_and_verify_max_len + + model_config = vllm_config.model_config + max_model_len = model_config.hf_overrides.get("max_model_len", + vllm_config.model_config.max_model_len) + + # reset hf_text_config for _get_and_verify_max_len. + if hasattr(model_config.hf_text_config, "max_model_len"): + delattr(model_config.hf_text_config, "max_model_len") + model_config.hf_text_config.max_position_embeddings = max_trained_positions + model_config.hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + max_model_len = _get_and_verify_max_len( + hf_config=model_config.hf_text_config, + max_model_len=max_model_len, + disable_sliding_window=False, + sliding_window_len=None) + vllm_config.reset_max_model_len(max_model_len) return config From e1e920d397448f6e78ea6856e56def225228d8d6 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 20:35:19 +0800 Subject: [PATCH 05/19] fix --- .../language/pooling/test_nomic_max_model_len.py | 6 +++--- vllm/model_executor/models/bert_with_rope.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 31217610dd73..9f6f0c4c5808 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -6,10 +6,10 @@ MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1"), - #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), - #EmbedModelInfo("nomic-ai/CodeRankEmbed"), + EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), + EmbedModelInfo("nomic-ai/CodeRankEmbed"), EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"), - #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), + EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), ] rope_theta = 1000 diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 819476e8dd91..14c988f48953 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -548,21 +548,23 @@ def config_verify(self, vllm_config): "examples/offline_inference/context_extension.html", vllm_config.model_config.max_model_len) else: - # We need to re-verify max_model_len to avoid lengths greater than position_embedding. + # We need to re-verify max_model_len to avoid lengths + # greater than position_embedding. from vllm.config import _get_and_verify_max_len model_config = vllm_config.model_config + hf_text_config = model_config.hf_text_config max_model_len = model_config.hf_overrides.get("max_model_len", vllm_config.model_config.max_model_len) # reset hf_text_config for _get_and_verify_max_len. - if hasattr(model_config.hf_text_config, "max_model_len"): - delattr(model_config.hf_text_config, "max_model_len") - model_config.hf_text_config.max_position_embeddings = max_trained_positions - model_config.hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + if hasattr(hf_text_config, "max_model_len"): + delattr(hf_text_config, "max_model_len") + hf_text_config.max_position_embeddings = max_trained_positions + hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] max_model_len = _get_and_verify_max_len( - hf_config=model_config.hf_text_config, + hf_config=hf_text_config, max_model_len=max_model_len, disable_sliding_window=False, sliding_window_len=None) From 75ed7bebb4f97547ca712f38ea05db7e5e79035f Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 20:36:17 +0800 Subject: [PATCH 06/19] fix --- tests/models/language/pooling/test_nomic_max_model_len.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 9f6f0c4c5808..31217610dd73 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -6,10 +6,10 @@ MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1"), - EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), - EmbedModelInfo("nomic-ai/CodeRankEmbed"), + #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), + #EmbedModelInfo("nomic-ai/CodeRankEmbed"), EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), + #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), ] rope_theta = 1000 From 52d9ce7c6f49f10df0d2c9fba18cbeb5f946ffab Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 20:43:59 +0800 Subject: [PATCH 07/19] fix --- .../pooling/test_nomic_max_model_len.py | 19 +++++-------------- vllm/model_executor/models/bert_with_rope.py | 8 ++++---- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 31217610dd73..df6961e48ffd 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -20,11 +20,9 @@ @pytest.mark.parametrize("model_info", MODELS) def test_default(model_info, vllm_runner): - hf_overrides = {} with vllm_runner(model_info.name, task="embed", - max_model_len=None, - hf_overrides=hf_overrides) as vllm_model: + max_model_len=None) as vllm_model: model_config = vllm_model.model.llm_engine.model_config if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": # For nomic-embed-text-v2-moe the length is set to 512 @@ -36,13 +34,10 @@ def test_default(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_set_max_model_len_legal(model_info, vllm_runner): - hf_overrides = {} - # set max_model_len < 512 with vllm_runner(model_info.name, task="embed", - max_model_len=256, - hf_overrides=hf_overrides) as vllm_model: + max_model_len=256) as vllm_model: model_config = vllm_model.model.llm_engine.model_config assert model_config.max_model_len == 256 @@ -53,14 +48,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner): with pytest.raises(ValueError): with vllm_runner(model_info.name, task="embed", - max_model_len=1024, - hf_overrides=hf_overrides): + max_model_len=1024): pass else: with vllm_runner(model_info.name, task="embed", - max_model_len=1024, - hf_overrides=hf_overrides) as vllm_model: + max_model_len=1024) as vllm_model: model_config = vllm_model.model.llm_engine.model_config assert model_config.max_model_len == 1024 @@ -68,12 +61,10 @@ def test_set_max_model_len_legal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_set_max_model_len_illegal(model_info, vllm_runner): # set max_model_len > 2048 - hf_overrides = {} with pytest.raises(ValueError): with vllm_runner(model_info.name, task="embed", - max_model_len=4096, - hf_overrides=hf_overrides): + max_model_len=4096): pass # set max_model_len > 2048 by hf_overrides diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 14c988f48953..41db6a7c8a8e 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -530,8 +530,8 @@ def config_verify(self, vllm_config): # with SentenceTransformer. # The context extension uses vllm style rope_theta and rope_scaling. # See #17785 #18755 - if (not vllm_config.model_config.hf_overrides and - vllm_config.model_config.original_max_model_len is None): + if (not vllm_config.model_config.hf_overrides + and vllm_config.model_config.original_max_model_len is None): # Default # Reset max_model_len to max_trained_positions. # nomic-embed-text-v2-moe the length is set to 512 @@ -554,8 +554,8 @@ def config_verify(self, vllm_config): model_config = vllm_config.model_config hf_text_config = model_config.hf_text_config - max_model_len = model_config.hf_overrides.get("max_model_len", - vllm_config.model_config.max_model_len) + max_model_len = model_config.hf_overrides.get( + "max_model_len", vllm_config.model_config.max_model_len) # reset hf_text_config for _get_and_verify_max_len. if hasattr(hf_text_config, "max_model_len"): From c594729939df517acada048f0bd6f8e9dd555e79 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 20:53:30 +0800 Subject: [PATCH 08/19] fix --- .../pooling/test_nomic_max_model_len.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index df6961e48ffd..68603e62843e 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -20,8 +20,7 @@ @pytest.mark.parametrize("model_info", MODELS) def test_default(model_info, vllm_runner): - with vllm_runner(model_info.name, - task="embed", + with vllm_runner(model_info.name, task="embed", max_model_len=None) as vllm_model: model_config = vllm_model.model.llm_engine.model_config if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": @@ -29,30 +28,28 @@ def test_default(model_info, vllm_runner): # by sentence_bert_config.json. assert model_config.max_model_len == 512 else: - assert model_config.max_model_len == 2048 + assert ( + model_config.max_model_len == original_max_position_embeddings) @pytest.mark.parametrize("model_info", MODELS) def test_set_max_model_len_legal(model_info, vllm_runner): - # set max_model_len < 512 - with vllm_runner(model_info.name, - task="embed", + # set max_model_len <= 512 + with vllm_runner(model_info.name, task="embed", max_model_len=256) as vllm_model: model_config = vllm_model.model.llm_engine.model_config assert model_config.max_model_len == 256 - # set 512 < max_model_len < 2048 + # set 512 < max_model_len <= 2048 if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": # For nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. with pytest.raises(ValueError): - with vllm_runner(model_info.name, - task="embed", + with vllm_runner(model_info.name, task="embed", max_model_len=1024): pass else: - with vllm_runner(model_info.name, - task="embed", + with vllm_runner(model_info.name, task="embed", max_model_len=1024) as vllm_model: model_config = vllm_model.model.llm_engine.model_config assert model_config.max_model_len == 1024 @@ -62,9 +59,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): def test_set_max_model_len_illegal(model_info, vllm_runner): # set max_model_len > 2048 with pytest.raises(ValueError): - with vllm_runner(model_info.name, - task="embed", - max_model_len=4096): + with vllm_runner(model_info.name, task="embed", max_model_len=4096): pass # set max_model_len > 2048 by hf_overrides From 3ab82d660c7bf4eac57a028fd1f6725a06b2d8ff Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 21:04:58 +0800 Subject: [PATCH 09/19] fix --- vllm/model_executor/models/bert_with_rope.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 41db6a7c8a8e..3c4e5be9f7a4 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -554,7 +554,8 @@ def config_verify(self, vllm_config): model_config = vllm_config.model_config hf_text_config = model_config.hf_text_config - max_model_len = model_config.hf_overrides.get( + hf_overrides = model_config.hf_overrides or {} + max_model_len = hf_overrides.get( "max_model_len", vllm_config.model_config.max_model_len) # reset hf_text_config for _get_and_verify_max_len. From 54669206d0e901dc03327bfe9f861bb848554539 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 21:39:46 +0800 Subject: [PATCH 10/19] fix --- vllm/config.py | 10 +++++++++- vllm/model_executor/models/bert_with_rope.py | 18 +++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 254cc6c63e0a..fb50348bd75d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4468,7 +4468,15 @@ def _set_cudagraph_sizes(self): self.compilation_config.init_with_cudagraph_sizes( batch_size_capture_list) - def reset_max_model_len(self, max_model_len: int): + def recalculate_max_model_len(self, max_model_len): + model_config = self.model_config + max_model_len = _get_and_verify_max_len( + hf_config=model_config.hf_text_config, + max_model_len=max_model_len, + disable_sliding_window=model_config.disable_sliding_window, + sliding_window_len=model_config.get_hf_config_sliding_window(), + spec_target_max_model_len=model_config.spec_target_max_model_len, + encoder_config=model_config.encoder_config) self.model_config.max_model_len = max_model_len self.scheduler_config.max_model_len = max_model_len self.compute_hash() diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 3c4e5be9f7a4..05cee54bb0e6 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -536,40 +536,36 @@ def config_verify(self, vllm_config): # Reset max_model_len to max_trained_positions. # nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. + max_model_len_before = vllm_config.model_config.max_model_len max_model_len = min(vllm_config.model_config.max_model_len, max_trained_positions) - vllm_config.reset_max_model_len(max_model_len) + vllm_config.recalculate_max_model_len(max_model_len) logger.warning( "We did not use the nomic context extension method, " + "max_model_len before is %s." "current max_model_len is %s. " "The context extension uses vllm style " "rope_theta and rope_scaling. See: " "examples/offline_inference/context_extension.html", - vllm_config.model_config.max_model_len) + max_model_len_before, vllm_config.model_config.max_model_len) else: # We need to re-verify max_model_len to avoid lengths # greater than position_embedding. - from vllm.config import _get_and_verify_max_len - model_config = vllm_config.model_config hf_text_config = model_config.hf_text_config hf_overrides = model_config.hf_overrides or {} max_model_len = hf_overrides.get( "max_model_len", vllm_config.model_config.max_model_len) - # reset hf_text_config for _get_and_verify_max_len. + # reset hf_text_config for recalculate_max_model_len. if hasattr(hf_text_config, "max_model_len"): delattr(hf_text_config, "max_model_len") hf_text_config.max_position_embeddings = max_trained_positions hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + model_config.encoder_config = None - max_model_len = _get_and_verify_max_len( - hf_config=hf_text_config, - max_model_len=max_model_len, - disable_sliding_window=False, - sliding_window_len=None) - vllm_config.reset_max_model_len(max_model_len) + vllm_config.recalculate_max_model_len(max_model_len) return config From 99667396551b8114db0e0c88e82dc5cf4b04b112 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 21:41:17 +0800 Subject: [PATCH 11/19] fix --- vllm/model_executor/models/bert_with_rope.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 05cee54bb0e6..e154bc31898f 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -542,8 +542,8 @@ def config_verify(self, vllm_config): vllm_config.recalculate_max_model_len(max_model_len) logger.warning( - "We did not use the nomic context extension method, " - "max_model_len before is %s." + "We did not use the nomic context extension method." + "Max_model_len before is %s, " "current max_model_len is %s. " "The context extension uses vllm style " "rope_theta and rope_scaling. See: " From d06275a3903ef80e8e0a36e453743f56164eb2f3 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 22:02:50 +0800 Subject: [PATCH 12/19] fix --- vllm/config.py | 2 +- vllm/model_executor/models/bert_with_rope.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index fb50348bd75d..72830a71f401 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4468,7 +4468,7 @@ def _set_cudagraph_sizes(self): self.compilation_config.init_with_cudagraph_sizes( batch_size_capture_list) - def recalculate_max_model_len(self, max_model_len): + def recalculate_max_model_len(self, max_model_len: int): model_config = self.model_config max_model_len = _get_and_verify_max_len( hf_config=model_config.hf_text_config, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index e154bc31898f..7a9b56e8c9a5 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -542,12 +542,10 @@ def config_verify(self, vllm_config): vllm_config.recalculate_max_model_len(max_model_len) logger.warning( - "We did not use the nomic context extension method." - "Max_model_len before is %s, " - "current max_model_len is %s. " - "The context extension uses vllm style " - "rope_theta and rope_scaling. See: " - "examples/offline_inference/context_extension.html", + "Nomic context extension is disabled. " + "Changing max_model_len from %s to %s. " + "To enable context extension, see: " + "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html", max_model_len_before, vllm_config.model_config.max_model_len) else: # We need to re-verify max_model_len to avoid lengths From 653d573b2fe9d886068db6e5127317e93aa8aa18 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 22:21:36 +0800 Subject: [PATCH 13/19] fix --- .../models/language/pooling/test_nomic_max_model_len.py | 6 +++--- vllm/model_executor/models/bert_with_rope.py | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 68603e62843e..12fe75e94ac1 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -6,10 +6,10 @@ MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1"), - #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), - #EmbedModelInfo("nomic-ai/CodeRankEmbed"), + EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), + EmbedModelInfo("nomic-ai/CodeRankEmbed"), EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"), - #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), + EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), ] rope_theta = 1000 diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 7a9b56e8c9a5..df8c0714e28d 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -559,9 +559,16 @@ def config_verify(self, vllm_config): # reset hf_text_config for recalculate_max_model_len. if hasattr(hf_text_config, "max_model_len"): delattr(hf_text_config, "max_model_len") + hf_text_config.max_position_embeddings = max_trained_positions hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] - model_config.encoder_config = None + + # The priority of sentence_bert_config.json is higher + # than max_position_embeddings + encoder_config = model_config.encoder_config + if hasattr(encoder_config, "max_seq_length"): + delattr(encoder_config, "max_seq_length") + model_config.encoder_config = encoder_config vllm_config.recalculate_max_model_len(max_model_len) return config From c04050a1cf9ee1a4109938a98544202176871cc2 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 22:23:16 +0800 Subject: [PATCH 14/19] fix --- vllm/model_executor/models/bert_with_rope.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index df8c0714e28d..aee96b5f38b5 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -566,8 +566,7 @@ def config_verify(self, vllm_config): # The priority of sentence_bert_config.json is higher # than max_position_embeddings encoder_config = model_config.encoder_config - if hasattr(encoder_config, "max_seq_length"): - delattr(encoder_config, "max_seq_length") + encoder_config.pop("max_seq_length", None) model_config.encoder_config = encoder_config vllm_config.recalculate_max_model_len(max_model_len) From b5fa6bdaa3b37fbf16f3a5102334d2ed7605aeef Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 22:41:58 +0800 Subject: [PATCH 15/19] fix --- tests/models/language/pooling/test_nomic_max_model_len.py | 5 ++++- vllm/model_executor/models/bert_with_rope.py | 5 +---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 12fe75e94ac1..c5a480c040e6 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -33,15 +33,18 @@ def test_default(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) -def test_set_max_model_len_legal(model_info, vllm_runner): +def test_set_max_model_len_legal1(model_info, vllm_runner): # set max_model_len <= 512 with vllm_runner(model_info.name, task="embed", max_model_len=256) as vllm_model: model_config = vllm_model.model.llm_engine.model_config assert model_config.max_model_len == 256 +@pytest.mark.parametrize("model_info", MODELS) +def test_set_max_model_len_legal2(model_info, vllm_runner): # set 512 < max_model_len <= 2048 if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": + pytest.skip("model_config.encoder_config has some kind of cache?") # For nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. with pytest.raises(ValueError): diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index aee96b5f38b5..8cdc0634003a 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -559,15 +559,12 @@ def config_verify(self, vllm_config): # reset hf_text_config for recalculate_max_model_len. if hasattr(hf_text_config, "max_model_len"): delattr(hf_text_config, "max_model_len") - hf_text_config.max_position_embeddings = max_trained_positions hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] # The priority of sentence_bert_config.json is higher # than max_position_embeddings - encoder_config = model_config.encoder_config - encoder_config.pop("max_seq_length", None) - model_config.encoder_config = encoder_config + model_config.encoder_config.pop("max_seq_length", None) vllm_config.recalculate_max_model_len(max_model_len) return config From c746de9fd19c6b2b16d7c08a221f6e3b2a189eea Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 22:58:35 +0800 Subject: [PATCH 16/19] fix --- .../pooling/test_nomic_max_model_len.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index c5a480c040e6..e446821a096d 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,15 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # ruff: noqa: SIM117 import pytest +import vllm.transformers_utils.config as config from ...utils import EmbedModelInfo + +def cache_clear(): + config.get_sentence_transformer_tokenizer_config.cache_clear() + + MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1"), - EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), - EmbedModelInfo("nomic-ai/CodeRankEmbed"), + #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), + #EmbedModelInfo("nomic-ai/CodeRankEmbed"), EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"), - EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), + #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), ] rope_theta = 1000 @@ -40,11 +46,10 @@ def test_set_max_model_len_legal1(model_info, vllm_runner): model_config = vllm_model.model.llm_engine.model_config assert model_config.max_model_len == 256 -@pytest.mark.parametrize("model_info", MODELS) -def test_set_max_model_len_legal2(model_info, vllm_runner): # set 512 < max_model_len <= 2048 if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": - pytest.skip("model_config.encoder_config has some kind of cache?") + cache_clear() + # For nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. with pytest.raises(ValueError): From 886eb32d0a8be52e4c74f247aaf6c77ac241acc4 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 23:05:18 +0800 Subject: [PATCH 17/19] fix --- tests/models/language/pooling/test_nomic_max_model_len.py | 8 -------- vllm/model_executor/models/bert_with_rope.py | 5 ++++- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index e446821a096d..3d3bcac7381e 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,15 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # ruff: noqa: SIM117 import pytest -import vllm.transformers_utils.config as config from ...utils import EmbedModelInfo - -def cache_clear(): - config.get_sentence_transformer_tokenizer_config.cache_clear() - - MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1"), #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), @@ -48,8 +42,6 @@ def test_set_max_model_len_legal1(model_info, vllm_runner): # set 512 < max_model_len <= 2048 if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": - cache_clear() - # For nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. with pytest.raises(ValueError): diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 8cdc0634003a..d4721c13a245 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable +from copy import deepcopy from typing import Optional import torch @@ -564,7 +565,9 @@ def config_verify(self, vllm_config): # The priority of sentence_bert_config.json is higher # than max_position_embeddings - model_config.encoder_config.pop("max_seq_length", None) + encoder_config = deepcopy(model_config.encoder_config) + encoder_config.pop("max_seq_length", None) + model_config.encoder_config = encoder_config vllm_config.recalculate_max_model_len(max_model_len) return config From de07c5d8a08d2eb3a76b8952826d7442c1b15f3b Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 27 May 2025 23:07:47 +0800 Subject: [PATCH 18/19] fix --- tests/models/language/pooling/test_nomic_max_model_len.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 3d3bcac7381e..68603e62843e 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -33,7 +33,7 @@ def test_default(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) -def test_set_max_model_len_legal1(model_info, vllm_runner): +def test_set_max_model_len_legal(model_info, vllm_runner): # set max_model_len <= 512 with vllm_runner(model_info.name, task="embed", max_model_len=256) as vllm_model: From 961ef1a3f8612c8a33d9e658656334a28407ceca Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 28 May 2025 09:55:47 +0800 Subject: [PATCH 19/19] fix --- vllm/model_executor/models/bert_with_rope.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index d4721c13a245..8a387d71f1cb 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -553,9 +553,15 @@ def config_verify(self, vllm_config): # greater than position_embedding. model_config = vllm_config.model_config hf_text_config = model_config.hf_text_config - hf_overrides = model_config.hf_overrides or {} - max_model_len = hf_overrides.get( - "max_model_len", vllm_config.model_config.max_model_len) + + if isinstance(model_config.hf_overrides, dict): + # hf_overrides_kw + max_model_len = model_config.hf_overrides.get( + "max_model_len", vllm_config.model_config.max_model_len) + else: + # hf_overrides_fn + # This might be overridden by sentence_bert_config.json. + max_model_len = vllm_config.model_config.max_model_len # reset hf_text_config for recalculate_max_model_len. if hasattr(hf_text_config, "max_model_len"):