Skip to content

Commit 817ca57

Browse files
DarkLight1337Yuqi Zhang
authored andcommitted
[Frontend] Chat template fallbacks for multimodal models (vllm-project#17805)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Yuqi Zhang <yuqizhang@google.com>
1 parent 1233d6f commit 817ca57

File tree

18 files changed

+219
-52
lines changed

18 files changed

+219
-52
lines changed

docs/source/serving/multimodal_inputs.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,10 +213,13 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
213213

214214
:::{important}
215215
A chat template is **required** to use Chat Completions API.
216+
For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
216217

217-
Although most models come with a chat template, for others you have to define one yourself.
218-
The chat template can be inferred based on the documentation on the model's HuggingFace repo.
219-
For example, DeepSeek-VL2 requires a chat template that can be found here: <gh-file:examples/template_deepseek_vl2.jinja>
218+
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
219+
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
220+
221+
For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
222+
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
220223
:::
221224

222225
### Image Inputs

examples/template_florence2.jinja

Lines changed: 0 additions & 3 deletions
This file was deleted.

examples/template_paligemma.jinja

Lines changed: 0 additions & 3 deletions
This file was deleted.

examples/template_qwen_vl.jinja

Lines changed: 0 additions & 3 deletions
This file was deleted.

tests/entrypoints/openai/test_chat_template.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22

33
import pytest
44

5+
from vllm.config import ModelConfig
56
from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
67
load_chat_template)
78
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
89
from vllm.transformers_utils.tokenizer import get_tokenizer
910

11+
from ...models.registry import HF_EXAMPLE_MODELS
1012
from ...utils import VLLM_PATH
1113

1214
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
@@ -91,8 +93,22 @@ def test_no_load_chat_template_literallike():
9193
MODEL_TEMPLATE_GENERATON_OUTPUT)
9294
def test_get_gen_prompt(model, template, add_generation_prompt,
9395
continue_final_message, expected_output):
96+
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
97+
model_info.check_available_online(on_fail="skip")
98+
99+
model_config = ModelConfig(
100+
model,
101+
tokenizer=model_info.tokenizer or model,
102+
tokenizer_mode=model_info.tokenizer_mode,
103+
trust_remote_code=model_info.trust_remote_code,
104+
hf_overrides=model_info.hf_overrides,
105+
)
106+
94107
# Initialize the tokenizer
95-
tokenizer = get_tokenizer(tokenizer_name=model)
108+
tokenizer = get_tokenizer(
109+
tokenizer_name=model_config.tokenizer,
110+
trust_remote_code=model_config.trust_remote_code,
111+
)
96112
template_content = load_chat_template(chat_template=template)
97113

98114
# Create a mock request object using keyword arguments
@@ -106,8 +122,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
106122

107123
# Call the function and get the result
108124
result = apply_hf_chat_template(
125+
model_config,
109126
tokenizer,
110-
trust_remote_code=True,
111127
conversation=mock_request.messages,
112128
chat_template=mock_request.chat_template or template_content,
113129
tools=None,

tests/entrypoints/test_chat_utils.py

Lines changed: 96 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
from typing import Optional
55

66
import pytest
7-
from packaging.version import Version
8-
from transformers import __version__ as TRANSFORMERS_VERSION
97

108
from vllm.assets.image import ImageAsset
119
from vllm.config import ModelConfig
@@ -19,6 +17,7 @@
1917
from vllm.multimodal.utils import encode_image_base64
2018
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
2119

20+
from ..models.registry import HF_EXAMPLE_MODELS
2221
from ..utils import VLLM_PATH
2322

2423
EXAMPLES_DIR = VLLM_PATH / "examples"
@@ -772,6 +771,7 @@ def get_conversation(is_hf: bool):
772771
enable_lora=False,
773772
max_num_seqs=5,
774773
max_input_length=None,
774+
trust_remote_code=model_config.trust_remote_code,
775775
)
776776
tokenizer = tokenizer_group.tokenizer
777777

@@ -793,8 +793,8 @@ def get_conversation(is_hf: bool):
793793
)
794794

795795
vllm_result = apply_hf_chat_template(
796+
model_config,
796797
tokenizer,
797-
trust_remote_code=model_config.trust_remote_code,
798798
conversation=conversation,
799799
chat_template=None,
800800
tools=None,
@@ -813,13 +813,24 @@ def get_conversation(is_hf: bool):
813813
@pytest.mark.parametrize("use_tools", [True, False])
814814
def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
815815
"""checks that chat_template is a dict type for HF models."""
816+
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
817+
model_info.check_available_online(on_fail="skip")
818+
819+
model_config = ModelConfig(
820+
model,
821+
tokenizer=model_info.tokenizer or model,
822+
tokenizer_mode=model_info.tokenizer_mode,
823+
trust_remote_code=model_info.trust_remote_code,
824+
hf_overrides=model_info.hf_overrides,
825+
)
816826

817827
# Build the tokenizer group and grab the underlying tokenizer
818828
tokenizer_group = TokenizerGroup(
819829
model,
820830
enable_lora=False,
821831
max_num_seqs=5,
822832
max_input_length=None,
833+
trust_remote_code=model_config.trust_remote_code,
823834
)
824835
tokenizer = tokenizer_group.tokenizer
825836

@@ -834,10 +845,10 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
834845

835846
# Test detecting the tokenizer's chat_template
836847
chat_template = resolve_hf_chat_template(
848+
model_config,
837849
tokenizer,
838850
chat_template=None,
839851
tools=tools,
840-
trust_remote_code=True,
841852
)
842853
assert isinstance(chat_template, str)
843854

@@ -857,24 +868,32 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
857868
)
858869
# yapf: enable
859870
def test_resolve_content_format_hf_defined(model, expected_format):
860-
if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
861-
"4.49.0"):
862-
pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
871+
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
872+
model_info.check_available_online(on_fail="skip")
873+
874+
model_config = ModelConfig(
875+
model,
876+
tokenizer=model_info.tokenizer or model,
877+
tokenizer_mode=model_info.tokenizer_mode,
878+
trust_remote_code=model_info.trust_remote_code,
879+
hf_overrides=model_info.hf_overrides,
880+
)
863881

864882
tokenizer_group = TokenizerGroup(
865883
model,
866884
enable_lora=False,
867885
max_num_seqs=5,
868886
max_input_length=None,
887+
trust_remote_code=model_config.trust_remote_code,
869888
)
870889
tokenizer = tokenizer_group.tokenizer
871890

872891
# Test detecting the tokenizer's chat_template
873892
chat_template = resolve_hf_chat_template(
893+
model_config,
874894
tokenizer,
875895
chat_template=None,
876896
tools=None,
877-
trust_remote_code=True,
878897
)
879898
assert isinstance(chat_template, str)
880899

@@ -884,11 +903,70 @@ def test_resolve_content_format_hf_defined(model, expected_format):
884903
print(_try_extract_ast(chat_template))
885904

886905
resolved_format = resolve_chat_template_content_format(
906+
model_config,
907+
None, # Test detecting the tokenizer's chat_template
908+
None,
909+
"auto",
910+
tokenizer,
911+
)
912+
913+
assert resolved_format == expected_format
914+
915+
916+
# yapf: disable
917+
@pytest.mark.parametrize(
918+
("model", "expected_format"),
919+
[("Salesforce/blip2-opt-2.7b", "string"),
920+
("facebook/chameleon-7b", "string"),
921+
("deepseek-ai/deepseek-vl2-tiny", "string"),
922+
("microsoft/Florence-2-base", "string"),
923+
("adept/fuyu-8b", "string"),
924+
("google/paligemma-3b-mix-224", "string"),
925+
("Qwen/Qwen-VL", "string"),
926+
("Qwen/Qwen-VL-Chat", "string")],
927+
)
928+
# yapf: enable
929+
def test_resolve_content_format_fallbacks(model, expected_format):
930+
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
931+
model_info.check_available_online(on_fail="skip")
932+
933+
model_config = ModelConfig(
934+
model,
935+
tokenizer=model_info.tokenizer or model,
936+
tokenizer_mode=model_info.tokenizer_mode,
937+
trust_remote_code=model_info.trust_remote_code,
938+
hf_overrides=model_info.hf_overrides,
939+
)
940+
941+
tokenizer_group = TokenizerGroup(
942+
model_config.tokenizer,
943+
enable_lora=False,
944+
max_num_seqs=5,
945+
max_input_length=None,
946+
trust_remote_code=model_config.trust_remote_code,
947+
)
948+
tokenizer = tokenizer_group.tokenizer
949+
950+
# Test detecting the tokenizer's chat_template
951+
chat_template = resolve_hf_chat_template(
952+
model_config,
953+
tokenizer,
954+
chat_template=None,
955+
tools=None,
956+
)
957+
assert isinstance(chat_template, str)
958+
959+
print("[TEXT]")
960+
print(chat_template)
961+
print("[AST]")
962+
print(_try_extract_ast(chat_template))
963+
964+
resolved_format = resolve_chat_template_content_format(
965+
model_config,
887966
None, # Test detecting the tokenizer's chat_template
888967
None,
889968
"auto",
890969
tokenizer,
891-
trust_remote_code=True,
892970
)
893971

894972
assert resolved_format == expected_format
@@ -899,22 +977,14 @@ def test_resolve_content_format_hf_defined(model, expected_format):
899977
("template_path", "expected_format"),
900978
[("template_alpaca.jinja", "string"),
901979
("template_baichuan.jinja", "string"),
902-
("template_blip2.jinja", "string"),
903-
("template_chameleon.jinja", "string"),
904980
("template_chatglm.jinja", "string"),
905981
("template_chatglm2.jinja", "string"),
906982
("template_chatml.jinja", "string"),
907-
("template_deepseek_vl2.jinja", "string"),
908983
("template_dse_qwen2_vl.jinja", "openai"),
909984
("template_falcon_180b.jinja", "string"),
910985
("template_falcon.jinja", "string"),
911-
("template_florence2.jinja", "string"),
912-
("template_fuyu.jinja", "string"),
913986
("template_inkbot.jinja", "string"),
914-
("template_paligemma.jinja", "string"),
915987
("template_teleflm.jinja", "string"),
916-
("template_qwen_vl.jinja", "string"),
917-
("template_qwen_vl_chat.jinja", "string"),
918988
("template_vlm2vec.jinja", "openai"),
919989
("tool_chat_template_granite_20b_fc.jinja", "string"),
920990
("tool_chat_template_hermes.jinja", "string"),
@@ -926,11 +996,18 @@ def test_resolve_content_format_hf_defined(model, expected_format):
926996
)
927997
# yapf: enable
928998
def test_resolve_content_format_examples(template_path, expected_format):
999+
model_config = ModelConfig(
1000+
PHI3V_MODEL_ID, # Dummy
1001+
tokenizer=PHI3V_MODEL_ID, # Dummy
1002+
trust_remote_code=True,
1003+
)
1004+
9291005
tokenizer_group = TokenizerGroup(
930-
PHI3V_MODEL_ID,
1006+
PHI3V_MODEL_ID, # Dummy
9311007
enable_lora=False,
9321008
max_num_seqs=5,
9331009
max_input_length=None,
1010+
trust_remote_code=model_config.trust_remote_code,
9341011
)
9351012
dummy_tokenizer = tokenizer_group.tokenizer
9361013
dummy_tokenizer.chat_template = None
@@ -944,11 +1021,11 @@ def test_resolve_content_format_examples(template_path, expected_format):
9441021
print(_try_extract_ast(chat_template))
9451022

9461023
resolved_format = resolve_chat_template_content_format(
1024+
model_config,
9471025
chat_template,
9481026
None,
9491027
"auto",
9501028
dummy_tokenizer,
951-
trust_remote_code=True,
9521029
)
9531030

9541031
assert resolved_format == expected_format

tests/models/registry.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,9 @@ def check_available_online(
182182
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
183183
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
184184
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
185-
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
185+
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
186+
extras={"guard": "meta-llama/Llama-Guard-3-1B", # noqa: E501
187+
"hermes": "NousResearch/Hermes-3-Llama-3.1-8B"}), # noqa: E501
186188
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
187189
is_available_online=False),
188190
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
@@ -378,7 +380,7 @@ def check_available_online(
378380
# Therefore, we borrow the BartTokenizer from the original Bart model
379381
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
380382
tokenizer="Isotr0py/Florence-2-tokenizer",
381-
trust_remote_code=True), # noqa: E501
383+
trust_remote_code=True,), # noqa: E501
382384
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
383385
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
384386
}

0 commit comments

Comments
 (0)