zzzyq
diff --git a/‎docs/source/serving/multimodal_inputs.md‎
Lines changed: 6 additions & 3 deletions b/‎docs/source/serving/multimodal_inputs.md‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎examples/template_florence2.jinja‎
Lines changed: 0 additions & 3 deletions b/‎examples/template_florence2.jinja‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/template_paligemma.jinja‎
Lines changed: 0 additions & 3 deletions b/‎examples/template_paligemma.jinja‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/template_qwen_vl.jinja‎
Lines changed: 0 additions & 3 deletions b/‎examples/template_qwen_vl.jinja‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tests/entrypoints/openai/test_chat_template.py‎
Lines changed: 18 additions & 2 deletions b/‎tests/entrypoints/openai/test_chat_template.py‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎tests/entrypoints/test_chat_utils.py‎
Lines changed: 96 additions & 19 deletions b/‎tests/entrypoints/test_chat_utils.py‎
Lines changed: 96 additions & 19 deletions
diff --git a/‎tests/models/registry.py‎
Lines changed: 4 additions & 2 deletions b/‎tests/models/registry.py‎
Lines changed: 4 additions & 2 deletions
@@ -213,10 +213,13 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
 
 :::{important}
 A chat template is **required** to use Chat Completions API.
+For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
 
-Although most models come with a chat template, for others you have to define one yourself.
-The chat template can be inferred based on the documentation on the model's HuggingFace repo.
-For example, DeepSeek-VL2 requires a chat template that can be found here: <gh-file:examples/template_deepseek_vl2.jinja>
+If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
+If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
+
+For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
+For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
 :::
 
 ### Image Inputs
 
@@ -2,11 +2,13 @@
 
 import pytest
 
+from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
                                          load_chat_template)
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+from ...models.registry import HF_EXAMPLE_MODELS
 from ...utils import VLLM_PATH
 
 chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
@@ -91,8 +93,22 @@ def test_no_load_chat_template_literallike():
     MODEL_TEMPLATE_GENERATON_OUTPUT)
 def test_get_gen_prompt(model, template, add_generation_prompt,
                         continue_final_message, expected_output):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
+
     # Initialize the tokenizer
-    tokenizer = get_tokenizer(tokenizer_name=model)
+    tokenizer = get_tokenizer(
+        tokenizer_name=model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
+    )
     template_content = load_chat_template(chat_template=template)
 
     # Create a mock request object using keyword arguments
@@ -106,8 +122,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
 
     # Call the function and get the result
     result = apply_hf_chat_template(
+        model_config,
         tokenizer,
-        trust_remote_code=True,
         conversation=mock_request.messages,
         chat_template=mock_request.chat_template or template_content,
         tools=None,
 
@@ -4,8 +4,6 @@
 from typing import Optional
 
 import pytest
-from packaging.version import Version
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
@@ -19,6 +17,7 @@
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
+from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import VLLM_PATH
 
 EXAMPLES_DIR = VLLM_PATH / "examples"
@@ -772,6 +771,7 @@ def get_conversation(is_hf: bool):
         enable_lora=False,
         max_num_seqs=5,
         max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
     )
     tokenizer = tokenizer_group.tokenizer
 
@@ -793,8 +793,8 @@ def get_conversation(is_hf: bool):
     )
 
     vllm_result = apply_hf_chat_template(
+        model_config,
         tokenizer,
-        trust_remote_code=model_config.trust_remote_code,
         conversation=conversation,
         chat_template=None,
         tools=None,
@@ -813,13 +813,24 @@ def get_conversation(is_hf: bool):
 @pytest.mark.parametrize("use_tools", [True, False])
 def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     """checks that chat_template is a dict type for HF models."""
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
 
     # Build the tokenizer group and grab the underlying tokenizer
     tokenizer_group = TokenizerGroup(
         model,
         enable_lora=False,
         max_num_seqs=5,
         max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
     )
     tokenizer = tokenizer_group.tokenizer
 
@@ -834,10 +845,10 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
 
     # Test detecting the tokenizer's chat_template
     chat_template = resolve_hf_chat_template(
+        model_config,
         tokenizer,
         chat_template=None,
         tools=tools,
-        trust_remote_code=True,
     )
     assert isinstance(chat_template, str)
 
@@ -857,24 +868,32 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
 )
 # yapf: enable
 def test_resolve_content_format_hf_defined(model, expected_format):
-    if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
-            "4.49.0"):
-        pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
 
     tokenizer_group = TokenizerGroup(
         model,
         enable_lora=False,
         max_num_seqs=5,
         max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
     )
     tokenizer = tokenizer_group.tokenizer
 
     # Test detecting the tokenizer's chat_template
     chat_template = resolve_hf_chat_template(
+        model_config,
         tokenizer,
         chat_template=None,
         tools=None,
-        trust_remote_code=True,
     )
     assert isinstance(chat_template, str)
 
@@ -884,11 +903,70 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     print(_try_extract_ast(chat_template))
 
     resolved_format = resolve_chat_template_content_format(
+        model_config,
+        None,  # Test detecting the tokenizer's chat_template
+        None,
+        "auto",
+        tokenizer,
+    )
+
+    assert resolved_format == expected_format
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("model", "expected_format"),
+    [("Salesforce/blip2-opt-2.7b", "string"),
+     ("facebook/chameleon-7b", "string"),
+     ("deepseek-ai/deepseek-vl2-tiny", "string"),
+     ("microsoft/Florence-2-base", "string"),
+     ("adept/fuyu-8b", "string"),
+     ("google/paligemma-3b-mix-224", "string"),
+     ("Qwen/Qwen-VL", "string"),
+     ("Qwen/Qwen-VL-Chat", "string")],
+)
+# yapf: enable
+def test_resolve_content_format_fallbacks(model, expected_format):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        hf_overrides=model_info.hf_overrides,
+    )
+
+    tokenizer_group = TokenizerGroup(
+        model_config.tokenizer,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    # Test detecting the tokenizer's chat_template
+    chat_template = resolve_hf_chat_template(
+        model_config,
+        tokenizer,
+        chat_template=None,
+        tools=None,
+    )
+    assert isinstance(chat_template, str)
+
+    print("[TEXT]")
+    print(chat_template)
+    print("[AST]")
+    print(_try_extract_ast(chat_template))
+
+    resolved_format = resolve_chat_template_content_format(
+        model_config,
         None,  # Test detecting the tokenizer's chat_template
         None,
         "auto",
         tokenizer,
-        trust_remote_code=True,
     )
 
     assert resolved_format == expected_format
@@ -899,22 +977,14 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     ("template_path", "expected_format"),
     [("template_alpaca.jinja", "string"),
      ("template_baichuan.jinja", "string"),
-     ("template_blip2.jinja", "string"),
-     ("template_chameleon.jinja", "string"),
      ("template_chatglm.jinja", "string"),
      ("template_chatglm2.jinja", "string"),
      ("template_chatml.jinja", "string"),
-     ("template_deepseek_vl2.jinja", "string"),
      ("template_dse_qwen2_vl.jinja", "openai"),
      ("template_falcon_180b.jinja", "string"),
      ("template_falcon.jinja", "string"),
-     ("template_florence2.jinja", "string"),
-     ("template_fuyu.jinja", "string"),
      ("template_inkbot.jinja", "string"),
-     ("template_paligemma.jinja", "string"),
      ("template_teleflm.jinja", "string"),
-     ("template_qwen_vl.jinja", "string"),
-     ("template_qwen_vl_chat.jinja", "string"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),
@@ -926,11 +996,18 @@ def test_resolve_content_format_hf_defined(model, expected_format):
 )
 # yapf: enable
 def test_resolve_content_format_examples(template_path, expected_format):
+    model_config = ModelConfig(
+        PHI3V_MODEL_ID,  # Dummy
+        tokenizer=PHI3V_MODEL_ID,  # Dummy
+        trust_remote_code=True,
+    )
+
     tokenizer_group = TokenizerGroup(
-        PHI3V_MODEL_ID,
+        PHI3V_MODEL_ID,  # Dummy
         enable_lora=False,
         max_num_seqs=5,
         max_input_length=None,
+        trust_remote_code=model_config.trust_remote_code,
     )
     dummy_tokenizer = tokenizer_group.tokenizer
     dummy_tokenizer.chat_template = None
@@ -944,11 +1021,11 @@ def test_resolve_content_format_examples(template_path, expected_format):
     print(_try_extract_ast(chat_template))
 
     resolved_format = resolve_chat_template_content_format(
+        model_config,
         chat_template,
         None,
         "auto",
         dummy_tokenizer,
-        trust_remote_code=True,
     )
 
     assert resolved_format == expected_format
@@ -182,7 +182,9 @@ def check_available_online(
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
                                         extras={"tiny": "ai21labs/Jamba-tiny-dev"}),  # noqa: E501
-    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
+    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct",
+                                        extras={"guard": "meta-llama/Llama-Guard-3-1B",  # noqa: E501
+                                                "hermes": "NousResearch/Hermes-3-Llama-3.1-8B"}),  # noqa: E501
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
@@ -378,7 +380,7 @@ def check_available_online(
     # Therefore, we borrow the BartTokenizer from the original Bart model
     "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
                                                          tokenizer="Isotr0py/Florence-2-tokenizer",
-                                                         trust_remote_code=True),  # noqa: E501
+                                                         trust_remote_code=True,),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }