diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index 48ede50e98f7..f18fbb0a9c71 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -122,10 +122,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt, # Call the function and get the result result = apply_hf_chat_template( - model_config, - tokenizer, + tokenizer=tokenizer, conversation=mock_request.messages, chat_template=mock_request.chat_template or template_content, + model_config=model_config, tools=None, add_generation_prompt=mock_request.add_generation_prompt, continue_final_message=mock_request.continue_final_message, diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index bcb25ed99062..43ad091971a7 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -793,10 +793,10 @@ def get_conversation(is_hf: bool): ) vllm_result = apply_hf_chat_template( - model_config, - tokenizer, + tokenizer=tokenizer, conversation=conversation, chat_template=None, + model_config=model_config, tools=None, add_generation_prompt=True, ) @@ -903,11 +903,11 @@ def test_resolve_content_format_hf_defined(model, expected_format): print(_try_extract_ast(chat_template)) resolved_format = resolve_chat_template_content_format( - model_config, None, # Test detecting the tokenizer's chat_template None, "auto", tokenizer, + model_config=model_config, ) assert resolved_format == expected_format @@ -962,11 +962,11 @@ def test_resolve_content_format_fallbacks(model, expected_format): print(_try_extract_ast(chat_template)) resolved_format = resolve_chat_template_content_format( - model_config, None, # Test detecting the tokenizer's chat_template None, "auto", tokenizer, + model_config=model_config, ) assert resolved_format == expected_format @@ -1021,11 +1021,11 @@ def test_resolve_content_format_examples(template_path, expected_format): print(_try_extract_ast(chat_template)) resolved_format = resolve_chat_template_content_format( - model_config, chat_template, None, "auto", dummy_tokenizer, + model_config=model_config, ) assert resolved_format == expected_format diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 183b5bf68311..6f5514a6f801 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -44,7 +44,7 @@ # yapf: enable from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import random_uuid +from vllm.utils import deprecate_kwargs, random_uuid logger = init_logger(__name__) @@ -329,11 +329,17 @@ def resolve_mistral_chat_template( "so it will be ignored.") return None +@deprecate_kwargs( + "trust_remote_code", + additional_message="Please use `model_config.trust_remote_code` instead.", +) def resolve_hf_chat_template( - model_config: ModelConfig, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], + *, + model_config: ModelConfig, + trsut_remote_code: Optional[bool] = None, ) -> Optional[str]: # 1st priority: The given chat template if chat_template is not None: @@ -379,18 +385,19 @@ def resolve_hf_chat_template( def _resolve_chat_template_content_format( - model_config: ModelConfig, chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], given_format: ChatTemplateContentFormatOption, tokenizer: AnyTokenizer, + *, + model_config: ModelConfig, ) -> _ChatTemplateContentFormat: if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): hf_chat_template = resolve_hf_chat_template( - model_config, tokenizer, chat_template=chat_template, tools=tools, + model_config=model_config, ) else: hf_chat_template = None @@ -428,19 +435,25 @@ def _log_chat_template_content_format( ) +@deprecate_kwargs( + "trust_remote_code", + additional_message="Please use `model_config.trust_remote_code` instead.", +) def resolve_chat_template_content_format( - model_config: ModelConfig, chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], given_format: ChatTemplateContentFormatOption, tokenizer: AnyTokenizer, + *, + model_config: ModelConfig, + trust_remote_code: Optional[bool] = None, ) -> _ChatTemplateContentFormat: detected_format = _resolve_chat_template_content_format( - model_config, chat_template, tools, given_format, tokenizer, + model_config=model_config, ) _log_chat_template_content_format( @@ -1191,21 +1204,27 @@ def parse_chat_messages_futures( return conversation, mm_tracker.all_mm_data() +@deprecate_kwargs( + "trust_remote_code", + additional_message="Please use `model_config.trust_remote_code` instead.", +) def apply_hf_chat_template( - model_config: ModelConfig, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], conversation: list[ConversationMessage], chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], *, + model_config: ModelConfig, tokenize: bool = False, # Different from HF's default + # Deprecated, explicitly capture here so it doesn't slit into kwargs. + trust_remote_code: Optional[bool] = None, **kwargs: Any, ) -> str: hf_chat_template = resolve_hf_chat_template( - model_config, tokenizer, chat_template=chat_template, tools=tools, + model_config=model_config, ) if hf_chat_template is None: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index cebddcc8e6aa..053ee55bb6a8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -731,11 +731,11 @@ def chat( tokenizer = self.get_tokenizer(lora_request) model_config = self.llm_engine.get_model_config() resolved_content_format = resolve_chat_template_content_format( - model_config, chat_template, tools, chat_template_content_format, tokenizer, + model_config=model_config, ) _chat_template_kwargs: dict[str, Any] = dict( @@ -767,9 +767,9 @@ def chat( ) else: prompt_str = apply_hf_chat_template( - model_config, - tokenizer, + tokenizer=tokenizer, conversation=conversation, + model_config=model_config, **_chat_template_kwargs, ) # Special tokens are already included in chat templates so diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a954a9ff90bc..5b3df0faccf6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -971,10 +971,10 @@ async def init_app_state( chat_template=resolved_chat_template) else: hf_chat_template = resolve_hf_chat_template( - vllm_config.model_config, - tokenizer, + tokenizer=tokenizer, chat_template=None, tools=None, + model_config=vllm_config.model_config, ) if hf_chat_template != resolved_chat_template: diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index f1d907f519c5..f9eebde37181 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -670,11 +670,11 @@ async def _preprocess_chat( model_config = self.model_config resolved_content_format = resolve_chat_template_content_format( - model_config, chat_template, tool_dicts, chat_template_content_format, tokenizer, + model_config=model_config, ) conversation, mm_data_future = parse_chat_messages_futures( messages, @@ -701,9 +701,9 @@ async def _preprocess_chat( ) else: request_prompt = apply_hf_chat_template( - model_config, - tokenizer, + tokenizer=tokenizer, conversation=conversation, + model_config=model_config, **_chat_template_kwargs, )