diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 61e5f5eae4ef..2883c37ca236 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple): def run_florence2(): engine_args = EngineArgs( model="microsoft/Florence-2-large", - tokenizer="facebook/bart-large", + tokenizer="Isotr0py/Florence-2-tokenizer", max_num_seqs=8, trust_remote_code=True, limit_mm_per_prompt={"image": 1}, @@ -165,6 +165,7 @@ def main(args): temperature=0, top_p=1.0, max_tokens=64, + skip_special_tokens=False, ) start = time.time() diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 6b533346ac31..4476009fd271 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model="microsoft/Florence-2-large", - tokenizer="facebook/bart-large", + tokenizer="Isotr0py/Florence-2-tokenizer", max_model_len=4096, max_num_seqs=2, trust_remote_code=True, diff --git a/tests/conftest.py b/tests/conftest.py index d272f448f61f..25e70319e2cc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -925,6 +925,7 @@ def generate_encoder_decoder_greedy_logprobs( max_tokens: int, num_logprobs: int, num_prompt_logprobs: Optional[int] = None, + skip_special_tokens: bool = True, ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( @@ -932,6 +933,7 @@ def generate_encoder_decoder_greedy_logprobs( max_tokens=max_tokens, logprobs=num_logprobs, prompt_logprobs=(num_prompt_logprobs), + skip_special_tokens=skip_special_tokens, ) ''' Greedy logprobs generation for vLLM encoder/decoder models diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py index a6ec333e2e9b..14b64393bf52 100644 --- a/tests/models/encoder_decoder/vision_language/test_florence2.py +++ b/tests/models/encoder_decoder/vision_language/test_florence2.py @@ -13,12 +13,12 @@ from ...utils import check_logprobs_close MODELS = ["microsoft/Florence-2-base"] -# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer -# Therefore, we borrow the BartTokenizer from the original Bart model -TOKENIZER = "facebook/bart-base" +# Florence-2 model repo's tokenizer config is missing some special tokens. +# Therefore, we use a converted tokenizer from a forked repo +TOKENIZER = "Isotr0py/Florence-2-tokenizer" HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": - "", # special task token + "", # special task token which will output special tokens "cherry_blossom": "Describe in detail what is shown in the image.", }) @@ -45,7 +45,6 @@ def hf_to_vllm_output(hf_output: tuple[list[int], str, output_ids, output_str, out_logprobs = hf_output output_str = output_str.replace("", "").replace("", "") - output_ids = [ids for ids in output_ids if ids not in [0, 2]] return output_ids, output_str, out_logprobs @@ -71,8 +70,11 @@ def run_test( enforce_eager=True) as vllm_model: vllm_outputs_per_case = [ vllm_model.generate_encoder_decoder_greedy_logprobs( - prompts, max_tokens, num_logprobs=num_logprobs) - for prompts in inputs + prompts, + max_tokens, + num_logprobs=num_logprobs, + skip_special_tokens=False, + ) for prompts in inputs ] hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs] @@ -93,6 +95,7 @@ def run_test( outputs_1_lst=vllm_outputs, name_0="hf", name_1="vllm", + num_outputs_0_skip_tokens=1, ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 1599b1da07ca..8d50644a8652 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -366,7 +366,7 @@ def check_available_online( # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer # Therefore, we borrow the BartTokenizer from the original Bart model "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501 - tokenizer="facebook/bart-base", + tokenizer="Isotr0py/Florence-2-tokenizer", trust_remote_code=True), # noqa: E501 "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501