Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 17 additions & 13 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2046,27 +2046,31 @@ def _validate_model_input(
*,
prompt_type: Literal["encoder", "decoder"],
):
if prompt_type == "encoder" and self.tokenizer is not None:
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
model_config = self.model_config
model_config = self.model_config
tokenizer = (None if self.tokenizer is None else
self.tokenizer.get_lora_tokenizer(lora_request))

if model_config.is_multimodal_model:
prompt_ids = prompt_inputs["prompt_token_ids"]
if not prompt_ids:
if prompt_type == "encoder" and model_config.is_multimodal_model:
pass # Mllama may have empty encoder inputs for text-only data
else:
raise ValueError(f"The {prompt_type} prompt cannot be empty")

max_prompt_len = self.model_config.max_model_len
if len(prompt_ids) >= max_prompt_len:
if prompt_type == "encoder" and model_config.is_multimodal_model:
mm_registry = self.input_preprocessor.mm_registry
mm_processor = mm_registry.create_processor(
model_config, tokenizer=tokenizer)
model_config,
tokenizer=tokenizer or object(), # Dummy if no tokenizer
)
assert isinstance(mm_processor, EncDecMultiModalProcessor)

if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper

prompt_ids = prompt_inputs["prompt_token_ids"]

if not prompt_ids:
raise ValueError(f"The {prompt_type} prompt cannot be empty")

max_prompt_len = self.model_config.max_model_len
if len(prompt_ids) >= max_prompt_len:
if self.model_config.is_multimodal_model:
if model_config.is_multimodal_model:
suggestion = (
"Make sure that `max_model_len` is no smaller than the "
"number of text tokens plus multimodal tokens. For image "
Expand Down
5 changes: 4 additions & 1 deletion vllm/model_executor/models/mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ def apply(
# }

if mm_data:
hf_processor = self.info.get_hf_processor()
image_token: str = hf_processor.image_token

# Since only the last group of consecutive images
# are attended by the decoded tokens, we only need to
# get the number of tokens for those images.
Expand All @@ -227,7 +230,7 @@ def apply(
num_tokens = decode_tiles * token_per_chunk
mm_inputs["encoder_prompt_token_ids"] = [image_token_id
] * num_tokens
mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens
mm_inputs["encoder_prompt"] = image_token * num_tokens

return mm_inputs

Expand Down
34 changes: 18 additions & 16 deletions vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,32 +315,34 @@ def _validate_model_input(
*,
prompt_type: Literal["encoder", "decoder"],
):
model_config = self.model_config
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)

if prompt_type == "encoder":
model_config = self.model_config

if model_config.is_multimodal_model:
mm_registry = self.input_preprocessor.mm_registry
mm_processor = mm_registry.create_processor(
model_config, tokenizer=tokenizer)
assert isinstance(mm_processor, EncDecMultiModalProcessor)

if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper

prompt_ids = prompt_inputs["prompt_token_ids"]

if not prompt_ids:
raise ValueError(f"The {prompt_type} prompt cannot be empty")
if prompt_type == "encoder" and model_config.is_multimodal_model:
pass # Mllama may have empty encoder inputs for text-only data
else:
raise ValueError(f"The {prompt_type} prompt cannot be empty")

max_input_id = max(prompt_ids)
max_input_id = max(prompt_ids, default=0)
if max_input_id > tokenizer.max_token_id:
raise ValueError(f"Token id {max_input_id} is out of vocabulary")

max_prompt_len = self.model_config.max_model_len
if len(prompt_ids) >= max_prompt_len:
if self.model_config.is_multimodal_model:
if prompt_type == "encoder" and model_config.is_multimodal_model:
mm_registry = self.input_preprocessor.mm_registry
mm_processor = mm_registry.create_processor(
model_config,
tokenizer=tokenizer,
)
assert isinstance(mm_processor, EncDecMultiModalProcessor)

if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper

if model_config.is_multimodal_model:
suggestion = (
"Make sure that `max_model_len` is no smaller than the "
"number of text tokens plus multimodal tokens. For image "
Expand Down
Loading