|
8 | 8 | from dataclasses import dataclass |
9 | 9 | from functools import partial |
10 | 10 | from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, |
11 | | - Iterable, List, Mapping, NamedTuple, Optional) |
| 11 | + Iterable, List, Literal, Mapping, NamedTuple, Optional) |
12 | 12 | from typing import Sequence as GenericSequence |
13 | 13 | from typing import Set, Type, Union, cast, overload |
14 | 14 |
|
|
30 | 30 | get_logits_processors as get_openai_logits_processors) |
31 | 31 | from vllm.executor.executor_base import ExecutorBase |
32 | 32 | from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, |
33 | | - PromptType) |
| 33 | + PromptType, SingletonInputs) |
34 | 34 | from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs |
35 | 35 | from vllm.inputs.preprocess import InputPreprocessor |
36 | 36 | from vllm.logger import init_logger |
|
40 | 40 | get_local_guided_decoding_logits_processor) |
41 | 41 | from vllm.model_executor.layers.sampler import SamplerOutput |
42 | 42 | from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry |
| 43 | +from vllm.multimodal.processing import EncDecMultiModalProcessor |
43 | 44 | from vllm.outputs import (PoolingRequestOutput, RequestOutput, |
44 | 45 | RequestOutputFactory) |
45 | 46 | from vllm.pooling_params import PoolingParams |
@@ -2029,29 +2030,57 @@ def _validate_model_inputs(self, inputs: ProcessorInputs, |
2029 | 2030 | lora_request: Optional[LoRARequest]): |
2030 | 2031 | encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs) |
2031 | 2032 |
|
2032 | | - # For encoder-decoder multimodal models, the max_prompt_len |
2033 | | - # restricts the decoder prompt length |
2034 | | - if self.model_config.is_multimodal_model: |
2035 | | - prompt_inputs = decoder_inputs |
2036 | | - else: |
2037 | | - prompt_inputs = encoder_inputs or decoder_inputs |
| 2033 | + if encoder_inputs is not None: |
| 2034 | + self._validate_model_input(encoder_inputs, |
| 2035 | + lora_request, |
| 2036 | + prompt_type="encoder") |
2038 | 2037 |
|
2039 | | - prompt_ids = prompt_inputs["prompt_token_ids"] |
| 2038 | + self._validate_model_input(decoder_inputs, |
| 2039 | + lora_request, |
| 2040 | + prompt_type="decoder") |
2040 | 2041 |
|
2041 | | - if prompt_ids is None or len(prompt_ids) == 0: |
2042 | | - raise ValueError("Prompt cannot be empty") |
| 2042 | + def _validate_model_input( |
| 2043 | + self, |
| 2044 | + prompt_inputs: SingletonInputs, |
| 2045 | + lora_request: Optional[LoRARequest], |
| 2046 | + *, |
| 2047 | + prompt_type: Literal["encoder", "decoder"], |
| 2048 | + ): |
| 2049 | + if prompt_type == "encoder" and self.tokenizer is not None: |
| 2050 | + tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) |
| 2051 | + model_config = self.model_config |
2043 | 2052 |
|
2044 | | - if self.model_config.is_multimodal_model: |
2045 | | - max_prompt_len = self.model_config.max_model_len |
| 2053 | + if model_config.is_multimodal_model: |
| 2054 | + mm_registry = self.input_preprocessor.mm_registry |
| 2055 | + mm_processor = mm_registry.create_processor( |
| 2056 | + model_config, tokenizer=tokenizer) |
| 2057 | + assert isinstance(mm_processor, EncDecMultiModalProcessor) |
2046 | 2058 |
|
2047 | | - if len(prompt_ids) > max_prompt_len: |
2048 | | - raise ValueError( |
2049 | | - f"The prompt (total length {len(prompt_ids)}) is too long " |
2050 | | - f"to fit into the model (context length {max_prompt_len}). " |
| 2059 | + if mm_processor.pad_dummy_encoder_prompt: |
| 2060 | + return # Skip encoder length check for Whisper |
| 2061 | + |
| 2062 | + prompt_ids = prompt_inputs["prompt_token_ids"] |
| 2063 | + |
| 2064 | + if not prompt_ids: |
| 2065 | + raise ValueError(f"The {prompt_type} prompt cannot be empty") |
| 2066 | + |
| 2067 | + max_prompt_len = self.model_config.max_model_len |
| 2068 | + if len(prompt_ids) >= max_prompt_len: |
| 2069 | + if self.model_config.is_multimodal_model: |
| 2070 | + suggestion = ( |
2051 | 2071 | "Make sure that `max_model_len` is no smaller than the " |
2052 | 2072 | "number of text tokens plus multimodal tokens. For image " |
2053 | 2073 | "inputs, the number of image tokens depends on the number " |
2054 | 2074 | "of images, and possibly their aspect ratios as well.") |
| 2075 | + else: |
| 2076 | + suggestion = ( |
| 2077 | + "Make sure that `max_model_len` is no smaller than the " |
| 2078 | + "number of text tokens.") |
| 2079 | + |
| 2080 | + raise ValueError( |
| 2081 | + f"The {prompt_type} prompt (length {len(prompt_ids)}) is " |
| 2082 | + f"longer than the maximum model length of {max_prompt_len}. " |
| 2083 | + f"{suggestion}") |
2055 | 2084 |
|
2056 | 2085 | # TODO: Find out how many placeholder tokens are there so we can |
2057 | 2086 | # check that chunked prefill does not truncate them |
|
0 commit comments