diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md index 76b2fb95a5009..21bd938be9e89 100644 --- a/docs/source/api/multimodal/inputs.md +++ b/docs/source/api/multimodal/inputs.md @@ -43,7 +43,7 @@ ``` ```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 +.. autoclass:: vllm.multimodal.inputs.MultiModalInputs :members: :show-inheritance: ``` diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index b8163a7acde1d..57e85779dd587 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs, MultiModalPlaceholderDict) - from vllm.multimodal.inputs import MultiModalInputsV2 + from vllm.multimodal.inputs import MultiModalInputs class TextPrompt(TypedDict): @@ -207,7 +207,7 @@ def token_inputs( return inputs -DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"] +DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -222,14 +222,14 @@ class EncoderDecoderInputs(TypedDict): This specifies the required data for encoder-decoder models. """ - encoder: Union[TokenInputs, "MultiModalInputsV2"] + encoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the encoder portion.""" - decoder: Union[TokenInputs, "MultiModalInputsV2"] + decoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the decoder portion.""" -SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"] +SingletonInputs = Union[TokenInputs, "MultiModalInputs"] """ A processed :class:`SingletonPrompt` which can be passed to :class:`vllm.sequence.Sequence`. @@ -311,7 +311,7 @@ def multi_modal_hashes(self) -> List[str]: return inputs.get("multi_modal_hashes", []) if inputs["type"] == "multimodal": - # only the case when we use MultiModalInputsV2 + # only the case when we use MultiModalInputs return inputs.get("mm_hashes", []) # type: ignore[return-value] assert_never(inputs) # type: ignore[arg-type] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0890883cc984f..70372e0cad22d 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -7,7 +7,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2 +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup @@ -247,7 +247,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, returning the corresponding token IDs and metadata. @@ -271,7 +271,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """Async version of :meth:`_process_multimodal`.""" tokenizer_group = self.get_tokenizer_group() tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 917b88e802071..f5c796b1acae6 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -15,7 +15,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -490,7 +490,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only tokens should be considered as placeholders, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a6634204699c9..e2207865a693d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -29,7 +29,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -159,7 +159,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only tokens should be considered as placeholders, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 63e7147f84e03..3f16d3ccbd061 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -31,7 +31,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) @@ -232,7 +232,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only |SPEAKER| (image) tokens should be considered as placeholders, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6cceded43a79d..a355ae494afd0 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -24,7 +24,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) @@ -746,7 +746,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -805,7 +805,7 @@ def get_replacement_mantis(item_idx: int): for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7a230e5beb367..dd3b0b35c9294 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -31,7 +31,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) @@ -484,7 +484,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only <|image|> tokens should be considered as placeholders, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 47d56175261e4..9cb8f83ad7873 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -37,7 +37,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) @@ -245,7 +245,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) # Only <|AUDIO|> tokens should be considered as placeholders, diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 4b63703585214..b35184f6855ab 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -491,7 +491,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: """ -class MultiModalInputsV2(TypedDict): +class MultiModalInputs(TypedDict): """ Represents the outputs of :class:`vllm.multimodal.processing.BaseMultiModalProcessor`, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index fa199a07b4cf8..ff02bcc8e1f2d 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -18,8 +18,8 @@ from .hasher import MultiModalHasher from .inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - MultiModalKwargsItem, PlaceholderRange) + MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem, + PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser if TYPE_CHECKING: @@ -609,7 +609,7 @@ def __call__( prompt: str, mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) def _get_data_parser(self) -> MultiModalDataParser: @@ -1067,7 +1067,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1169,7 +1169,7 @@ def apply( for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index ec580cd6ecddd..20da0f1d8316f 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -11,7 +11,7 @@ from vllm.inputs import DummyData from vllm.logger import init_logger -from .inputs import MultiModalDataDict, MultiModalInputsV2 +from .inputs import MultiModalDataDict, MultiModalInputs from .processing import BaseMultiModalProcessor, BaseProcessingInfo logger = init_logger(__name__) @@ -131,7 +131,7 @@ def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Mapping[str, int], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: factory = self.dummy_inputs processor_inputs = factory.get_dummy_processor_inputs( seq_len, mm_counts)