@@ -989,6 +989,18 @@ def get_mm_max_tokens_per_item(
989989[`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
990990"""
991991
992+ MultiModalPromptUpdates = dict [str , Sequence [BoundPromptUpdate ]]
993+ """
994+ A collection of prompt updates with a similar structure as
995+ [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
996+ """
997+
998+
999+ class MultiModalProcessingInfo (NamedTuple ):
1000+ kwargs : MultiModalKwargsItems
1001+ hashes : Optional [MultiModalHashes ]
1002+ prompt_updates : MultiModalPromptUpdates
1003+
9921004
9931005class BaseMultiModalProcessor (ABC , Generic [_I ]):
9941006 """
@@ -1363,7 +1375,7 @@ def _merge_mm_kwargs(
13631375 cache : ProcessingCache ,
13641376 mm_cache_items_or_hashes : dict [str , list [_CacheItemOrHash ]],
13651377 mm_missing_kwargs : MultiModalKwargsItems ,
1366- ) -> dict [ str , list [ MultiModalKwargsItem ]] :
1378+ ) -> MultiModalKwargsItems :
13671379 mm_missing_next_idx = defaultdict [str , int ](lambda : 0 )
13681380
13691381 merged_items = defaultdict [str , list [MultiModalKwargsItem ]](list )
@@ -1379,7 +1391,7 @@ def _merge_mm_kwargs(
13791391
13801392 merged_items [modality ].append (kw_item )
13811393
1382- return dict (merged_items )
1394+ return MultiModalKwargsItems (merged_items )
13831395
13841396 def _apply_hf_processor (
13851397 self ,
@@ -1389,8 +1401,7 @@ def _apply_hf_processor(
13891401 tokenization_kwargs : Mapping [str , object ],
13901402 * ,
13911403 return_mm_hashes : bool ,
1392- ) -> tuple [list [int ], MultiModalKwargsItems , Optional [MultiModalHashes ],
1393- bool ]:
1404+ ) -> tuple [list [int ], MultiModalProcessingInfo , bool ]:
13941405 (
13951406 prompt_ids ,
13961407 mm_processed_data ,
@@ -1413,7 +1424,21 @@ def _apply_hf_processor(
14131424 tokenization_kwargs )
14141425 if return_mm_hashes else None )
14151426
1416- return prompt_ids , mm_kwargs , mm_hashes , is_update_applied
1427+ unbound_prompt_updates = self ._get_prompt_updates (
1428+ mm_data_items ,
1429+ hf_processor_mm_kwargs ,
1430+ mm_kwargs ,
1431+ )
1432+ mm_prompt_updates = self ._bind_and_group_updates (
1433+ unbound_prompt_updates )
1434+
1435+ mm_info = MultiModalProcessingInfo (
1436+ kwargs = mm_kwargs ,
1437+ hashes = mm_hashes ,
1438+ prompt_updates = mm_prompt_updates ,
1439+ )
1440+
1441+ return prompt_ids , mm_info , is_update_applied
14171442
14181443 def _cached_apply_hf_processor (
14191444 self ,
@@ -1423,8 +1448,7 @@ def _cached_apply_hf_processor(
14231448 tokenization_kwargs : Mapping [str , object ],
14241449 * ,
14251450 return_mm_hashes : bool ,
1426- ) -> tuple [list [int ], MultiModalKwargsItems , Optional [MultiModalHashes ],
1427- bool ]:
1451+ ) -> tuple [list [int ], MultiModalProcessingInfo , bool ]:
14281452 """
14291453 Apply the HF processor on the full prompt text,
14301454 caching the results and reusing cached results.
@@ -1475,18 +1499,27 @@ def _cached_apply_hf_processor(
14751499 hf_processor_mm_kwargs ),
14761500 )
14771501
1478- mm_cache_items_merged = self ._merge_mm_kwargs (
1502+ mm_kwargs = self ._merge_mm_kwargs (
14791503 cache ,
14801504 mm_cache_items_or_hashes = mm_cache_items_or_hashes ,
14811505 mm_missing_kwargs = mm_missing_kwargs ,
14821506 )
14831507
1484- mm_kwargs = MultiModalKwargsItems .from_seq ([
1485- item for cache_items in mm_cache_items_merged .values ()
1486- for item in cache_items
1487- ])
1508+ unbound_prompt_updates = self ._get_prompt_updates (
1509+ mm_data_items ,
1510+ hf_processor_mm_kwargs ,
1511+ mm_kwargs ,
1512+ )
1513+ mm_prompt_updates = self ._bind_and_group_updates (
1514+ unbound_prompt_updates )
1515+
1516+ mm_info = MultiModalProcessingInfo (
1517+ kwargs = mm_kwargs ,
1518+ hashes = mm_hashes_to_return ,
1519+ prompt_updates = mm_prompt_updates ,
1520+ )
14881521
1489- return prompt_ids , mm_kwargs , mm_hashes_to_return , is_update_applied
1522+ return prompt_ids , mm_info , is_update_applied
14901523
14911524 def _bind_and_group_updates (
14921525 self ,
@@ -1626,19 +1659,11 @@ def _validate_mm_placeholders(
16261659 def _maybe_apply_prompt_updates (
16271660 self ,
16281661 mm_items : MultiModalDataItems ,
1629- hf_processor_mm_kwargs : Mapping [str , object ],
16301662 prompt_ids : list [int ],
16311663 mm_kwargs : MultiModalKwargsItems ,
1664+ mm_prompt_updates : MultiModalPromptUpdates ,
16321665 is_update_applied : bool ,
16331666 ) -> tuple [list [int ], str , Mapping [str , list [PlaceholderFeaturesInfo ]]]:
1634- unbound_prompt_updates = self ._get_prompt_updates (
1635- mm_items ,
1636- hf_processor_mm_kwargs ,
1637- mm_kwargs ,
1638- )
1639- mm_prompt_updates = self ._bind_and_group_updates (
1640- unbound_prompt_updates )
1641-
16421667 mm_item_counts = mm_items .get_all_counts ()
16431668 self ._validate_mm_kwargs (mm_kwargs , mm_item_counts )
16441669
@@ -1694,8 +1719,7 @@ def apply(
16941719
16951720 (
16961721 prompt_ids ,
1697- mm_kwargs ,
1698- mm_hashes ,
1722+ mm_info ,
16991723 is_update_applied ,
17001724 ) = self ._cached_apply_hf_processor (
17011725 prompt ,
@@ -1708,9 +1732,9 @@ def apply(
17081732 # NOTE: tokenization_kwargs are not required to init processor
17091733 prompt_ids , prompt , mm_placeholders = self ._maybe_apply_prompt_updates (
17101734 mm_items = mm_items ,
1711- hf_processor_mm_kwargs = hf_processor_mm_kwargs ,
17121735 prompt_ids = prompt_ids ,
1713- mm_kwargs = mm_kwargs ,
1736+ mm_kwargs = mm_info .kwargs ,
1737+ mm_prompt_updates = mm_info .prompt_updates ,
17141738 is_update_applied = is_update_applied ,
17151739 )
17161740
@@ -1723,8 +1747,8 @@ def apply(
17231747 type = "multimodal" ,
17241748 prompt = prompt ,
17251749 prompt_token_ids = prompt_ids ,
1726- mm_kwargs = mm_kwargs ,
1727- mm_hashes = mm_hashes ,
1750+ mm_kwargs = mm_info . kwargs ,
1751+ mm_hashes = mm_info . hashes ,
17281752 mm_placeholders = mm_placeholder_ranges ,
17291753 )
17301754
0 commit comments