vllm-project · DarkLight1337 · Apr 19, 2025 · Apr 19, 2025
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
@@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace):
         tokenizer_mode="mistral" if args.format == "mistral" else "auto",
         config_format="mistral" if args.format == "mistral" else "auto",
         load_format="mistral" if args.format == "mistral" else "auto",
+        limit_mm_per_prompt={"image": 1},
         max_model_len=4096,
         max_num_seqs=2,
         tensor_parallel_size=2,

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
@@ -957,7 +957,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
             "max_pixels": 1280 * 28 * 28,
             "fps": [1],
         },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
     )
 
     if modality == "image":

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
@@ -503,26 +503,6 @@ def get_feature_extractor(self) -> SequenceFeatureExtractor:
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None, "image": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "audio": self.get_max_audio_tokens(),
-        }
-
-    def get_max_audio_tokens(self) -> int:
-        sr = self.get_feature_extractor().sampling_rate
-        num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr)
-        return self._compute_audio_embed_size(num_frames)
-
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-        return self.get_num_image_tokens(image_width=target_width,
-                                         image_height=target_height)
-
     def _find_target_aspect_ratio(
         self,
         orig_width: int,
@@ -764,9 +744,6 @@ def get_dummy_mm_data(
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
 
-        target_width, target_height = \
-            self.info.get_image_size_with_most_features()
-
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 

@@ -172,26 +172,9 @@ def get_feature_extractor(
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_max_audio_tokens(self) -> int:
-        hf_config = self.get_hf_config()
-        max_source_position = hf_config.audio_config.max_source_positions
-        output_lengths = (max_source_position - 2) // 2 + 1
-        return output_lengths
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"audio": None, "image": None, "video": None}
 
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "audio": self.get_max_audio_tokens(),
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len, mm_counts),
-        }
-
 
 class Qwen2_5OmniThinkerDummyInputsBuilder(
         BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
@@ -210,7 +193,6 @@ def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return (audio_token * num_audios + image_token * num_images +
                 video_token * num_videos)
 
-    # TODO: @abstractmethod after transition
     def get_dummy_mm_data(
         self,
         seq_len: int,