diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index af1831bd36d1..37c3181dc5fa 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace): tokenizer_mode="mistral" if args.format == "mistral" else "auto", config_format="mistral" if args.format == "mistral" else "auto", load_format="mistral" if args.format == "mistral" else "auto", + limit_mm_per_prompt={"image": 1}, max_model_len=4096, max_num_seqs=2, tensor_parallel_size=2, diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index bd7035b7615a..d02ac17cfdd6 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -957,7 +957,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str): "max_pixels": 1280 * 28 * 28, "fps": [1], }, - disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + limit_mm_per_prompt={"image": 1}, ) if modality == "image": diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index cdd762f5fec3..03ca143f9c08 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -503,26 +503,6 @@ def get_feature_extractor(self) -> SequenceFeatureExtractor: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None, "image": None} - def get_mm_max_tokens_per_item( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> Mapping[str, int]: - return { - "image": self.get_max_image_tokens(), - "audio": self.get_max_audio_tokens(), - } - - def get_max_audio_tokens(self) -> int: - sr = self.get_feature_extractor().sampling_rate - num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr) - return self._compute_audio_embed_size(num_frames) - - def get_max_image_tokens(self) -> int: - target_width, target_height = self.get_image_size_with_most_features() - return self.get_num_image_tokens(image_width=target_width, - image_height=target_height) - def _find_target_aspect_ratio( self, orig_width: int, @@ -764,9 +744,6 @@ def get_dummy_mm_data( num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) - target_width, target_height = \ - self.info.get_image_size_with_most_features() - target_width, target_height = \ self.info.get_image_size_with_most_features() diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 517d6eb7d6d0..e978c7f4e217 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -172,26 +172,9 @@ def get_feature_extractor( assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor - def get_max_audio_tokens(self) -> int: - hf_config = self.get_hf_config() - max_source_position = hf_config.audio_config.max_source_positions - output_lengths = (max_source_position - 2) // 2 + 1 - return output_lengths - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None, "image": None, "video": None} - def get_mm_max_tokens_per_item( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> Mapping[str, int]: - return { - "audio": self.get_max_audio_tokens(), - "image": self.get_max_image_tokens(), - "video": self.get_max_video_tokens(seq_len, mm_counts), - } - class Qwen2_5OmniThinkerDummyInputsBuilder( BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]): @@ -210,7 +193,6 @@ def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: return (audio_token * num_audios + image_token * num_images + video_token * num_videos) - # TODO: @abstractmethod after transition def get_dummy_mm_data( self, seq_len: int,