vllm-project
diff --git a/‎vllm/model_executor/models/interns1.py‎
Lines changed: 10 additions & 4 deletions b/‎vllm/model_executor/models/interns1.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎vllm/model_executor/models/internvl.py‎
Lines changed: 10 additions & 4 deletions b/‎vllm/model_executor/models/internvl.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎vllm/model_executor/models/midashenglm.py‎
Lines changed: 27 additions & 53 deletions b/‎vllm/model_executor/models/midashenglm.py‎
Lines changed: 27 additions & 53 deletions
diff --git a/‎vllm/model_executor/models/minicpmo.py‎
Lines changed: 5 additions & 42 deletions b/‎vllm/model_executor/models/minicpmo.py‎
Lines changed: 5 additions & 42 deletions
@@ -631,8 +631,11 @@ def _parse_and_validate_image_input(
             )
 
         image_token_id = kwargs["image_token_id"]
-        assert isinstance(image_token_id, torch.Tensor)
-        self.img_context_token_id = image_token_id.flatten().unique().item()
+        if isinstance(image_token_id, torch.Tensor):
+            image_token_id = image_token_id.flatten().unique().item()
+
+        assert isinstance(image_token_id, int)
+        self.img_context_token_id = image_token_id
 
         if pixel_values is not None:
             h, w = self.config.vision_config.image_size
@@ -665,8 +668,11 @@ def _parse_and_validate_video_input(
             )
 
         video_token_id = kwargs["video_token_id"]
-        assert isinstance(video_token_id, torch.Tensor)
-        self.video_context_token_id = video_token_id.flatten().unique().item()
+        if isinstance(video_token_id, torch.Tensor):
+            video_token_id = video_token_id.flatten().unique().item()
+
+        assert isinstance(video_token_id, int)
+        self.video_context_token_id = video_token_id
 
         if pixel_values_flat_video is not None:
             h, w = self.config.vision_config.image_size
 
@@ -1232,8 +1232,11 @@ def _parse_and_validate_image_input(
             )
 
         image_token_id = kwargs["image_token_id"]
-        assert isinstance(image_token_id, torch.Tensor)
-        self.img_context_token_id = image_token_id.flatten().unique().item()
+        if isinstance(image_token_id, torch.Tensor):
+            image_token_id = image_token_id.flatten().unique().item()
+
+        assert isinstance(image_token_id, int)
+        self.img_context_token_id = image_token_id
 
         if pixel_values_flat is not None:
             expected_h = expected_w = self.config.vision_config.image_size
@@ -1265,8 +1268,11 @@ def _parse_and_validate_video_input(
             )
 
         video_token_id = kwargs["video_token_id"]
-        assert isinstance(video_token_id, torch.Tensor)
-        self.video_context_token_id = video_token_id.flatten().unique().item()
+        if isinstance(video_token_id, torch.Tensor):
+            video_token_id = video_token_id.flatten().unique().item()
+
+        assert isinstance(video_token_id, int)
+        self.video_context_token_id = video_token_id
 
         if pixel_values_flat_video is not None:
             expected_h = expected_w = self.config.vision_config.image_size
 
@@ -26,7 +26,7 @@
 import collections
 import collections.abc
 from collections.abc import Callable, Iterable, Mapping, Sequence
-from typing import Any, TypeAlias, TypedDict, cast
+from typing import Annotated, Any, TypeAlias, cast
 
 import numpy as np
 import torch
@@ -62,6 +62,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.midashenglm import DashengConfig
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
@@ -508,11 +509,16 @@ def forward(self, x, mask=None):
 
 
 # === Audio Inputs === #
-class MiDashengLMAudioInputs(TypedDict):
-    input_values: torch.Tensor
-    """Shape: `(num_audios, num_sampling_points)`"""
-    audio_length: torch.Tensor
-    """Shape: `(num_audios, 1)`"""
+class MiDashengLMAudioInputs(TensorSchema):
+    """
+
+    Dimensions:
+        - bn: Batch size * number of audios
+        - p: Number of sampling points
+    """
+
+    input_values: Annotated[torch.Tensor, TensorShape("n", "p")]
+    audio_length: Annotated[torch.Tensor, TensorShape("n")]
 
 
 class MiDashengLMProcessingInfo(BaseProcessingInfo):
@@ -676,6 +682,8 @@ def get_replacement_midashenglm(item_idx: int):
     dummy_inputs=MiDashengLMDummyInputsBuilder,
 )
 class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -728,26 +736,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.decoder.make_empty_intermediate_tensors
         )
 
-    def _validate_and_reshape_mm_tensor(
-        self, mm_input: object, name: str
-    ) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            return mm_input.reshape(-1, *mm_input.shape[2:])
-
-        if name == "input_values":
-            max_length = max(tensor.shape[1] for tensor in mm_input)
-            padded_mm_input = [
-                torch.nn.functional.pad(tensor, (0, max_length - tensor.shape[1]))
-                if tensor.shape[1] < max_length
-                else tensor
-                for tensor in mm_input
-            ]
-            return torch.concat(padded_mm_input)
-
-        return torch.concat(mm_input)
-
     def _parse_and_validate_audio_input(
         self, **kwargs: object
     ) -> MiDashengLMAudioInputs | None:
@@ -756,24 +744,22 @@ def _parse_and_validate_audio_input(
 
         if input_values is None:
             return None
-        input_values = self._validate_and_reshape_mm_tensor(
-            input_values, "input_values"
-        )
-        audio_length = self._validate_and_reshape_mm_tensor(
-            audio_length, "audio_length"
-        )
-        if not isinstance(input_values, (torch.Tensor, list)):
-            raise ValueError(
-                "Incorrect type of audio input features. "
-                f"Got type: {type(input_values)}"
+
+        if isinstance(input_values, list):
+            input_values = torch.nn.utils.rnn.pad_sequence(
+                input_values,
+                batch_first=True,
             )
 
         return MiDashengLMAudioInputs(
             input_values=input_values,
             audio_length=audio_length,
         )
 
-    def _process_audio_input(self, audio_input: MiDashengLMAudioInputs) -> torch.Tensor:
+    def _process_audio_input(
+        self,
+        audio_input: MiDashengLMAudioInputs,
+    ) -> tuple[torch.Tensor, ...]:
         # Process audio through encoder and projector
         input_values = audio_input["input_values"]
         audio_length = audio_input["audio_length"]
@@ -783,17 +769,13 @@ def _process_audio_input(self, audio_input: MiDashengLMAudioInputs) -> torch.Ten
         audio_embeddings = audio_embeddings.to(audio_input["input_values"].dtype)
         batch_size, max_audio_tokens, embed_dim = audio_embeddings.shape
 
-        audio_length_np = (
-            audio_length.cpu().numpy()
-            if isinstance(audio_length, torch.Tensor)
-            else audio_length
-        )
         audio_output_lengths = [
             max(1, calculate_mel_frames_dasheng(int(length)))  # at least one frame
-            for length in audio_length_np
+            for length in audio_length.tolist()
         ]
-        audio_output_lengths = torch.tensor(audio_output_lengths).to(
-            audio_embeddings.device
+        audio_output_lengths = torch.tensor(
+            audio_output_lengths,
+            device=audio_embeddings.device,
         )
 
         audio_feature_mask = torch.arange(
@@ -826,14 +808,6 @@ def forward(
     ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
-        elif inputs_embeds is None:
-            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(
-                input_ids,
-                multimodal_embeddings,
-                is_multimodal=input_ids == self.config.audio_token_id,
-            )
-            input_ids = None
 
         return self.decoder.model(
             input_ids,
 
@@ -71,7 +71,7 @@
     MiniCPMVProcessingInfo,
     _minicpmv_field_config,
 )
-from .utils import AutoWeightsLoader, cast_overflow_tensors, flatten_bn, maybe_prefix
+from .utils import AutoWeightsLoader, cast_overflow_tensors, maybe_prefix
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -132,15 +132,11 @@ class MiniCPMOAudioEmbeddingInputs(TensorSchema):
 
 
 def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    audio_features = hf_inputs.get("audio_features", torch.empty(0))
-    num_audios = len(audio_features)
-
     return dict(
         **_minicpmv_field_config(hf_inputs),
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
-        audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
 
@@ -332,10 +328,6 @@ def process_audios(
             ]
             audio_inputs["audio_features"] = unpadded_audio_features
 
-        tokenizer = self.info.get_tokenizer()
-        unk_token_id = tokenizer.get_vocab()["<unk>"]
-        audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
-
         return audio_inputs
 
     def process_mm_inputs(
@@ -436,12 +428,10 @@ def forward(
         attention_mask: torch.Tensor,
     ) -> torch.Tensor:
         residual = hidden_states
-        past_key_values = None
         hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, attn_weights, past_key_values = self.self_attn(
+        hidden_states, _ = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            past_key_value=past_key_values,
         )
         hidden_states = nn.functional.dropout(
             hidden_states, p=self.dropout, training=self.training
@@ -567,8 +557,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "apm")
         )
 
-        self.audio_token_id = None
-
     def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Do not use parameters temporarily
         audio_config = self.config.audio_config
@@ -731,43 +719,18 @@ def _parse_and_validate_audio_input(
         if audio_features is None and audio_embeds is None:
             return None
 
-        audio_token_id = kwargs.pop("audio_token_id")
-        if audio_token_id is not None:
-            assert isinstance(audio_token_id, torch.Tensor)
-            self.mm_token_ids.add(audio_token_id.flatten().unique().item())
-
         if audio_embeds is not None:
-            if not isinstance(audio_embeds, (torch.Tensor, list)):
-                raise ValueError(
-                    f"Incorrect type of audio_embeds. Got type: {type(audio_embeds)}"
-                )
-
-            audio_embeds_flat = flatten_bn(audio_embeds)
-
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
-                audio_embeds=audio_embeds_flat,
-            )
-
-        if not isinstance(audio_features, (torch.Tensor, list)):
-            raise ValueError(
-                f"Incorrect type of audio_features. Got type: {type(audio_features)}"
+                audio_embeds=audio_embeds,
             )
 
         audio_feature_lens = kwargs.pop("audio_feature_lens")
-        if not isinstance(audio_feature_lens, (torch.Tensor, list)):
-            raise ValueError(
-                "Incorrect type of audio_feature_lens. "
-                f"Got type: {type(audio_feature_lens)}"
-            )
-
-        audio_features_flat = flatten_bn(audio_features)
-        audio_feature_lens_flat = flatten_bn(audio_feature_lens)
 
         return MiniCPMOAudioFeatureInputs(
             type="audio_features",
-            audio_features=audio_features_flat,
-            audio_feature_lens=audio_feature_lens_flat,
+            audio_features=audio_features,
+            audio_feature_lens=audio_feature_lens,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: