7171from vllm .multimodal .profiling import BaseDummyInputsBuilder
7272from vllm .platforms import _Backend
7373from vllm .sequence import IntermediateTensors
74- from vllm .transformers_utils .config import uses_mrope
7574from vllm .utils .tensor_schema import TensorSchema , TensorShape
7675
7776from ..layers .activation import SiluAndMul
8079from .qwen2_vl import (_create_qwen2vl_field_factory ,
8180 apply_rotary_pos_emb_vision )
8281from .utils import (AutoWeightsLoader , WeightsMapper ,
83- init_vllm_registered_model , maybe_prefix ,
84- merge_multimodal_embeddings )
82+ init_vllm_registered_model , maybe_prefix )
8583from .vision import get_vit_attn_backend , run_dp_sharded_mrope_vision_model
8684
8785logger = init_logger (__name__ )
@@ -1552,32 +1550,6 @@ def get_multimodal_embeddings(
15521550 multimodal_embeddings += video_embeddings
15531551 return multimodal_embeddings
15541552
1555- def get_input_embeddings_v0 (
1556- self ,
1557- input_ids : torch .Tensor ,
1558- image_input : Optional [Glm4vImageInputs ] = None ,
1559- video_input : Optional [Glm4vVideoInputs ] = None ,
1560- ) -> torch .Tensor :
1561- inputs_embeds = self .get_input_embeddings (input_ids )
1562- if image_input is not None :
1563- image_embeds = self ._process_image_input (image_input )
1564- inputs_embeds = merge_multimodal_embeddings (
1565- input_ids ,
1566- inputs_embeds ,
1567- image_embeds ,
1568- placeholder_token_id = self .config .image_token_id ,
1569- )
1570-
1571- if video_input is not None :
1572- video_embeds = self ._process_video_input (video_input )
1573- inputs_embeds = merge_multimodal_embeddings (
1574- input_ids ,
1575- inputs_embeds ,
1576- video_embeds ,
1577- placeholder_token_id = self .config .video_token_id ,
1578- )
1579- return inputs_embeds
1580-
15811553 def forward (
15821554 self ,
15831555 input_ids : torch .Tensor ,
@@ -1604,26 +1576,6 @@ def forward(
16041576 if intermediate_tensors is not None :
16051577 inputs_embeds = None
16061578
1607- # NOTE: In v1, inputs_embeds is always generated at model runner from
1608- # `get_multimodal_embeddings` and `get_input_embeddings`, this
1609- # condition is only for v0 compatibility.
1610- elif inputs_embeds is None :
1611- image_input = self ._parse_and_validate_image_input (** kwargs )
1612- video_input = self ._parse_and_validate_video_input (** kwargs )
1613-
1614- if image_input is None and video_input is None :
1615- inputs_embeds = None
1616- else :
1617- if uses_mrope (self .config ):
1618- assert positions .ndim == 2 and positions .size (0 ) == 3 , (
1619- "multimodal section rotary embedding requires "
1620- f"(3, seq_len) positions, but got { positions .size ()} " )
1621- inputs_embeds = self .get_input_embeddings_v0 (
1622- input_ids ,
1623- image_input = image_input ,
1624- video_input = video_input )
1625- input_ids = None
1626-
16271579 hidden_states = self .language_model .model (
16281580 input_ids = input_ids ,
16291581 positions = positions ,
0 commit comments