|
41 | 41 | from vllm.config import VllmConfig |
42 | 42 | from vllm.logger import init_logger |
43 | 43 | from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding |
| 44 | +from vllm.model_executor.models.module_mapping import MultiModelKeys |
44 | 45 | from vllm.model_executor.models.qwen2_5_vl import ( |
45 | 46 | Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs, |
46 | 47 | Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs, |
|
66 | 67 | from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens |
67 | 68 | from vllm.utils.tensor_schema import TensorSchema, TensorShape |
68 | 69 |
|
69 | | -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP |
| 70 | +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, |
| 71 | + SupportsMultiModal, SupportsPP) |
70 | 72 | from .utils import (AutoWeightsLoader, WeightsMapper, |
71 | 73 | init_vllm_registered_model, maybe_prefix, |
72 | 74 | merge_multimodal_embeddings) |
@@ -726,14 +728,30 @@ def _process_video_input( |
726 | 728 | dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder, |
727 | 729 | ) |
728 | 730 | class Qwen2_5OmniThinkerForConditionalGeneration( |
729 | | - nn.Module, SupportsMultiModal, SupportsPP, |
| 731 | + nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, |
730 | 732 | Qwen2_5OmniConditionalGenerationMixin): |
731 | 733 | hf_to_vllm_mapper = WeightsMapper( |
732 | 734 | orig_to_new_prefix={ |
733 | 735 | "thinker.lm_head.": "language_model.lm_head.", |
734 | 736 | "thinker.model.": "language_model.model.", |
735 | 737 | "thinker.": "", |
736 | 738 | }) |
| 739 | + packed_modules_mapping = { |
| 740 | + "qkv_proj": [ |
| 741 | + "q_proj", |
| 742 | + "k_proj", |
| 743 | + "v_proj", |
| 744 | + ], |
| 745 | + "attn.qkv": [ |
| 746 | + "attn.q", |
| 747 | + "attn.k", |
| 748 | + "attn.v", |
| 749 | + ], |
| 750 | + "gate_up_proj": [ |
| 751 | + "gate_proj", |
| 752 | + "up_proj", |
| 753 | + ], |
| 754 | + } |
737 | 755 |
|
738 | 756 | @classmethod |
739 | 757 | def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: |
@@ -956,3 +974,12 @@ def load_weights(self, weights: Iterable[tuple[str, |
956 | 974 | mapper=self.hf_to_vllm_mapper) |
957 | 975 |
|
958 | 976 | return loaded_weights |
| 977 | + |
| 978 | + def get_mm_mapping(self) -> MultiModelKeys: |
| 979 | + """ |
| 980 | + Get the module prefix in multimodal models |
| 981 | + """ |
| 982 | + return MultiModelKeys.from_string_field( |
| 983 | + language_model="language_model", |
| 984 | + connector="merger.", |
| 985 | + tower_model=["visual.", "audio_tower."]) |
0 commit comments