vllm-project · simon-mo · Oct 12, 2024 · Oct 11, 2024 · Oct 12, 2024 · Oct 12, 2024
@@ -337,7 +337,11 @@ def __init__(
             self.packed_modules_mapping = copy.deepcopy(
                 self.model.packed_modules_mapping)
         # Used to indicate whether the model is a multimodal model
-        self.supports_mm: bool = supports_multimodal(self.model)
+        self.supports_mm: bool = (
+            supports_multimodal(self.model)
+            # In case the model only supports LoRA for
+            # text modules (e.g. ChatGLM)
+            and hasattr(self.model, "get_mm_mapping"))
         self.packed_modules: Dict[str, List[str]] = {}
         self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
         # Dict instead of a Set for compatibility with LRUCache.

diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
@@ -474,17 +474,18 @@ def __init__(
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        if not self.config.tie_word_embeddings:
-            self.lm_head = ParallelLMHead(
-                unpadded_vocab_size,
-                config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
-                quant_config=quant_config,
-            )
+        self.lm_head = ParallelLMHead(
+            unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
         self.logits_processor = LogitsProcessor(unpadded_vocab_size,
@@ -517,11 +518,7 @@ def compute_logits(
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
         hidden_states = hidden_states / self.scale_width
-        if self.config.tie_word_embeddings:
-            lm_head = self.model.embed_tokens
-        else:
-            lm_head = self.lm_head
-        logits = self.logits_processor(lm_head, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 

diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
@@ -216,6 +216,28 @@ def _init_layers(
 
 
 class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
+    packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "kv_a_proj_with_mqa",
+        "q_a_proj",
+        "q_b_proj",
+        "kv_b_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+
+    # `embedding_modules` and `embedding_padding_modules`
+    # are inherited from MiniCPMForCausalLM
 
     def _init_model(self):
         self.model = MiniCPM3Model(config=self.config,