From 86e01f950d53aa3f207d36bd56ff50e751eec0d1 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 11 Oct 2024 22:37:09 +0800
Subject: [PATCH 1/4] Modfiy MiniCPM lm_head

---
 vllm/model_executor/models/minicpm.py | 29 ++++++++++++---------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 6bba1594c270..41c2877194bb 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -474,17 +474,18 @@ def __init__(
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        if not self.config.tie_word_embeddings:
-            self.lm_head = ParallelLMHead(
-                unpadded_vocab_size,
-                config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size,
-                quant_config=quant_config,
-            )
+        self.lm_head = ParallelLMHead(
+            unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
 
         self.logits_processor = LogitsProcessor(unpadded_vocab_size,
@@ -517,11 +518,7 @@ def compute_logits(
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
         hidden_states = hidden_states / self.scale_width
-        if self.config.tie_word_embeddings:
-            lm_head = self.model.embed_tokens
-        else:
-            lm_head = self.lm_head
-        logits = self.logits_processor(lm_head, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 

From 4ccacc30bd80b494454064c72d171ff5768485d6 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 12 Oct 2024 13:19:11 +0800
Subject: [PATCH 2/4] Fix mincpm3 lora mapping bug

---
 vllm/model_executor/models/minicpm3.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index c37bc5ad7c38..f81c3a650baa 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -216,6 +216,28 @@ def _init_layers(
 
 
 class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
+    packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "kv_a_proj_with_mqa",
+        "q_a_proj",
+        "q_b_proj",
+        "kv_b_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+
+    # `embedding_modules`` and `embedding_padding_modules`
+    # are inherited from MiniCPMForCausalLM
 
     def _init_model(self):
         self.model = MiniCPM3Model(config=self.config,

From 03ff616b09212fab2333132578caa8cdd2876f7e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 12 Oct 2024 13:23:07 +0800
Subject: [PATCH 3/4] Fix typo

---
 vllm/model_executor/models/minicpm3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index f81c3a650baa..3b5fd95328d7 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -236,7 +236,7 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
         "lm_head",
     ]
 
-    # `embedding_modules`` and `embedding_padding_modules`
+    # `embedding_modules` and `embedding_padding_modules`
     # are inherited from MiniCPMForCausalLM
 
     def _init_model(self):

From 09ed30695a4f57931f512d15b24536c36a07fe12 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 12 Oct 2024 23:12:14 +0800
Subject: [PATCH 4/4] Fix ChatGLM LoRA bug

---
 vllm/lora/models.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 0dc54516f867..aaadca9a4d16 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -337,7 +337,11 @@ def __init__(
             self.packed_modules_mapping = copy.deepcopy(
                 self.model.packed_modules_mapping)
         # Used to indicate whether the model is a multimodal model
-        self.supports_mm: bool = supports_multimodal(self.model)
+        self.supports_mm: bool = (
+            supports_multimodal(self.model)
+            # In case the model only supports LoRA for
+            # text modules (e.g. ChatGLM)
+            and hasattr(self.model, "get_mm_mapping"))
         self.packed_modules: Dict[str, List[str]] = {}
         self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
         # Dict instead of a Set for compatibility with LRUCache.