From dfa262ad3208313740925be15d90b9208d07d05c Mon Sep 17 00:00:00 2001
From: Amit Garg <mitgarg17495@gmail.com>
Date: Thu, 19 Sep 2024 17:54:02 -0700
Subject: [PATCH] [Bugfix] Fix Phi3.5 mini and MoE LoRA inference (#8571)

Signed-off-by: Alvant <alvasian@yandex.ru>
---
 vllm/model_executor/models/__init__.py |  2 +-
 vllm/model_executor/models/phi3.py     | 17 +++++++++++++++++
 vllm/model_executor/models/phimoe.py   |  4 ++++
 3 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/phi3.py

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 591007e787f47..7427060922281 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -50,7 +50,7 @@
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
-    "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
new file mode 100644
index 0000000000000..02b2ff01c3832
--- /dev/null
+++ b/vllm/model_executor/models/phi3.py
@@ -0,0 +1,17 @@
+# coding=utf-8
+# Adapted from llama.py
+"""Inference-only Phi3 model code inherit from Llama.py"""
+
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+
+class Phi3ForCausalLM(LlamaForCausalLM):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 5036f55803c20..a3555a294bb66 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -491,6 +491,10 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA):
         "o_proj",
         "embed_tokens",
         "lm_head",
+        "w1",
+        "w2",
+        "w3",
+        "gate",
     ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",