From dfa262ad3208313740925be15d90b9208d07d05c Mon Sep 17 00:00:00 2001 From: Amit Garg Date: Thu, 19 Sep 2024 17:54:02 -0700 Subject: [PATCH] [Bugfix] Fix Phi3.5 mini and MoE LoRA inference (#8571) Signed-off-by: Alvant --- vllm/model_executor/models/__init__.py | 2 +- vllm/model_executor/models/phi3.py | 17 +++++++++++++++++ vllm/model_executor/models/phimoe.py | 4 ++++ 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/phi3.py diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 591007e787f47..7427060922281 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -50,7 +50,7 @@ "OrionForCausalLM": ("orion", "OrionForCausalLM"), "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), - "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"), + "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py new file mode 100644 index 0000000000000..02b2ff01c3832 --- /dev/null +++ b/vllm/model_executor/models/phi3.py @@ -0,0 +1,17 @@ +# coding=utf-8 +# Adapted from llama.py +"""Inference-only Phi3 model code inherit from Llama.py""" + +from vllm.model_executor.models.llama import LlamaForCausalLM + + +class Phi3ForCausalLM(LlamaForCausalLM): + + packed_modules_mapping = { + "qkv_proj": [ + "qkv_proj", + ], + "gate_up_proj": [ + "gate_up_proj", + ], + } diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 5036f55803c20..a3555a294bb66 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -491,6 +491,10 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA): "o_proj", "embed_tokens", "lm_head", + "w1", + "w2", + "w3", + "gate", ] embedding_modules = { "embed_tokens": "input_embeddings",