[0.7.3] optimize qwen2_vl and qwen2_5_vl (#702)

zouyida2052 · zouyida2002 · web-flow · commit b9528e6ecdc4 · 2025-04-28T20:02:05.000+08:00
### What this PR does / why we need it?
Optimize qwen2_vl and qwen2_5_vl.

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
Testing this PR on 1080p picture with tp=1, bs=1 on Qwen2-VL and
Qwen2.5-VL, every fa op's during time lasting from 11ms to 9ms, got
roughly 22% perf boost.
---------

Signed-off-by: zouyida2052 &lt;zouyida@huawei.com&gt;
Signed-off-by: zouyida2052 &lt;zouyida2002@gmail.com&gt;
Co-authored-by: zouyida2052 &lt;zouyida@huawei.com&gt;
diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
@@ -4,7 +4,7 @@
 def register_model():
     ModelRegistry.register_model(
         "Qwen2VLForConditionalGeneration",
-        "vllm_ascend.models.qwen2_vl:CustomQwen2VLForConditionalGeneration")
+        "vllm_ascend.models.qwen2_vl:AscendQwen2VLForConditionalGeneration")
 
     ModelRegistry.register_model(
         "Qwen2_5_VLForConditionalGeneration",
diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py
@@ -42,8 +42,8 @@
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
-MIN_PAD_SIZE = 64
-MAX_PAD_SIZE = 128
+MIN_PAD_SIZE = 64  # min_size to pad weight
+MAX_PAD_SIZE = 128  # max_size to pad weight
 
 
 class AscendQwen2_5_VisionAttention(Qwen2_5_VisionAttention):
@@ -66,6 +66,7 @@ def __init__(
         self.embed_dim = embed_dim
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
+        self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head
         if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE:
             self.hidden_size_per_attention_head = MAX_PAD_SIZE
 
@@ -101,7 +102,7 @@ def forward(
             key=k,
             value=v,
             seq_len=cu_seqlens,
-            scale_value=self.hidden_size_per_attention_head**-0.5,
+            scale_value=self.origin_hidden_size_per_attention_head**-0.5,
             num_heads=self.num_attention_heads_per_partition,
             num_kv_heads=self.num_attention_heads_per_partition,
             out=context_layer)
@@ -164,6 +165,7 @@ def __init__(
         super().__init__(vision_config, norm_eps, quant_config, prefix)
         norm_layer = partial(RMSNorm, eps=norm_eps)
         self.interleaved = interleaved
+        self.enable_pad = False
         self.patch_embed = AscendQwen2_5_VisionPatchEmbed(
             patch_size=vision_config.patch_size,
             temporal_patch_size=vision_config.temporal_patch_size,
@@ -187,6 +189,7 @@ def __init__(
             self.hidden_size, self.num_heads)
 
         if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE:
+            self.enable_pad = True
             self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head
             self.half_origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head // 2
             self.half_pad_hidden_size_per_attention_head = (
@@ -196,10 +199,11 @@ def __init__(
     def cal_cos_sin(self, rotary_pos_emb):
         cos = rotary_pos_emb.cos()  # [seqlen, rotary_dim / 2]
         sin = rotary_pos_emb.sin()
-        cos = torch.nn.functional.pad(
-            cos, (0, self.half_pad_hidden_size_per_attention_head))
-        sin = torch.nn.functional.pad(
-            sin, (0, self.half_pad_hidden_size_per_attention_head))
+        if self.enable_pad:
+            cos = torch.nn.functional.pad(
+                cos, (0, self.half_pad_hidden_size_per_attention_head))
+            sin = torch.nn.functional.pad(
+                sin, (0, self.half_pad_hidden_size_per_attention_head))
 
         if not self.interleaved:
             cos_new = torch.cat((cos, cos), dim=-1)
@@ -285,11 +289,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
-                if ("attn.proj.weight" in name):
+                if ("attn.proj.weight" in name) and self.enable_pad:
                     param.data = self.pad_proj_weight(param.data)
-                if ("attn.qkv.weight" in name):
+                if ("attn.qkv.weight" in name) and self.enable_pad:
                     param.data = self.pad_qkv_weight(param.data)
-                if ("attn.qkv.bias" in name):
+                if ("attn.qkv.bias" in name) and self.enable_pad:
                     param.data = self.pad_qkv_bias(param.data)
             loaded_params.add(name)
         return loaded_params
diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py