0808-temp

SangChengC · SangChengC · commit 88577b3ea75f · 2025-08-08T07:47:56.000Z
diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -104,15 +104,30 @@ def forward(self, x) -> torch.Tensor:
         return self.fc2(self.act(self.fc1(x)))
 
 
+# copy form vllm
 class VisionRotaryEmbedding(nn.Module):
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
+        self.dim = dim
+        self.theta = theta
         self.inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (
+                self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device) / self.dim)
+            )
+            seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
 
     def forward(self, seqlen: int) -> torch.Tensor:
-        self.seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
-        self.freqs = torch.outer(self.seq, self.inv_freq)
-        return self.freqs
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
 
 
 class VisionFlashAttention(nn.Module):
@@ -130,17 +145,19 @@ def apply_rotary_pos_emb_vision(self, t: torch.Tensor, freqs: torch.Tensor) -> t
         return output
 
     def forward(
-        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int = 0,
+        rotary_pos_emb: torch.Tensor = None,
     ) -> torch.Tensor:
         seq_length = hidden_states.shape[0]
         q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
-        q = self.apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb.cuda())
-        k = self.apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb.cuda())
+        q = self.apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb)
+        k = self.apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb)
         q = q.squeeze(0)
         k = k.squeeze(0)
 
-        cu_seqlens = cu_seqlens.to(q.device, torch.int32, non_blocking=True)
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         attn_output = g_cache_manager.alloc_tensor(q.shape, q.dtype, device=q.device)
 
         flash_attention_fwd(q, k, v, attn_output, cu_seqlens, max_seqlen)
@@ -159,9 +176,9 @@ def __init__(self, embed_dim, mlp_ratio, num_heads, hidden_act) -> None:
         self.attn = VisionFlashAttention(embed_dim, num_heads=num_heads)
         self.mlp = VisionMlp(dim=embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=hidden_act)
 
-    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+    def forward(self, hidden_states, cu_seqlens, max_seqlen, rotary_pos_emb) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
-            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, rotary_pos_emb=rotary_pos_emb
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -271,9 +288,8 @@ def rot_pos_emb(self, grid_thw):
             pos_shape = (h // s, s, w // s, s)
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
             wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            hpos_ids, wpos_ids = hpos_ids.reshape(pos_shape), wpos_ids.reshape(pos_shape)
-            hpos_ids, wpos_ids = hpos_ids.permute(0, 2, 1, 3), wpos_ids.permute(0, 2, 1, 3)
-            hpos_ids, wpos_ids = hpos_ids.flatten(), wpos_ids.flatten()
+            hpos_ids = hpos_ids.reshape(pos_shape).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(pos_shape).permute(0, 2, 1, 3).flatten()
 
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1))
         pos_ids = torch.cat(pos_ids, dim=0)
@@ -284,14 +300,18 @@ def rot_pos_emb(self, grid_thw):
 
     def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
         hidden_states = self.patch_embed(hidden_states)
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw).to("cuda", non_blocking=True)
         cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
             dim=0, dtype=torch.int32
         )
         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
 
+        cu_seqlens = cu_seqlens.to("cuda", non_blocking=True)
         for blk in self.blocks:
-            hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+            hidden_states = blk(
+                hidden_states, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, rotary_pos_emb=rotary_pos_emb
+            )
         return self.merger(hidden_states)
 
     def load_image(self, img: List[ImageItem]):
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -215,7 +215,7 @@ def flash_attention_fwd(q, k, v, o, cu_seqlens, max_seqlen):
     统一的 Flash Attention 接口。如果 sgl_kernel 存在，
     则使用 sgl_kernel里的接口，否则使用 Triton 版本。
     """
-    if _flash_attn_v3_available and is_hopper() and False:
+    if _flash_attn_v3_available and is_hopper():
         flash_attention_v3_fwd(q, k, v, o, cu_seqlens, max_seqlen)
     else:
         _flash_attention_triton_fwd(q, k, v, o, cu_seqlens, max_seqlen)