[Attn][bugfix] fix long seq percision issue (vllm-project#334)

MengqingCao · wangxiyuan · web-flow · commit f025df05f614 · 2025-03-14T12:16:47.000+08:00
fix vllm-project/vllm-ascend#321 This pr is a temporary solution for long seq percision issue, will revert when the root cause is fixed cc @rjg-lyh @wangxiyuan Co-authored-by: rjg-lyh <1318825571@qq.com> --------- Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: wangxiyuan <wangxiyuan@huawei.com>
diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md
@@ -25,7 +25,7 @@
 - Pin modelscope<1.23.0 on vLLM v0.7.3 to resolve: https://github.com/vllm-project/vllm/pull/13807
 
 ### Known issues
-- In [some cases](https://github.com/vllm-project/vllm-ascend/issues/321), expecially when the input/output is very long, the accuracy of output may be incorrect. You may see many `!` in the output, We are working on it. It'll be fixed in the next release.
+- In some cases, expecially when the input/output is very long with VL model, the accuracy of output may be incorrect. You may see many `!` or some other unreadable code in the output. We are working on it. It'll be fixed in the next release.
 - Improved and reduced the garbled code in model output. But if you still hit the issue, try to change the gerneration config value, such as `temperature` and try again. Any [feedback](https://github.com/vllm-project/vllm-ascend/issues/267) is welcome. [#277](https://github.com/vllm-project/vllm-ascend/pull/277)
 
 ## v0.7.1rc1
diff --git a/vllm_ascend/attention.py b/vllm_ascend/attention.py
@@ -717,6 +717,8 @@ def __init__(
         self.query_len_cpu_tensor = None
         self.key_cache = None
         self.value_cache = None
+        # TODO: FIXME revert me when torch-npu sync issue is solved
+        self.output: torch.Tensor = None
 
     def forward(
         self,
@@ -755,11 +757,11 @@ def forward(
         value = value.contiguous()
         attn_type = self.attn_type
 
-        output = torch.empty(num_tokens,
-                             self.num_heads,
-                             self.head_size,
-                             dtype=query.dtype,
-                             device=query.device)
+        self.output = torch.empty(num_tokens,
+                                  self.num_heads,
+                                  self.head_size,
+                                  dtype=query.dtype,
+                                  device=query.device)
 
         if kv_cache.numel() > 0:
             if self.key_cache is None:
@@ -792,7 +794,7 @@ def forward(
                 block_tables,
                 isPrefill,
                 attn_metadata,
-                output,
+                self.output,
                 seq_lens_tensor_cpu=self.seq_lens_tensor_cpu)
         else:
             if self.key_cache is not None:
@@ -831,7 +833,7 @@ def forward(
                                 is_causal=causal_attn and mask is None,
                                 scale=self.scale).squeeze(0).movedim(
                                     query.dim() - 2, 0)
-                            output[start_q:end_q, :, :] = sub_out
+                            self.output[start_q:end_q, :, :] = sub_out
                             start_q, start_kv = end_q, end_kv
                     else:
                         assert attn_metadata.attn_mask is not None
@@ -849,7 +851,7 @@ def forward(
                             scale_value=self.scale,
                             num_heads=self.num_heads,
                             num_kv_heads=self.num_kv_heads,
-                            out=output)
+                            out=self.output)
                 elif attn_metadata.num_decode_tokens == 0 and not attn_metadata.chunked_prefill_enabled:
                     assert kv_cache is not None
                     assert attn_metadata.prefill_metadata is not None
@@ -875,7 +877,7 @@ def forward(
                         num_kv_heads=self.num_kv_heads,
                         num_heads=self.num_heads,
                         scale_value=self.scale,
-                        out=output)
+                        out=self.output)
                 # Splitfuse
                 else:
                     assert kv_cache is not None
@@ -897,7 +899,7 @@ def forward(
                         num_kv_heads=self.num_kv_heads,
                         num_heads=self.num_heads,
                         scale_value=self.scale,
-                        out=output)
+                        out=self.output)
             # Decode only
             else:
                 assert kv_cache is not None
@@ -915,9 +917,9 @@ def forward(
                     scale_value=self.scale,
                     block_table=block_tables,
                     context_lens=self.seq_lens_tensor_cpu,
-                    out=output)
+                    out=self.output)
 
-        return output.view(num_tokens, self.hidden_size)
+        return self.output.view(num_tokens, self.hidden_size)
 
 
 class AscendMLAAttentionBackendImpl(MLAAttentionImpl):