Add output_scale to new attn backends

ProExpertProg · ProExpertProg · commit 98de2f98fe17 · 2025-06-11T16:59:02.000Z
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py
@@ -370,6 +370,8 @@ def forward(  # type: ignore
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: DualChunkFlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with DualChunkFlashAttention.
         Args:
@@ -383,6 +385,13 @@ def forward(  # type: ignore
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        assert output is None, "Output tensor not supported for DualChunk"
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashAttentionImpl")
+
         (
             query,
             query_succ,
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -547,6 +547,7 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: FlashInferMetadata,
         output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashInfer.
 
@@ -561,6 +562,11 @@ def forward(
         """
         assert output is not None, "Output tensor must be provided."
 
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlashInferImpl")
+
         if attn_metadata is None:
             # Profiling run.
             return output
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
@@ -414,6 +414,7 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: FlexAttentionMetadata,
         output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FLexAttention.
 
@@ -427,6 +428,12 @@ def forward(
             shape = [num_tokens, num_heads * head_size]
         """
         assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for FlexAttentionImpl")
+
         enable_gqa = self.num_kv_heads != self.num_heads
 
         if attn_metadata is None: