Character ai (#587)

fsx950223 · charlifu · fsx950223 · commit b130f855cde7 · 2025-07-01T03:50:57.000Z
* integrate aiter

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* add env variable

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* rename function

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* optimize kernels with small query lens

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* change condition

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* add rocm aiter backend

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* new fa impl

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* update api

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* optimize performance

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* remove try catch

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* clean code

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* remove type cast

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* use on_gfx9 instead of on_mi250_mi300

Signed-off-by: charlifu &lt;charlifu@amd.com&gt;

* add fp8 support

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* revert layernorm

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* enable aiter pa

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* fix bug

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* fix bug

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* fix upstream issue

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* change condition

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* support head size 256

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* enable fp8 aiter pa in vllm v1

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* fix workspace buffer

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* fix fa crash issue

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

* add namespace aiter

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;

---------

Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;
Signed-off-by: charlifu &lt;charlifu@amd.com&gt;
Co-authored-by: charlifu &lt;charlifu@amd.com&gt;
Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -913,8 +913,7 @@ def forward(
                 )
                 max_logits = torch.empty_like(exp_sums)
 
-                query_start_loc = None
-                ops.paged_attention_rocm(
+                torch.ops.aiter.paged_attention_rocm(
                     output[num_prefill_tokens:],
                     exp_sums,
                     max_logits,
@@ -930,7 +929,6 @@ def forward(
                     decode_meta.seq_lens_tensor
                     if self.attn_type != AttentionType.ENCODER_DECODER else
                     decode_meta.encoder_seq_lens_tensor,
-                    query_start_loc,
                     block_size,
                     max_seq_len,
                     self.alibi_slopes,
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -9,7 +9,6 @@
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.platforms.rocm import use_rocm_custom_paged_attention
 from vllm.triton_utils import tl, triton
@@ -305,7 +304,7 @@ def chunked_prefill_paged_decode(
         )
         max_logits = torch.empty_like(exp_sums)
 
-        ops.paged_attention_rocm(
+        torch.ops.aiter.paged_attention_rocm(
             output,
             exp_sums,
             max_logits,
@@ -316,10 +315,9 @@ def chunked_prefill_paged_decode(
             num_kv_heads,
             scale=sm_scale,
             block_tables=block_table,
-            seq_lens=seq_lens,
-            query_start_loc=query_start_loc,
+            context_lens=seq_lens,
             block_size=block_size,
-            max_seq_len=max_seq_len,
+            max_context_len=max_seq_len,
             alibi_slopes=alibi_slopes,
             kv_cache_dtype=kv_cache_dtype,
             k_scale=k_scale,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -138,9 +138,9 @@ def use_rocm_custom_paged_attention(
         return ((not envs.VLLM_USE_V1 or sliding_window == 0
                  or sliding_window == (-1, -1))
                 and (qtype == torch.half or qtype == torch.bfloat16)
-                and (head_size == 64 or head_size == 128)
+                and (head_size in [64, 128, 256])
                 and (block_size == 16 or block_size == 32)
-                and (gqa_ratio >= 1 and gqa_ratio <= 16)
+                and (gqa_ratio >= 1 and gqa_ratio <= 32)
                 and max_seq_len <= 128 * 1024
                 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
                 and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -5,10 +5,8 @@
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
+                                              AttentionMetadata, AttentionType)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import (
@@ -17,6 +15,8 @@
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
 
+_PARTITION_SIZE_ROCM = 256
+
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
@@ -38,6 +38,9 @@ def _vllm_layout_trans_kernel(
         b_seq_lens_loc,
         block_table,
         block_table_stride_0,
+        k_scale,
+        v_scale,
+        output_dtype: tl.constexpr,
         E_DIM: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
     ):
@@ -59,16 +62,27 @@ def _vllm_layout_trans_kernel(
                           tl.arange(0, BLOCK_SIZE)[:, None]) < seq_len
 
             kv_idx = tl.load(block_table + batch_idx * block_table_stride_0 +
-                             block_idx)
+                             block_idx).to(tl.int64)
 
             kv_buffer_off = kv_idx * BLOCK_SIZE * E_DIM + tl.arange(
                 0, BLOCK_SIZE)[:, None] * E_DIM + tl.arange(0, E_DIM)[None, :]
             k_vals = tl.load(k_buffer_ptr + kv_buffer_off,
                              mask=block_mask,
                              other=0.0)
+            if k_vals.dtype.is_fp8():
+                k_vals = (k_vals.to(tl.float32) *
+                          tl.load(k_scale)).to(output_dtype)
+            else:
+                k_vals = k_vals.to(output_dtype)
+
             v_vals = tl.load(v_buffer_ptr + kv_buffer_off,
                              mask=block_mask,
                              other=0.0)
+            if v_vals.dtype.is_fp8():
+                v_vals = (v_vals.to(tl.float32) *
+                          tl.load(v_scale)).to(output_dtype)
+            else:
+                v_vals = v_vals.to(output_dtype)
 
             kv_values_off = batch_token_start * E_DIM + \
                 block_idx * BLOCK_SIZE * E_DIM + \
@@ -78,21 +92,28 @@ def _vllm_layout_trans_kernel(
             tl.store(v_values_ptr + kv_values_off, v_vals, mask=block_mask)
 
     def vllm_layout_trans(b_query_lens_loc, b_seq_lens_loc, block_table,
-                          k_buffer, v_buffer, max_seq_len, total_tokens):
+                          k_buffer, v_buffer, max_seq_len, total_tokens,
+                          k_scale, v_scale, output_dtype):
         H_KV = v_buffer.shape[2]
         D = v_buffer.shape[3]
         BLOCK_SIZE = v_buffer.shape[1]
-        dtype = k_buffer.dtype
         k_values = torch.empty((total_tokens, H_KV, D),
-                               dtype=dtype,
+                               dtype=output_dtype,
                                device="cuda")
         v_values = torch.empty((total_tokens, H_KV, D),
-                               dtype=dtype,
+                               dtype=output_dtype,
                                device="cuda")
 
         grid = (block_table.shape[0],
                 (max_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE)
 
+        if output_dtype == torch.float16:
+            output_dtype = tl.float16
+        elif output_dtype == torch.bfloat16:
+            output_dtype = tl.bfloat16
+        else:
+            raise ValueError(f"Unsupported output dtype: {output_dtype}")
+
         _vllm_layout_trans_kernel[grid](k_buffer,
                                         v_buffer,
                                         k_values,
@@ -101,6 +122,9 @@ def vllm_layout_trans(b_query_lens_loc, b_seq_lens_loc, block_table,
                                         b_seq_lens_loc,
                                         block_table,
                                         block_table.stride(0),
+                                        k_scale,
+                                        v_scale,
+                                        output_dtype=output_dtype,
                                         E_DIM=H_KV * D,
                                         BLOCK_SIZE=BLOCK_SIZE)
 
@@ -120,9 +144,12 @@ def flash_attn_varlen_func_impl(
         window_size: Optional[list[int]],  # -1 means infinite context window
         alibi_slopes: Optional[list[float]],
         block_table: torch.Tensor,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
     ) -> torch.Tensor:
         k, v = vllm_layout_trans(cu_seqlens_q, cu_seqlens_k, block_table,
-                                 k_cache, v_cache, max_seqlen_k, total_tokens)
+                                 k_cache, v_cache, max_seqlen_k, total_tokens,
+                                 k_scale, v_scale, q.dtype)
         output = aiter.flash_attn_varlen_func(
             q=q,
             k=k,
@@ -154,6 +181,8 @@ def flash_attn_varlen_func_fake(
         window_size: Optional[list[int]],  # -1 means infinite context window
         alibi_slopes: Optional[list[float]],
         block_table: torch.Tensor,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
     ) -> torch.Tensor:
         return torch.empty(q.shape[0],
                            q.shape[1],
@@ -184,7 +213,6 @@ def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
         self.block_size = kv_cache_spec.block_size
         self.kv_cache_spec = kv_cache_spec
         self.block_table = block_table
-
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
         self.aot_sliding_window: Optional[tuple[int, int]] = None
@@ -281,6 +309,18 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
         prefix_kv_lens = None
         suffix_kv_lens = None
 
+        nbyes_per_qo_elem = torch.finfo(self.runner.dtype).bits // 8
+        max_num_partitions = (max_seq_len + _PARTITION_SIZE_ROCM -
+                              1) // _PARTITION_SIZE_ROCM
+
+        workspace_buffer = torch.empty(
+            (num_reqs * self.num_heads_q * max_num_partitions * self.headdim) *
+            nbyes_per_qo_elem + 2 *
+            (num_reqs * self.num_heads_q * max_num_partitions) * 4,
+            dtype=torch.uint8,
+            device=self.runner.device,
+        )
+
         attn_metadata = AiterFlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
@@ -292,6 +332,7 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
             block_table=block_table_tensor,
             slot_mapping=slot_mapping,
             use_cascade=use_cascade,
+            workspace_buffer=workspace_buffer,
             common_prefix_len=common_prefix_len,
             cu_prefix_query_lens=cu_prefix_query_lens,
             prefix_kv_lens=prefix_kv_lens,
@@ -315,7 +356,7 @@ class AiterFlashAttentionBackend(AttentionBackend):
 
     @staticmethod
     def get_supported_head_sizes() -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
+        return [64, 128, 256]
 
     @staticmethod
     def get_name() -> str:
@@ -364,6 +405,7 @@ class AiterFlashAttentionMetadata:
     total_tokens: int
     block_table: torch.Tensor
     slot_mapping: torch.Tensor
+    workspace_buffer: torch.Tensor
 
     # For cascade attention.
     use_cascade: bool
@@ -442,10 +484,6 @@ def __init__(
                                       "are not implemented for "
                                       "FlashAttentionImpl")
         self.use_irope = use_irope
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "AiterFlashAttention does not support fp8 kv-cache on this "
-                "device.")
 
     def forward(
         self,
@@ -516,12 +554,6 @@ def forward(
         if self.kv_cache_dtype.startswith("fp8"):
             key_cache = key_cache.view(torch.float8_e4m3fnuz)
             value_cache = value_cache.view(torch.float8_e4m3fnuz)
-            num_tokens, num_heads, head_size = query.shape
-            query, _ = ops.scaled_fp8_quant(
-                query.reshape(
-                    (num_tokens, num_heads * head_size)).contiguous(),
-                layer._q_scale)
-            query = query.reshape((num_tokens, num_heads, head_size))
 
         # Compute attention and update output up to `num_actual_tokens`.
         use_local_attn = \
@@ -559,28 +591,14 @@ def forward(
                     alibi_slopes=self.alibi_slopes,
                     window_size=self.sliding_window,
                     block_table=block_table,
-                    cu_seqlens_k=(cu_seq_lens if not use_local_attn else
-                                  local_metadata.local_cu_seq_lens),
+                    cu_seqlens_k=cu_seq_lens,
+                    k_scale=layer._k_scale,
+                    v_scale=layer._v_scale,
                 )
 
-            _, num_heads, head_size = query.shape
-            _PARTITION_SIZE_ROCM = 256
-            num_seqs = seqused_k.shape[0]
-            nbyes_per_qo_elem = torch.finfo(output.dtype).bits // 8
-            max_num_partitions = (max_seqlen_k + _PARTITION_SIZE_ROCM -
-                                  1) // _PARTITION_SIZE_ROCM
-
-            workspace_buffer = torch.empty(
-                (num_seqs * num_heads * max_num_partitions * head_size) *
-                nbyes_per_qo_elem + 2 *
-                (num_seqs * num_heads * max_num_partitions) * 4,
-                dtype=torch.uint8,
-                device=output.device,
-            )
-
-            aiter.paged_attention_v1(
+            torch.ops.aiter.paged_attention_v1(
                 output[:num_actual_tokens],
-                workspace_buffer,
+                attn_metadata.workspace_buffer,
                 query[:num_actual_tokens],
                 key_cache,
                 value_cache,