move out v1 attn

bigPYJ1151 · bigPYJ1151 · commit 4960d659a56c · 2025-04-14T07:43:07.000Z
Signed-off-by: jiang.li &lt;jiang1.li@intel.com&gt;
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
@@ -4,7 +4,6 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
-import numpy as np
 import torch
 from torch.nn.functional import scaled_dot_product_attention
 
@@ -22,9 +21,6 @@
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
 from vllm.logger import init_logger
 from vllm.utils import make_tensor_with_pad
-from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.worker.cpu_model_runner import CPUModelRunner
-from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
 logger = init_logger(__name__)
@@ -78,59 +74,6 @@ def copy_blocks(
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
 
-class TorchSDPABackendV1:
-    accept_output_buffer: bool = False
-
-    @staticmethod
-    def get_name() -> str:
-        return "TORCH_SDPA"
-
-    @staticmethod
-    def get_impl_cls() -> Type["TorchSDPABackendImpl"]:
-        return TorchSDPABackendImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return TorchSDPAMetadata
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_builder_cls() -> Type["TorchSDPAMetadataBuilderV1"]:
-        return TorchSDPAMetadataBuilderV1
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
-
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
-
 @dataclass
 class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata):
     """Metadata for TorchSDPABackend.
@@ -343,105 +286,6 @@ def get_seq_len_block_table_args(
             raise AttributeError(f"Invalid attention type {str(attn_type)}")
 
 
-class TorchSDPAMetadataBuilderV1:
-
-    def __init__(self, runner: CPUModelRunner) -> None:
-        self.runner = runner
-
-        # For reorder
-        self.reorder_prompt_req_index_list = np.empty(self.runner.max_num_reqs,
-                                                      dtype=np.int64)
-        self.reorder_decode_req_index_list = np.empty(self.runner.max_num_reqs,
-                                                      dtype=np.int64)
-        self.num_prompt_req: int = 0
-
-    def reorder_batch(self, input_batch: InputBatch,
-                      scheduler_output: SchedulerOutput) -> bool:
-        prompt_list_idx = 0
-        decode_list_idx = 0
-        for req_index in range(input_batch.num_reqs):
-            if input_batch.num_computed_tokens_cpu[
-                    req_index] < input_batch.num_prompt_tokens[req_index]:
-                # prompt stage
-                self.reorder_prompt_req_index_list[prompt_list_idx] = req_index
-                prompt_list_idx += 1
-            else:
-                # decode stage
-                self.reorder_decode_req_index_list[decode_list_idx] = req_index
-                decode_list_idx += 1
-        assert decode_list_idx + prompt_list_idx == input_batch.num_reqs
-
-        # Update prompt requests number
-        self.num_prompt_req = prompt_list_idx
-
-        reorder_req_num = 0
-        for req_index in range(decode_list_idx):
-            if self.reorder_decode_req_index_list[req_index] < prompt_list_idx:
-                reorder_req_num += 1
-            else:
-                break
-
-        if reorder_req_num == 0:
-            return False
-
-        reorder_prompt_list = (
-            self.reorder_prompt_req_index_list[:prompt_list_idx]
-            [-reorder_req_num:])
-        reorder_decode_list = (
-            self.reorder_decode_req_index_list[:decode_list_idx]
-            [:reorder_req_num])
-        assert reorder_decode_list.size == reorder_prompt_list.size
-
-        for idx in range(reorder_req_num):
-            prompt_req_index = reorder_prompt_list[idx].item()
-            decode_req_index = reorder_decode_list[idx].item()
-            input_batch.swap_states(prompt_req_index, decode_req_index)
-
-        return True
-
-    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
-              common_prefix_len: int):
-        runner = self.runner
-        seq_lens_np = runner.seq_lens_np[:num_reqs]
-        num_prompt_req = self.num_prompt_req
-        max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item(
-        ) if num_prompt_req > 0 else 0
-        max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item(
-        ) if num_prompt_req < num_reqs else 0
-        runner.seq_start_loc_np[0] = 0
-        np.cumsum(seq_lens_np, out=runner.seq_start_loc_np[1:num_reqs + 1])
-        num_prefill_tokens = runner.query_start_loc_np[num_prompt_req].item()
-        num_decode_tokens = runner.query_start_loc_np[num_reqs].item(
-        ) - num_prefill_tokens
-        slot_mapping = runner.slot_mapping_cpu[:num_actual_tokens].long()
-        block_table_tensor = runner.input_batch.block_table.get_device_tensor()
-        attn_metadata = TorchSDPAMetadata(
-            num_prefills=num_prompt_req,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            slot_mapping=slot_mapping,
-            seq_lens_tensor=runner.
-            seq_lens_cpu[num_prompt_req:num_reqs],  # decode
-            max_decode_seq_len=max_decode_seq_len,  # decode
-            block_tables=block_table_tensor[num_prompt_req:num_reqs],  # decode
-            chunked_prefill=True,
-            max_query_len=max_query_len,
-            max_kv_len=max_prefill_seq_len,
-            prefill_query_start_loc=runner.
-            query_start_loc_cpu[:num_prompt_req + 1],  # prefill
-            kv_start_loc=runner.seq_start_loc_cpu[:num_prompt_req +
-                                                  1],  # prefill
-            prefill_block_tables=block_table_tensor[:
-                                                    num_prompt_req],  # prefill
-            query_start_loc=runner.query_start_loc_cpu[:num_reqs +
-                                                       1],  # for logits index
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-        )
-
-        return attn_metadata
-
-
 class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
 
     def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
@@ -44,7 +44,7 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
             return "vllm.attention.backends.cpu_mla.CPUMLABackend"
         logger.info("Using Torch SDPA backend.")
         if use_v1:
-            return "vllm.attention.backends.torch_sdpa.TorchSDPABackendV1"
+            return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
         else:
             return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import torch
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl,
+                                                TorchSDPAMetadata)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.ops.ipex_attn import PagedAttention
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.worker.cpu_model_runner import CPUModelRunner
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+class TorchSDPABackend:
+    accept_output_buffer: bool = False
+
+    @staticmethod
+    def get_name() -> str:
+        return "TORCH_SDPA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TorchSDPABackendImpl"]:
+        return TorchSDPABackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return TorchSDPAMetadata
+
+    @staticmethod
+    def get_state_cls() -> type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]:
+        return TorchSDPAMetadataBuilderV1
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                 num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: list[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+class TorchSDPAMetadataBuilderV1:
+
+    def __init__(self, runner: CPUModelRunner) -> None:
+        self.runner = runner
+
+        # For reorder
+        self.reorder_prompt_req_index_list = np.empty(self.runner.max_num_reqs,
+                                                      dtype=np.int64)
+        self.reorder_decode_req_index_list = np.empty(self.runner.max_num_reqs,
+                                                      dtype=np.int64)
+        self.num_prompt_req: int = 0
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        prompt_list_idx = 0
+        decode_list_idx = 0
+        for req_index in range(input_batch.num_reqs):
+            if input_batch.num_computed_tokens_cpu[
+                    req_index] < input_batch.num_prompt_tokens[req_index]:
+                # prompt stage
+                self.reorder_prompt_req_index_list[prompt_list_idx] = req_index
+                prompt_list_idx += 1
+            else:
+                # decode stage
+                self.reorder_decode_req_index_list[decode_list_idx] = req_index
+                decode_list_idx += 1
+        assert decode_list_idx + prompt_list_idx == input_batch.num_reqs
+
+        # Update prompt requests number
+        self.num_prompt_req = prompt_list_idx
+
+        reorder_req_num = 0
+        for req_index in range(decode_list_idx):
+            if self.reorder_decode_req_index_list[req_index] < prompt_list_idx:
+                reorder_req_num += 1
+            else:
+                break
+
+        if reorder_req_num == 0:
+            return False
+
+        reorder_prompt_list = (
+            self.reorder_prompt_req_index_list[:prompt_list_idx]
+            [-reorder_req_num:])
+        reorder_decode_list = (
+            self.reorder_decode_req_index_list[:decode_list_idx]
+            [:reorder_req_num])
+        assert reorder_decode_list.size == reorder_prompt_list.size
+
+        for idx in range(reorder_req_num):
+            prompt_req_index = reorder_prompt_list[idx].item()
+            decode_req_index = reorder_decode_list[idx].item()
+            input_batch.swap_states(prompt_req_index, decode_req_index)
+
+        return True
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int):
+        runner = self.runner
+        seq_lens_np = runner.seq_lens_np[:num_reqs]
+        num_prompt_req = self.num_prompt_req
+        max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item(
+        ) if num_prompt_req > 0 else 0
+        max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item(
+        ) if num_prompt_req < num_reqs else 0
+        runner.seq_start_loc_np[0] = 0
+        np.cumsum(seq_lens_np, out=runner.seq_start_loc_np[1:num_reqs + 1])
+        num_prefill_tokens = runner.query_start_loc_np[num_prompt_req].item()
+        num_decode_tokens = runner.query_start_loc_np[num_reqs].item(
+        ) - num_prefill_tokens
+        slot_mapping = runner.slot_mapping_cpu[:num_actual_tokens].long()
+        block_table_tensor = runner.input_batch.block_table.get_device_tensor()
+        attn_metadata = TorchSDPAMetadata(
+            num_prefills=num_prompt_req,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            slot_mapping=slot_mapping,
+            seq_lens_tensor=runner.
+            seq_lens_cpu[num_prompt_req:num_reqs],  # decode
+            max_decode_seq_len=max_decode_seq_len,  # decode
+            block_tables=block_table_tensor[num_prompt_req:num_reqs],  # decode
+            chunked_prefill=True,
+            max_query_len=max_query_len,
+            max_kv_len=max_prefill_seq_len,
+            prefill_query_start_loc=runner.
+            query_start_loc_cpu[:num_prompt_req + 1],  # prefill
+            kv_start_loc=runner.seq_start_loc_cpu[:num_prompt_req +
+                                                  1],  # prefill
+            prefill_block_tables=block_table_tensor[:
+                                                    num_prompt_req],  # prefill
+            query_start_loc=runner.query_start_loc_cpu[:num_reqs +
+                                                       1],  # for logits index
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+        )
+
+        return attn_metadata