diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index b7e4ba4d7416..d55eb43766b6 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -29,7 +29,7 @@ def get_kv_cache_shape( head_size: int, *args, ) -> Tuple[int, ...]: - return (2, num_blocks, block_size * num_kv_heads * head_size) + return 2, num_blocks, block_size * num_kv_heads * head_size @staticmethod def split_kv_cache( diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 9e834befd68a..ff110e050bb6 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -3,7 +3,7 @@ """A CPU worker class.""" import os from importlib import util -from typing import Dict, List, Optional, Set, Tuple, Type +from typing import List, Optional, Set, Tuple, Type import torch import torch.distributed @@ -88,13 +88,13 @@ def _allocate_kv_cache( torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu")) return kv_cache - def swap_in(self, src_to_dst: Dict[int, int]) -> None: + def swap_in(self, src_to_dst: torch.Tensor) -> None: raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - def swap_out(self, src_to_dst: Dict[int, int]) -> None: + def swap_out(self, src_to_dst: torch.Tensor) -> None: raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: + def copy(self, src_to_dsts: torch.Tensor) -> None: self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts) @staticmethod