|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | + |
| 4 | +import numpy as np |
| 5 | +import torch |
| 6 | + |
| 7 | +from vllm import _custom_ops as ops |
| 8 | +from vllm.attention import AttentionBackend |
| 9 | +from vllm.logger import init_logger |
| 10 | +from vllm.utils import is_pin_memory_available |
| 11 | +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec |
| 12 | +from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, |
| 13 | + TransferResult, TransferSpec) |
| 14 | + |
| 15 | +logger = init_logger(__name__) |
| 16 | + |
| 17 | + |
| 18 | +def expand_block_ids(block_ids: np.ndarray, |
| 19 | + block_size_factor: int, |
| 20 | + output: np.ndarray, |
| 21 | + skip_count: int = 0): |
| 22 | + """ |
| 23 | + Convert a list of block IDs to a list of matching block ids, |
| 24 | + assuming each block is composed of actual block_size_factor blocks. |
| 25 | + Outputs to output tensor. |
| 26 | + The first skip_count blocks will be skipped. |
| 27 | + Note that skip_count must be less than block_size_factor. |
| 28 | +
|
| 29 | + For example, if block_ids = [0, 1, 3] and block_size_factor = 4, |
| 30 | + then it yields [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15] |
| 31 | + since 0 maps to [0, 1, 2, 3] |
| 32 | + 1 maps to [4, 5, 6, 7] |
| 33 | + and 3 maps to [12, 13, 14, 15] |
| 34 | + """ |
| 35 | + assert skip_count < block_size_factor |
| 36 | + |
| 37 | + first_range = np.arange(skip_count, block_size_factor) |
| 38 | + full_range = np.arange(0, block_size_factor) |
| 39 | + |
| 40 | + output_idx = 0 |
| 41 | + for i, block_id in enumerate(block_ids): |
| 42 | + base_block_id = block_id * block_size_factor |
| 43 | + indices = first_range if i == 0 else full_range |
| 44 | + output_end_idx = output_idx + len(indices) |
| 45 | + output[output_idx:output_end_idx] = base_block_id + indices |
| 46 | + output_idx = output_end_idx |
| 47 | + |
| 48 | + |
| 49 | +class CpuGpuOffloadingHandler(OffloadingHandler): |
| 50 | + |
| 51 | + def __init__(self, gpu_block_size: int, cpu_block_size: int, |
| 52 | + num_cpu_blocks: int, gpu_caches: dict[str, torch.Tensor], |
| 53 | + attn_backends: dict[str, type[AttentionBackend]]): |
| 54 | + assert cpu_block_size % gpu_block_size == 0 |
| 55 | + self.block_size_factor = cpu_block_size // gpu_block_size |
| 56 | + |
| 57 | + # cuda streams for gpu->cpu and cpu->gpu |
| 58 | + self.d2h_stream = torch.cuda.Stream() |
| 59 | + self.h2d_stream = torch.cuda.Stream() |
| 60 | + |
| 61 | + # job_id -> transfer cuda event |
| 62 | + self.transfer_events: dict[int, torch.cuda.Event] = {} |
| 63 | + # list of cuda events available for re-use |
| 64 | + self.events_pool: list[torch.cuda.Event] = [] |
| 65 | + |
| 66 | + pin_memory = is_pin_memory_available() |
| 67 | + |
| 68 | + # allocate cpu tensors |
| 69 | + logger.info("Allocating %d CPU tensors...", len(gpu_caches)) |
| 70 | + self.gpu_tensors: list[torch.Tensor] = [] |
| 71 | + self.cpu_tensors: list[torch.Tensor] = [] |
| 72 | + self.kv_dim_before_num_blocks: list[bool] = [] |
| 73 | + for layer_name, gpu_tensor in gpu_caches.items(): |
| 74 | + self.gpu_tensors.append(gpu_tensor) |
| 75 | + |
| 76 | + gpu_shape = gpu_tensor.shape |
| 77 | + test_shape = attn_backends[layer_name].get_kv_cache_shape( |
| 78 | + num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256) |
| 79 | + if test_shape[0] == 1234: |
| 80 | + # shape is (num_blocks, ...) |
| 81 | + num_blocks_idx = 0 |
| 82 | + self.kv_dim_before_num_blocks.append(False) |
| 83 | + else: |
| 84 | + # shape should be (2, num_blocks, ...) |
| 85 | + assert test_shape[0] == 2 |
| 86 | + assert test_shape[1] == 1234 |
| 87 | + assert gpu_shape[0] == 2 |
| 88 | + |
| 89 | + num_blocks_idx = 1 |
| 90 | + self.kv_dim_before_num_blocks.append(True) |
| 91 | + |
| 92 | + cpu_shape = list(gpu_shape) |
| 93 | + cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor |
| 94 | + |
| 95 | + logger.debug("Allocating CPU tensor of shape %r", cpu_shape) |
| 96 | + self.cpu_tensors.append( |
| 97 | + torch.zeros(cpu_shape, |
| 98 | + dtype=gpu_tensor.dtype, |
| 99 | + device="cpu", |
| 100 | + pin_memory=pin_memory)) |
| 101 | + |
| 102 | + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: |
| 103 | + src_spec, dst_spec = spec |
| 104 | + if isinstance(src_spec, CPULoadStoreSpec): |
| 105 | + assert isinstance(dst_spec, GPULoadStoreSpec) |
| 106 | + stream = self.h2d_stream |
| 107 | + src_tensors = self.cpu_tensors |
| 108 | + dst_tensors = self.gpu_tensors |
| 109 | + src_block_size_factor = self.block_size_factor |
| 110 | + dst_block_size_factor = 1 |
| 111 | + else: |
| 112 | + assert isinstance(src_spec, GPULoadStoreSpec) |
| 113 | + assert isinstance(dst_spec, CPULoadStoreSpec) |
| 114 | + stream = self.d2h_stream |
| 115 | + src_tensors = self.gpu_tensors |
| 116 | + dst_tensors = self.cpu_tensors |
| 117 | + src_block_size_factor = 1 |
| 118 | + dst_block_size_factor = self.block_size_factor |
| 119 | + |
| 120 | + src_blocks = src_spec.block_ids |
| 121 | + dst_blocks = dst_spec.block_ids |
| 122 | + assert src_blocks.ndim == 1 |
| 123 | + assert dst_blocks.ndim == 1 |
| 124 | + |
| 125 | + dst_sub_blocks_to_skip = (-src_blocks.size % dst_block_size_factor) |
| 126 | + src_sub_block_count = src_blocks.size * src_block_size_factor |
| 127 | + |
| 128 | + assert ( |
| 129 | + src_sub_block_count == dst_blocks.size * dst_block_size_factor - |
| 130 | + dst_sub_blocks_to_skip) |
| 131 | + |
| 132 | + src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64) |
| 133 | + expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0]) |
| 134 | + expand_block_ids(dst_blocks, |
| 135 | + dst_block_size_factor, |
| 136 | + src_to_dst[:, 1], |
| 137 | + skip_count=dst_sub_blocks_to_skip) |
| 138 | + src_to_dst_tensor = torch.from_numpy(src_to_dst) |
| 139 | + |
| 140 | + event = self.events_pool.pop() if self.events_pool \ |
| 141 | + else torch.cuda.Event() |
| 142 | + with torch.cuda.stream(stream): |
| 143 | + for src_tensor, dst_tensor, kv_dim in zip( |
| 144 | + src_tensors, dst_tensors, self.kv_dim_before_num_blocks): |
| 145 | + if kv_dim: |
| 146 | + src_key_cache = src_tensor[0] |
| 147 | + dst_key_cache = dst_tensor[0] |
| 148 | + ops.swap_blocks(src_key_cache, dst_key_cache, |
| 149 | + src_to_dst_tensor) |
| 150 | + src_value_cache = src_tensor[1] |
| 151 | + dst_value_cache = dst_tensor[1] |
| 152 | + ops.swap_blocks(src_value_cache, dst_value_cache, |
| 153 | + src_to_dst_tensor) |
| 154 | + else: |
| 155 | + ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor) |
| 156 | + event.record(stream) |
| 157 | + |
| 158 | + self.transfer_events[job_id] = event |
| 159 | + |
| 160 | + # success |
| 161 | + return True |
| 162 | + |
| 163 | + def get_finished(self) -> list[TransferResult]: |
| 164 | + results: list[TransferResult] = [] |
| 165 | + for job_id, event in self.transfer_events.items(): |
| 166 | + if event.query(): |
| 167 | + results.append((job_id, True)) |
| 168 | + self.events_pool.append(event) |
| 169 | + for job_id, _ in results: |
| 170 | + del self.transfer_events[job_id] |
| 171 | + return results |
0 commit comments