diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 42506730e868..737b2eede9c6 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -66,10 +66,10 @@ function cpu_tests() { tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$NUMA_NODE" bash -c " - set -e - VLLM_USE_V1=0 pytest -s -v \ - tests/quantization/test_ipex_quant.py" + # docker exec cpu-test-"$NUMA_NODE" bash -c " + # set -e + # VLLM_USE_V1=0 pytest -s -v \ + # tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test docker exec cpu-test-"$NUMA_NODE" bash -c " diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 827649bfcf54..cf3aaab8493b 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -26,7 +26,5 @@ docker run \ --name "${container_name}" \ "${image_name}" \ sh -c ' - VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m - VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager ' diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml index 28dba9a6f688..815f02a4bfd5 100644 --- a/examples/online_serving/chart-helm/values.yaml +++ b/examples/online_serving/chart-helm/values.yaml @@ -8,7 +8,7 @@ image: # -- Image tag tag: "latest" # -- Container launch command - command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] + command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] # -- Container port containerPort: 8000 diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 0437bb8293ce..7d7522c1fc00 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -36,7 +36,8 @@ def clear_cache(): DEVICE_MLA_BLOCK_SIZES = { "cuda": [16, 64], # CUDA supports both standard and extended block sizes "hip": [16, 1], # HIP requires special handling for block_size=1 - "cpu": [16] # CPU uses fixed block size from test cases + # "cpu": [16] # CPU uses fixed block size from test cases + "cpu": [] # FIXME(woosuk): Temporarily disable CPU tests } @@ -81,14 +82,14 @@ def test_env( m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") if device == "cpu": + if not use_v1: + pytest.skip("CPU backend only supports V1") + with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, block_size, False) - if use_v1: - assert backend.get_name() == "TORCH_SDPA_VLLM_V1" - else: - assert backend.get_name() == "TORCH_SDPA" + assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "hip": with patch("vllm.attention.selector.current_platform", @@ -193,12 +194,14 @@ def test_fp32_fallback( m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") if device == "cpu": + if not use_v1: + pytest.skip("CPU backend only supports V1") + with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float32, torch.float32, 16, False) - assert (backend.get_name() == "TORCH_SDPA_VLLM_V1" - if use_v1 else "TORCH_SDPA") + assert backend.get_name() == "TORCH_SDPA_VLLM_V1" elif device == "cuda": with patch("vllm.attention.selector.current_platform", diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py deleted file mode 100644 index 793cb87b7434..000000000000 --- a/vllm/attention/backends/cpu_mla.py +++ /dev/null @@ -1,307 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch - -import vllm._custom_ops as ops -from vllm._ipex_ops import ipex_ops -from vllm.attention.backends.abstract import (AttentionBackend, - AttentionMetadataBuilder, - AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.mla.common import MLACommonImpl, MLACommonState -from vllm.attention.backends.torch_sdpa import TorchSDPAMetadata -from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder - - -class CPUMLABackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "CPU_MLA" - - @staticmethod - def get_metadata_cls() -> Type["CPUMLAMetadata"]: - return CPUMLAMetadata - - @staticmethod - def get_builder_cls() -> Type["CPUMLAMetadataBuilder"]: - return CPUMLAMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["MLACommonState"]: - return MLACommonState - - @staticmethod - def get_impl_cls() -> Type["CPUMLAImpl"]: - return CPUMLAImpl - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, # assumed to be 1 for MLA - head_size: int, - ) -> Tuple[int, ...]: - return (num_blocks, block_size, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - ops.copy_blocks_mla(kv_caches, src_to_dists) - - @staticmethod - def get_supported_head_sizes() -> List[int]: - return [576] - - -@dataclass -class CPUMLAMetadata(TorchSDPAMetadata): - # New for MLA - # Input positions for rotrary embeddings since for MLA the rotary - # position embeddings are applied inside the attention backend - input_positions: torch.Tensor = None - - # required by MLACommonImpl - is_profile_run: bool = False - - -class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]): - - def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: - self.chunked_prefill = input_builder.chunked_prefill - self.input_builder = input_builder - assert not self.chunked_prefill, \ - "chunked prefill is currently not supported" - - def prepare(self): - self.input_data = self.input_builder.input_data - - def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size): - input_data = self.input_data - prefill_seq_lens = seq_lens[0:input_data.num_prefills] - prefill_query_lens = query_lens[0:input_data.num_prefills] - slot_mapping = torch.tensor(input_data.slot_mapping, - dtype=torch.long, - device="cpu") - - # metadata for prefill - if input_data.num_prefills > 0: - query_lens_tensor = torch.tensor(prefill_query_lens, - dtype=torch.int32, - device="cpu") - kv_lens_tensor = torch.tensor(prefill_seq_lens, - dtype=torch.int32, - device="cpu") - query_start_loc = torch.zeros(input_data.num_prefills + 1, - dtype=torch.int32, - device="cpu") - kv_start_loc = torch.zeros(input_data.num_prefills + 1, - dtype=torch.int32, - device="cpu") - torch.cumsum(query_lens_tensor, - dim=0, - dtype=torch.int32, - out=query_start_loc[1:]) - torch.cumsum(kv_lens_tensor, - dim=0, - dtype=torch.int32, - out=kv_start_loc[1:]) - max_query_len = max(prefill_query_lens) - max_kv_len = max(prefill_seq_lens) - - # for chunked-prefill - if self.chunked_prefill: - prefill_block_tables = make_tensor_with_pad( - self.input_data.prefill_block_tables, - pad=0, - dtype=torch.int32, - device="cpu", - ) - else: - prefill_block_tables = None - - else: - query_start_loc = None - kv_start_loc = None - max_query_len = None - max_kv_len = None - prefill_block_tables = None - - # metadata for decode - if input_data.num_decode_tokens != 0: - seq_lens_tensor = torch.tensor( - input_data.seq_lens[input_data.num_prefills:], - dtype=torch.int32, - device="cpu", - ) - block_tables = make_tensor_with_pad( - self.input_data.decode_block_tables, - pad=0, - dtype=torch.int32, - device="cpu", - ) - else: - block_tables = torch.tensor([]) - seq_lens_tensor = torch.tensor( - input_data.seq_lens[:input_data.num_prefills], - dtype=torch.int32, - device="cpu", - ) - - # For multi-modal models - placeholder_index_maps = None - if len(input_data.multi_modal_inputs_list) != 0: - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - input_data.multi_modal_placeholder_maps.items() - } - - return CPUMLAMetadata( - chunked_prefill=self.chunked_prefill, - seq_lens=prefill_seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_kv_len=max_kv_len, - prefill_query_start_loc=query_start_loc, - kv_start_loc=kv_start_loc, - max_decode_seq_len=input_data.max_decode_seq_len, - num_prefills=input_data.num_prefills, - num_prefill_tokens=input_data.num_prefill_tokens, - num_decode_tokens=input_data.num_decode_tokens, - block_tables=block_tables, - prefill_block_tables=prefill_block_tables, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=False, - input_positions=torch.tensor([self.input_data.input_positions])) - - -class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], - logits_soft_cap: Optional[float], - attn_type: str, - kv_sharing_target_layer_name: Optional[str], - # MLA Specific Arguments - **mla_args) -> None: - super().__init__(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, - kv_sharing_target_layer_name, **mla_args) - - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] - if any(unsupported_features): - raise NotImplementedError( - "CPUMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "CPUMLAImpl") - - # states is implemented. - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "CPUMLAImpl with FP8 KV cache not yet supported") - - def _forward_prefill( - self, - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: CPUMLAMetadata, # type: ignore[override] - ) -> torch.Tensor: - - prefill_metadata = attn_metadata.prefill_metadata - assert prefill_metadata is not None - - kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\ - -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - k_nope, v = kv_nope\ - .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) - - # For MLA the v head dim is smaller than qk head dim so we pad out - # v with 0s to match the qk head dim - v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]], - value=0) - - output = torch.empty_like(q) - ipex_ops.varlen_attention( - query=q, - key=k, - value=v_padded, - out=output, - seqlen_q=prefill_metadata.prefill_query_start_loc, - seqlen_k=prefill_metadata.prefill_query_start_loc, - max_seqlen_q=prefill_metadata.max_query_len, - max_seqlen_k=prefill_metadata.max_query_len, - pdropout=0.0, - softmax_scale=self.scale, - zero_tensors=False, - is_causal=True, - return_softmax=False, - gen_=None, - logits_soft_cap=0.0, - window_size_left=-1, - window_size_right=-1, - alibi_slopes=None, - ) - - # remove padding - output = output.view(-1, self.num_heads, - q.shape[-1])[..., :v.shape[-1]] - return output.reshape(-1, self.num_heads * v.shape[-1]) - - def _forward_decode( - self, - q_nope: torch.Tensor, - q_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - attn_metadata: CPUMLAMetadata, # type: ignore[override] - ) -> torch.Tensor: - assert kv_c_and_k_pe_cache.numel() > 0 - - decode_meta = attn_metadata.decode_metadata - assert decode_meta is not None - - q = torch.cat([q_nope, q_pe], dim=-1) - o = q.new_empty(q.shape[0], self.num_heads, self.kv_lora_rank) - - # Run MQA - ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale, - decode_meta.block_tables, - decode_meta.seq_lens_tensor) - return self._v_up_proj(o) diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py deleted file mode 100644 index 410ada3b0828..000000000000 --- a/vllm/attention/backends/ipex_attn.py +++ /dev/null @@ -1,403 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" Attention layer with torch scaled_dot_product_attention - and PagedAttention.""" -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch - -from vllm._ipex_ops import ipex_ops -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.utils import CommonAttentionState -from vllm.attention.ops.paged_attn import (PagedAttention, - PagedAttentionMetadata) -from vllm.logger import init_logger - -logger = init_logger(__name__) - -_PARTITION_SIZE = 512 - - -class IpexAttnBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "IPEX" - - @staticmethod - def get_impl_cls() -> Type["IpexAttnBackendImpl"]: - return IpexAttnBackendImpl - - @staticmethod - def get_metadata_cls() -> Type["IpexAttnMetadata"]: - return IpexAttnMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - from vllm._ipex_ops import ipex_ops as ops - ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - from vllm._ipex_ops import ipex_ops as ops - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - ops.copy_blocks(key_caches, value_caches, src_to_dists) - - -@dataclass -class IpexAttnMetadata(AttentionMetadata, PagedAttentionMetadata): - """Metadata for IpexAttnBackend. - """ - # Currently, input sequences can only contain all prompts - # or all decoding. True if all sequences are prompts. - is_prompt: bool - slot_mapping: torch.Tensor - seq_lens: Optional[List[int]] - seqlen_q: Optional[torch.Tensor] - max_seqlen: Optional[int] - - def __post_init__(self): - # Set during the execution of the first attention op. - # It is a list because it is needed to set per prompt - # when alibi slopes is used. It is because of the limitation - # from xformer API. - # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[torch.Tensor]] = None - - @property - def prefill_metadata(self) -> Optional["IpexAttnMetadata"]: - # Currently chunked prefill is not supported - if self.num_decode_tokens == 0: - assert self.num_prefills > 0 - return self - - return None - - @property - def decode_metadata(self) -> Optional["IpexAttnMetadata"]: - # Currently chunked prefill is not supported - if self.num_prefills > 0: - assert self.num_decode_tokens == 0 - return None - - return self - - -class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0.") - if use_irope: - logger.warning_once( - "Using irope in Ipex is not supported yet, it will fall" - " back to global attention for long context.") - if blocksparse_params is not None: - raise ValueError( - "IPEX backend does not support block-sparse attention.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - self.sliding_window = sliding_window - self.kv_cache_dtype = kv_cache_dtype - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.need_mask = (self.sliding_window is not None) - if logits_soft_cap is None: - logits_soft_cap = -1 - self.logits_soft_cap = logits_soft_cap - - supported_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - if is_quantized_kv_cache(kv_cache_dtype): - raise NotImplementedError( - "IPEX backend does not support FP8 KV cache. " - "Please use xFormers backend instead.") - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "IpexAttnBackendImpl") - - def split_kv_cache( - self, - kv_cache: torch.Tensor, - num_kv_heads: int, - head_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: - x = 1 - num_blocks = kv_cache.shape[1] - - key_cache = kv_cache[0] - key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, - -1, x) - value_cache = kv_cache[1] - value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) - return key_cache, value_cache - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: IpexAttnMetadata, # type: ignore - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with IPEX varlen_attention and PagedAttention. - - Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for IpexAttentionImpl") - - assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 - num_tokens, hidden_size = query.shape - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - if kv_cache.numel() > 0: - key_cache, value_cache = self.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - ipex_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping.flatten(), - self.kv_cache_dtype, - layer._k_scale_float, - layer._v_scale_float, - ) - - if attn_metadata.is_prompt: - assert attn_metadata.seq_lens is not None - if (kv_cache.numel() == 0 - or attn_metadata.block_tables.numel() == 0): - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, dim=1) - value = value.repeat_interleave(self.num_queries_per_kv, - dim=1) - - if attn_metadata.attn_bias is None: - if self.sliding_window is not None: - att_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, self.sliding_window, - query.dtype) # type: ignore - else: - att_masks = _make_sliding_window_bias( - attn_metadata.seq_lens, None, dtype=query.dtype) - attn_metadata.attn_bias = att_masks - - output = torch.empty( - (num_tokens, self.num_heads, self.head_size), - dtype=query.dtype, - device=query.device) - ipex_ops.varlen_attention( - query, - key, - value, - output, - attn_metadata.seqlen_q, - attn_metadata.seqlen_q, - self.alibi_slopes, - attn_metadata.max_seqlen, - attn_metadata.max_seqlen, - pdropout=0.0, - softmax_scale=self.scale, - zero_tensors=False, - is_causal=True, - return_softmax=False, - gen_=None, - window_size_left=-1, - window_size_right=-1, - logits_soft_cap=self.logits_soft_cap, - ) - else: - # prefix-enabled attention - raise RuntimeError( - "IPEX backend doesn't support prefix decoding.") - - else: - # Decoding run. - max_seq_len = attn_metadata.max_decode_seq_len - output = torch.empty_like(query) - block_size = value_cache.shape[3] - num_seqs, num_heads, head_size = query.shape - max_num_partitions = ((max_seq_len + _PARTITION_SIZE - 1) // - _PARTITION_SIZE) - # NOTE(woosuk): We use a simple heuristic to decide whether to use - # PagedAttention V1 or V2. If the number of partitions is 1, we use - # V1 to avoid the overhead of reduction. Also, if the number of - # sequences or heads is large, we use V1 since there is enough work - # to parallelize. - # TODO(woosuk): Tune this heuristic. - # For context len > 8192, use V2 kernel to avoid shared memory - # shortage. - use_v1 = (max_seq_len <= 8192 and - (max_num_partitions == 1 or num_seqs * num_heads > 512)) - if use_v1: - # Run PagedAttention V1. - ipex_ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - self.num_kv_heads, - self.scale, - attn_metadata.block_tables, - attn_metadata.seq_lens_tensor, - block_size, - max_seq_len, - self.alibi_slopes, - self.kv_cache_dtype, - layer._k_scale_float, - layer._v_scale_float, - ) - else: - # Run PagedAttention V2. - assert _PARTITION_SIZE % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ipex_ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - self.num_kv_heads, - self.scale, - attn_metadata.block_tables, - attn_metadata.seq_lens_tensor, - block_size, - max_seq_len, - self.alibi_slopes, - self.kv_cache_dtype, - layer._k_scale_float, - layer._v_scale_float, - ) - - # Reshape the output tensor. - return output.view(-1, self.num_heads * self.head_size) - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - dtype: torch.dtype, - seq_lens: List[int], -) -> List[torch.Tensor]: - attn_biases = [] - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - num_heads = alibi_slopes.shape[0] - bias = bias[None, :].repeat((num_heads, 1, 1)) - bias.mul_(alibi_slopes[:, None, None]) - inf_mask = torch.empty( - (1, seq_len, seq_len), - dtype=bias.dtype, - device=alibi_slopes.device).fill_(-torch.inf).triu_(diagonal=1) - attn_biases.append((bias + inf_mask).to(dtype)) - - return attn_biases - - -def _make_sliding_window_bias( - seq_lens: List[int], - window_size: Optional[int], - dtype: torch.dtype, -) -> List[torch.Tensor]: - attn_biases = [] - for seq_len in seq_lens: - tensor = torch.full( - (1, seq_len, seq_len), - dtype=dtype, - fill_value=1, - ) - shift = 0 - mask = torch.tril(tensor, diagonal=shift).to(dtype) # type: ignore - if window_size is not None: - mask = torch.triu(mask, diagonal=shift - window_size + 1) - mask = torch.log(mask) - attn_biases.append(mask.to(dtype)) - - return attn_biases diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py deleted file mode 100644 index c900666955a3..000000000000 --- a/vllm/attention/backends/pallas.py +++ /dev/null @@ -1,356 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch -import torch_xla.experimental.custom_kernel # Required to register custom ops. - -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType, - is_quantized_kv_cache) -from vllm.attention.backends.utils import CommonAttentionState -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class PallasAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "PALLAS" - - @staticmethod - def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: - return PallasAttentionBackendImpl - - @staticmethod - def get_metadata_cls() -> Type["PallasMetadata"]: - return PallasMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return (num_kv_heads, num_blocks, block_size, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - raise RuntimeError("swap_blocks is not used for the TPU backend.") - - @torch.compile(backend="openxla") - @staticmethod - def copy_blocks( - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - src_to_dists: Tuple[torch.Tensor, torch.Tensor], - ) -> None: - src_indices, dst_indices = src_to_dists - for k_cache, v_cache in kv_caches: - torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True) - k_cache[:, dst_indices] = k_cache[:, src_indices] - torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True) - v_cache[:, dst_indices] = v_cache[:, src_indices] - - -@dataclass -class PallasMetadata(AttentionMetadata): - - # Currently, input sequences can only contain all prefills - # or all decoding. - block_tables: Optional[torch.Tensor] = None - context_lens: Optional[torch.Tensor] = None - effective_query_lens: Optional[torch.Tensor] = None - - @property - def prefill_metadata(self) -> Optional["PallasMetadata"]: - if self.num_prefills == 0: - return None - - assert self.num_decode_tokens == 0 - return self - - @property - def decode_metadata(self) -> Optional["PallasMetadata"]: - if self.num_decode_tokens == 0: - return None - - assert self.num_prefills == 0 - assert self.num_prefill_tokens == 0 - assert self.block_tables is not None - assert self.context_lens is not None - return self - - -class PallasAttentionBackendImpl(AttentionImpl): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0.") - if use_irope: - logger.warning_once( - "Using irope in Pallas is not supported yet, it will fall back " - "to global attention for long context.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.logits_soft_cap = logits_soft_cap - if head_size % 128 != 0: - raise NotImplementedError( - f"Head size must be a multiple of 128, found {head_size}.") - if alibi_slopes is not None: - raise NotImplementedError("Alibi slopes is not supported.") - if sliding_window is not None: - raise NotImplementedError("Sliding window is not supported.") - if is_quantized_kv_cache(kv_cache_dtype): - raise NotImplementedError("FP8 KV cache dtype is not supported.") - if blocksparse_params is not None: - raise NotImplementedError("Blocksparse is not supported.") - - if torch_xla.tpu.version() < 4: - raise NotImplementedError("TPU version must be 4 or higher.") - - self.megacore_mode = None - tpu_env = torch_xla.tpu.get_tpu_env() - tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None) - or tpu_env.get("TYPE", None) - or tpu_env.get("TPU_ACCELERATOR_TYPE", None)) - assert tpu_type is not None - tpu_type = tpu_type.lower() - - if (("lite" not in tpu_type) and ("v6" not in tpu_type)): - if self.num_kv_heads % 2 == 0: - self.megacore_mode = "kv_head" - else: - # NOTE(woosuk): If the batch size is not a multiple of 2, the - # megacore mode will be None. - self.megacore_mode = "batch" - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "PallasAttentionBackendImpl") - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: Tuple[torch.Tensor, torch.Tensor], - attn_metadata: PallasMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with Pallas attention. - - Args: - query: shape = [batch_size, seq_len, num_heads * head_size] - key: shape = [batch_size, seq_len, num_kv_heads * head_size] - value: shape = [batch_size, seq_len, num_kv_heads * head_size] - kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size] - kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size] - NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor - with shape [0] for profiling run. - attn_metadata: Metadata for attention. - Returns: - shape = [batch_size, seq_len, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for PallasAttentionImpl") - - assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 - batch_size, seq_len, hidden_size = query.shape - query = query.view(batch_size, seq_len, self.num_heads, self.head_size) - key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size) - value = value.view(batch_size, seq_len, self.num_kv_heads, - self.head_size) - - if kv_cache[0].numel() > 0: - slot_mapping = attn_metadata.slot_mapping - key_cache, value_cache = kv_cache - write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping) - - query = query * self.scale - if attn_metadata.num_prefills > 0: - if attn_metadata.block_tables is None: - # Prefill without paged KV cache. - assert seq_len % 16 == 0, ( - "Pallas FlashAttention kernel requires seq_len to be a " - f"multiple of 16 but got {seq_len}") - - # Handle GQA/MQA. - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, - dim=-2) - key = key.view(batch_size, seq_len, self.num_heads, - self.head_size) - value = value.repeat_interleave(self.num_queries_per_kv, - dim=-2) - value = value.view(batch_size, seq_len, self.num_heads, - self.head_size) - # FlashAttention kernel requires the input shape to be - # [batch_size, num_heads, seq_len, d_model] - # while the input is [batch_size, seq_len, num_heads, d_model]. - # Permute the input to match the required format. - output = torch.ops.xla.flash_attention( - query.permute(0, 2, 1, 3), - key.permute(0, 2, 1, 3), - value.permute(0, 2, 1, 3), - True, - ) - output = output.permute(0, 2, 1, 3) - else: - # Prefill with paged KV cache. - # TODO(woosuk): Tune the below knobs. - num_kv_pages_per_compute_block = 16 - num_queries_per_compute_block = 16 - assert seq_len % num_queries_per_compute_block == 0 - output = torch.ops.xla.multi_queries_paged_attention( - query, - key_cache, - value_cache, - attn_metadata.context_lens, - attn_metadata.block_tables, - attn_metadata.effective_query_lens, - num_kv_pages_per_compute_block, - num_queries_per_compute_block, - use_kernel=True, - attn_logits_soft_cap=self.logits_soft_cap, - ) - else: - # Decoding run. - assert kv_cache[0].numel() > 0 - query = query.squeeze(dim=1) - pages_per_compute_block = 16 # TODO(woosuk): Tune this value. - - assert attn_metadata.block_tables is not None - assert attn_metadata.context_lens is not None - # NOTE(woosuk): The PagedAttention Pallas kernel stores the entire - # block table in SMEM. Therefore, if the block table is too large, - # the kernel compilation will fail. To avoid this, we split the - # batch dimension into smaller chunks and run the kernel multiple - # times. - MAX_SMEM_USAGE = 512 * 1024 - size_per_seq = 4 * attn_metadata.block_tables.shape[1] - max_num_seq = MAX_SMEM_USAGE // size_per_seq - - if batch_size <= max_num_seq: - output = paged_attention( - query, - key_cache, - value_cache, - attn_metadata.context_lens, - attn_metadata.block_tables, - pages_per_compute_block, - self.megacore_mode, - attn_logits_soft_cap=self.logits_soft_cap, - ) - else: - chunk_size = max_num_seq - # Make sure the chunk size is a multiple of 2. - chunk_size = chunk_size // 2 * 2 - num_chunks = (batch_size + chunk_size - 1) // chunk_size - - output = torch.empty_like(query) - for chunk_idx in range(num_chunks): - chunk_start = chunk_idx * chunk_size - chunk_end = chunk_start + chunk_size - # NOTE(woosuk): We skip this line because it causes Dynamo - # compilation error. Instead, we rely on the slice operation - # to handle the out-of-bound case. - # chunk_end = min(chunk_end, batch_size) - chunk_output = paged_attention( - query[chunk_start:chunk_end], - key_cache, - value_cache, - attn_metadata.context_lens[chunk_start:chunk_end], - attn_metadata.block_tables[chunk_start:chunk_end], - pages_per_compute_block, - self.megacore_mode, - attn_logits_soft_cap=self.logits_soft_cap, - ) - output[chunk_start:chunk_end] = chunk_output - - # Reshape the output tensor. - return output.reshape(batch_size, seq_len, hidden_size) - - -def write_to_kv_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slot_mapping: torch.Tensor, -) -> None: - torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True) - torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True) - - key = key.flatten(0, 2) - value = value.flatten(0, 2) - key_cache = key_cache.flatten(0, 2) - value_cache = value_cache.flatten(0, 2) - key_cache.index_copy_(0, slot_mapping, key) - value_cache.index_copy_(0, slot_mapping, value) - - -def paged_attention( - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - context_lens: torch.Tensor, - block_tables: torch.Tensor, - pages_per_compute_block: int, - megacore_mode: Optional[str], - *, - attn_logits_soft_cap: Optional[float], -) -> torch.Tensor: - batch_size = query.shape[0] - if megacore_mode == "batch" and batch_size % 2 != 0: - megacore_mode = None - else: - megacore_mode = megacore_mode - - return torch.ops.xla.paged_attention( - query, - key_cache, - value_cache, - context_lens, - block_tables, - pages_per_compute_block, - megacore_mode=megacore_mode, - attn_logits_soft_cap=attn_logits_soft_cap, - ) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index af5fe81dc883..a490aa397991 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -3,78 +3,24 @@ """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional import torch from torch.nn.functional import scaled_dot_product_attention # yapf conflicts with isort for this block # yapf: disable -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, - AttentionMetadataBuilder, - AttentionType, +from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer, + AttentionMetadata, AttentionType, is_quantized_kv_cache) # yapf: enable -from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex from vllm.attention.ops.paged_attn import PagedAttentionMetadata from vllm.logger import init_logger -from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder logger = init_logger(__name__) -class TorchSDPABackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "TORCH_SDPA" - - @staticmethod - def get_impl_cls() -> Type["TorchSDPABackendImpl"]: - return TorchSDPABackendImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return TorchSDPAMetadata - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]: - return TorchSDPAMetadataBuilder - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - raise NotImplementedError("Swap is not supported in TorchSDPABackend.") - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - PagedAttention.copy_blocks(kv_caches, src_to_dists) - - @dataclass class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): """Metadata for TorchSDPABackend. @@ -287,113 +233,6 @@ def get_seq_len_block_table_args( raise AttributeError(f"Invalid attention type {str(attn_type)}") -class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): - - def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: - self.chunked_prefill = input_builder.chunked_prefill - self.input_builder = input_builder - - def prepare(self): - self.input_data = self.input_builder.input_data - - def build(self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata: - input_data = self.input_data - prefill_seq_lens = seq_lens[0:input_data.num_prefills] - prefill_query_lens = query_lens[0:input_data.num_prefills] - slot_mapping = torch.tensor(input_data.slot_mapping, - dtype=torch.long, - device="cpu") - - # For chunked-prefill - if self.chunked_prefill and input_data.num_prefill_tokens != 0: - prefill_block_tables = make_tensor_with_pad( - self.input_data.prefill_block_tables, - pad=0, - dtype=torch.int32, - device="cpu", - ) - query_lens_tensor = torch.tensor(prefill_query_lens, - dtype=torch.int32, - device="cpu") - kv_lens_tensor = torch.tensor(prefill_seq_lens, - dtype=torch.int32, - device="cpu") - query_start_loc = torch.zeros(input_data.num_prefills + 1, - dtype=torch.int32, - device="cpu") - kv_start_loc = torch.zeros(input_data.num_prefills + 1, - dtype=torch.int32, - device="cpu") - torch.cumsum(query_lens_tensor, - dim=0, - dtype=torch.int32, - out=query_start_loc[1:]) - torch.cumsum(kv_lens_tensor, - dim=0, - dtype=torch.int32, - out=kv_start_loc[1:]) - max_query_len = max(prefill_query_lens) - max_kv_len = max(prefill_seq_lens) - else: - prefill_block_tables = None - query_start_loc = None - kv_start_loc = None - max_query_len = None - max_kv_len = None - - # For paged attention - if input_data.num_decode_tokens != 0: - seq_lens_tensor = torch.tensor( - input_data.seq_lens[input_data.num_prefills:], - dtype=torch.int32, - device="cpu", - ) - block_tables = make_tensor_with_pad( - self.input_data.decode_block_tables, - pad=0, - dtype=torch.int32, - device="cpu", - ) - else: - block_tables = torch.tensor([]) - seq_lens_tensor = torch.tensor( - input_data.seq_lens[:input_data.num_prefills], - dtype=torch.int32, - device="cpu", - ) - - # For multi-modal models - placeholder_index_maps = None - if len(input_data.multi_modal_inputs_list) != 0: - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - input_data.multi_modal_placeholder_maps.items() - } - - attn_metadata = TorchSDPAMetadata( - chunked_prefill=self.chunked_prefill, - seq_lens=prefill_seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - max_kv_len=max_kv_len, - prefill_query_start_loc=query_start_loc, - kv_start_loc=kv_start_loc, - max_decode_seq_len=input_data.max_decode_seq_len, - num_prefills=input_data.num_prefills, - num_prefill_tokens=input_data.num_prefill_tokens, - num_decode_tokens=input_data.num_decode_tokens, - block_tables=block_tables, - prefill_block_tables=prefill_block_tables, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=False, - ) - - return attn_metadata - - class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): def __init__( diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index dccd60f4463a..1050d3c59344 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -64,13 +64,11 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, if selected_backend and selected_backend != _Backend.TORCH_SDPA: logger.info("Cannot use %s backend on CPU.", selected_backend) if use_mla: - logger.info("Using CPU MLA backend.") - return "vllm.attention.backends.cpu_mla.CPUMLABackend" + raise NotImplementedError("MLA is not supported on CPU.") logger.info("Using Torch SDPA backend.") - if use_v1: - return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend" - else: - return "vllm.attention.backends.torch_sdpa.TorchSDPABackend" + if not use_v1: + raise ValueError("CPU backend only supports V1.") + return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend" @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: @@ -147,26 +145,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.distributed_executor_backend) parallel_config.distributed_executor_backend = "mp" if parallel_config.worker_cls == "auto": - if vllm_config.speculative_config: - parallel_config.worker_cls = \ - "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = \ - "vllm.worker.cpu_worker.CPUWorker" - else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = \ - "vllm.v1.worker.cpu_worker.CPUWorker" - else: - parallel_config.worker_cls = \ - "vllm.worker.cpu_worker.CPUWorker" + parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker" # Note: workaround for v1 gpu_model_runner from vllm.config import CompilationLevel vllm_config.compilation_config.cudagraph_capture_sizes = [] compilation_config = vllm_config.compilation_config - if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE): + if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE: # Note: vLLM V1 is using PIECEWISE level compilation, which will # take time to compile kernels just-in-time with the inductor diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 0387e348965d..a8c8cb46de2c 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -6,7 +6,6 @@ import torch from tpu_info import device -import vllm.envs as envs from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger from vllm.sampling_params import SamplingParams, SamplingType @@ -50,12 +49,10 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, and selected_backend != _Backend.PALLAS_VLLM_V1): logger.info("Cannot use %s backend on TPU.", selected_backend) - if use_v1: - logger.info("Using Pallas V1 backend.") - return "vllm.v1.attention.backends.pallas.PallasAttentionBackend" - else: - logger.info("Using Pallas backend.") - return "vllm.attention.backends.pallas.PallasAttentionBackend" + if not use_v1: + raise ValueError("TPU backend only supports V1.") + logger.info("Using Pallas V1 backend.") + return "vllm.v1.attention.backends.pallas.PallasAttentionBackend" @classmethod def get_device_name(cls, device_id: int = 0) -> str: @@ -68,7 +65,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: @classmethod def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return not envs.VLLM_USE_V1 + return False @classmethod def get_punica_wrapper(cls) -> str: @@ -117,31 +114,19 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "Using bfloat16 instead.", vllm_config.model_config.dtype) vllm_config.model_config.dtype = torch.bfloat16 - if envs.VLLM_USE_V1: - from vllm.v1.attention.backends.pallas import ( - PallasAttentionBackend) - cache_config.block_size = PallasAttentionBackend.get_page_size( - vllm_config) # type: ignore[assignment] + from vllm.v1.attention.backends.pallas import PallasAttentionBackend + cache_config.block_size = PallasAttentionBackend.get_page_size( + vllm_config) # type: ignore[assignment] parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config if parallel_config.worker_cls == "auto": if scheduler_config.is_multi_step: - if envs.VLLM_USE_V1: - raise NotImplementedError( - "Multi-step scheduling is not supported (and not " - "needed) on vLLM V1. Please launch without " - "--num-scheduler-steps.") - else: - parallel_config.worker_cls = \ - "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker" - else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = \ - "vllm.v1.worker.tpu_worker.TPUWorker" - else: - parallel_config.worker_cls = \ - "vllm.worker.tpu_worker.TPUWorker" + raise NotImplementedError( + "Multi-step scheduling is not supported (and not " + "needed) on vLLM V1. Please launch without " + "--num-scheduler-steps.") + parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker" assert not vllm_config.speculative_config, ( "Speculative decoding is not yet supported for TPU backend") @@ -189,13 +174,9 @@ def validate_request( processed_inputs: ProcessorInputs, ) -> None: """Raises if this request is unsupported on this platform""" - if isinstance(params, SamplingParams): - if params.guided_decoding is not None and not envs.VLLM_USE_V1: - raise ValueError("Structured output is not supported on " - f"{cls.device_name} V0.") - if params.sampling_type == SamplingType.RANDOM_SEED: - raise ValueError( - "Torch XLA does not support per-request seed.") + if (isinstance(params, SamplingParams) + and params.sampling_type == SamplingType.RANDOM_SEED): + raise ValueError("Torch XLA does not support per-request seed.") try: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 61a0453dcbc8..5bd34033233a 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -39,12 +39,10 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, if selected_backend != _Backend.IPEX: logger.info("Cannot use %s backend on XPU.", selected_backend) use_v1 = envs.VLLM_USE_V1 - if use_v1: - logger.info("Using Flash Attention backend on V1 engine.") - return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" - else: - logger.info("Using IPEX attention backend.") - return "vllm.attention.backends.ipex_attn.IpexAttnBackend" + if not use_v1: + raise ValueError("XPU backend only supports V1.") + logger.info("Using Flash Attention backend on V1 engine.") + return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" @classmethod def get_device_capability( @@ -77,10 +75,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config # in V1(or with ipex chunked prefill) block_size is 64 if cache_config and cache_config.block_size is None: - if envs.VLLM_USE_V1: - cache_config.block_size = 64 - else: - cache_config.block_size = 16 + cache_config.block_size = 64 # Instances created using VllmConfig() typically have model_config as # None by default. The modification involves adding a check to prevent @@ -106,11 +101,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # check and update parallel config parallel_config = vllm_config.parallel_config - if envs.VLLM_USE_V1: - parallel_config.worker_cls =\ - "vllm.v1.worker.xpu_worker.XPUWorker" - else: - parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" + parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker" if parallel_config.distributed_executor_backend is None: if parallel_config.world_size > 1: diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py deleted file mode 100644 index c99e2652a397..000000000000 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ /dev/null @@ -1,326 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast - -import torch - -from vllm.attention import AttentionMetadata -from vllm.forward_context import set_forward_context -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, - ModelInputForCPUBuilder, - ModelInputForCPUWithSamplingMetadata) -from vllm.worker.model_runner_base import ( - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - - -@dataclasses.dataclass(frozen=True) -class EncoderDecoderModelInputForCPU(ModelInputForCPUWithSamplingMetadata): - """ - Used by the EncoderDecoderModelRunner. - """ - encoder_input_tokens: Optional[torch.Tensor] = None - encoder_input_positions: Optional[torch.Tensor] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "encoder_input_tokens": self.encoder_input_tokens, - "encoder_input_positions": self.encoder_input_positions, - "multi_modal_kwargs": self.multi_modal_kwargs, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "EncoderDecoderModelInputForCPU": - return cast( - EncoderDecoderModelInputForCPU, - super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) - - -class CPUEncoderDecoderModelRunner( - CPUModelRunnerBase[EncoderDecoderModelInputForCPU]): - _model_input_cls: Type[EncoderDecoderModelInputForCPU] = ( - EncoderDecoderModelInputForCPU) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder - - def _list_to_int32_tensor( - self, - _list: List[int], - ) -> torch.Tensor: - return torch.tensor(_list, dtype=torch.int32, device=self.device) - - def _list_to_long_tensor( - self, - _list: List[int], - ) -> torch.Tensor: - return torch.tensor(_list, dtype=torch.long, device=self.device) - - def _empty_int32_tensor(self) -> torch.Tensor: - return self._list_to_int32_tensor([]) - - def _empty_long_tensor(self) -> torch.Tensor: - return self._list_to_long_tensor([]) - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, - Any]) -> EncoderDecoderModelInputForCPU: - return EncoderDecoderModelInputForCPU.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> EncoderDecoderModelInputForCPU: - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - ( - attn_metadata, - encoder_input_tokens_tensor, - encoder_input_positions_tensor, - ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list, - model_input) - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - pin_memory=False, - generators=generators) - return dataclasses.replace( - model_input, - sampling_metadata=sampling_metadata, - attn_metadata=attn_metadata, - encoder_input_tokens=encoder_input_tokens_tensor, - encoder_input_positions=encoder_input_positions_tensor, - virtual_engine=virtual_engine, - ) - - def _prepare_encoder_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - model_input: EncoderDecoderModelInputForCPU, - ) -> Tuple[AttentionMetadata, Optional[torch.Tensor], - Optional[torch.Tensor]]: - """Helper method to prepare the encoder- and cross-attn-related - model inputs based on a given sequence group. These additional inputs - are used to augment an already-computed `EncoderDecoderModelInput` - data structure which already has decoder-related model inputs - populated. - - Sets the following attn_metadata fields: - * `num_encoder_tokens` - * `encoder_seq_lens` - * `encoder_seq_lens_tensor` - * `max_encoder_seq_len` - * `cross_slot_mapping` - * `cross_block_tables` - - Constructs a new model inputs data structure, based on - (1) the existing fields in the `model_inputs` argument, - and (2) the following additional fields which are - computed (or in the case of `attn_metadata`, updated) - by this function: - * attn_metadata - * encoder_input_tokens - * encoder_input_positions - - Arguments: - - * seq_group_metadata_list: list of sequence groups for which to - compute inputs - * model_inputs: model inputs data structure with decoder-oriented - fields already computed. - - Return: - - * Updated model inputs data structure - """ - - if len(seq_group_metadata_list) == 0: - return (model_input.attn_metadata, None, None) - - # Since we are not supporting chunked prefill either the entire - # batch is prefill or it is decode - is_prompt = seq_group_metadata_list[0].is_prompt - - # Build encoder inputs - encoder_seq_lens: List[int] = [] - if is_prompt: - # Prefill phase. - cross_block_tables = self._empty_int32_tensor().view( - len(seq_group_metadata_list), -1) - - # Extract input tokens/positions, cross-attention slot-mapping, - # & seq len from each sequence group metadata - ( - encoder_input_tokens, - encoder_input_positions, - cross_slot_mapping, - ) = ( - [], - [], - [], - ) - for seq_group_metadata in seq_group_metadata_list: - # Build seq lens - seq_len = seq_group_metadata.encoder_seq_data.get_len() - token_ids = seq_group_metadata.encoder_seq_data.get_token_ids() - encoder_seq_lens.append(seq_len) - - # Build slot mapping - for i in range(0, seq_len): - block_number = seq_group_metadata.cross_block_table[ - i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - cross_slot_mapping.append(slot) - - # Build encoder input tokens - encoder_input_tokens.extend(token_ids) - encoder_input_positions.extend(list(range(0, seq_len))) - - # Convert tokens/positions & cross-attention - # slot-mapping to encoder input tensors - encoder_input_tokens_tensor = self._list_to_long_tensor( - encoder_input_tokens) - encoder_input_positions_tensor = self._list_to_long_tensor( - encoder_input_positions) - cross_slot_mapping_tensor = self._list_to_long_tensor( - cross_slot_mapping) - - else: - # Decode phase. - encoder_input_tokens_tensor = self._empty_long_tensor() - encoder_input_positions_tensor = self._empty_long_tensor() - cross_slot_mapping_tensor = self._empty_long_tensor() - # Extract cross-attention block tables & - # seq len from each sequence group metadata. - # Cross-attention block tables are empty - # during vLLM memory profiling. - cross_block_tables = [] - for seq_group_metadata in seq_group_metadata_list: - for _ in range(len(seq_group_metadata.seq_data)): - encoder_seq_lens.append( - seq_group_metadata.encoder_seq_data.get_len()) - cross_block_table = seq_group_metadata.cross_block_table - cross_block_tables.append([] if ( - cross_block_table is None) else cross_block_table) - - max_len_of_block_table = max( - len(block_table) for block_table in cross_block_tables) - - cross_block_tables = make_tensor_with_pad( - cross_block_tables, - max_len=max_len_of_block_table, - pad=0, - dtype=torch.int32, - device=self.device, - ) - - # Compute encoder sequence lengths & encoder - # sequence starting offset tensors - max_encoder_seq_len = max(encoder_seq_lens, default=0) - encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens) - encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + - 1, - dtype=torch.int32, - device=self.device) - torch.cumsum(encoder_seq_lens_tensor, - dim=0, - dtype=encoder_seq_start_loc.dtype, - out=encoder_seq_start_loc[1:]) - - # Update attention metadata with encoder-oriented attributes - attn_metadata = model_input.attn_metadata - assert attn_metadata is not None - ( - attn_metadata.num_encoder_tokens, - attn_metadata.encoder_seq_lens, - attn_metadata.encoder_seq_lens_tensor, - attn_metadata.max_encoder_seq_len, - attn_metadata.cross_slot_mapping, - attn_metadata.cross_block_tables, - ) = ( - sum(encoder_seq_lens), - encoder_seq_lens, - encoder_seq_lens_tensor, - max_encoder_seq_len, - cross_slot_mapping_tensor, - cross_block_tables, - ) - - return (attn_metadata, encoder_input_tokens_tensor, - encoder_input_positions_tensor) - - @torch.no_grad() - def execute_model( - self, - model_input: EncoderDecoderModelInputForCPU, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "CPU worker does not support multi-step execution.") - - model_executable = self.model - execute_model_kwargs = { - "input_ids": - model_input.input_tokens, - "positions": - model_input.input_positions, - "encoder_input_ids": - model_input.encoder_input_tokens, - "encoder_positions": - model_input.encoder_input_positions, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - "intermediate_tensors": - intermediate_tensors, - } - - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_states = model_executable(**execute_model_kwargs) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - # Sample the next token. - output = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - return [output] diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py deleted file mode 100644 index 68cdf65cafa7..000000000000 --- a/vllm/worker/cpu_model_runner.py +++ /dev/null @@ -1,671 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import weakref -from collections import defaultdict -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Type, - TypeVar, Union) - -import torch -from torch import nn - -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.config import VllmConfig -from vllm.forward_context import set_forward_context -from vllm.logger import init_logger -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.models import supports_lora, supports_multimodal -from vllm.multimodal import (BatchedTensorInputs, MultiModalKwargs, - MultiModalPlaceholderMap) -from vllm.sequence import (IntermediateTensors, SequenceData, - SequenceGroupMetadata) -from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU") -_PAD_SLOT_ID = -1 - - -@dataclass(frozen=True) -class ModelInputForCPU(ModelRunnerInputBase): - """ - Base class contains metadata needed for the base model forward pass on CPU - """ - input_tokens: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - token_type_ids: Optional[torch.Tensor] = None - attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[BatchedTensorInputs] = None - virtual_engine: Optional[int] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None - lora_mapping: Optional["LoRAMapping"] = None - lora_requests: Optional[Set[LoRARequest]] = None - - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "token_type_ids": self.token_type_ids, - "multi_modal_kwargs": self.multi_modal_kwargs, - "lora_requests": self.lora_requests, - "lora_mapping": self.lora_mapping, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type[TModelInputForCPU], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None - ) -> TModelInputForCPU: - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -@dataclass(frozen=True) -class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU): - """ - Used by the ModelRunner. - """ - sampling_metadata: Optional["SamplingMetadata"] = None - is_prompt: Optional[bool] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - "token_type_ids": self.token_type_ids, - "multi_modal_kwargs": self.multi_modal_kwargs, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForCPUWithSamplingMetadata": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]): - - class ModelInputData: - - def __init__(self, use_mrope: bool): - self.use_mrope = use_mrope - self.input_tokens: List[int] = [] - self.input_positions: List[int] = [] - self.token_type_ids: Optional[List[int]] = [] - self.seq_lens: List[int] = [] - self.query_lens: List[int] = [] - self.prefill_block_tables: List[List[int]] = [] - self.decode_block_tables: List[List[int]] = [] - self.max_decode_seq_len: int = 0 - self.num_prefills: int = 0 - self.num_prefill_tokens: int = 0 - self.num_decode_tokens: int = 0 - self.slot_mapping: List[int] = [] - self.multi_modal_inputs_list: List[MultiModalKwargs] = [] - self.multi_modal_placeholder_maps: Dict[ - str, MultiModalPlaceholderMap] = defaultdict( - MultiModalPlaceholderMap) - self.input_mrope_positions: List[List[int]] = [[] - for _ in range(3)] - - def __init__(self, - runner: "CPUModelRunner", - finished_requests_ids: Optional[List[str]] = None) -> None: - super().__init__() - self.runner = runner - self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled - or runner.cache_config.enable_prefix_caching) - self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend - self.sliding_window = self.runner.sliding_window - self.block_size = self.runner.block_size - self.device = self.runner.device - self.enable_lora = self.runner.lora_config is not None - if self.runner.attn_backend is not None: - # spec decode (e.g. Medusa) does not have atten backend - attn_backend = self.runner.attn_backend - self.att_metadata_builder = attn_backend.get_builder_cls()(self) - - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] - self.input_data = ModelInputForCPUBuilder.ModelInputData( - self.runner.model_config.uses_mrope) - self.att_metadata_builder.prepare() - - def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): - self.seq_group_metadata_list.append(seq_group_metadata) - - def set_seq_group_list( - self, seq_group_metadata_list: List[SequenceGroupMetadata]): - self.seq_group_metadata_list = seq_group_metadata_list - - def build(self) -> ModelInputForCPU: - self._build_input_data() - - input_data = self.input_data - input_tokens = torch.tensor(input_data.input_tokens, - dtype=torch.long, - device="cpu") - input_positions = torch.tensor( - input_data.input_positions - if not any(input_data.input_mrope_positions) else - input_data.input_mrope_positions, - dtype=torch.long, - device="cpu") - token_type_ids = torch.tensor(input_data.token_type_ids, - dtype=torch.long, - device="cpu") \ - if input_data.token_type_ids else None - - # For multi-modal models - multi_modal_kwargs = None - if len(input_data.multi_modal_inputs_list) != 0: - multi_modal_kwargs = MultiModalKwargs.batch( - input_data.multi_modal_inputs_list) - - attn_metadata = self.att_metadata_builder.build( - input_data.seq_lens, input_data.query_lens, -1, -1) - - is_prompt = (self.seq_group_metadata_list[0].is_prompt - if self.seq_group_metadata_list else None) - # LoRA data. - lora_requests = set() - lora_mapping = None - if self.enable_lora: - lora_requests = set(seq.lora_request - for seq in self.seq_group_metadata_list - if seq.lora_request is not None) - - lora_mapping = self._prepare_lora_input( - self.seq_group_metadata_list, is_prompt) - - return self.model_input_cls(input_tokens=input_tokens, - input_positions=input_positions, - token_type_ids=token_type_ids, - seq_lens=input_data.seq_lens, - query_lens=input_data.query_lens, - attn_metadata=attn_metadata, - multi_modal_kwargs=multi_modal_kwargs, - lora_mapping=lora_mapping, - lora_requests=lora_requests) - - def _build_input_data(self): - for seq_group_metadata in self.seq_group_metadata_list: - for seq_id, seq_data in seq_group_metadata.seq_data.items(): - if seq_group_metadata.is_prompt: - self._compute_prompt_input_tokens(self.input_data, - seq_group_metadata, - seq_data, seq_id) - if seq_group_metadata.multi_modal_data: - self._compute_multi_modal_input( - seq_group_metadata, seq_data) - else: - self._compute_decode_input_tokens(self.input_data, - seq_group_metadata, - seq_data, seq_id) - - def _compute_decode_input_tokens(self, data: ModelInputData, - seq_group_metadata: SequenceGroupMetadata, - seq_data: SequenceData, seq_id: int): - """ - Compute decode input tokens, positions, block table and slot mapping. - """ - block_size = self.runner.block_size - - block_table = seq_group_metadata.block_tables[seq_id] - seq_len = seq_data.get_len() - context_len = seq_data.get_num_computed_tokens() - - tokens = seq_data.get_last_token_id() - token_positions = seq_len - 1 - block_number = block_table[token_positions // block_size] - block_offset = token_positions % block_size - slot = block_number * block_size + block_offset - - # For paged_attention kernel - if self.runner.sliding_window: - start_idx = max(0, seq_len - self.runner.sliding_window) - start_block = start_idx // block_size - start_idx = start_block * block_size - seq_len = seq_len - start_idx - block_table = block_table[start_block:] - - # For MRotaryEmbedding - if seq_data.mrope_position_delta is not None: - next_pos = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, - context_len, - seq_len, - ) - for idx in range(3): - data.input_mrope_positions[idx].extend( # type: ignore - next_pos[idx]) - else: - data.input_positions.append(token_positions) # type: ignore - - # Update fields - data.input_tokens.append(tokens) - data.max_decode_seq_len = max(data.max_decode_seq_len, seq_len) - data.num_decode_tokens += 1 - data.slot_mapping.append(slot) - data.decode_block_tables.append(block_table) - data.query_lens.append(1) - data.seq_lens.append(seq_len) - - def _compute_prompt_input_tokens(self, data: ModelInputData, - seq_group_metadata: SequenceGroupMetadata, - seq_data: SequenceData, seq_id: int): - """ - Compute prompt input tokens, positions, block table and slot mapping. - """ - token_chunk_size = seq_group_metadata.token_chunk_size - block_size = self.runner.block_size - - block_table = seq_group_metadata.block_tables[seq_id] - seq_len = seq_data.get_len() - context_len = seq_data.get_num_computed_tokens() - seq_len = min(seq_len, context_len + token_chunk_size) - - # For prefix caching - prefix_cache_block_num = len(seq_group_metadata.computed_block_nums) - if prefix_cache_block_num > 0: - prefix_cache_len = (prefix_cache_block_num * - self.runner.block_size) - if prefix_cache_len <= context_len: - # We already passed the cache hit region, - # so do normal computation. - pass - elif context_len < prefix_cache_len < seq_len: - # Partial hit. Compute the missing part. - context_len = prefix_cache_len - token_chunk_size = seq_len - context_len - elif seq_len <= prefix_cache_len: - # Full hit. Only compute the last token to avoid - # erroneous behavior. FIXME: Ideally we should directly - # mark all tokens as computed in the scheduler and do not - # schedule this sequence, so this case should not happen. - context_len = seq_len - 1 - token_chunk_size = 1 - - tokens = seq_data.get_token_ids() - tokens = tokens[context_len:seq_len] - token_positions = range(context_len, seq_len) - token_types = seq_group_metadata.token_type_ids - - # For encoder-only models, the block_table is None, - # and there is no need to initialize the slot_mapping. - if block_table is not None: - slot_mapping = [_PAD_SLOT_ID] * len(token_positions) - for i, pos in enumerate(token_positions): - block_number = block_table[pos // block_size] - block_offset = pos % block_size - slot = block_number * block_size + block_offset - slot_mapping[i] = slot - data.slot_mapping.extend(slot_mapping) - - # The MROPE positions are prepared in _compute_multi_modal_input - data.input_positions.extend(token_positions) - - if data.token_type_ids is not None: - data.token_type_ids.extend(token_types if token_types else []) - - # Update fields - data.input_tokens.extend(tokens) - data.num_prefills += 1 - data.num_prefill_tokens += len(tokens) - data.query_lens.append(len(tokens)) - data.prefill_block_tables.append(block_table) - data.seq_lens.append(seq_len) - - def _compute_multi_modal_input(self, - seq_group_metadata: SequenceGroupMetadata, - seq_data: SequenceData): - computed_len = seq_data.get_num_computed_tokens() - seq_len = self.input_data.seq_lens[-1] - - # NOTE: mm_kwargs only includes the subset of multi-modal items that - # intersect with the current prefill positions. - mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( - seq_group_metadata, range(computed_len, seq_len)) - - if not mm_kwargs: - return - - # special processing for mrope position deltas. - if self.runner.model_config.uses_mrope: - assert not self.chunked_prefill, \ - "MROPE on CPU does not support chunked-prefill." - - image_grid_thw = mm_kwargs.get("image_grid_thw", None) - video_grid_thw = mm_kwargs.get("video_grid_thw", None) - audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", - None) - assert ( - image_grid_thw is not None or video_grid_thw is not None - or audio_feature_lengths is not None), ( - "mrope embedding type requires multi-modal input mapper " - "returns 'image_grid_thw' or 'video_grid_thw' or " - "'audio_feature_lengths'.") - - second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) - use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) - hf_config = self.runner.model_config.hf_config - token_ids = seq_data.get_token_ids() - - mrope_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=computed_len, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - seq_data.mrope_position_delta = mrope_position_delta - - for i in range(3): - self.input_data.input_mrope_positions[ # type: ignore - i].extend(mrope_positions[i]) - - self.input_data.multi_modal_inputs_list.append(mm_kwargs) - for modality, placeholder_map in placeholder_maps.items(): - self.input_data.multi_modal_placeholder_maps[modality].extend( - placeholder_map) - - def _prepare_lora_input( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - is_prefill: bool) -> LoRAMapping: - index_mapping = [] - prompt_mapping = [] - for seq in seq_group_metadata_list: - lora_id = seq.lora_int_id - query_len = seq.token_chunk_size - - index_mapping += [lora_id] * query_len - prompt_mapping += [lora_id] * ( - query_len if seq.sampling_params - and seq.sampling_params.prompt_logprobs is not None else 1) - - return LoRAMapping(index_mapping=tuple(index_mapping), - prompt_mapping=tuple(prompt_mapping), - is_prefill=is_prefill) - - -class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): - """ - Helper class for shared methods between CPU model runners. - """ - _model_input_cls: Type[TModelInputForCPU] - _builder_cls: Type[ModelInputForCPUBuilder] - builder: ModelInputForCPUBuilder - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - return_hidden_states: bool = False, - *args, - **kwargs, - ): - ModelRunnerBase.__init__(self, vllm_config) - model_config = self.model_config - cache_config = self.cache_config - - self.is_driver_worker = is_driver_worker - self.return_hidden_states = return_hidden_states - - self.device = self.device_config.device - self.pin_memory = False - - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size - num_attn_heads = self.model_config.get_num_attention_heads( - self.parallel_config) - needs_attn_backend = (num_attn_heads != 0 - or self.model_config.is_attention_free) - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) if needs_attn_backend else None - - # Lazy initialization. - self.model: nn.Module # Set after init_Model - # Set after load_model. - self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None - self.sampler = get_sampler() - - if hasattr(self, "_builder_cls"): - # multi-step model runner does not have `_builder_cls` - self.builder = self._builder_cls(weakref.proxy(self)) - - def load_model(self) -> None: - self.model = get_model(vllm_config=self.vllm_config) - - if self.lora_config: - assert supports_lora( - self.model - ), f"{self.model.__class__.__name__} does not support LoRA yet." - - if supports_multimodal(self.model): - logger.warning("Regarding multimodal models, vLLM currently " - "only supports adding LoRA to language model.") - - # Use get_text_config() in case of multimodal models - text_config = self.model_config.hf_config.get_text_config() - - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, - self.vocab_size, - self.lora_config, - self.device, - self.model.embedding_modules, - self.model.embedding_padding_modules, - max_position_embeddings=text_config.max_position_embeddings, - ) - self.model = self.lora_manager.create_lora_manager(self.model) - - def get_model(self) -> nn.Module: - return self.model - - def _prepare_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None - ) -> TModelInputForCPU: - """Helper method to prepare the model input based on a given sequence - group. Prepares metadata needed for the base model forward pass but not - metadata for possible additional steps, e.g., sampling. - - """ - self.builder.prepare(finished_requests_ids) - self.builder.set_seq_group_list(seq_group_metadata_list) - - return self.builder.build() # type: ignore - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - def remove_all_loras(self): - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.remove_all_adapters() - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.set_active_adapters(lora_requests, lora_mapping) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_adapter(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_adapter(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.pin_adapter(lora_id) - - def list_loras(self) -> Set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.list_adapters() - - -class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): - _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( - ModelInputForCPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForCPUWithSamplingMetadata: - return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForCPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - pin_memory=False, - generators=generators) - - is_prompt = (seq_group_metadata_list[0].is_prompt - if seq_group_metadata_list else None) - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - virtual_engine=virtual_engine, - is_prompt=is_prompt) - - @torch.no_grad() - def execute_model( - self, - model_input: ModelInputForCPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - previous_hidden_states: Optional[torch.Tensor] = None, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "CPU worker does not support multi-step execution.") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - model_executable = self.model - - multimodal_kwargs = {} - if model_input.multi_modal_kwargs is not None: - multimodal_kwargs = MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs, - device=self.device, - ) - execute_model_kwargs = {} - if previous_hidden_states is not None: - execute_model_kwargs.update( - {"previous_hidden_states": previous_hidden_states}) - - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **execute_model_kwargs, - **multimodal_kwargs, - ) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - # Sample the next token. - output = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - if model_input.is_prompt: - output.prefill_hidden_states = hidden_states - output.hidden_states = hidden_states - return [output] - - def generate_proposals(self, *args, **kwargs): - return self.model.generate_proposals(*args, **kwargs) diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py deleted file mode 100644 index 203fdf225a41..000000000000 --- a/vllm/worker/cpu_pooling_model_runner.py +++ /dev/null @@ -1,125 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from typing import Any, Dict, List, Optional, Tuple, Type, Union - -import torch - -from vllm.forward_context import set_forward_context -from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.multimodal import MultiModalKwargs -from vllm.pooling_params import PoolingParams -from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, - SequenceGroupMetadata) -from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU, - ModelInputForCPUBuilder) - - -@dataclasses.dataclass(frozen=True) -class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU): - """ - Used by the CPUPoolingModelRunner. - """ - pooling_metadata: Optional["PoolingMetadata"] = None - - -class CPUPoolingModelRunner( - CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]): - _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = ( - ModelInputForCPUWithPoolingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForCPUWithPoolingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: - if num_steps > 1: - raise ValueError( - "CPU worker does not support multi-step execution.") - - model_executable = self.model - cross_enc_kwargs = {} - if model_input.token_type_ids is not None: - cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids - execute_model_kwargs = { - "input_ids": - model_input.input_tokens, - "positions": - model_input.input_positions, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - **cross_enc_kwargs, - "intermediate_tensors": - intermediate_tensors, - } - - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_states = model_executable(**execute_model_kwargs) - - # Only perform pooling in the driver worker. - if not self.is_driver_worker: - return [] - - return [ - self.model.pooler(hidden_states=hidden_states, - pooling_metadata=model_input.pooling_metadata) - ] - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, - Any]) -> ModelInputForCPUWithPoolingMetadata: - return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - ) - - def prepare_model_input( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForCPUWithPoolingMetadata: - assert seq_group_metadata_list is not None - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - # Prepare PoolingMetadata. - assert model_input.seq_lens is not None - pooling_metadata = self._prepare_pooling(seq_group_metadata_list, - model_input.seq_lens) - - return dataclasses.replace(model_input, - virtual_engine=virtual_engine, - pooling_metadata=pooling_metadata) - - def _prepare_pooling( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], - ) -> PoolingMetadata: - """Prepare PoolingMetadata for the sequence group metadata list.""" - seq_groups: List[Tuple[List[int], PoolingParams]] = [] - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = list(seq_group_metadata.seq_data.keys()) - pooling_params = seq_group_metadata.pooling_params - seq_groups.append((seq_ids, pooling_params)) - - seq_data: Dict[int, SequenceData] = {} - for seq_group_metadata in seq_group_metadata_list: - seq_data.update(seq_group_metadata.seq_data) - - pooling_metadata = PoolingMetadata( - seq_groups=seq_groups, - seq_data=seq_data, - prompt_lens=prompt_lens, - ) - - return pooling_metadata diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py deleted file mode 100644 index a8998127b60f..000000000000 --- a/vllm/worker/cpu_worker.py +++ /dev/null @@ -1,452 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A CPU worker class.""" -import os -from importlib import util -from typing import List, Optional, Set, Tuple, Type - -import torch -import torch.distributed - -import vllm.envs as envs -from vllm.attention import get_attn_backend -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, VllmConfig) -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest -from vllm.utils import bind_kv_cache -from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner -from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase -from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class CPUCacheEngine: - """Manages the KV cache for CPU backend. - - This class is responsible for initializing and managing CPU KV - caches. It also provides methods for performing KV cache operations, such - as copying. - """ - - def __init__(self, cache_config: CacheConfig, model_config: ModelConfig, - parallel_config: ParallelConfig, - device_config: DeviceConfig) -> None: - assert device_config.device_type == "cpu" - self.cache_config = cache_config - self.model_config = model_config - self.parallel_config = parallel_config - - self.head_size = model_config.get_head_size() - self.num_layers = model_config.get_num_layers(parallel_config) - self.num_heads = model_config.get_num_kv_heads(parallel_config) - - self.block_size = cache_config.block_size - # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks - # for CPU backend, because we want to reuse KV cache management - # in the scheduler. - self.num_cpu_blocks = cache_config.num_gpu_blocks - - self.dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config, - model_config) - - # Get attention backend. - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - cache_config.cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) - - # Initialize the cache. - self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks) - - def _allocate_kv_cache( - self, - num_blocks: int, - ) -> List[torch.Tensor]: - """Allocates KV cache on CPU.""" - kv_cache_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_heads, self.head_size) - kv_cache: List[torch.Tensor] = [] - for _ in range(self.num_layers): - kv_cache.append( - torch.empty(kv_cache_shape, dtype=self.dtype, device="cpu")) - return kv_cache - - def swap_in(self, src_to_dst: torch.Tensor) -> None: - raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - - def swap_out(self, src_to_dst: torch.Tensor) -> None: - raise NotImplementedError("Swap is not supported in CPUCacheEngine.") - - def copy(self, src_to_dsts: torch.Tensor) -> None: - self.attn_backend.copy_blocks(self.cpu_cache, src_to_dsts) - - @staticmethod - def get_kv_cache_dtype(cache_config: CacheConfig, - model_config: ModelConfig): - if cache_config.cache_dtype == "auto": - return model_config.dtype - elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]: - return torch.float8_e5m2 - else: - raise NotImplementedError(f"Unsupported KV cache type " - f"{cache_config.cache_dtype}.") - - @staticmethod - def get_cache_block_size( - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, - ) -> int: - head_size = model_config.get_head_size() - num_heads = model_config.get_num_kv_heads(parallel_config) - num_layers = model_config.get_num_layers(parallel_config) - - key_cache_block = cache_config.block_size * num_heads * head_size - value_cache_block = key_cache_block if not model_config.use_mla else 0 - total = num_layers * (key_cache_block + value_cache_block) - dtype = CPUCacheEngine.get_kv_cache_dtype(cache_config, model_config) - dtype_size = torch.tensor([], dtype=dtype).element_size() - return dtype_size * total - - -class CPUWorker(LocalOrDistributedWorkerBase): - """A worker class that executes (a partition of) the model on a CPU socket. - - Each worker is associated with a single CPU socket. The worker is - responsible for maintaining the KV cache and executing the model on the - CPU. In case of distributed inference, each worker is assigned a partition - of the model. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - model_runner_cls: Optional[Type[CPUModelRunner]] = None, - ) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - - self.local_rank = local_rank - self.rank = rank - vllm_config.parallel_config.rank = rank - - self.distributed_init_method = distributed_init_method - - self.is_driver_worker = is_driver_worker - if self.is_driver_worker: - assert self.rank == 0, "The driver worker must have rank 0." - - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - init_cached_hf_modules() - - # Setup OpenMP threads affinity. - omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND - self.local_omp_cpuid = "all" - if omp_cpuids == "auto": - self.local_omp_cpuid = self.get_cpus_id_binding_based_on_numa_nodes( - ) - else: - self.local_omp_cpuid = omp_cpuids.split("|")[rank] - - # Return hidden states from target model if the draft model is an - # mlp_speculator - speculative_config = self.speculative_config - model_config = self.model_config - speculative_args = {} if speculative_config is None \ - or (speculative_config.draft_model_config.model == - model_config.model) \ - or (speculative_config.draft_model_config.hf_config.model_type - not in ["medusa", "mlp_speculator", "eagle"]) \ - else {"return_hidden_states": True} - ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner - if self.model_config.runner_type == "pooling": - ModelRunnerClass = CPUPoolingModelRunner - elif self.model_config.is_encoder_decoder: - ModelRunnerClass = CPUEncoderDecoderModelRunner - self.model_runner: CPUModelRunnerBase = ModelRunnerClass( - vllm_config=vllm_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker, - **speculative_args, - ) - if model_runner_cls is not None: - self.model_runner = model_runner_cls(self.model_runner) - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: List[CPUCacheEngine] - # Initialize cpu_cache as pooling models don't initialize kv_caches - self.cpu_cache: Optional[List[List[torch.Tensor]]] = None - - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - ], - with_stack=True, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True)) - else: - self.profiler = None - - def start_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.start() - - def stop_profile(self): - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - self.profiler.stop() - - def init_device(self) -> None: - if self.local_omp_cpuid != "all": - ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) - if ret: - logger.info(ret) - - # Note: unique identifier for creating allreduce shared memory - os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split( - ":")[-1] - self.device = torch.device("cpu") - self.init_distributed_environment() - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - self.model_runner.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of blocks available for the KV cache. - - This determines how many KV blocks can fit into the configured CPU - KV cache space. - - Note that since vLLM assumes a block resides on GPU if it can be - modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0. - This allows us to reuse the scheduler of vLLM without generalizing it - to different devices. - """ - # For CPU device, the block number will be calculated based on the - # cpu_kvcache_space. - cache_block_size = self.get_cache_block_size_bytes() - num_cpu_blocks = int(self.cache_config.cpu_kvcache_space_bytes // - cache_block_size) - num_cpu_blocks = max(num_cpu_blocks, 0) - - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - num_gpu_blocks = num_cpu_blocks - num_cpu_blocks = 0 - return num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache. Currently, swappable CPU memory is not - supported. - - Since this worker does not support GPUs, we use the num_gpu_blocks to - determine how many non-swappable CPU blocks to allocate. - """ - assert (num_cpu_blocks == 0 - ), f"{type(self)} does not support swappable cache" - - # Note: To reuse the cache management procedure, - # use cpu cache as 'gpu cache'. - num_cpu_blocks = num_gpu_blocks - - self._validate_num_cpu_blocks(num_cpu_blocks) - self.cache_config.num_gpu_blocks = num_cpu_blocks - self.cache_config.num_cpu_blocks = 0 - - # Initialize the cache. - self._init_cache_engine() - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_runner.remove_lora(lora_id) - - def pin_lora(self, lora_id: int) -> bool: - return self.model_runner.pin_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_runner.list_loras() - - def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None: - """Raise errors if the num_cpu_blocks is invalid. - """ - if num_cpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `VLLM_CPU_KVCACHE_SPACE` when " - "initializing the engine.") - - max_seq_len = self.cache_config.block_size * num_cpu_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`VLLM_CPU_KVCACHE_SPACE` or decreasing `max_model_len` when " - "initializing the engine.") - - def _init_cache_engine(self) -> None: - self.cache_engine = [ - CPUCacheEngine(self.cache_config, self.model_config, - self.parallel_config, self.device_config) - for _ in range(self.parallel_config.pipeline_parallel_size) - ] - self.cpu_cache = [ - self.cache_engine[ve].cpu_cache - for ve in range(self.parallel_config.pipeline_parallel_size) - ] - bind_kv_cache(self.compilation_config.static_forward_context, - self.cpu_cache) - self.model_runner.block_size = self.cache_engine[0].block_size - - assert all( - self.cpu_cache[ve] is not None - for ve in range(self.parallel_config.pipeline_parallel_size)) - - # Populate the cache to warmup the memory - for ve in range(self.parallel_config.pipeline_parallel_size): - for layer_cache in self.cpu_cache[ve]: - layer_cache.fill_(0) - - @property - def do_metadata_broadcast(self) -> bool: - return self.parallel_config.tensor_parallel_size > 1 - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - return self.cpu_cache - - @property - def vocab_size(self) -> int: - return self.model_runner.vocab_size - - @property - def max_model_len(self) -> int: - return self.model_config.max_model_len - - def execute_worker( - self, - worker_input: WorkerInput, - ) -> None: - if (worker_input.blocks_to_copy is not None - and worker_input.blocks_to_copy.numel() > 0): - self.cache_engine[worker_input.virtual_engine].copy( - worker_input.blocks_to_copy) - - @torch.inference_mode() - def prepare_worker_input( - self, execute_model_req: ExecuteModelRequest) -> WorkerInput: - assert execute_model_req is not None - virtual_engine: int = execute_model_req.virtual_engine - num_seq_groups: int = len(execute_model_req.seq_group_metadata_list) - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device="cpu", - dtype=torch.int64).view(-1, 2) - assert len(execute_model_req.blocks_to_swap_in) == 0 - assert len(execute_model_req.blocks_to_swap_out) == 0 - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - ) - - def init_distributed_environment(self) -> None: - """Initialize the distributed environment.""" - - parallel_config = self.parallel_config - rank = self.rank - distributed_init_method = self.distributed_init_method - init_distributed_environment( - world_size=parallel_config.world_size, - rank=rank, - distributed_init_method=distributed_init_method, - backend="gloo", - ) - - # A small all_reduce for warmup. - torch.distributed.all_reduce(torch.zeros(1).cpu()) - - ensure_model_parallel_initialized( - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) - - def get_cache_block_size_bytes(self) -> int: - """Return the size in bytes of a single KV cache block. - """ - return CPUCacheEngine.get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - - def get_cpus_id_binding_based_on_numa_nodes(self) -> str: - """Return CPUs id binding based on NUMA nodes. - """ - rank_to_cpus = self.local_omp_cpuid - # Setup OpenMP thread affinity based on NUMA nodes automatically - world_size = self.vllm_config.parallel_config.world_size - libnuma_found = util.find_spec("numa") is not None - psutil_found = util.find_spec("psutil") is not None - if libnuma_found and psutil_found: - import psutil - from numa import info - cpu_count = psutil.cpu_count(logical=False) - cpus_allow_list = psutil.Process().cpu_affinity() - numa_size = info.get_num_configured_nodes() - cpu_count_per_numa = cpu_count // numa_size - num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, - cpu_count_per_numa // 2) - - # check allow node_to_cpus list - node_to_cpus = [] - for i in range(numa_size): - node_intersect = set( - info.node_to_cpus(i)).intersection(cpus_allow_list) - if bool(node_intersect): - node_to_cpus.append(list(node_intersect)) - - if world_size > len(node_to_cpus): - logger.error( - "Auto thread-binding failed due to " - "world size: %d is larger than " - "allowed NUMA nodes number: %d." - "Please try to bind threads manually.", world_size, - len(node_to_cpus)) - else: - end = cpu_count_per_numa - num_of_reserved_cpu - rank_to_cpus_list = node_to_cpus[self.rank][:end] - rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) - logger.info("auto thread-binding list: %s", rank_to_cpus) - else: - logger.warning( - "Auto thread-binding is not supported due to " - "the lack of package numa and psutil," - "fallback to no thread-binding. To get better performance," - "please try to manually bind threads.") - return rank_to_cpus diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py deleted file mode 100644 index ed9f00166615..000000000000 --- a/vllm/worker/multi_step_tpu_worker.py +++ /dev/null @@ -1,108 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -from typing import Dict, Optional, Tuple - -import torch - -from vllm.distributed import broadcast_tensor_dict -from vllm.sequence import ExecuteModelRequest -from vllm.worker.tpu_model_runner import ModelInputForTPU -from vllm.worker.tpu_worker import TPUWorker -from vllm.worker.worker_base import WorkerInput - - -class MultiStepTPUWorker(TPUWorker): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.cached_model_input: Optional[ModelInputForTPU] = None - - def _get_driver_input_and_broadcast( - self, execute_model_req: ExecuteModelRequest - ) -> Tuple[ModelInputForTPU, WorkerInput, Dict[str, torch.Tensor]]: - assert self.is_driver_worker - assert execute_model_req.virtual_engine == 0 - - is_first_multi_step = execute_model_req.is_first_multi_step - is_last_step = execute_model_req.is_last_step - if is_first_multi_step: - worker_input: WorkerInput = self.prepare_worker_input( - execute_model_req=execute_model_req) - worker_input = dataclasses.replace( - worker_input, - num_steps=execute_model_req.num_lookahead_slots + 1) - model_input: ModelInputForTPU = ( - self.model_runner.prepare_model_input( - execute_model_req.seq_group_metadata_list, - execute_model_req.virtual_engine, - execute_model_req.finished_requests_ids)) - - if execute_model_req.async_callback: - model_input = dataclasses.replace( - model_input, - async_callback=execute_model_req.async_callback) - else: - assert self.cached_model_input is not None - model_input = self.cached_model_input - worker_input = WorkerInput() - model_input = dataclasses.replace( - model_input, - is_first_multi_step=is_first_multi_step, - is_last_step=is_last_step) - - if self.do_metadata_broadcast: - if is_first_multi_step: - broadcast_data = worker_input.as_broadcastable_tensor_dict() - broadcast_data.update( - model_input.as_broadcastable_tensor_dict()) - broadcast_tensor_dict(broadcast_data, src=0) - else: - broadcast_data = { - "is_first_multi_step": is_first_multi_step, - "is_last_step": is_last_step, - } - broadcast_tensor_dict(broadcast_data, src=0) - - # Retuning empty dict here to keep this compatible with - # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast` - return model_input, worker_input, {} - - def prepare_input( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> Optional[Tuple[ModelInputForTPU, WorkerInput, Dict[str, - torch.Tensor]]]: - if self.is_driver_worker: - if execute_model_req is None: - if self.do_metadata_broadcast: - broadcast_tensor_dict({}, src=0) - return None - - model_input, worker_input, _ = self._get_driver_input_and_broadcast( - execute_model_req) - if model_input.is_first_multi_step: - self.cached_model_input = model_input - return model_input, worker_input, {} - else: - broadcast_data = broadcast_tensor_dict(src=0) - if not broadcast_data: - return None - - if len(broadcast_data) == 2: - assert self.cached_model_input is not None - self.cached_model_input = dataclasses.replace( - self.cached_model_input, - is_first_multi_step=broadcast_data["is_first_multi_step"], - is_last_step=broadcast_data["is_last_step"]) - empty_worker_input = WorkerInput() - return self.cached_model_input, empty_worker_input, {} - - worker_input = WorkerInput.from_broadcasted_tensor_dict( - broadcast_data) - model_input = ( - self.model_runner. - make_model_input_from_broadcasted_tensor_dict(broadcast_data)) - self.cached_model_input = model_input - return model_input, worker_input, {} diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py deleted file mode 100644 index 336bc0bcec36..000000000000 --- a/vllm/worker/tpu_model_runner.py +++ /dev/null @@ -1,909 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import enum -import time -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Type, Union) -from unittest.mock import patch - -import numpy as np -import torch -import torch.nn as nn -import torch_xla.core.xla_model as xm -import torch_xla.runtime as xr - -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.config import VllmConfig -from vllm.forward_context import get_forward_context, set_forward_context -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, - Logprob, SequenceGroupMetadata, SequenceOutput) -from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, - _add_attn_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -# Here we utilize the behavior that out-of-bound index is ignored. -# FIXME(woosuk): Find a more reliable way to prevent possible bugs. -_PAD_SLOT_ID = 1_000_000_000 -# FIXME(woosuk): Temporarily disabled top-p sampling since it's too slow. -_ENABLE_TOP_P = False -# FIXME(woosuk): A temporary hack to support `n > 1`. -# This can significantly affect the performance if too large. -_MAX_NUM_SAMPLES = 128 - - -class ExecutionMode(enum.Enum): - PREFILL = enum.auto() - DECODE = enum.auto() - PREFIX_PREFILL = enum.auto() - - def is_prefill(self) -> bool: - return self in (ExecutionMode.PREFILL, ExecutionMode.PREFIX_PREFILL) - - -@dataclass(frozen=True) -class ModelInputForTPU(ModelRunnerInputBase): - token_ids: torch.Tensor - position_ids: torch.Tensor - attn_metadata: AttentionMetadata - input_lens: torch.Tensor - t: torch.Tensor - p: torch.Tensor - num_samples: int - n: List[int] - seq_groups: List[List[int]] - is_first_multi_step: bool = True - is_last_step: bool = True - virtual_engine: int = 0 - async_callback: Optional[Callable] = None - - def as_broadcastable_tensor_dict( - self) -> Dict[str, Union[int, torch.Tensor]]: - tensor_dict = { - "token_ids": self.token_ids, - "position_ids": self.position_ids, - "input_lens": self.input_lens, - "t": self.t, - "p": self.p, - "num_samples": self.num_samples, - "n": self.n, - "seq_groups": self.seq_groups, - "is_first_multi_step": self.is_first_multi_step, - "is_last_step": self.is_last_step, - "virtual_engine": self.virtual_engine, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type["ModelInputForTPU"], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForTPU": - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): - - def __init__( - self, - vllm_config: VllmConfig, - is_driver_worker: bool = False, - ): - ModelRunnerBase.__init__(self, vllm_config=vllm_config) - self.is_driver_worker = is_driver_worker - - self.block_size = self.cache_config.block_size - self.max_num_blocks_per_seq = (self.model_config.max_model_len // - self.block_size) - self.block_tables = np.zeros( - (self.scheduler_config.max_num_seqs, self.max_num_blocks_per_seq), - dtype=np.int32) - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.cache_config.cache_dtype, - self.block_size, - self.model_config.is_attention_free, - False, - ) - self.cached_step_outputs: List[torch.Tensor] = [] - - smem_size = 512 * 1024 - block_table_size = 4 * self.block_tables.size - if block_table_size >= smem_size: - logger.warning( - "The max_model_len (%d) is too large. This may degrade the " - "performance due to the insufficient smem size. Consider " - "setting --max-model-len to a smaller value, like %d.", - self.model_config.max_model_len, - self.model_config.max_model_len / - (block_table_size / smem_size)) - - def load_model(self) -> None: - self.device = self.device_config.device - - # NOTE(woosuk): While the executor assigns the TP ranks to the worker - # process, the ranks can be different from the ranks internally assigned - # by the xm runtime. Therefore, there is a mismatch in the rank - # assignment between the gloo (cpu) runtime and the xm (tpu) runtime. - # This is not a problem in linear layers because all-reduce is - # rank-agnostic. However, it matters for all-gather as the ranks - # determine the order of concatenating the output tensors. - # As a workaround, we use the xm's rank assignment only when loading - # the embedding weights. - xm_tp_rank = xr.global_ordinal() - with patch( - "vllm.model_executor.layers.vocab_parallel_embedding." - "get_tensor_model_parallel_rank", - return_value=xm_tp_rank): - model = get_model(vllm_config=self.vllm_config) - model = model.eval() - xm.wait_device_ops() - model = ModelWrapper(model) - self.model = torch.compile(model, - backend="openxla", - fullgraph=True, - dynamic=False) - - def get_model(self) -> nn.Module: - return self.model.model - - def _dummy_run( - self, - batch_size: int, - seq_len: int, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - exec_mode: ExecutionMode, - ) -> None: - exec_mode = ExecutionMode(exec_mode) - if exec_mode.is_prefill(): - seq_len = (seq_len + 15) // 16 * 16 - token_ids = torch.zeros((batch_size, seq_len), - dtype=torch.int32, - device=self.device) - position_ids = torch.zeros((batch_size, seq_len), - dtype=torch.int32, - device=self.device) - slot_mapping = torch.zeros((batch_size, seq_len), - dtype=torch.int64, - device=self.device) - input_lens = torch.ones((batch_size, ), - dtype=torch.int32, - device=self.device) - if exec_mode == ExecutionMode.PREFILL: - attn_metadata = self.attn_backend.make_metadata( - num_prefills=batch_size, - num_prefill_tokens=batch_size * seq_len, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=None, - context_lens=None, - effective_query_lens=None, - ) - else: - context_lens = torch.ones((batch_size, ), - dtype=torch.int32, - device=self.device) - block_tables = torch.tensor(self.block_tables[:batch_size], - dtype=torch.int32, - device=self.device) - effective_query_lens = torch.ones_like(context_lens) - attn_metadata = self.attn_backend.make_metadata( - num_prefills=batch_size, - num_prefill_tokens=batch_size * seq_len, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=block_tables, - context_lens=context_lens, - effective_query_lens=effective_query_lens, - ) - else: - assert seq_len == 1 - token_ids = torch.zeros((batch_size, seq_len), - dtype=torch.int32, - device=self.device) - position_ids = torch.zeros((batch_size, seq_len), - dtype=torch.int32, - device=self.device) - slot_mapping = torch.zeros((batch_size, seq_len), - dtype=torch.int64, - device=self.device) - block_tables = torch.zeros( - (batch_size, self.max_num_blocks_per_seq), - dtype=torch.int32, - device=self.device) - context_lens = torch.ones((batch_size, ), - dtype=torch.int32, - device=self.device) - input_lens = torch.ones((batch_size, ), - dtype=torch.int32, - device=self.device) - attn_metadata = self.attn_backend.make_metadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=batch_size * seq_len, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=block_tables, - context_lens=context_lens, - ) - t = torch.ones((batch_size, ), dtype=torch.float32, device=self.device) - p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device) - num_samples = _MAX_NUM_SAMPLES if exec_mode.is_prefill() else 1 - - # NOTE(woosuk): There are two stages of compilation: torch.compile and - # XLA compilation. Using `mark_dynamic` can reduce the torch.compile - # overhead by reusing the FX graph for different shapes. - # However, the XLA graph will still require static shapes and needs to - # be re-compiled for every different shapes. This overhead is inevitable - # in the first run, but can be skipped afterwards as we cache the XLA - # graphs in the disk (VLLM_XLA_CACHE_PATH). - if exec_mode.is_prefill(): - # Prefll - torch._dynamo.mark_dynamic(token_ids, 1) - torch._dynamo.mark_dynamic(position_ids, 1) - torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 1) - else: - # Decode - torch._dynamo.mark_dynamic(token_ids, 0) - torch._dynamo.mark_dynamic(position_ids, 0) - torch._dynamo.mark_dynamic(input_lens, 0) - torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0) - torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0) - torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0) - torch._dynamo.mark_dynamic(t, 0) - torch._dynamo.mark_dynamic(p, 0) - # Dummy run. - with set_forward_context(attn_metadata, self.vllm_config, 0): - self.model(token_ids, position_ids, input_lens, t, p, num_samples, - kv_caches) - - def warmup_model( - self, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - ) -> None: - # Prefill - logger.info("Compiling the model with different input shapes...") - start = time.time() - for batch_size in [1]: - seq_len = 16 - while seq_len <= self.model_config.max_model_len: - self._dummy_run(batch_size, - seq_len, - kv_caches, - exec_mode=ExecutionMode.PREFILL) - xm.wait_device_ops() - logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) - num_tokens = batch_size * seq_len - if num_tokens >= self.scheduler_config.max_num_batched_tokens: - break - seq_len = seq_len * 2 - - end = time.time() - logger.info("Compilation for prefill done in %.2f s.", end - start) - - # Prefix prefill - if self.cache_config.enable_prefix_caching: - logger.info("Compiling the model with different input shapes for " - "prefix prefill...") - start = time.time() - for batch_size in [1]: - seq_len = 16 - while seq_len <= self.model_config.max_model_len: - self._dummy_run(batch_size, - seq_len, - kv_caches, - exec_mode=ExecutionMode.PREFIX_PREFILL) - xm.wait_device_ops() - logger.info("batch_size: %d, seq_len: %d", batch_size, - seq_len) - num_tokens = batch_size * seq_len - if (num_tokens - >= self.scheduler_config.max_num_batched_tokens): - break - seq_len = seq_len * 2 - end = time.time() - logger.info("Compilation for prefix prefill done in %.2f s.", - end - start) - - # Decode - start = time.time() - seq_len = 1 - batch_size = 8 # Must be in sync with _get_padded_batch_size() - while True: - self._dummy_run(batch_size, - seq_len, - kv_caches, - exec_mode=ExecutionMode.DECODE) - xm.wait_device_ops() - logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) - - if batch_size >= self.scheduler_config.max_num_seqs: - break - batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2 - - end = time.time() - logger.info("Compilation for decode done in %.2f s.", end - start) - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - prompt_lens: List[int] = [] - context_lens: List[int] = [] - slot_mapping: List[int] = [] - - for batch_idx, seq_group_metadata in enumerate( - seq_group_metadata_list): - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - # Could include output tokens when a request is preempted. - prompt_tokens = seq_data.get_token_ids() - seq_len = len(prompt_tokens) - - num_computed_blocks = len(seq_group_metadata.computed_block_nums) - num_computed_tokens = num_computed_blocks * self.block_size - if num_computed_tokens > 0: - prompt_tokens = prompt_tokens[num_computed_tokens:] - context_lens.append(seq_len) - else: - context_lens.append(0) - - prompt_len = len(prompt_tokens) - prompt_lens.append(prompt_len) - - input_tokens.extend(prompt_tokens) - input_positions.extend(range(num_computed_tokens, seq_len)) - - assert seq_group_metadata.block_tables is not None - block_table = seq_group_metadata.block_tables[seq_id] - for i in range(num_computed_tokens, seq_len): - block_number = block_table[i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - if num_computed_tokens > 0: - self.block_tables[batch_idx, :len(block_table)] = block_table - - # Add paddings to EACH prompt to the smallest power of 2 that is - # greater than or equal to the prompt length. - # We pad the seq_len to reduce the compilation overhead. - # We execute each prompt individually (i.e., with batch_size 1) - # because the FlashAttention kernel does not support ragged inputs. - # TODO(woosuk): Use SplashAttention to support ragged inputs. - padded_prompt_len = _get_padded_prefill_len(prompt_len) - num_paddings = padded_prompt_len - prompt_len - input_tokens += [0] * num_paddings - input_positions += [0] * num_paddings - slot_mapping += [_PAD_SLOT_ID] * num_paddings - - assert len(prompt_lens) > 0 - num_prefills = len(prompt_lens) - input_tokens = torch.tensor(input_tokens, - dtype=torch.int32, - device="cpu") - input_positions = torch.tensor(input_positions, - dtype=torch.int32, - device="cpu") - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.int64, - device="cpu") - prompt_lens = torch.tensor(prompt_lens, - dtype=torch.int32, - device="cpu") - context_lens = torch.tensor(context_lens, - dtype=torch.int32, - device="cpu") - block_tables = torch.tensor(self.block_tables[:num_prefills], - dtype=torch.int32, - device="cpu") - attn_metadata = self.attn_backend.make_metadata( - num_prefills=num_prefills, - num_prefill_tokens=0, # NOTE: This is not used. - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=block_tables, - context_lens=context_lens, - effective_query_lens=prompt_lens, - ) - return input_tokens, input_positions, attn_metadata, prompt_lens - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - context_lens: List[int] = [] - - batch_idx = 0 - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append([generation_token]) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append([position]) - context_lens.append(seq_len) - - assert seq_group_metadata.block_tables is not None - block_table = seq_group_metadata.block_tables[seq_id] - self.block_tables[batch_idx, :len(block_table)] = block_table - batch_idx += 1 - - block_number = block_table[position // self.block_size] - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append([slot]) - - batch_size = _get_padded_batch_size(batch_idx) - num_paddings = batch_size - batch_idx - input_tokens = input_tokens + [[0]] * num_paddings - input_positions = input_positions + [[0]] * num_paddings - slot_mapping = slot_mapping + [[_PAD_SLOT_ID]] * num_paddings - context_lens = context_lens + [0] * num_paddings - - input_tokens = torch.tensor(input_tokens, - dtype=torch.int32, - device="cpu") - input_positions = torch.tensor(input_positions, - dtype=torch.int32, - device="cpu") - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.int64, - device="cpu") - context_lens = torch.tensor(context_lens, - dtype=torch.int32, - device="cpu") - block_tables = torch.tensor(self.block_tables[:batch_size], - dtype=torch.int32, - device="cpu") - input_lens = torch.tensor([1] * batch_size, - dtype=torch.int32, - device="cpu") - attn_metadata = self.attn_backend.make_metadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=batch_size, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - block_tables=block_tables, - context_lens=context_lens, - ) - return input_tokens, input_positions, attn_metadata, input_lens - - def _prepare_sample( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - padded_batch_size: int, - ) -> Tuple[torch.Tensor, torch.Tensor, List[int]]: - assert len(seq_group_metadata_list) > 0 - t = [] - p = [] - n = [] - for seq_group_metadata in seq_group_metadata_list: - sampling_params = seq_group_metadata.sampling_params - t.append(sampling_params.temperature) - if sampling_params.top_p != 1 and not _ENABLE_TOP_P: - raise NotImplementedError( - "Top-p sampling is currently disabled for the TPU backend " - "due to performance issues.") - p.append(sampling_params.top_p) - if sampling_params.top_k > 0: - raise NotImplementedError( - "Top-k sampling is currently disabled for the TPU backend " - "due to performance issues.") - if sampling_params.n > _MAX_NUM_SAMPLES: - raise NotImplementedError( - f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU " - "backend.") - n.append(sampling_params.n) - if sampling_params.logprobs is not None: - raise NotImplementedError( - "logprobs is not currently supported by the TPU backend.") - if sampling_params.prompt_logprobs is not None: - raise NotImplementedError( - "prompt_logprobs is not currently supported by the TPU " - "backend.") - - # Repeat the sampling params if the seq group has multiple seqs. - num_seqs = len(seq_group_metadata.seq_data) - t += [t[-1]] * (num_seqs - 1) - p += [p[-1]] * (num_seqs - 1) - n += [n[-1]] * (num_seqs - 1) - - num_paddings = padded_batch_size - len(t) - t += [1.0] * num_paddings - p += [1.0] * num_paddings - - t = torch.tensor(t, dtype=torch.float32, device="cpu") - p = torch.tensor(p, dtype=torch.float32, device="cpu") - return t, p, n - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> ModelInputForTPU: - del finished_requests_ids # Unused. - assert virtual_engine == 0 - assert len(seq_group_metadata_list) > 0 - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - if is_prompt: - inputs = self._prepare_prompt(seq_group_metadata_list) - else: - inputs = self._prepare_decode(seq_group_metadata_list) - input_tokens, input_positions, attn_metadata, input_lens = inputs - padded_batch_size = input_tokens.shape[0] - t, p, n = self._prepare_sample(seq_group_metadata_list, - padded_batch_size) - num_samples = _MAX_NUM_SAMPLES if is_prompt else 1 - - seq_groups = [ - list(metadata.seq_data.keys()) - for metadata in seq_group_metadata_list - ] - return ModelInputForTPU(input_tokens, input_positions, attn_metadata, - input_lens, t, p, num_samples, n, seq_groups) - - def make_model_input_from_broadcasted_tensor_dict( - self, tensor_dict: Dict[str, Any]) -> ModelInputForTPU: - model_input = ModelInputForTPU.from_broadcasted_tensor_dict( - tensor_dict, attn_backend=self.attn_backend) - return model_input - - @torch.no_grad() - def execute_model( - self, - model_input: ModelInputForTPU, - kv_caches: Optional[List[Any]], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> List[SamplerOutput]: - assert intermediate_tensors is None - if not model_input.is_first_multi_step: - if not model_input.is_last_step: - return [] - - use_async_out_proc = model_input.async_callback is not None - sampler_outputs = [] - num_outputs = len(self.cached_step_outputs) - for i in range(num_outputs): - next_token_ids = self.cached_step_outputs.pop(0) - next_token_ids = next_token_ids.cpu().tolist() - sampler_output = _make_decode_output(next_token_ids, - model_input.seq_groups) - sampler_outputs.append(sampler_output) - - if i < num_outputs - 1 and use_async_out_proc: - assert model_input.async_callback is not None - ctx = model_input.async_callback.keywords[ # type: ignore - "ctx"] - ctx.append_output( - outputs=[sampler_output], - seq_group_metadata_list=ctx.seq_group_metadata_list, - scheduler_outputs=ctx.scheduler_outputs, - is_async=False, - is_last_step=False, - is_first_step_output=i == 0) - model_input.async_callback() - if use_async_out_proc: - return [sampler_outputs[-1]] - else: - return sampler_outputs - - is_prompt = model_input.attn_metadata.num_prefills > 0 - if is_prompt: - assert num_steps == 1 - # NOTE(woosuk): Since the FlashAttention kernel does not support - # ragged inputs, we split the prompts into different batches and - # process them separately. This is a temporary hack that should be - # optimized by using SplashAttention. - orig_slot_mapping = model_input.attn_metadata.slot_mapping - orig_block_tables = model_input.attn_metadata.block_tables - orig_context_lens = model_input.attn_metadata.context_lens - orig_effective_query_lens = \ - model_input.attn_metadata.effective_query_lens - batch_size = model_input.input_lens.shape[0] - start_idx = 0 - next_token_ids = [] - for i in range(batch_size): - # Get the actual prefill_len. - prefill_len = model_input.input_lens[i:i + 1].item() - prefill_len = _get_padded_prefill_len(prefill_len) - end_idx = start_idx + prefill_len - - token_ids = model_input.token_ids[None, start_idx:end_idx].to( - self.device) - position_ids = model_input.position_ids[None, - start_idx:end_idx].to( - self.device) - attn_metadata = model_input.attn_metadata - attn_metadata.num_prefills = 1 - attn_metadata.slot_mapping = orig_slot_mapping[ - None, start_idx:end_idx].to(self.device) - if orig_context_lens[i].item() > 0: - attn_metadata.context_lens = orig_context_lens[i:i + 1].to( - self.device) - attn_metadata.block_tables = orig_block_tables[ - i].unsqueeze(0).to(self.device) - attn_metadata.effective_query_lens = \ - orig_effective_query_lens[i:i + 1].to(self.device) - else: - attn_metadata.context_lens = None - attn_metadata.block_tables = None - attn_metadata.effective_query_lens = None - input_lens = model_input.input_lens[i:i + 1].to(self.device) - t = model_input.t[i:i + 1].to(self.device) - p = model_input.p[i:i + 1].to(self.device) - with set_forward_context(model_input.attn_metadata, - self.vllm_config, - model_input.virtual_engine): - output_token_ids = self.model(token_ids, position_ids, - input_lens, t, p, - model_input.num_samples, - kv_caches) - next_token_ids.append(output_token_ids[0]) - start_idx = end_idx - - if model_input.async_callback is not None: - model_input.async_callback() - # Retrieve the outputs to CPU. - next_token_ids = [ - output_token_ids.cpu().tolist() - for output_token_ids in next_token_ids - ] - - # NOTE(woosuk): Minimal code to construct the sampler outputs. - # The TPU backend does not reuse the sampler, since the TPU backend - # does not support advanced sampling parameters such as logprobs. - zero_logprob = Logprob(0.0) - sampler_outputs = [] - for i, seq_group in enumerate(model_input.seq_groups): - seq_ids = seq_group - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - seq_outputs = [] - for j in range(model_input.n[i]): - next_token_id = next_token_ids[i][j] - seq_outputs.append( - SequenceOutput(seq_id, next_token_id, - {next_token_id: zero_logprob})) - sampler_outputs.append( - CompletionSequenceGroupOutput(seq_outputs, None)) - return [SamplerOutput(sampler_outputs)] - else: - token_ids = model_input.token_ids.to(self.device) - position_ids = model_input.position_ids.to(self.device) - attn_metadata = model_input.attn_metadata - attn_metadata.slot_mapping = attn_metadata.slot_mapping.to( - self.device) - attn_metadata.block_tables = attn_metadata.block_tables.to( - self.device) - attn_metadata.context_lens = attn_metadata.context_lens.to( - self.device) - t = model_input.t.to(self.device) - p = model_input.p.to(self.device) - input_lens = model_input.input_lens.to(self.device) - for i in range(num_steps): - slot_mapping = attn_metadata.slot_mapping - with set_forward_context(model_input.attn_metadata, - self.vllm_config, - model_input.virtual_engine): - output_token_ids = self.model(token_ids, position_ids, - input_lens, t, p, - model_input.num_samples, - kv_caches) - self.cached_step_outputs.append(output_token_ids) - - if i < num_steps - 1: - # Prepare the inputs for the next step. - token_ids = output_token_ids.unsqueeze(dim=1).int() - position_ids = position_ids + 1 - attn_metadata.context_lens = attn_metadata.context_lens + 1 - - block_tables = attn_metadata.block_tables - block_number = block_tables.gather( - 1, - position_ids.long() // self.block_size) - block_offset = position_ids % self.block_size - - is_padding = slot_mapping == _PAD_SLOT_ID - slot_mapping = block_number * self.block_size + block_offset - slot_mapping = slot_mapping.long() - slot_mapping = torch.where(is_padding, _PAD_SLOT_ID, - slot_mapping) - attn_metadata.slot_mapping = slot_mapping - - if model_input.async_callback is not None: - model_input.async_callback() - - if num_steps > 1: - return [] - # Retrieve the outputs to CPU. - next_token_ids = self.cached_step_outputs.pop(0) - next_token_ids = next_token_ids.cpu().tolist() - sampler_output = _make_decode_output(next_token_ids, - model_input.seq_groups) - return [sampler_output] - - -class ModelWrapper(nn.Module): - - def __init__(self, model: nn.Module): - super().__init__() - self.model = model - - def forward( - self, - token_ids: torch.Tensor, - position_ids: torch.Tensor, - input_lens: torch.Tensor, - t: torch.Tensor, - p: torch.Tensor, - num_samples: int, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - ) -> torch.Tensor: - """Executes the forward pass of the model and samples the next token. - - Args: - token_ids: The input token IDs of shape [batch_size, seq_len]. - position_ids: The input position IDs of shape [batch_size, seq_len]. - input_lens: The actual input lengths of shape [batch_size]. - t: The sampling temperature of shape [batch_size]. - p: The top-p probability of shape [batch_size]. - num_samples: Number of samples to draw from each logits vector. - kv_caches: The key and value caches. They can be None during the - memory profiling at initialization. - """ - batch_size, seq_len = token_ids.shape - # Calculate the positions to sample from. - start_indices = torch.arange( - batch_size, dtype=torch.int32, device=input_lens.device) * seq_len - logits_indices = start_indices + input_lens - 1 - attn_metadata = get_forward_context().attn_metadata - - # FIXME(woosuk): This is a temporary hack to avoid using the existing - # sampler and sampling metadata. - sampling_metadata = SamplingMetadata( - seq_groups=[], - selected_token_indices=logits_indices, - categorized_sample_indices={}, - num_prompts=attn_metadata.num_prefills, - ) - - # Skip this in memory profiling at initialization. - if kv_caches[0][0].numel() > 0: - # index_copy_(slot_mapping) only works when the inserted dimension - # is 0. However, the KV cache in the Pallas backend has the shape - # [num_kv_heads, num_blocks, block_size, head_size]. To make it - # work, we need to flatten the first three dimensions and modify - # the slot_mapping accordingly. - num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape - slot_mapping = attn_metadata.slot_mapping - slot_mapping = slot_mapping.flatten() - head_indices = torch.arange(0, - num_kv_heads, - device=slot_mapping.device, - dtype=slot_mapping.dtype) - head_indices *= block_size * num_blocks - slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view( - -1, num_kv_heads) - slot_mapping = slot_mapping + head_indices.view(1, -1) - slot_mapping = slot_mapping.flatten() - attn_metadata.slot_mapping = slot_mapping - - hidden_states = self.model(token_ids, position_ids) - hidden_states = hidden_states.flatten(0, 1) - logits = self.model.compute_logits(hidden_states, sampling_metadata) - - # Argmax sampling. - argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True) - argmax_token_ids = argmax_token_ids.repeat(1, num_samples) - - # Zero temperature means greedy decoding. Avoid division by zero. - nonzero_t = torch.where(t != 0, t, 1.0) - logits = logits / nonzero_t.unsqueeze(dim=1) - if _ENABLE_TOP_P: - logits = _apply_top_p(logits, p.unsqueeze(dim=1)) - - # Random sampling. - probs = torch.softmax(logits, dim=-1, dtype=torch.float32) - sampled_token_ids = torch.multinomial(probs, - num_samples, - replacement=True) - if num_samples == 1: - argmax_token_ids = argmax_token_ids.squeeze(dim=-1) - sampled_token_ids = sampled_token_ids.squeeze(dim=-1) - next_token_ids = torch.where(t != 0, sampled_token_ids, - argmax_token_ids) - return next_token_ids - - -def _get_padded_prefill_len(x: int) -> int: - # NOTE(woosuk): The pallas FlashAttention kernel requires the sequence - # length to be a multiple of 16. We pad the prompt length to the nearest - # multiple of 16. This is also good for performance. - if x <= 16: - return 16 - return 1 << (x - 1).bit_length() - - -def _get_padded_batch_size(batch_size: int) -> int: - # The GMM Pallas kernel requires num_tokens * topk to be a multiple of 16. - # To meet this requirement in the simplest way, we set the minimal batch - # size to 8. - if batch_size <= 8: - return 8 - else: - return ((batch_size + 15) // 16) * 16 - - -def _apply_top_p(logits: torch.Tensor, p: torch.Tensor) -> torch.Tensor: - logits_sorted = torch.sort(logits, dim=-1, descending=True).values - sorted_cum_probs = torch.cumsum(logits_sorted.softmax(dim=-1), dim=-1) - cutoff_index = torch.sum(sorted_cum_probs < p, dim=-1, keepdim=True) - cutoff_logit = torch.gather(logits_sorted, -1, cutoff_index) - logits = logits.masked_fill_(logits < cutoff_logit, -float("inf")) - return logits - - -def _make_decode_output( - next_token_ids: List[int], - seq_groups: List[List[int]], -) -> SamplerOutput: - zero_logprob = Logprob(0.0) - sampler_outputs = [] - batch_idx = 0 - for seq_group in seq_groups: - seq_ids = seq_group - seq_outputs = [] - for seq_id in seq_ids: - next_token_id = next_token_ids[batch_idx] - seq_outputs.append( - SequenceOutput(seq_id, next_token_id, - {next_token_id: zero_logprob})) - batch_idx += 1 - sampler_outputs.append(CompletionSequenceGroupOutput( - seq_outputs, None)) - return SamplerOutput(sampler_outputs) diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py deleted file mode 100644 index ad5ed19e2f89..000000000000 --- a/vllm/worker/tpu_worker.py +++ /dev/null @@ -1,337 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -from typing import List, Optional, Tuple, Union - -import torch -import torch_xla.core.xla_model as xm -import torch_xla.debug.profiler as xp -import torch_xla.runtime as xr - -import vllm.envs as envs -from vllm.config import VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.logger import init_logger -from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size -from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoRANotSupportedWorkerBase, WorkerBase, - WorkerInput) - -logger = init_logger(__name__) - - -class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool, - ) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - self.parallel_config.rank = rank - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - - assert self.device_config.device_type == "tpu" - if self.cache_config.cache_dtype == "auto": - self.cache_dtype = self.model_config.dtype - else: - self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ - self.cache_config.cache_dtype] - - self.model_runner: TPUModelRunner = TPUModelRunner( - vllm_config=vllm_config, is_driver_worker=is_driver_worker) - - if self.model_config.seed is None: - self.model_config.seed = 0 - - if vllm_config.lora_config is not None: - raise NotImplementedError( - "The V0 TPU backend doesn't support LoRA serving") - - def init_device(self) -> None: - os.environ["PJRT_DEVICE"] = "TPU" - torch.set_grad_enabled(False) - torch.set_default_dtype(self.model_config.dtype) - - # NOTE(woosuk): This is just to initialize the TP group and broadcast - # the input objects on CPU. The all-reduce and all-gather ops on TPU - # are invoked by `xm.all_reduce` and `xm.all_gather` which use their - # own context. - init_distributed_environment( - world_size=self.parallel_config.world_size, - rank=self.rank, - local_rank=self.local_rank, - distributed_init_method=self.distributed_init_method, - backend="gloo", - ) - ensure_model_parallel_initialized( - self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size) - - # Device initialization should happen after initializing the distributed - # runtime. - self.device = xm.xla_device() - self.device_config.device = self.device - - # Set random seed. - set_random_seed(self.model_config.seed) - xm.set_rng_state(self.model_config.seed, self.device) - - # Increase the cache size limit, which is the maximum number of - # dynamo graphs that can be compiled. - # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and - # 30-40 graphs for decode. 128 is an arbitrary safe number. - torch._dynamo.config.cache_size_limit = 128 - # Use persistent cache to avoid XLA recompilation. - # NOTE(woosuk): Set per-rank cache path since different ranks - # can have slightly different XLA graphs. - world_size = self.parallel_config.world_size - rank = xr.global_ordinal() - # The PyTorch/XLA compilation cache uses the Torch IR to generate keys. - # Consequently, changes in optimization flags, which affect compilation - # results, don't change the cache key. This can result in the wrong - # compilation being used. To prevent this, disabling the XLA compilation - # cache during development is recommended.We can disable it by - # `export VLLM_XLA_CACHE_PATH=` - if envs.VLLM_XLA_CACHE_PATH: - per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH, - f"tp{world_size}_rank{rank}") - xr.initialize_cache(per_rank_path, readonly=False) - - self.profiler = None - if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1: - # For TPU, we can only have 1 active profiler session for 1 profiler - # server. So we only profile on rank0. - self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info("Profiling enabled. Traces will be saved to: %s", - self.profile_dir) - self.profiler = xp.start_server(9012) - - def start_profile(self): - if self.rank < 1: - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - xp.start_trace(self.profile_dir) - - def stop_profile(self): - if self.rank < 1: - if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") - xp.stop_trace() - - def load_model(self): - self.model_runner.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - num_layers = self.model_config.get_num_layers(self.parallel_config) - head_size = self.model_config.get_head_size() - num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) - - # use an empty tensor instead of `None`` to force Dynamo to pass - # it by reference, rather by specializing on the value ``None``. - # the `dtype` argument does not matter, and we use `float32` as - # a placeholder (it has wide hardware support). - kv_caches = [(torch.tensor([], dtype=torch.float32, - device=self.device), - torch.tensor([], dtype=torch.float32, - device=self.device)) - for _ in range(num_layers)] - bind_kv_cache(self.compilation_config.static_forward_context, - [kv_caches]) - self.model_runner._dummy_run( - batch_size=1, - seq_len=self.scheduler_config.max_num_batched_tokens, - kv_caches=kv_caches, - exec_mode=ExecutionMode.PREFILL, - ) - # Synchronize before measuring the memory usage. - xm.wait_device_ops() - - # Get the maximum amount of memory used by the model weights and - # intermediate activations. - m = xm.get_memory_info(self.device) - total_memory_size = m["bytes_limit"] - profiled = m["peak_bytes_used"] # Weights + intermediate activations. - - # Calculate the TPU KV cache size based on profiling. - usable_memory_size = int(total_memory_size * - self.cache_config.gpu_memory_utilization) - tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0) - dtype_bytes = get_dtype_size(self.cache_dtype) - block_size_bytes = (dtype_bytes * self.cache_config.block_size * - num_layers * 2 * head_size * num_kv_heads) - num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes - num_tpu_blocks = (num_tpu_blocks // 8) * 8 # Round down to 8. - - # Calculate the CPU KV cache size based on the config. - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - block_size_bytes) - num_cpu_blocks = (num_cpu_blocks // 8) * 8 # Round down to 8. - return num_tpu_blocks, num_cpu_blocks - - def initialize_cache( - self, - num_gpu_blocks: int, - num_cpu_blocks: int, - ) -> None: - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - self.block_size = self.cache_config.block_size - - dtype = self.cache_dtype - num_layers = self.model_config.get_num_layers(self.parallel_config) - num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) - head_size = self.model_config.get_head_size() - - self.cpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] - self.tpu_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] - tpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( - num_gpu_blocks, self.block_size, num_kv_heads, head_size) - cpu_cache_shape = self.model_runner.attn_backend.get_kv_cache_shape( - num_cpu_blocks, self.block_size, num_kv_heads, head_size) - for _ in range(num_layers): - tpu_k_cache = torch.zeros(tpu_cache_shape, - dtype=dtype, - device=self.device) - tpu_v_cache = torch.zeros_like(tpu_k_cache) - self.tpu_cache.append((tpu_k_cache, tpu_v_cache)) - cpu_k_cache = torch.zeros(cpu_cache_shape, - dtype=dtype, - device="cpu") - cpu_v_cache = torch.zeros_like(cpu_k_cache) - self.cpu_cache.append((cpu_k_cache, cpu_v_cache)) - bind_kv_cache(self.compilation_config.static_forward_context, - [self.tpu_cache]) - self._warmup_model() - - def _warmup_model(self) -> None: - # FIXME(woosuk): Here we are abusing `enforce_eager` which is defined - # for CUDA graphs. We should refactor this part. - if not self.model_config.enforce_eager: - # Warm up the model with all possible input shapes so that - # compilation never happens during the actual execution. - # This may take ~30 mins for the first run and ~20 mins for the - # subsequent runs. - # If `enforce_eager` is True, the ahead-of-time compilation is - # skipped and the compilation happens during the actual execution, - # which is bad for performance but useful for development. - self.model_runner.warmup_model(self.tpu_cache) - - def get_cache_block_size_bytes(self) -> int: - head_size = self.model_config.get_head_size() - num_heads = self.model_config.get_num_kv_heads(self.parallel_config) - num_layers = self.model_config.get_num_layers(self.parallel_config) - - key_cache_block = self.cache_config.block_size * num_heads * head_size - value_cache_block = key_cache_block - total = num_layers * (key_cache_block + value_cache_block) - dtype_size = get_dtype_size(self.cache_dtype) - return dtype_size * total - - @property - def do_metadata_broadcast(self) -> bool: - return self.parallel_config.tensor_parallel_size > 1 - - @property - def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: - # NOTE(woosuk): This assumes virtual_engine == 0, i.e., no pipeline - # parallelism. - return [self.tpu_cache] - - def prepare_worker_input( - self, - execute_model_req: ExecuteModelRequest, - ) -> WorkerInput: - virtual_engine = execute_model_req.virtual_engine - num_seq_groups = len(execute_model_req.seq_group_metadata_list) - blocks_to_swap_in = _make_src_to_dst( - execute_model_req.blocks_to_swap_in, "cpu", self.device) - blocks_to_swap_out = _make_src_to_dst( - execute_model_req.blocks_to_swap_out, self.device, "cpu") - blocks_to_copy = _make_src_to_dst(execute_model_req.blocks_to_copy, - self.device, self.device) - return WorkerInput( - num_seq_groups=num_seq_groups, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - virtual_engine=virtual_engine, - ) - - def execute_worker(self, worker_input: WorkerInput) -> None: - virtual_engine = worker_input.virtual_engine - assert virtual_engine == 0 - attn_backend = self.model_runner.attn_backend - num_layers = self.model_config.get_num_layers(self.parallel_config) - - # Issue cache operations. - if worker_input.blocks_to_swap_in is not None: - src_indices, dst_indices = worker_input.blocks_to_swap_in - if src_indices.numel() > 0: - # Swap from CPU to TPU. - for i in range(num_layers): - tpu_k_cache, tpu_v_cache = self.tpu_cache[i] - cpu_k_cache, cpu_v_cache = self.cpu_cache[i] - k = cpu_k_cache[:, src_indices].to(self.device) - v = cpu_v_cache[:, src_indices].to(self.device) - _insert_kv(k, v, dst_indices, tpu_k_cache, tpu_v_cache) - - if worker_input.blocks_to_swap_out is not None: - src_indices, dst_indices = worker_input.blocks_to_swap_out - if src_indices.numel() > 0: - # Swap from TPU to CPU. - for i in range(num_layers): - tpu_k_cache, tpu_v_cache = self.tpu_cache[i] - cpu_k_cache, cpu_v_cache = self.cpu_cache[i] - cpu_k_cache[:, dst_indices] = tpu_k_cache[:, src_indices] - cpu_v_cache[:, dst_indices] = tpu_v_cache[:, src_indices] - - if worker_input.blocks_to_copy is not None: - src_indices, dst_indices = worker_input.blocks_to_copy - if src_indices.numel() > 0: - attn_backend.copy_blocks(self.tpu_cache, - (src_indices, dst_indices)) - - -def _make_src_to_dst( - mapping: List[Tuple[int, int]], - src_device: Union[torch.device, str], - dst_device: Union[torch.device, str], -) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: - if not mapping: - return None - - src_indices = [i for i, _ in mapping] - dst_indices = [i for _, i in mapping] - src_indices = torch.tensor(src_indices, - device=src_device, - dtype=torch.int64) - dst_indices = torch.tensor(dst_indices, - device=dst_device, - dtype=torch.int64) - return src_indices, dst_indices - - -@torch.compile(backend="openxla") -def _insert_kv( - k: torch.Tensor, - v: torch.Tensor, - indices: torch.Tensor, - tpu_k_cache: torch.Tensor, - tpu_v_cache: torch.Tensor, -) -> None: - torch.ops.xla.dynamo_set_buffer_donor_(tpu_k_cache, True) - torch.ops.xla.dynamo_set_buffer_donor_(tpu_v_cache, True) - tpu_k_cache[:, indices] = k - tpu_v_cache[:, indices] = v diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py deleted file mode 100644 index b2d3ce8526d5..000000000000 --- a/vllm/worker/xpu_model_runner.py +++ /dev/null @@ -1,606 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses -import time -import weakref -from collections import defaultdict -from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Type, TypeVar) - -import torch -import torch.nn as nn - -from vllm.attention import get_attn_backend -from vllm.config import VllmConfig -from vllm.distributed import get_pp_group -from vllm.forward_context import set_forward_context -from vllm.inputs import INPUT_REGISTRY, InputRegistry -from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadataCache -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.model_loader import get_model -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs, MultiModalPlaceholderMap, - MultiModalRegistry) -from vllm.sampling_params import SamplingParams -from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad -from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata -from vllm.worker.model_runner_base import ( - ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, - _add_attn_metadata_broadcastable_dict, - _add_sampling_metadata_broadcastable_dict, - _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict) - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -logger = init_logger(__name__) - -_PAD_SLOT_ID = -1 - -TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU") - - -@dataclass(frozen=True) -class ModelInputForXPU(ModelRunnerInputBase): - """ - Used by the NeuronModelRunner. - """ - input_tokens: Optional[torch.Tensor] = None - input_positions: Optional[torch.Tensor] = None - attn_metadata: Optional["AttentionMetadata"] = None - multi_modal_kwargs: Optional[BatchedTensorInputs] = None - virtual_engine: Optional[int] = None - seq_lens: Optional[List[int]] = None - query_lens: Optional[List[int]] = None - async_callback: Optional[Callable] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls: Type[TModelInputForXPU], - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> TModelInputForXPU: - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -@dataclass(frozen=True) -class ModelInputForXPUWithSamplingMetadata(ModelInputForXPU): - """ - Used by the ModelRunner. - """ - sampling_metadata: Optional["SamplingMetadata"] = None - - def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: - tensor_dict = { - "input_tokens": self.input_tokens, - "input_positions": self.input_positions, - } - _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) - _add_sampling_metadata_broadcastable_dict(tensor_dict, - self.sampling_metadata) - return tensor_dict - - @classmethod - def from_broadcasted_tensor_dict( - cls, - tensor_dict: Dict[str, Any], - attn_backend: Optional["AttentionBackend"] = None, - ) -> "ModelInputForXPUWithSamplingMetadata": - tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) - if attn_backend is not None: - tensor_dict = _init_attn_metadata_from_tensor_dict( - attn_backend, tensor_dict) - return cls(**tensor_dict) - - -class ModelInputForXPUBuilder(ModelRunnerInputBuilderBase[ModelInputForXPU]): - - def __init__(self, - runner: "XPUModelRunner", - finished_requests_ids: Optional[List[str]] = None) -> None: - super().__init__() - self.runner = runner - self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend - self.sliding_window = self.runner.sliding_window - self.block_size = self.runner.block_size - self.device = self.runner.device - - def prepare(self, - finished_requests_ids: Optional[List[str]] = None) -> None: - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] - - def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): - self.seq_group_metadata_list.append(seq_group_metadata) - - def build(self) -> ModelInputForXPU: - is_prompt = self.seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs) = self._prepare_prompt( - self.seq_group_metadata_list) - else: - (input_tokens, input_positions, - attn_metadata) = self._prepare_decode( - self.seq_group_metadata_list) - seq_lens = None - multi_modal_kwargs = None - - return self.model_input_cls( - input_tokens=input_tokens, - input_positions=input_positions, - attn_metadata=attn_metadata, - multi_modal_kwargs=multi_modal_kwargs, - seq_lens=seq_lens, - query_lens=seq_lens, - ) - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], - BatchedTensorInputs]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - multi_modal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_tokens = seq_data.get_token_ids() - computed_len = seq_data.get_num_computed_tokens() - seq_len = len(prompt_tokens) - - seq_lens.append(seq_len) # Prompt token num - input_tokens.extend(prompt_tokens) # Token ids - - # Token position ids - # NOTE(woosuk): Here we assume that the first token in the prompt - # is always the first token in the sequence. - positions_range = range(computed_len, seq_len) - input_positions.extend(list(positions_range)) - - if seq_group_metadata.multi_modal_data: - # NOTE: mm_kwargs only includes the subset of multi-modal items - # that intersect with the current prefill positions. - mm_kwargs, placeholder_maps = MultiModalPlaceholderMap \ - .from_seq_group(seq_group_metadata, positions_range) - - multi_modal_kwargs_list.append(mm_kwargs) - - for modality, placeholder_map in placeholder_maps.items(): - multi_modal_placeholder_maps[modality].extend( - placeholder_map) - - if seq_group_metadata.block_tables is None: - # During memory profiling, the block tables are not initialized - # yet. In this case, we just use a dummy slot mapping. - slot_mapping.extend([_PAD_SLOT_ID] * seq_len) - continue - - # Compute the slot mapping. - block_table = seq_group_metadata.block_tables[seq_id] - # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, seq_len - sliding_window). - # For example, if the prompt len is 10, sliding window is 8, and - # block size is 4, the first two tokens are masked and the slot - # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. - start_idx = 0 - if self.sliding_window is not None: - start_idx = max(0, seq_len - self.sliding_window) - - for i in range(computed_len, seq_len): - if i < start_idx: - slot_mapping.append(_PAD_SLOT_ID) - continue - - block_number = block_table[i // - self.block_size] # type: ignore - block_offset = i % self.block_size # type: ignore - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - num_prompt_tokens = len(input_tokens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) # type: ignore - input_positions = torch.tensor(input_positions, - dtype=torch.long, - device=self.device) # type: ignore - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) # type: ignore - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - multi_modal_placeholder_maps.items() - } - - max_seqlen = max(seq_lens) - tmp = [0] - tmp.extend(seq_lens) - seqlen = torch.tensor(tmp) - seqlen_q = torch.cumsum(seqlen, dim=0).to(device=self.device) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=True, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=False, - seq_lens=seq_lens, - seqlen_q=seqlen_q, - max_seqlen=max_seqlen, - seq_lens_tensor=torch.tensor([]), - max_decode_seq_len=0, - num_prefills=len(seq_lens), - num_prefill_tokens=num_prompt_tokens, - num_decode_tokens=0, - block_tables=torch.tensor([], device=self.device, dtype=torch.int), - ) - - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs) - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - block_tables: List[List[int]] = [] - - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - assert seq_group_metadata.token_chunk_size == 1 - - seq_ids = list(seq_group_metadata.seq_data.keys()) - - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append(generation_token) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append(position) - - seq_len = seq_len if self.sliding_window is None else min( - seq_len, self.sliding_window) - seq_lens.append(seq_len) - - block_table = seq_group_metadata.block_tables[seq_id] - block_number = block_table[position // self.block_size] - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - if self.sliding_window is not None: - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - block_tables.append(block_table) - - max_decode_seq_len = max(seq_lens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) - input_positions = torch.tensor(input_positions, - dtype=torch.long, - device=self.device) - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) - seq_lens_tensor = torch.tensor(seq_lens, - dtype=torch.int, - device=self.device) - - block_tables = make_tensor_with_pad( - block_tables, - pad=0, - dtype=torch.int, - device=self.device, - ) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=False, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - seq_lens=seq_lens, - seqlen_q=torch.tensor([]), - max_seqlen=0, - seq_lens_tensor=seq_lens_tensor, - max_decode_seq_len=max_decode_seq_len, - num_prefill_tokens=0, - num_decode_tokens=len(input_tokens), - num_prefills=0, - block_tables=block_tables, - ) - return ( - input_tokens, - input_positions, - attn_metadata, - ) - - -class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): - _model_input_cls: Type[ModelInputForXPUWithSamplingMetadata] = ( - ModelInputForXPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForXPUBuilder] = ModelInputForXPUBuilder - - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - return_hidden_states: bool = False, - input_registry: InputRegistry = INPUT_REGISTRY, - mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, - ): - - ModelRunnerBase.__init__(self, vllm_config=vllm_config) - model_config = self.model_config - cache_config = self.cache_config - self.is_driver_worker = is_driver_worker - self.return_hidden_states = return_hidden_states - - self.device = self.device_config.device - - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size - - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - ) - - # Multi-modal data support - self.input_registry = input_registry - self.mm_registry = mm_registry - - # Lazy initialization. - self.model: nn.Module # Set after init_Model - self.sampler = get_sampler() - - self.sampling_metadata_cache: SamplingMetadataCache = \ - SamplingMetadataCache() \ - if self.parallel_config.pipeline_parallel_size == 1 else None - - self.builder = self._builder_cls(weakref.proxy(self)) - - def load_model(self) -> None: - with DeviceMemoryProfiler() as m: - self.model = get_model(vllm_config=self.vllm_config) - - self.model_memory_usage = m.consumed_memory - logger.info("Loading model weights took %.4f GiB", - self.model_memory_usage / GiB_bytes) - - def get_model(self) -> nn.Module: - return self.model - - @property - def vocab_size(self) -> int: - return self.model_config.get_vocab_size() - - @torch.inference_mode() - def profile_run(self) -> None: - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - - # Profile memory usage with max_num_sequences sequences and the total - # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - # Additional GPU memory may be needed for multi-modal encoding, which - # needs to be accounted for when calculating the GPU blocks for - # vLLM blocker manager. - # To exercise the worst scenario for GPU memory consumption, - # the number of seqs (batch_size) is chosen to maximize the number - # of images processed. - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - max_num_seqs_orig = max_num_seqs - max_num_seqs = min(max_num_seqs, - max_num_batched_tokens // max_mm_tokens) - if max_num_seqs < 1: - expr = (f"min({max_num_seqs_orig}, " - f"{max_num_batched_tokens} // {max_mm_tokens})") - logger.warning( - "Computed max_num_seqs (%s) to be less than 1. " - "Setting it to the minimum value of 1.", expr) - max_num_seqs = 1 - - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len - - dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry) - - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=None, - multi_modal_data=dummy_data.multi_modal_data, - multi_modal_placeholders=dummy_data.multi_modal_placeholders) - seqs.append(seq) - - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - if not get_pp_group().is_first_rank: - intermediate_tensors = self.model.make_empty_intermediate_tensors( - batch_size=batch_size, - dtype=self.model_config.dtype, - device=self.device) - self.execute_model(model_input, None, intermediate_tensors) - torch.xpu.synchronize() - return - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, - Any]) -> ModelInputForXPUWithSamplingMetadata: - return ( - ModelInputForXPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - )) - - def _prepare_model_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForXPUWithSamplingMetadata: - """Helper method to prepare the model input based on a given sequence - group. Prepares metadata needed for the base model forward pass but not - metadata for possible additional steps, e.g., sampling. - - """ - builder = self.builder - builder.prepare(finished_requests_ids) - for seq_group_metadata in seq_group_metadata_list: - builder.add_seq_group(seq_group_metadata) - - return builder.build() # type: ignore - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForXPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list, finished_requests_ids) - # Sampling metadata is only required for the final pp group - generators = self.get_generators(finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - pin_memory=False, - generators=generators, - cache=self.sampling_metadata_cache) - - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - virtual_engine=virtual_engine) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForXPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "XPUModelRunner does not support multi-step execution.") - - model_executable = self.model - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_start_time = time.time() - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs or {}, - device=self.device, - ), - ) - # Compute the logits in the last pipeline stage. - if not get_pp_group().is_last_rank: - return hidden_or_intermediate_states - - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time): - model_forward_end_time = time.time() - - # Compute the logits. - logits = self.model.compute_logits(hidden_or_intermediate_states, - model_input.sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - output: SamplerOutput = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time - and output is not None): - model_forward_time = (model_forward_end_time - - model_forward_start_time) - # If there are multiple workers, we are still tracking the latency - # from the start time of the driver worker to the end time of the - # driver worker. The model forward time will then end up covering - # the communication time as well. - output.model_forward_time = model_forward_time - - return [output] diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py deleted file mode 100644 index fe321c059f52..000000000000 --- a/vllm/worker/xpu_worker.py +++ /dev/null @@ -1,186 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""A XPU worker class.""" -import gc -import os -from typing import List, Optional, Tuple - -import intel_extension_for_pytorch # noqa: F401 -import oneccl_bindings_for_pytorch # noqa: F401 -import torch -import torch.distributed - -from vllm.config import VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.distributed.parallel_state import get_pp_group -from vllm.logger import init_logger -from vllm.model_executor import set_random_seed -from vllm.platforms import current_platform -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.worker import Worker -from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase -from vllm.worker.xpu_model_runner import XPUModelRunner - -logger = init_logger(__name__) - - -class XPUWorker(LoRANotSupportedWorkerBase, Worker): - """A worker class that executes (a partition of) the model on a GPU. - - Each worker is associated with a single XPU device. The worker is - responsible for maintaining the KV cache and executing the model on the - XPU. In case of distributed inference, each worker is assigned a partition - of the model. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False, - ) -> None: - WorkerBase.__init__(self, vllm_config=vllm_config) - device_config = self.device_config - parallel_config = self.parallel_config - assert device_config.device_type == "xpu" - assert current_platform.is_xpu() - - self.parallel_config.rank = rank - - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - if parallel_config and is_driver_worker: - assert rank % parallel_config.tensor_parallel_size == 0, \ - "Driver worker should be rank 0 of tensor parallel group." - - self.model_runner = XPUModelRunner( # type: ignore - vllm_config=vllm_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker, - ) - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: List[CacheEngine] - self.gpu_cache: Optional[List[List[torch.Tensor]]] - - def init_device(self) -> None: - if self.device_config.device.type == "xpu" and current_platform.is_xpu( - ): - self.device = torch.device(f"xpu:{self.local_rank}") - torch.xpu.set_device(self.device) - torch.xpu.empty_cache() - self.init_gpu_memory = torch.xpu.get_device_properties( - self.local_rank).total_memory - else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") - # Initialize the distributed environment. - self.init_worker_distributed_environment() - # Initialize the model. - set_random_seed(self.model_config.seed) - - # keep this method for `empty_cache` and `synchronize` api - @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - - Tip: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - torch.xpu.empty_cache() - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - self.model_runner.profile_run() - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - torch.xpu.synchronize() - used_memory = torch.xpu.memory_allocated() - total_gpu_memory = torch.xpu.get_device_properties( - self.local_rank).total_memory - free_gpu_memory = total_gpu_memory - used_memory - - # NOTE(woosuk): Here we assume that the other processes using the same - # GPU did not change their memory usage during the profiling. - peak_memory = self.init_gpu_memory - free_gpu_memory - assert peak_memory > 0, ( - "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory}, current free memory" - f" {free_gpu_memory}. This happens when the GPU memory was " - "not properly cleaned up before initializing the vLLM instance.") - - cache_block_size = self.get_cache_block_size_bytes() - num_gpu_blocks = int( - (total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) // cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // - cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - gc.collect() - torch.xpu.empty_cache() - return num_gpu_blocks, num_cpu_blocks - - def _warm_up_model(self) -> None: - # IPEX don't support capture graph yet - pass - - def init_worker_distributed_environment(self) -> None: - """Initialize the distributed environment.""" - - parallel_config = self.parallel_config - rank = self.rank - distributed_init_method = self.distributed_init_method - - if torch.distributed.is_initialized(): - torch_world_size = torch.distributed.get_world_size() - if torch_world_size != parallel_config.world_size: - raise RuntimeError( - "torch.distributed is already initialized but the torch " - "world size does not match parallel_config.world_size " - f"({torch_world_size} vs. {parallel_config.world_size}).") - elif not distributed_init_method: - raise ValueError( - "distributed_init_method must be set if torch.distributed " - "is not already initialized") - else: - # use sockets as default Level zero IPC exchange backend. By - # default oneccl will use `drmfd` as mechanism which need extra - # dependency (libdrm and drm headers) on your system. - ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi") - ENV_LOCAL_WORLD_SIZE = os.getenv("LOCAL_WORLD_SIZE", - str(parallel_config.world_size)) - os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT - os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE - os.environ["LOCAL_RANK"] = str(self.local_rank) - init_distributed_environment( - world_size=parallel_config.world_size, - rank=rank, - distributed_init_method=distributed_init_method, - local_rank=self.local_rank, - backend="ccl") - - ensure_model_parallel_initialized( - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) - # global all_reduce needed for overall oneccl warm up - torch.distributed.all_reduce(torch.zeros(1).xpu()) - - if parallel_config.pipeline_parallel_size > 1: - # Add pp group init to avoid - # p2p communication as the first call - get_pp_group().all_reduce(torch.zeros(1).xpu())