diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index dae6ead04e9c..7d9b32cd4b67 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -207,7 +207,7 @@ def get_physical_block_id(self, absolute_id: int) -> int: Args: absolute_id (int): The absolute block id for the block - in whole allocator. + in whole allocator. Returns: int: The zero-offset block id on certain device. diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 2913a01bf34a..a21d69323abb 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -61,7 +61,7 @@ class PrefixCachingBlockAllocator(BlockAllocator): Args: num_blocks (int): The total number of blocks to manage. block_size (int): The size of each block in tokens. - block_ids(Optional[Iterable[int]], optional): An optional iterable of + block_ids (Optional[Iterable[int]], optional): An optional iterable of block IDs. If not provided, block IDs will be assigned sequentially from 0 to num_blocks - 1. """ diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 63894e7f5dc8..c89f3f663264 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -657,7 +657,7 @@ def _schedule_running( `budget.num_batched_tokens` has not enough capacity to schedule all tokens. partial_prefill_metadata: information about the partial prefills - that are currently running + that are currently running Returns: SchedulerRunningOutputs. diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 973979fdf7df..ced8234a7b43 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -491,7 +491,8 @@ def forward( query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + kv_cache: shape = + [2, num_blocks, block_size * num_kv_heads * head_size] NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 6e7096de924c..dd2b956d4fa3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -438,7 +438,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 1115fc606b05..70d3471a4725 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -637,11 +637,9 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache: shape - - # NHD: [num_blocks, 2, block_size, num_kv_heads, head_size] - # HND: [num_blocks, 2, num_kv_heads, block_size, head_size] - - + kv_cache: KV cache tensor with different possible shapes: + - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size] + - HND: [num_blocks, 2, num_kv_heads, block_size, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 458562ebc8d2..a596f6b2b32a 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -689,7 +689,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index fd97db0abb84..26f9abf13d0e 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -235,7 +235,8 @@ def forward( query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size] + kv_cache: shape = + [num_blocks, block_size, num_kv_heads * 2, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -329,7 +330,7 @@ def write_to_kv_cache( Args: key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size] + kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size] num_slices_per_kv_cache_update_block: int """ _, page_size, num_combined_kv_heads, head_size = kv_cache.shape diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 403ad8e88a95..173a0a255e49 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -429,7 +429,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index c93223a34083..b96d957a150b 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -362,7 +362,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index b12036c59979..a37a7f6811ef 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -285,7 +285,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index e0eb7d8be974..7f888c113574 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -330,7 +330,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index c9d18033a198..bd2ec036834b 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -255,9 +255,9 @@ def compute_encoder_budget( Returns: - Compute budget for encoder execution, measured in number of tokens - from the input sequence. + from the input sequence. - Space budget for encoder cache size, measured in number of tokens - from the input sequence. + from the input sequence. """ if mm_registry.supports_multimodal_inputs(model_config): max_tokens_by_modality = mm_registry \ @@ -303,9 +303,9 @@ def compute_mm_encoder_budget( Returns: - Compute budget for encoder execution, measured in number of tokens - from the input sequence. + from the input sequence. - Space budget for encoder cache size, measured in number of tokens - from the input sequence. + from the input sequence. """ if not max_tokens_by_modality: diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index f082ad00f2e3..9421341f990c 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -119,7 +119,8 @@ def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: Args: request: The request. - num_tokens: The total number of tokens that need to be cached + num_computed_tokens: The total number of tokens + that need to be cached (including tokens that are already cached). """ for manager in self.single_type_managers: diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index b427a9c497fe..87a11fe58a04 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -54,14 +54,15 @@ def get_block_ids( def get_block_ids( self, allow_none: bool = False, - ): + ) -> Optional[tuple[list[int], ...]]: """ Converts the KVCacheBlocks instance to block_ids. - + Returns: - tuple[list[int], ...]: A tuple of lists where - * the outer tuple corresponds to KV cache groups - * each inner list contains the block_ids of the blocks in that group + tuple[list[int], ...]: A tuple of lists where: + - the outer tuple corresponds to KV cache groups + - each inner list contains the block_ids of the blocks in that + group """ if allow_none and all(len(group) == 0 for group in self.blocks): return None diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index c05ad1966d61..8394ae788ab0 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -8,6 +8,7 @@ from vllm.executor.ray_distributed_executor import ( # noqa RayDistributedExecutor as RayDistributedExecutorV0) from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput @@ -64,7 +65,7 @@ def max_concurrent_batches(self) -> int: def execute_model( self, - scheduler_output, + scheduler_output: SchedulerOutput, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: """Execute the model on the Ray workers. diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py index 61ba5d66cb31..a43cf9ce255e 100644 --- a/vllm/v1/metrics/prometheus.py +++ b/vllm/v1/metrics/prometheus.py @@ -36,7 +36,7 @@ def setup_multiprocess_prometheus(): "and vLLM will properly handle cleanup.") -def get_prometheus_registry(): +def get_prometheus_registry() -> CollectorRegistry: """Get the appropriate prometheus registry based on multiprocessing configuration. diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 12b4db24bff8..c9c649705dfd 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -80,7 +80,7 @@ def update_state( to each forward pass. Args: - batch_update is non-None iff there have been - changes to the batch makeup. + batch_update: Non-None iff there have been changes + to the batch makeup. """ raise NotImplementedError diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index b2354c53302a..2d9ce3101b6c 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -68,7 +68,7 @@ def forward( different requests are flattened into a single tensor because this is the shape of the output logits. NOTE: `target_logits` can be updated in place to save memory. - bonus_token_ids_tensor (torch.Tensor): + bonus_token_ids (torch.Tensor): A tensor containing bonus tokens. Shape is [batch_size, 1]. Bonus tokens are added to the end of the sequence if all proposed tokens are accepted. We generate the bonus tokens diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 04545d587e4a..e84136e3a6d0 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -89,7 +89,7 @@ def gather_logprobs( Gather logprobs for topk and sampled/prompt token. Args: - logits: (num tokens) x (vocab) tensor + logprobs: (num tokens) x (vocab) tensor num_logprobs: minimum number of logprobs to retain per token token_ids: prompt tokens (if prompt logprobs) diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index d500783aa4b3..9a53aa7a1ad1 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -110,7 +110,7 @@ def compile_grammar(self, request_type: StructuredOutputOptions, Args: request_type (StructuredOutputOptions): The type of structured - output request. + output request. grammar_spec (str): The grammar specification to compile. Returns: @@ -124,7 +124,7 @@ def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor: Args: max_num_seqs (int): The maximum number of sequences for which - to allocate the bitmask. + to allocate the bitmask. """ @abstractmethod diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 284af6bfedce..f4c2f45df595 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -525,9 +525,6 @@ def condense(self) -> None: Any consecutive empty indices at the very end of the list are not filled. - Args: - empty_req_indices: empty indices which may be filled. - Returns: swaps: list of (from,to) swap tuples for moved requests empty_req_indices: indices not filled by condensation diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d93460d618e7..ed2a54eebc98 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2953,7 +2953,7 @@ def _reshape_kv_cache_tensors( Args: kv_cache_config: The KV cache config kv_cache_raw_tensors: The KV cache buffer of each layer, with - correct size but uninitialized shape. + correct size but uninitialized shape. Returns: Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d36423660427..70ffde39ca33 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -552,7 +552,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return kv_cache_spec def _get_slot_mapping_metadata(self, num_reqs, - num_scheduled_tokens_per_req): + num_scheduled_tokens_per_req) -> np.ndarray: """ Computes metadata for mapping slots to blocks in the key-value (KV) cache for a batch of requests. @@ -565,15 +565,15 @@ def _get_slot_mapping_metadata(self, num_reqs, Args: num_reqs (int): Number of requests in the current batch. num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens - to be scheduled for each request. + to be scheduled for each request. Returns: np.ndarray: A 2D array of shape (total_block_len, 3), where each row - contains: + contains: - kv_cache_start_index (int): The starting index in the KV cache - for the corresponding slice. + for the corresponding slice. - new_kv_start_index (int): The starting index in the new KV - cache for the corresponding slice. + cache for the corresponding slice. - slice_len (int): The length of the slice. """ slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs] diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index f40753468766..a519336e4161 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -172,10 +172,10 @@ def scatter_mm_placeholders( Args: embeds: The multimodal embeddings. - Shape: `(num_embeds, embed_dim)` + Shape: `(num_embeds, embed_dim)` is_embed: A boolean mask indicating which positions in the placeholder - tokens need to be filled with multimodal embeddings. - Shape: `(num_placeholders, num_embeds)` + tokens need to be filled with multimodal embeddings. + Shape: `(num_placeholders, num_embeds)` """ if is_embed is None: return embeds @@ -278,7 +278,7 @@ def bind_kv_cache( Args: kv_caches: The allocated kv_caches with layer names as keys. forward_context: The global forward context containing all Attention - layers with layer names as keys. + layers with layer names as keys. runner_kv_caches: The kv_cache declared by ModelRunner. """ # Bind kv_caches to ModelRunner diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 9c93754f93f8..038ce4b54f96 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -36,8 +36,8 @@ def __init__( local_rank: Local device index rank: Global rank in distributed setup distributed_init_method: Distributed initialization method - is_driver_worker: Whether this worker handles driver - responsibilities + is_driver_worker: Whether this worker handles driver + responsibilities """ # Configuration storage super().__init__(vllm_config=vllm_config)