From 2c388c2476bd9dd6091851f8eb65bbb00684c3ad Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 20:30:12 +0900 Subject: [PATCH 01/27] docs: docstring in naive_block WARNING - griffe: vllm/core/block/naive_block.py:210: Failed to get 'name: description' pair from 'in whole allocator.' WARNING - griffe: vllm/core/block/prefix_caching_block.py:64: Parameter 'block_ids(Optional[Iterable[int]],' does not appear in the function signature Signed-off-by: Zerohertz --- vllm/core/block/naive_block.py | 2 +- vllm/core/block/prefix_caching_block.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index dae6ead04e9c..7d9b32cd4b67 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -207,7 +207,7 @@ def get_physical_block_id(self, absolute_id: int) -> int: Args: absolute_id (int): The absolute block id for the block - in whole allocator. + in whole allocator. Returns: int: The zero-offset block id on certain device. diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 2913a01bf34a..a21d69323abb 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -61,7 +61,7 @@ class PrefixCachingBlockAllocator(BlockAllocator): Args: num_blocks (int): The total number of blocks to manage. block_size (int): The size of each block in tokens. - block_ids(Optional[Iterable[int]], optional): An optional iterable of + block_ids (Optional[Iterable[int]], optional): An optional iterable of block IDs. If not provided, block IDs will be assigned sequentially from 0 to num_blocks - 1. """ From ff3d8089e71843127999bb355e303dbab18883a4 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 20:45:00 +0900 Subject: [PATCH 02/27] docs: docstring in scheduler WARNING - griffe: vllm/core/scheduler.py:660: Failed to get 'name: description' pair from 'that are currently running' Signed-off-by: Zerohertz --- vllm/core/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 63894e7f5dc8..c89f3f663264 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -657,7 +657,7 @@ def _schedule_running( `budget.num_batched_tokens` has not enough capacity to schedule all tokens. partial_prefill_metadata: information about the partial prefills - that are currently running + that are currently running Returns: SchedulerRunningOutputs. From 489ca3eab4a076122e144a41430fe7becdb61120 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 20:53:16 +0900 Subject: [PATCH 03/27] docs: docstring in flash_attn & flashinfer WARNING - griffe: vllm/v1/attention/backends/flash_attn.py:441: Failed to get 'name: description' pair from 'kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]' WARNING - griffe: vllm/v1/attention/backends/flashinfer.py:641: Parameter '#' does not appear in the function signature WARNING - griffe: vllm/v1/attention/backends/flashinfer.py:642: Parameter '#' does not appear in the function signature Signed-off-by: Zerohertz --- vllm/v1/attention/backends/flash_attn.py | 3 ++- vllm/v1/attention/backends/flashinfer.py | 8 +++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 6e7096de924c..a9f3d8ee2a63 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -438,7 +438,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: KV cache tensor with shape + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 1115fc606b05..70d3471a4725 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -637,11 +637,9 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache: shape - - # NHD: [num_blocks, 2, block_size, num_kv_heads, head_size] - # HND: [num_blocks, 2, num_kv_heads, block_size, head_size] - - + kv_cache: KV cache tensor with different possible shapes: + - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size] + - HND: [num_blocks, 2, num_kv_heads, block_size, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] From 6c43f106c158c748ab2f911327d68285ae6ad663 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 20:58:51 +0900 Subject: [PATCH 04/27] docs: docstring in v1 worker utils WARNING - griffe: vllm/v1/worker/utils.py:280: Failed to get 'name: description' pair from 'layers with layer names as keys.' WARNING - griffe: vllm/v1/worker/utils.py:174: Confusing indentation for continuation line 8 in docstring, should be 4 * 2 = 8 spaces, not 6 WARNING - griffe: vllm/v1/worker/utils.py:176: Confusing indentation for continuation line 10 in docstring, should be 4 * 2 = 8 spaces, not 6 WARNING - griffe: vllm/v1/worker/utils.py:177: Confusing indentation for continuation line 11 in docstring, should be 4 * 2 = 8 spaces, not 6 Signed-off-by: Zerohertz --- vllm/v1/worker/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index f40753468766..a519336e4161 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -172,10 +172,10 @@ def scatter_mm_placeholders( Args: embeds: The multimodal embeddings. - Shape: `(num_embeds, embed_dim)` + Shape: `(num_embeds, embed_dim)` is_embed: A boolean mask indicating which positions in the placeholder - tokens need to be filled with multimodal embeddings. - Shape: `(num_placeholders, num_embeds)` + tokens need to be filled with multimodal embeddings. + Shape: `(num_placeholders, num_embeds)` """ if is_embed is None: return embeds @@ -278,7 +278,7 @@ def bind_kv_cache( Args: kv_caches: The allocated kv_caches with layer names as keys. forward_context: The global forward context containing all Attention - layers with layer names as keys. + layers with layer names as keys. runner_kv_caches: The kv_cache declared by ModelRunner. """ # Bind kv_caches to ModelRunner From fd0977084553ea8212b237368ba39f2e98dc95f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= Date: Wed, 27 Aug 2025 21:39:01 +0900 Subject: [PATCH 05/27] docs: docstring format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Hyogeun Oh (오효근) --- vllm/v1/attention/backends/flash_attn.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a9f3d8ee2a63..04c4b6c35a86 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -438,8 +438,7 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache: KV cache tensor with shape - [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] From 1b73b1faded86322a7e4e21ef93c03d74eb40e59 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:12:22 +0900 Subject: [PATCH 06/27] docs: docstring and type hin in kv_cache_manager WARNING - griffe: vllm/v1/core/kv_cache_manager.py:61: No type or annotation for returned value 1 WARNING - griffe: vllm/v1/core/kv_cache_manager.py:62: No type or annotation for returned value 2 WARNING - griffe: vllm/v1/core/kv_cache_manager.py:63: No type or annotation for returned value 3 Signed-off-by: Zerohertz --- vllm/v1/core/kv_cache_manager.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index b427a9c497fe..87a11fe58a04 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -54,14 +54,15 @@ def get_block_ids( def get_block_ids( self, allow_none: bool = False, - ): + ) -> Optional[tuple[list[int], ...]]: """ Converts the KVCacheBlocks instance to block_ids. - + Returns: - tuple[list[int], ...]: A tuple of lists where - * the outer tuple corresponds to KV cache groups - * each inner list contains the block_ids of the blocks in that group + tuple[list[int], ...]: A tuple of lists where: + - the outer tuple corresponds to KV cache groups + - each inner list contains the block_ids of the blocks in that + group """ if allow_none and all(len(group) == 0 for group in self.blocks): return None From 067fa002aee31089de082f85654f7b3af110b048 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:21:54 +0900 Subject: [PATCH 07/27] docs: docstring and type hint in tpu_model_runner WARNING - griffe: vllm/v1/worker/tpu_model_runner.py:567: Failed to get 'name: description' pair from 'to be scheduled for each request.' WARNING - griffe: vllm/v1/worker/tpu_model_runner.py:570: No type or annotation for returned value 1 WARNING - griffe: vllm/v1/worker/tpu_model_runner.py:571: No type or annotation for returned value 'contains' Signed-off-by: Zerohertz --- vllm/v1/worker/tpu_model_runner.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d36423660427..70ffde39ca33 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -552,7 +552,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return kv_cache_spec def _get_slot_mapping_metadata(self, num_reqs, - num_scheduled_tokens_per_req): + num_scheduled_tokens_per_req) -> np.ndarray: """ Computes metadata for mapping slots to blocks in the key-value (KV) cache for a batch of requests. @@ -565,15 +565,15 @@ def _get_slot_mapping_metadata(self, num_reqs, Args: num_reqs (int): Number of requests in the current batch. num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens - to be scheduled for each request. + to be scheduled for each request. Returns: np.ndarray: A 2D array of shape (total_block_len, 3), where each row - contains: + contains: - kv_cache_start_index (int): The starting index in the KV cache - for the corresponding slice. + for the corresponding slice. - new_kv_start_index (int): The starting index in the new KV - cache for the corresponding slice. + cache for the corresponding slice. - slice_len (int): The length of the slice. """ slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs] From 00cbb5d08bc1655ea1c08fa3a97c6715272c9595 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:27:52 +0900 Subject: [PATCH 08/27] docs: indent in encoder_cache_manager WARNING - griffe: vllm/v1/core/encoder_cache_manager.py:258: Confusing indentation for continuation line 6 in docstring, should be 4 * 2 = 8 spaces, not 6 WARNING - griffe: vllm/v1/core/encoder_cache_manager.py:260: Confusing indentation for continuation line 8 in docstring, should be 4 * 2 = 8 spaces, not 6 WARNING - griffe: vllm/v1/core/encoder_cache_manager.py:306: Confusing indentation for continuation line 11 in docstring, should be 4 * 2 = 8 spaces, not 6 WARNING - griffe: vllm/v1/core/encoder_cache_manager.py:308: Confusing indentation for continuation line 13 in docstring, should be 4 * 2 = 8 spaces, not 6 Signed-off-by: Zerohertz --- vllm/v1/core/encoder_cache_manager.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index c9d18033a198..bd2ec036834b 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -255,9 +255,9 @@ def compute_encoder_budget( Returns: - Compute budget for encoder execution, measured in number of tokens - from the input sequence. + from the input sequence. - Space budget for encoder cache size, measured in number of tokens - from the input sequence. + from the input sequence. """ if mm_registry.supports_multimodal_inputs(model_config): max_tokens_by_modality = mm_registry \ @@ -303,9 +303,9 @@ def compute_mm_encoder_budget( Returns: - Compute budget for encoder execution, measured in number of tokens - from the input sequence. + from the input sequence. - Space budget for encoder cache size, measured in number of tokens - from the input sequence. + from the input sequence. """ if not max_tokens_by_modality: From c3bbc276e07446dc2ab1174c5c7910431de17aad Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:36:27 +0900 Subject: [PATCH 09/27] docs: docstring in cpu_attn WARNING - griffe: vllm/v1/attention/backends/cpu_attn.py:494: Failed to get 'name: description' pair from 'kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]' Signed-off-by: Zerohertz --- vllm/v1/attention/backends/cpu_attn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 973979fdf7df..ced8234a7b43 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -491,7 +491,8 @@ def forward( query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + kv_cache: shape = + [2, num_blocks, block_size * num_kv_heads * head_size] NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. From b76bc2deed80c866d54971d317fcbf1f25e54c0b Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:37:36 +0900 Subject: [PATCH 10/27] docs: docstring in flex_attention WARNING - griffe: vllm/v1/attention/backends/flex_attention.py:692: Failed to get 'name: description' pair from 'kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]' Signed-off-by: Zerohertz --- vllm/v1/attention/backends/flex_attention.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 458562ebc8d2..a596f6b2b32a 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -689,7 +689,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] From d0f6245729ee6f640865d9de9d347270bc0b3c4d Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:39:25 +0900 Subject: [PATCH 11/27] docs: docstring in pallas WARNING - griffe: vllm/v1/attention/backends/pallas.py:238: Failed to get 'name: description' pair from 'kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]' WARNING - griffe: vllm/v1/attention/backends/pallas.py:332: Failed to get 'name: description' pair from 'kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]' Signed-off-by: Zerohertz --- vllm/v1/attention/backends/pallas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index fd97db0abb84..26f9abf13d0e 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -235,7 +235,8 @@ def forward( query: shape = [num_tokens, num_heads * head_size] key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size] + kv_cache: shape = + [num_blocks, block_size, num_kv_heads * 2, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -329,7 +330,7 @@ def write_to_kv_cache( Args: key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size] + kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size] num_slices_per_kv_cache_update_block: int """ _, page_size, num_combined_kv_heads, head_size = kv_cache.shape From 1dad7592acb2142cb509941249cdddc6632ede4a Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:49:15 +0900 Subject: [PATCH 12/27] docs: docstring in rocm_aiter_fa WARNING - griffe: vllm/v1/attention/backends/rocm_aiter_fa.py:432: Failed to get 'name: description' pair from 'kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]' Signed-off-by: Zerohertz --- vllm/v1/attention/backends/rocm_aiter_fa.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 403ad8e88a95..173a0a255e49 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -429,7 +429,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] From 5a3dd50ad676f7c2421d7c500d83f4165b044471 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:50:31 +0900 Subject: [PATCH 13/27] docs: docstring in tree_attn WARNING - griffe: vllm/v1/attention/backends/tree_attn.py:365: Failed to get 'name: description' pair from 'kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]' Signed-off-by: Zerohertz --- vllm/v1/attention/backends/tree_attn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index c93223a34083..b96d957a150b 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -362,7 +362,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] From 8c0b40c45da76e5c45a91b56698375fdc87ba7ce Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:52:10 +0900 Subject: [PATCH 14/27] docs: docstring in triton_attn WARNING - griffe: vllm/v1/attention/backends/triton_attn.py:288: Failed to get 'name: description' pair from 'kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]' Signed-off-by: Zerohertz --- vllm/v1/attention/backends/triton_attn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index b12036c59979..a37a7f6811ef 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -285,7 +285,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] From a0a56bd839047059f0bc0823d5851743abd9d0ca Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:53:02 +0900 Subject: [PATCH 15/27] docs: docstring in xformers WARNING - griffe: vllm/v1/attention/backends/xformers.py:333: Failed to get 'name: description' pair from 'kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]' Signed-off-by: Zerohertz --- vllm/v1/attention/backends/xformers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index e0eb7d8be974..7f888c113574 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -330,7 +330,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] From 70639c72cf87eef65acfeceb005b3f0edbce86be Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 21:54:10 +0900 Subject: [PATCH 16/27] docs: docstring in kv_cache_coordinator WARNING - griffe: vllm/v1/core/kv_cache_coordinator.py:121: No type or annotation for parameter 'num_tokens' WARNING - griffe: vllm/v1/core/kv_cache_coordinator.py:121: Parameter 'num_tokens' does not appear in the function signature Signed-off-by: Zerohertz --- vllm/v1/core/kv_cache_coordinator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index f082ad00f2e3..9421341f990c 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -119,7 +119,8 @@ def cache_blocks(self, request: Request, num_computed_tokens: int) -> None: Args: request: The request. - num_tokens: The total number of tokens that need to be cached + num_computed_tokens: The total number of tokens + that need to be cached (including tokens that are already cached). """ for manager in self.single_type_managers: From d89966fef22515e467fb488730d1d411817fecff Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:00:16 +0900 Subject: [PATCH 17/27] docs: type hint of ray_distributed_executor WARNING - griffe: vllm/v1/executor/ray_distributed_executor.py:72: No type or annotation for parameter 'scheduler_output' Signed-off-by: Zerohertz --- vllm/v1/executor/ray_distributed_executor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index c05ad1966d61..8394ae788ab0 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -8,6 +8,7 @@ from vllm.executor.ray_distributed_executor import ( # noqa RayDistributedExecutor as RayDistributedExecutorV0) from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput @@ -64,7 +65,7 @@ def max_concurrent_batches(self) -> int: def execute_model( self, - scheduler_output, + scheduler_output: SchedulerOutput, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: """Execute the model on the Ray workers. From a2e17ae846b1ee938174fd5a949e51f11bbba3c2 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:00:59 +0900 Subject: [PATCH 18/27] docs: type hint of prometheus WARNING - griffe: vllm/v1/metrics/prometheus.py:44: No type or annotation for returned value 'Registry' Signed-off-by: Zerohertz --- vllm/v1/metrics/prometheus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py index 61ba5d66cb31..a43cf9ce255e 100644 --- a/vllm/v1/metrics/prometheus.py +++ b/vllm/v1/metrics/prometheus.py @@ -36,7 +36,7 @@ def setup_multiprocess_prometheus(): "and vLLM will properly handle cleanup.") -def get_prometheus_registry(): +def get_prometheus_registry() -> CollectorRegistry: """Get the appropriate prometheus registry based on multiprocessing configuration. From 5b026241161cfc62889f227267181ef260df6241 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:03:10 +0900 Subject: [PATCH 19/27] docs: docstring in logits_processor interface WARNING - griffe: vllm/v1/sample/logits_processor/interface.py:83: Failed to get 'name: description' pair from 'batch_update is non-None iff there have been' WARNING - griffe: vllm/v1/sample/logits_processor/interface.py:84: Failed to get 'name: description' pair from 'changes to the batch makeup.' Signed-off-by: Zerohertz --- vllm/v1/sample/logits_processor/interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 12b4db24bff8..4d3aaf43411e 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -80,7 +80,7 @@ def update_state( to each forward pass. Args: - batch_update is non-None iff there have been - changes to the batch makeup. + batch_update: Non-None iff there have been changes + to the batch makeup. """ raise NotImplementedError From 596bb46e34cc6a7a985d1d0e0c3f96e147fbdeb7 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:06:38 +0900 Subject: [PATCH 20/27] docs: docstring in rejection_sampler WARNING - griffe: vllm/v1/sample/rejection_sampler.py:70: Parameter 'bonus_token_ids_tensor' does not appear in the function signature Signed-off-by: Zerohertz --- vllm/v1/sample/rejection_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index b2354c53302a..2d9ce3101b6c 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -68,7 +68,7 @@ def forward( different requests are flattened into a single tensor because this is the shape of the output logits. NOTE: `target_logits` can be updated in place to save memory. - bonus_token_ids_tensor (torch.Tensor): + bonus_token_ids (torch.Tensor): A tensor containing bonus tokens. Shape is [batch_size, 1]. Bonus tokens are added to the end of the sequence if all proposed tokens are accepted. We generate the bonus tokens From ca1a7292bb79951ace6dd906ab9a54c58989e083 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:07:23 +0900 Subject: [PATCH 21/27] docs: docstring in tpu sampler WARNING - griffe: vllm/v1/sample/tpu/sampler.py:91: No type or annotation for parameter 'logits' WARNING - griffe: vllm/v1/sample/tpu/sampler.py:91: Parameter 'logits' does not appear in the function signature Signed-off-by: Zerohertz --- vllm/v1/sample/tpu/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 04545d587e4a..e84136e3a6d0 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -89,7 +89,7 @@ def gather_logprobs( Gather logprobs for topk and sampled/prompt token. Args: - logits: (num tokens) x (vocab) tensor + logprobs: (num tokens) x (vocab) tensor num_logprobs: minimum number of logprobs to retain per token token_ids: prompt tokens (if prompt logprobs) From fb487bde0475b9046e839d9e340f1b86fa92a9ee Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:08:15 +0900 Subject: [PATCH 22/27] docs: indent in structured_output backend_types WARNING - griffe: vllm/v1/structured_output/backend_types.py:126: Confusing indentation for continuation line 5 in docstring, should be 4 * 2 = 8 spaces, not 6 WARNING - griffe: vllm/v1/structured_output/backend_types.py:112: Confusing indentation for continuation line 5 in docstring, should be 4 * 2 = 8 spaces, not 6 Signed-off-by: Zerohertz --- vllm/v1/structured_output/backend_types.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index d500783aa4b3..9a53aa7a1ad1 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -110,7 +110,7 @@ def compile_grammar(self, request_type: StructuredOutputOptions, Args: request_type (StructuredOutputOptions): The type of structured - output request. + output request. grammar_spec (str): The grammar specification to compile. Returns: @@ -124,7 +124,7 @@ def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor: Args: max_num_seqs (int): The maximum number of sequences for which - to allocate the bitmask. + to allocate the bitmask. """ @abstractmethod From 36b703474ff1bde3df746de95f10c81396a5a8ad Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:09:46 +0900 Subject: [PATCH 23/27] docs: docstring in worker gpu_input_batch WARNING - griffe: vllm/v1/worker/gpu_input_batch.py:529: No type or annotation for parameter 'empty_req_indices' WARNING - griffe: vllm/v1/worker/gpu_input_batch.py:529: Parameter 'empty_req_indices' does not appear in the function signature Signed-off-by: Zerohertz --- vllm/v1/worker/gpu_input_batch.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 284af6bfedce..f4c2f45df595 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -525,9 +525,6 @@ def condense(self) -> None: Any consecutive empty indices at the very end of the list are not filled. - Args: - empty_req_indices: empty indices which may be filled. - Returns: swaps: list of (from,to) swap tuples for moved requests empty_req_indices: indices not filled by condensation From 717c637b2df27a2241627186168e801f7315d464 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:11:01 +0900 Subject: [PATCH 24/27] docs: indent in worker gpu_model_runner WARNING - griffe: vllm/v1/worker/gpu_model_runner.py:2955: Failed to get 'name: description' pair from 'correct size but uninitialized shape.' Signed-off-by: Zerohertz --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d93460d618e7..ed2a54eebc98 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2953,7 +2953,7 @@ def _reshape_kv_cache_tensors( Args: kv_cache_config: The KV cache config kv_cache_raw_tensors: The KV cache buffer of each layer, with - correct size but uninitialized shape. + correct size but uninitialized shape. Returns: Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. From 85034bb06f1b3bce5d81d1cf0ec4cbbf2084e658 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:14:10 +0900 Subject: [PATCH 25/27] docs: indent in worker_base WARNING - griffe: vllm/v1/worker/worker_base.py:39: Failed to get 'name: description' pair from 'responsibilities' Signed-off-by: Zerohertz --- vllm/v1/worker/worker_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 9c93754f93f8..038ce4b54f96 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -36,8 +36,8 @@ def __init__( local_rank: Local device index rank: Global rank in distributed setup distributed_init_method: Distributed initialization method - is_driver_worker: Whether this worker handles driver - responsibilities + is_driver_worker: Whether this worker handles driver + responsibilities """ # Configuration storage super().__init__(vllm_config=vllm_config) From 8baf0cedec4a4fbd4316cb1246975f8866a42152 Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:16:07 +0900 Subject: [PATCH 26/27] docs: indent in logits_processor interface Signed-off-by: Zerohertz --- vllm/v1/sample/logits_processor/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 4d3aaf43411e..c9c649705dfd 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -81,6 +81,6 @@ def update_state( Args: batch_update: Non-None iff there have been changes - to the batch makeup. + to the batch makeup. """ raise NotImplementedError From 427f4d885f7c2b7a34b39f05aa99b7bbcd13366f Mon Sep 17 00:00:00 2001 From: Zerohertz Date: Wed, 27 Aug 2025 22:28:33 +0900 Subject: [PATCH 27/27] style: ruff Signed-off-by: Zerohertz --- vllm/v1/attention/backends/flash_attn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 04c4b6c35a86..dd2b956d4fa3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -438,7 +438,8 @@ def forward( query: shape = [num_tokens, num_heads, head_size] key: shape = [num_tokens, num_kv_heads, head_size] value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache: shape = [2, num_blocks, block_size, num_kv_heads, head_size] + kv_cache: shape = + [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size]