Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2c388c2
docs: docstring in naive_block
Zerohertz Aug 27, 2025
ff3d808
docs: docstring in scheduler
Zerohertz Aug 27, 2025
489ca3e
docs: docstring in flash_attn & flashinfer
Zerohertz Aug 27, 2025
6c43f10
docs: docstring in v1 worker utils
Zerohertz Aug 27, 2025
fd09770
docs: docstring format
Zerohertz Aug 27, 2025
1b73b1f
docs: docstring and type hin in kv_cache_manager
Zerohertz Aug 27, 2025
067fa00
docs: docstring and type hint in tpu_model_runner
Zerohertz Aug 27, 2025
00cbb5d
docs: indent in encoder_cache_manager
Zerohertz Aug 27, 2025
c3bbc27
docs: docstring in cpu_attn
Zerohertz Aug 27, 2025
b76bc2d
docs: docstring in flex_attention
Zerohertz Aug 27, 2025
d0f6245
docs: docstring in pallas
Zerohertz Aug 27, 2025
1dad759
docs: docstring in rocm_aiter_fa
Zerohertz Aug 27, 2025
5a3dd50
docs: docstring in tree_attn
Zerohertz Aug 27, 2025
8c0b40c
docs: docstring in triton_attn
Zerohertz Aug 27, 2025
a0a56bd
docs: docstring in xformers
Zerohertz Aug 27, 2025
70639c7
docs: docstring in kv_cache_coordinator
Zerohertz Aug 27, 2025
d89966f
docs: type hint of ray_distributed_executor
Zerohertz Aug 27, 2025
a2e17ae
docs: type hint of prometheus
Zerohertz Aug 27, 2025
5b02624
docs: docstring in logits_processor interface
Zerohertz Aug 27, 2025
596bb46
docs: docstring in rejection_sampler
Zerohertz Aug 27, 2025
ca1a729
docs: docstring in tpu sampler
Zerohertz Aug 27, 2025
fb487bd
docs: indent in structured_output backend_types
Zerohertz Aug 27, 2025
36b7034
docs: docstring in worker gpu_input_batch
Zerohertz Aug 27, 2025
717c637
docs: indent in worker gpu_model_runner
Zerohertz Aug 27, 2025
85034bb
docs: indent in worker_base
Zerohertz Aug 27, 2025
8baf0ce
docs: indent in logits_processor interface
Zerohertz Aug 27, 2025
427f4d8
style: ruff
Zerohertz Aug 27, 2025
57fc53d
Merge branch 'main' into docs/mkdocs-warnings
Zerohertz Aug 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/core/block/naive_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def get_physical_block_id(self, absolute_id: int) -> int:

Args:
absolute_id (int): The absolute block id for the block
in whole allocator.
in whole allocator.

Returns:
int: The zero-offset block id on certain device.
Expand Down
2 changes: 1 addition & 1 deletion vllm/core/block/prefix_caching_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
Args:
num_blocks (int): The total number of blocks to manage.
block_size (int): The size of each block in tokens.
block_ids(Optional[Iterable[int]], optional): An optional iterable of
block_ids (Optional[Iterable[int]], optional): An optional iterable of
block IDs. If not provided, block IDs will be assigned sequentially
from 0 to num_blocks - 1.
"""
Expand Down
2 changes: 1 addition & 1 deletion vllm/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,7 @@ def _schedule_running(
`budget.num_batched_tokens` has not enough capacity to schedule
all tokens.
partial_prefill_metadata: information about the partial prefills
that are currently running
that are currently running

Returns:
SchedulerRunningOutputs.
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/attention/backends/cpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,8 @@ def forward(
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
kv_cache: shape =
[2, num_blocks, block_size * num_kv_heads * head_size]
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/attention/backends/flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,8 @@ def forward(
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
Expand Down
8 changes: 3 additions & 5 deletions vllm/v1/attention/backends/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,11 +637,9 @@ def forward(
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache: shape -
# NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
# HND: [num_blocks, 2, num_kv_heads, block_size, head_size]


kv_cache: KV cache tensor with different possible shapes:
- NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
- HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/attention/backends/flex_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,8 @@ def forward(
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
Expand Down
5 changes: 3 additions & 2 deletions vllm/v1/attention/backends/pallas.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,8 @@ def forward(
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
kv_cache: shape =
[num_blocks, block_size, num_kv_heads * 2, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
Expand Down Expand Up @@ -329,7 +330,7 @@ def write_to_kv_cache(
Args:
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size]
num_slices_per_kv_cache_update_block: int
"""
_, page_size, num_combined_kv_heads, head_size = kv_cache.shape
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/attention/backends/rocm_aiter_fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,8 @@ def forward(
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/attention/backends/tree_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,8 @@ def forward(
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/attention/backends/triton_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ def forward(
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/attention/backends/xformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,8 @@ def forward(
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
Expand Down
8 changes: 4 additions & 4 deletions vllm/v1/core/encoder_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,9 @@ def compute_encoder_budget(

Returns:
- Compute budget for encoder execution, measured in number of tokens
from the input sequence.
from the input sequence.
- Space budget for encoder cache size, measured in number of tokens
from the input sequence.
from the input sequence.
"""
if mm_registry.supports_multimodal_inputs(model_config):
max_tokens_by_modality = mm_registry \
Expand Down Expand Up @@ -303,9 +303,9 @@ def compute_mm_encoder_budget(

Returns:
- Compute budget for encoder execution, measured in number of tokens
from the input sequence.
from the input sequence.
- Space budget for encoder cache size, measured in number of tokens
from the input sequence.
from the input sequence.
"""

if not max_tokens_by_modality:
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/core/kv_cache_coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:

Args:
request: The request.
num_tokens: The total number of tokens that need to be cached
num_computed_tokens: The total number of tokens
that need to be cached
(including tokens that are already cached).
"""
for manager in self.single_type_managers:
Expand Down
11 changes: 6 additions & 5 deletions vllm/v1/core/kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,15 @@ def get_block_ids(
def get_block_ids(
self,
allow_none: bool = False,
):
) -> Optional[tuple[list[int], ...]]:
"""
Converts the KVCacheBlocks instance to block_ids.

Returns:
tuple[list[int], ...]: A tuple of lists where
* the outer tuple corresponds to KV cache groups
* each inner list contains the block_ids of the blocks in that group
tuple[list[int], ...]: A tuple of lists where:
- the outer tuple corresponds to KV cache groups
- each inner list contains the block_ids of the blocks in that
group
"""
if allow_none and all(len(group) == 0 for group in self.blocks):
return None
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/executor/ray_distributed_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from vllm.executor.ray_distributed_executor import ( # noqa
RayDistributedExecutor as RayDistributedExecutorV0)
from vllm.logger import init_logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
from vllm.v1.executor.abstract import Executor
from vllm.v1.outputs import ModelRunnerOutput
Expand Down Expand Up @@ -64,7 +65,7 @@ def max_concurrent_batches(self) -> int:

def execute_model(
self,
scheduler_output,
scheduler_output: SchedulerOutput,
) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
"""Execute the model on the Ray workers.

Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/metrics/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def setup_multiprocess_prometheus():
"and vLLM will properly handle cleanup.")


def get_prometheus_registry():
def get_prometheus_registry() -> CollectorRegistry:
"""Get the appropriate prometheus registry based on multiprocessing
configuration.

Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/sample/logits_processor/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def update_state(
to each forward pass.

Args:
batch_update is non-None iff there have been
changes to the batch makeup.
batch_update: Non-None iff there have been changes
to the batch makeup.
"""
raise NotImplementedError
2 changes: 1 addition & 1 deletion vllm/v1/sample/rejection_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def forward(
different requests are flattened into a single tensor because
this is the shape of the output logits.
NOTE: `target_logits` can be updated in place to save memory.
bonus_token_ids_tensor (torch.Tensor):
bonus_token_ids (torch.Tensor):
A tensor containing bonus tokens. Shape is [batch_size, 1].
Bonus tokens are added to the end of the sequence if all
proposed tokens are accepted. We generate the bonus tokens
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/sample/tpu/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def gather_logprobs(
Gather logprobs for topk and sampled/prompt token.

Args:
logits: (num tokens) x (vocab) tensor
logprobs: (num tokens) x (vocab) tensor
num_logprobs: minimum number of logprobs to
retain per token
token_ids: prompt tokens (if prompt logprobs)
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/structured_output/backend_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def compile_grammar(self, request_type: StructuredOutputOptions,

Args:
request_type (StructuredOutputOptions): The type of structured
output request.
output request.
grammar_spec (str): The grammar specification to compile.

Returns:
Expand All @@ -124,7 +124,7 @@ def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:

Args:
max_num_seqs (int): The maximum number of sequences for which
to allocate the bitmask.
to allocate the bitmask.
"""

@abstractmethod
Expand Down
3 changes: 0 additions & 3 deletions vllm/v1/worker/gpu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,9 +525,6 @@ def condense(self) -> None:
Any consecutive empty indices at the very end of the list are not
filled.

Args:
empty_req_indices: empty indices which may be filled.

Returns:
swaps: list of (from,to) swap tuples for moved requests
empty_req_indices: indices not filled by condensation
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2953,7 +2953,7 @@ def _reshape_kv_cache_tensors(
Args:
kv_cache_config: The KV cache config
kv_cache_raw_tensors: The KV cache buffer of each layer, with
correct size but uninitialized shape.
correct size but uninitialized shape.
Returns:
Dict[str, torch.Tensor]: A map between layer names to their
corresponding memory buffer for KV cache.
Expand Down
10 changes: 5 additions & 5 deletions vllm/v1/worker/tpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
return kv_cache_spec

def _get_slot_mapping_metadata(self, num_reqs,
num_scheduled_tokens_per_req):
num_scheduled_tokens_per_req) -> np.ndarray:
"""
Computes metadata for mapping slots to blocks in the key-value (KV)
cache for a batch of requests.
Expand All @@ -565,15 +565,15 @@ def _get_slot_mapping_metadata(self, num_reqs,
Args:
num_reqs (int): Number of requests in the current batch.
num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens
to be scheduled for each request.
to be scheduled for each request.

Returns:
np.ndarray: A 2D array of shape (total_block_len, 3), where each row
contains:
contains:
- kv_cache_start_index (int): The starting index in the KV cache
for the corresponding slice.
for the corresponding slice.
- new_kv_start_index (int): The starting index in the new KV
cache for the corresponding slice.
cache for the corresponding slice.
- slice_len (int): The length of the slice.
"""
slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs]
Expand Down
8 changes: 4 additions & 4 deletions vllm/v1/worker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@ def scatter_mm_placeholders(

Args:
embeds: The multimodal embeddings.
Shape: `(num_embeds, embed_dim)`
Shape: `(num_embeds, embed_dim)`
is_embed: A boolean mask indicating which positions in the placeholder
tokens need to be filled with multimodal embeddings.
Shape: `(num_placeholders, num_embeds)`
tokens need to be filled with multimodal embeddings.
Shape: `(num_placeholders, num_embeds)`
"""
if is_embed is None:
return embeds
Expand Down Expand Up @@ -278,7 +278,7 @@ def bind_kv_cache(
Args:
kv_caches: The allocated kv_caches with layer names as keys.
forward_context: The global forward context containing all Attention
layers with layer names as keys.
layers with layer names as keys.
runner_kv_caches: The kv_cache declared by ModelRunner.
"""
# Bind kv_caches to ModelRunner
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/worker/worker_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def __init__(
local_rank: Local device index
rank: Global rank in distributed setup
distributed_init_method: Distributed initialization method
is_driver_worker: Whether this worker handles driver
responsibilities
is_driver_worker: Whether this worker handles driver
responsibilities
"""
# Configuration storage
super().__init__(vllm_config=vllm_config)
Expand Down