Skip to content

Commit 0ecdd98

Browse files
authored
Add comments on accessing kv_cache and attn_metadata (#13887)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
1 parent 7b700ec commit 0ecdd98

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

vllm/attention/layer.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ def __init__(
4747
attn_type: str = AttentionType.DECODER,
4848
**extra_impl_args,
4949
) -> None:
50+
"""
51+
The KV cache is stored inside this class and is accessed via
52+
`self.kv_cache`.
53+
"""
5054
super().__init__()
5155
if per_layer_sliding_window is not None:
5256
# per-layer sliding window
@@ -155,6 +159,15 @@ def forward(
155159
key: torch.Tensor,
156160
value: torch.Tensor,
157161
) -> torch.Tensor:
162+
"""
163+
The KV cache is stored inside this class and is accessed via
164+
`self.kv_cache`.
165+
166+
Attention metadata (`attn_metadata`) is set using a context manager in
167+
the model runner's `execute_model` method. It is accessed via forward
168+
context using
169+
`vllm.forward_context.get_forward_context().attn_metadata`.
170+
"""
158171
if self.calculate_kv_scales:
159172
attn_metadata = get_forward_context().attn_metadata
160173
if attn_metadata.enable_kv_scales_calculation:

0 commit comments

Comments
 (0)