Add comments on accessing kv_cache and attn_metadata (#13887)

hmellor · web-flow · commit 0ecdd98031f9 · 2025-02-26T18:41:02.000+08:00
Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -47,6 +47,10 @@ def __init__(
         attn_type: str = AttentionType.DECODER,
         **extra_impl_args,
     ) -> None:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+        """
         super().__init__()
         if per_layer_sliding_window is not None:
             # per-layer sliding window
@@ -155,6 +159,15 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+
+        Attention metadata (`attn_metadata`) is set using a context manager in
+        the model runner's `execute_model` method. It is accessed via forward
+        context using
+        `vllm.forward_context.get_forward_context().attn_metadata`.
+        """
         if self.calculate_kv_scales:
             attn_metadata = get_forward_context().attn_metadata
             if attn_metadata.enable_kv_scales_calculation: