File tree Expand file tree Collapse file tree 1 file changed +13
-0
lines changed Expand file tree Collapse file tree 1 file changed +13
-0
lines changed Original file line number Diff line number Diff line change @@ -47,6 +47,10 @@ def __init__(
4747 attn_type : str = AttentionType .DECODER ,
4848 ** extra_impl_args ,
4949 ) -> None :
50+ """
51+ The KV cache is stored inside this class and is accessed via
52+ `self.kv_cache`.
53+ """
5054 super ().__init__ ()
5155 if per_layer_sliding_window is not None :
5256 # per-layer sliding window
@@ -155,6 +159,15 @@ def forward(
155159 key : torch .Tensor ,
156160 value : torch .Tensor ,
157161 ) -> torch .Tensor :
162+ """
163+ The KV cache is stored inside this class and is accessed via
164+ `self.kv_cache`.
165+
166+ Attention metadata (`attn_metadata`) is set using a context manager in
167+ the model runner's `execute_model` method. It is accessed via forward
168+ context using
169+ `vllm.forward_context.get_forward_context().attn_metadata`.
170+ """
158171 if self .calculate_kv_scales :
159172 attn_metadata = get_forward_context ().attn_metadata
160173 if attn_metadata .enable_kv_scales_calculation :
You can’t perform that action at this time.
0 commit comments