@@ -72,9 +72,6 @@ def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
7272 vllm_config .parallel_config )
7373 self .headdim = model_config .get_head_size ()
7474
75- self .attention_chunk_size = getattr (vllm_config .scheduler_config ,
76- 'attention_chunk_size' , None )
77-
7875 def build_for_cudagraph_capture (
7976 self , common_attn_metadata : CommonAttentionMetadata
8077 ) -> TritonAttentionMetadata :
@@ -208,7 +205,6 @@ def __init__(
208205 logits_soft_cap : Optional [float ] = None ,
209206 attn_type : AttentionType = AttentionType .DECODER ,
210207 kv_sharing_target_layer_name : Optional [int ] = None ,
211- use_irope : bool = False ,
212208 ) -> None :
213209 self .num_heads = num_heads
214210 self .head_size = head_size
@@ -228,8 +224,6 @@ def __init__(
228224 self .logits_soft_cap = logits_soft_cap
229225 self .kv_sharing_target_layer_name = kv_sharing_target_layer_name
230226
231- self .use_irope = use_irope
232-
233227 self .num_queries_per_kv = self .num_heads // self .num_kv_heads
234228
235229 TritonAttentionBackend .validate_head_size (head_size )
0 commit comments