@@ -4769,12 +4769,23 @@ def __post_init__(self):
47694769 # Hybrid KV cache manager is not compatible with KV events.
47704770 self .scheduler_config .disable_hybrid_kv_cache_manager = True
47714771 if self .model_config is not None and \
4772- self .model_config .attention_chunk_size is not None and \
4773- self .speculative_config is not None and \
4774- self .speculative_config .use_eagle ():
4775- # Hybrid KV cache manager is not yet supported with chunked
4776- # local attention + eagle.
4777- self .scheduler_config .disable_hybrid_kv_cache_manager = True
4772+ self .model_config .attention_chunk_size is not None :
4773+ if self .speculative_config is not None and \
4774+ self .speculative_config .use_eagle ():
4775+ # Hybrid KV cache manager is not yet supported with chunked
4776+ # local attention + eagle.
4777+ self .scheduler_config .disable_hybrid_kv_cache_manager = True
4778+ elif \
4779+ not envs .VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE :
4780+ logger .warning (
4781+ "There is a latency regression when using chunked local"
4782+ " attention with the hybrid KV cache manager. Disabling"
4783+ " it, by default. To enable it, set the environment "
4784+ "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
4785+ )
4786+ # Hybrid KV cache manager is not yet supported with chunked
4787+ # local attention.
4788+ self .scheduler_config .disable_hybrid_kv_cache_manager = True
47784789
47794790 def update_sizes_for_sequence_parallelism (self ,
47804791 possible_sizes : list ) -> list :
0 commit comments