@@ -63,6 +63,8 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
6363 self .spec_decoding_logging = SpecDecodingLogging ()
6464 kv_tranfer_config = self .vllm_config .kv_transfer_config
6565 self .kv_connector_logging = KVConnectorLogging (kv_tranfer_config )
66+ self .connector_prefix_caching_metrics = PrefixCachingMetrics (
67+ ) if kv_tranfer_config else None
6668 self .last_prompt_throughput : float = 0.0
6769 self .last_generation_throughput : float = 0.0
6870
@@ -97,6 +99,11 @@ def record(self,
9799 self .prefix_caching_metrics .observe (
98100 scheduler_stats .prefix_cache_stats )
99101
102+ if (scheduler_stats .connector_prefix_cache_stats is not None
103+ and self .connector_prefix_caching_metrics is not None ):
104+ self .connector_prefix_caching_metrics .observe (
105+ scheduler_stats .connector_prefix_cache_stats )
106+
100107 if scheduler_stats .spec_decoding_stats is not None :
101108 self .spec_decoding_logging .observe (
102109 scheduler_stats .spec_decoding_stats )
@@ -124,21 +131,30 @@ def log(self):
124131 self .last_prompt_throughput = prompt_throughput
125132
126133 # Format and print output.
127- log_fn (
128- "Engine %03d: "
129- "Avg prompt throughput: %.1f tokens/s, "
130- "Avg generation throughput: %.1f tokens/s, "
131- "Running: %d reqs, Waiting: %d reqs, "
132- "GPU KV cache usage: %.1f%%, "
133- "Prefix cache hit rate: %.1f%%" ,
134+ log_msg = ("Engine %03d: "
135+ "Avg prompt throughput: %.1f tokens/s, "
136+ "Avg generation throughput: %.1f tokens/s, "
137+ "Running: %d reqs, Waiting: %d reqs, "
138+ "GPU KV cache usage: %.1f%%, "
139+ "Prefix cache hit rate: %.1f%%" )
140+
141+ log_args = [
134142 self .engine_index ,
135143 prompt_throughput ,
136144 generation_throughput ,
137145 scheduler_stats .num_running_reqs ,
138146 scheduler_stats .num_waiting_reqs ,
139147 scheduler_stats .kv_cache_usage * 100 ,
140148 self .prefix_caching_metrics .hit_rate * 100 ,
141- )
149+ ]
150+
151+ if self .connector_prefix_caching_metrics is not None :
152+ log_msg += ", KV connector prefix cache hit rate: %.1f%%"
153+ log_args .append (self .connector_prefix_caching_metrics .hit_rate *
154+ 100 )
155+
156+ log_fn (log_msg , * log_args )
157+
142158 self .spec_decoding_logging .log (log_fn = log_fn )
143159 self .kv_connector_logging .log (log_fn = log_fn )
144160
@@ -271,6 +287,25 @@ def __init__(self,
271287 self .counter_prefix_cache_hits = make_per_engine (
272288 counter_prefix_cache_hits , engine_indexes , model_name )
273289
290+ #
291+ # KV connector cache
292+ #
293+ counter_connector_prefix_cache_queries = self ._counter_cls (
294+ name = "vllm:connector_prefix_cache_queries" ,
295+ documentation = ("KV connector prefix cache queries, "
296+ "in terms of number of queried tokens." ),
297+ labelnames = labelnames )
298+ self .counter_connector_prefix_cache_queries = make_per_engine (
299+ counter_connector_prefix_cache_queries , engine_indexes , model_name )
300+
301+ counter_connector_prefix_cache_hits = self ._counter_cls (
302+ name = "vllm:connector_prefix_cache_hits" ,
303+ documentation = ("KV connector prefix cache hits, "
304+ "in terms of number of cached tokens." ),
305+ labelnames = labelnames )
306+ self .counter_connector_prefix_cache_hits = make_per_engine (
307+ counter_connector_prefix_cache_hits , engine_indexes , model_name )
308+
274309 #
275310 # Counters
276311 #
@@ -550,6 +585,12 @@ def record(self,
550585 self .counter_prefix_cache_hits [engine_idx ].inc (
551586 scheduler_stats .prefix_cache_stats .hits )
552587
588+ if scheduler_stats .connector_prefix_cache_stats is not None :
589+ self .counter_connector_prefix_cache_queries [engine_idx ].inc (
590+ scheduler_stats .connector_prefix_cache_stats .queries )
591+ self .counter_connector_prefix_cache_hits [engine_idx ].inc (
592+ scheduler_stats .connector_prefix_cache_stats .hits )
593+
553594 if scheduler_stats .spec_decoding_stats is not None :
554595 self .spec_decoding_prom .observe (
555596 scheduler_stats .spec_decoding_stats , engine_idx )
0 commit comments