[Metrics][KVConnector] Add connector prefix cache hit rate stats

ptovam · ptovam · commit ae2b26ef8e88 · 2025-10-05T12:39:57.000+03:00
Signed-off-by: tovam &lt;tovam@pliops.com&gt;
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -43,6 +43,7 @@
 
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
@@ -89,6 +90,8 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
         self._connector_metadata: Optional[KVConnectorMetadata] = None
         self._vllm_config = vllm_config
         self._role = role
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.prefix_cache_stats = PrefixCacheStats()
 
     @property
     def role(self) -> KVConnectorRole:
@@ -413,3 +416,27 @@ def build_kv_connector_stats(
         which can implement custom aggregation logic on the data dict.
         """
         return None
+
+    def update_prefix_cache_stats(self, request_num_tokens: int,
+                                  num_external_tokens: int) -> None:
+        """
+        Update prefix cache statistics for a request.
+
+        Args:
+            request_num_tokens (int): the number of tokens in the request.
+            num_external_tokens (int): the number of tokens that will be
+                loaded from the external KV cache.
+        """
+        self.prefix_cache_stats.requests += 1
+        self.prefix_cache_stats.queries += request_num_tokens
+        self.prefix_cache_stats.hits += num_external_tokens
+
+    def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
+        """Get (and reset) the prefix cache stats.
+
+        Returns:
+            The current prefix caching stats.
+        """
+        stats = self.prefix_cache_stats
+        self.prefix_cache_stats = PrefixCacheStats()
+        return stats
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -495,6 +495,9 @@ def schedule(self) -> SchedulerOutput:
                         new_computed_blocks + new_blocks,
                         num_external_computed_tokens,
                     )
+                    if self.log_stats:
+                        self.connector.update_prefix_cache_stats(
+                            request.num_tokens, num_external_computed_tokens)
 
                 # Request was already popped from self.waiting
                 # unless it was re-added above due to new_blocks being None.
@@ -1197,16 +1200,20 @@ def make_stats(
         if not self.log_stats:
             return None
         prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
+        connector_prefix_cache_stats = self.connector.make_prefix_cache_stats(
+        ) if self.connector is not None else None
         assert prefix_cache_stats is not None
-        return SchedulerStats(num_running_reqs=len(self.running),
-                              num_waiting_reqs=len(self.waiting),
-                              kv_cache_usage=self.kv_cache_manager.usage,
-                              prefix_cache_stats=prefix_cache_stats,
-                              spec_decoding_stats=spec_decoding_stats,
-                              num_corrupted_reqs=sum(req.is_output_corrupted
-                                                     for req in self.running),
-                              kv_connector_stats=kv_connector_stats.data
-                              if kv_connector_stats else None)
+        return SchedulerStats(
+            num_running_reqs=len(self.running),
+            num_waiting_reqs=len(self.waiting),
+            kv_cache_usage=self.kv_cache_manager.usage,
+            prefix_cache_stats=prefix_cache_stats,
+            connector_prefix_cache_stats=connector_prefix_cache_stats,
+            spec_decoding_stats=spec_decoding_stats,
+            num_corrupted_reqs=sum(req.is_output_corrupted
+                                   for req in self.running),
+            kv_connector_stats=kv_connector_stats.data
+            if kv_connector_stats else None)
 
     def make_spec_decoding_stats(
         self,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -63,6 +63,8 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self.spec_decoding_logging = SpecDecodingLogging()
         kv_tranfer_config = self.vllm_config.kv_transfer_config
         self.kv_connector_logging = KVConnectorLogging(kv_tranfer_config)
+        self.connector_prefix_caching_metrics = PrefixCachingMetrics(
+        ) if kv_tranfer_config else None
         self.last_prompt_throughput: float = 0.0
         self.last_generation_throughput: float = 0.0
 
@@ -97,6 +99,11 @@ def record(self,
             self.prefix_caching_metrics.observe(
                 scheduler_stats.prefix_cache_stats)
 
+            if (scheduler_stats.connector_prefix_cache_stats is not None
+                    and self.connector_prefix_caching_metrics is not None):
+                self.connector_prefix_caching_metrics.observe(
+                    scheduler_stats.connector_prefix_cache_stats)
+
             if scheduler_stats.spec_decoding_stats is not None:
                 self.spec_decoding_logging.observe(
                     scheduler_stats.spec_decoding_stats)
@@ -124,21 +131,30 @@ def log(self):
         self.last_prompt_throughput = prompt_throughput
 
         # Format and print output.
-        log_fn(
-            "Engine %03d: "
-            "Avg prompt throughput: %.1f tokens/s, "
-            "Avg generation throughput: %.1f tokens/s, "
-            "Running: %d reqs, Waiting: %d reqs, "
-            "GPU KV cache usage: %.1f%%, "
-            "Prefix cache hit rate: %.1f%%",
+        log_msg = ("Engine %03d: "
+                   "Avg prompt throughput: %.1f tokens/s, "
+                   "Avg generation throughput: %.1f tokens/s, "
+                   "Running: %d reqs, Waiting: %d reqs, "
+                   "GPU KV cache usage: %.1f%%, "
+                   "Prefix cache hit rate: %.1f%%")
+
+        log_args = [
             self.engine_index,
             prompt_throughput,
             generation_throughput,
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
             scheduler_stats.kv_cache_usage * 100,
             self.prefix_caching_metrics.hit_rate * 100,
-        )
+        ]
+
+        if self.connector_prefix_caching_metrics is not None:
+            log_msg += ", KV connector prefix cache hit rate: %.1f%%"
+            log_args.append(self.connector_prefix_caching_metrics.hit_rate *
+                            100)
+
+        log_fn(log_msg, *log_args)
+
         self.spec_decoding_logging.log(log_fn=log_fn)
         self.kv_connector_logging.log(log_fn=log_fn)
 
@@ -271,6 +287,25 @@ def __init__(self,
         self.counter_prefix_cache_hits = make_per_engine(
             counter_prefix_cache_hits, engine_indexes, model_name)
 
+        #
+        # KV connector cache
+        #
+        counter_connector_prefix_cache_queries = self._counter_cls(
+            name="vllm:connector_prefix_cache_queries",
+            documentation=("KV connector prefix cache queries, "
+                           "in terms of number of queried tokens."),
+            labelnames=labelnames)
+        self.counter_connector_prefix_cache_queries = make_per_engine(
+            counter_connector_prefix_cache_queries, engine_indexes, model_name)
+
+        counter_connector_prefix_cache_hits = self._counter_cls(
+            name="vllm:connector_prefix_cache_hits",
+            documentation=("KV connector prefix cache hits, "
+                           "in terms of number of cached tokens."),
+            labelnames=labelnames)
+        self.counter_connector_prefix_cache_hits = make_per_engine(
+            counter_connector_prefix_cache_hits, engine_indexes, model_name)
+
         #
         # Counters
         #
@@ -550,6 +585,12 @@ def record(self,
             self.counter_prefix_cache_hits[engine_idx].inc(
                 scheduler_stats.prefix_cache_stats.hits)
 
+            if scheduler_stats.connector_prefix_cache_stats is not None:
+                self.counter_connector_prefix_cache_queries[engine_idx].inc(
+                    scheduler_stats.connector_prefix_cache_stats.queries)
+                self.counter_connector_prefix_cache_hits[engine_idx].inc(
+                    scheduler_stats.connector_prefix_cache_stats.hits)
+
             if scheduler_stats.spec_decoding_stats is not None:
                 self.spec_decoding_prom.observe(
                     scheduler_stats.spec_decoding_stats, engine_idx)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
@@ -48,6 +48,9 @@ class SchedulerStats:
     prefix_cache_stats: PrefixCacheStats = field(
         default_factory=PrefixCacheStats)
 
+    connector_prefix_cache_stats: Optional[PrefixCacheStats] = field(
+        default=None)
+
     spec_decoding_stats: Optional[SpecDecodingStats] = None
     kv_connector_stats: Optional[dict[str, Any]] = None