Refactor prefix cache stats tracking

ptovam · ptovam · commit 8e8ea29a78cb · 2025-10-16T17:19:57.000+03:00
Signed-off-by: tovam &lt;tovam@pliops.com&gt;
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -44,7 +44,6 @@
 
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
@@ -101,8 +100,6 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
         else:
             raise ValueError("kv_transfer_config must be set for KVConnectorBase_V1")
         self._role = role
-        # FIXME: make prefix cache stats conditional on log_stats
-        self.prefix_cache_stats = PrefixCacheStats()
 
     @property
     def role(self) -> KVConnectorRole:
@@ -434,28 +431,3 @@ def build_kv_connector_stats(
         which can implement custom aggregation logic on the data dict.
         """
         return None
-
-    def update_prefix_cache_stats(
-        self, request_num_tokens: int, num_external_tokens: int
-    ) -> None:
-        """
-        Update prefix cache statistics for a request.
-
-        Args:
-            request_num_tokens (int): the number of tokens in the request.
-            num_external_tokens (int): the number of tokens that will be
-                loaded from the external KV cache.
-        """
-        self.prefix_cache_stats.requests += 1
-        self.prefix_cache_stats.queries += request_num_tokens
-        self.prefix_cache_stats.hits += num_external_tokens
-
-    def make_prefix_cache_stats(self) -> PrefixCacheStats | None:
-        """Get (and reset) the prefix cache stats.
-
-        Returns:
-            The current prefix caching stats.
-        """
-        stats = self.prefix_cache_stats
-        self.prefix_cache_stats = PrefixCacheStats()
-        return stats
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -28,7 +28,7 @@
 from vllm.v1.core.sched.utils import check_stop, remove_all
 from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.kv_cache_interface import KVCacheConfig
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import PrefixCacheStats, SchedulerStats
 from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
@@ -84,6 +84,7 @@ def __init__(
         # will have a corresponding KVConnector with Role=WORKER.
         # KV Connector pushes/pull of remote KVs for P/D and offloading.
         self.connector = None
+        self.connector_prefix_cache_stats: PrefixCacheStats | None = None
         if self.vllm_config.kv_transfer_config is not None:
             assert len(self.kv_cache_config.kv_cache_groups) == 1, (
                 "Multiple KV cache groups are not currently supported "
@@ -95,6 +96,8 @@ def __init__(
             self.connector = KVConnectorFactory.create_connector(
                 config=self.vllm_config, role=KVConnectorRole.SCHEDULER
             )
+            if self.log_stats:
+                self.connector_prefix_cache_stats = PrefixCacheStats()
 
         self.kv_event_publisher = EventPublisherFactory.create(
             self.kv_events_config,
@@ -525,10 +528,9 @@ def schedule(self) -> SchedulerOutput:
                         new_computed_blocks + new_blocks,
                         num_external_computed_tokens,
                     )
-                    if self.log_stats:
-                        self.connector.update_prefix_cache_stats(
-                            request.num_tokens, num_external_computed_tokens
-                        )
+                    self._update_connector_prefix_cache_stats(
+                        request.num_tokens, num_external_computed_tokens
+                    )
 
                 # Request was already popped from self.waiting
                 # unless it was re-added above due to new_blocks being None.
@@ -1249,11 +1251,7 @@ def make_stats(
         if not self.log_stats:
             return None
         prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
-        connector_prefix_cache_stats = (
-            self.connector.make_prefix_cache_stats()
-            if self.connector is not None
-            else None
-        )
+        connector_prefix_cache_stats = self._make_connector_prefix_cache_stats()
         assert prefix_cache_stats is not None
         return SchedulerStats(
             num_running_reqs=len(self.running),
@@ -1291,6 +1289,22 @@ def shutdown(self) -> None:
     # KV Connector Related Methods
     ########################################################################
 
+    def _update_connector_prefix_cache_stats(
+        self, request_num_tokens: int, num_external_tokens: int
+    ) -> None:
+        if self.connector_prefix_cache_stats is None:
+            return
+        self.connector_prefix_cache_stats.requests += 1
+        self.connector_prefix_cache_stats.queries += request_num_tokens
+        self.connector_prefix_cache_stats.hits += num_external_tokens
+
+    def _make_connector_prefix_cache_stats(self) -> PrefixCacheStats | None:
+        if self.connector_prefix_cache_stats is None:
+            return None
+        stats = self.connector_prefix_cache_stats
+        self.connector_prefix_cache_stats = PrefixCacheStats()
+        return stats
+
     def get_kv_connector(self) -> KVConnectorBase_V1 | None:
         return self.connector