[KVConnector][Feature] Support KV connector cache reset via /reset_prefix_cache

ptovam · ptovam · commit f94cb1500b31 · 2025-10-30T09:29:17.000+02:00
Signed-off-by: tovam &lt;tovam@pliops.com&gt;
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -491,3 +491,12 @@ def build_prom_metrics(
         expose connector transfer stats via Prometheus.
         """
         return None
+
+    def reset_cache(self) -> bool:
+        """
+        Reset the connector's internal cache.
+
+        Returns:
+            bool: True if the cache was successfully reset, False otherwise.
+        """
+        return False
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -438,3 +438,6 @@ def build_prom_metrics(
             per_engine_labelvalues,
             prom_metrics,
         )
+
+    def reset_cache(self) -> bool:
+        return any(connector.reset_cache() for connector in self._connectors)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -124,8 +124,10 @@ async def reset_mm_cache(self) -> None:
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self, device: Device | None = None) -> None:
-        """Reset the prefix cache"""
+    async def reset_prefix_cache(
+        self, device: Device | None = None, reset_connector: bool = False
+    ) -> None:
+        """Reset the prefix cache and optionally any configured connector cache"""
         ...
 
     @abstractmethod
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -1491,8 +1491,12 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self, device: Device | None = None) -> None:
-        self.llm_engine.reset_prefix_cache(device)
+    def reset_prefix_cache(
+        self,
+        device: Device | None = None,
+        reset_connector: bool = False,
+    ) -> None:
+        self.llm_engine.reset_prefix_cache(device, reset_connector)
 
     def sleep(self, level: int = 1):
         """
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -991,15 +991,33 @@ async def show_server_info(
     @router.post("/reset_prefix_cache")
     async def reset_prefix_cache(raw_request: Request):
         """
-        Reset the prefix cache. Note that we currently do not check if the
-        prefix cache is successfully reset in the API server.
+        Reset the local prefix cache.
+
+        Optionally, if the query parameter `reset_external=true`
+        also resets the external (connector-managed) prefix cache.
+
+        Note that we currently do not check if the prefix cache
+        is successfully reset in the API server.
+
+        Example:
+            POST /reset_prefix_cache?device=gpu&reset_external=true
         """
         device = None
         device_str = raw_request.query_params.get("device")
         if device_str is not None:
             device = Device[device_str.upper()]
-        logger.info("Resetting prefix cache with specific %s...", str(device))
-        await engine_client(raw_request).reset_prefix_cache(device)
+
+        reset_connector = (
+            raw_request.query_params.get("reset_external", "false").lower() == "true"
+        )
+
+        logger.info(
+            "Resetting prefix cache (device=%s, reset_external_cache=%s)",
+            str(device),
+            reset_connector,
+        )
+
+        await engine_client(raw_request).reset_prefix_cache(device, reset_connector)
         return Response(status_code=200)
 
     @router.post("/reset_mm_cache")
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
@@ -128,7 +128,7 @@ def has_requests(self) -> bool:
         return self.has_unfinished_requests() or self.has_finished_requests()
 
     @abstractmethod
-    def reset_prefix_cache(self) -> bool:
+    def reset_prefix_cache(self, reset_connector: bool = False) -> bool:
         """Reset the prefix cache for KV cache.
 
         This is particularly required when the model weights are live-updated.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -1240,8 +1240,17 @@ def get_num_unfinished_requests(self) -> int:
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
 
-    def reset_prefix_cache(self) -> bool:
-        return self.kv_cache_manager.reset_prefix_cache()
+    def reset_prefix_cache(self, reset_connector: bool = False) -> bool:
+        reset_success = self.kv_cache_manager.reset_prefix_cache()
+        if reset_connector:
+            reset_success = reset_success and self.reset_connector_cache()
+        return reset_success
+
+    def reset_connector_cache(self) -> bool:
+        if self.connector is None:
+            logger.warning("reset_connector called but no KV connector configured.")
+            return False
+        return self.connector.reset_cache()
 
     def make_stats(
         self,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -680,10 +680,14 @@ async def reset_mm_cache(self) -> None:
         self.processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self, device: Device | None = None) -> None:
+    async def reset_prefix_cache(
+        self,
+        device: Device | None = None,
+        reset_connector: bool = False,
+    ) -> None:
         if device == Device.CPU:
             raise ValueError("Not supported on CPU.")
-        await self.engine_core.reset_prefix_cache_async()
+        await self.engine_core.reset_prefix_cache_async(reset_connector)
 
     async def sleep(self, level: int = 1) -> None:
         await self.reset_prefix_cache()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -411,8 +411,8 @@ def reset_mm_cache(self):
 
         self.model_executor.reset_mm_cache()
 
-    def reset_prefix_cache(self):
-        self.scheduler.reset_prefix_cache()
+    def reset_prefix_cache(self, reset_connector: bool = False):
+        self.scheduler.reset_prefix_cache(reset_connector)
 
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -138,7 +138,7 @@ def profile(self, is_start: bool = True) -> None:
     def reset_mm_cache(self) -> None:
         raise NotImplementedError
 
-    def reset_prefix_cache(self) -> None:
+    def reset_prefix_cache(self, reset_connector: bool = False) -> None:
         raise NotImplementedError
 
     def sleep(self, level: int = 1) -> None:
@@ -208,7 +208,7 @@ async def profile_async(self, is_start: bool = True) -> None:
     async def reset_mm_cache_async(self) -> None:
         raise NotImplementedError
 
-    async def reset_prefix_cache_async(self) -> None:
+    async def reset_prefix_cache_async(self, reset_connector: bool = False) -> None:
         raise NotImplementedError
 
     async def sleep_async(self, level: int = 1) -> None:
@@ -287,8 +287,8 @@ def profile(self, is_start: bool = True) -> None:
     def reset_mm_cache(self) -> None:
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self) -> None:
-        self.engine_core.reset_prefix_cache()
+    def reset_prefix_cache(self, reset_connector: bool = False) -> None:
+        self.engine_core.reset_prefix_cache(reset_connector)
 
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
@@ -750,8 +750,8 @@ def profile(self, is_start: bool = True) -> None:
     def reset_mm_cache(self) -> None:
         self.call_utility("reset_mm_cache")
 
-    def reset_prefix_cache(self) -> None:
-        self.call_utility("reset_prefix_cache")
+    def reset_prefix_cache(self, reset_connector: bool = False) -> None:
+        self.call_utility("reset_prefix_cache", reset_connector)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.call_utility("add_lora", lora_request)
@@ -954,8 +954,8 @@ async def profile_async(self, is_start: bool = True) -> None:
     async def reset_mm_cache_async(self) -> None:
         await self.call_utility_async("reset_mm_cache")
 
-    async def reset_prefix_cache_async(self) -> None:
-        await self.call_utility_async("reset_prefix_cache")
+    async def reset_prefix_cache_async(self, reset_connector: bool = False) -> None:
+        await self.call_utility_async("reset_prefix_cache", reset_connector)
 
     async def sleep_async(self, level: int = 1) -> None:
         await self.call_utility_async("sleep", level)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
@@ -326,8 +326,10 @@ def reset_mm_cache(self):
         self.processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, device: Device | None = None):
-        self.engine_core.reset_prefix_cache()
+    def reset_prefix_cache(
+        self, device: Device | None = None, reset_connector: bool = False
+    ):
+        self.engine_core.reset_prefix_cache(reset_connector=reset_connector)
 
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)

Original file line number	Diff line number	Diff line change
`@@ -438,3 +438,6 @@ def build_prom_metrics(`
`438`	`438`	`per_engine_labelvalues,`
`439`	`439`	`prom_metrics,`
`440`	`440`	`)`
	`441`	`+`
	`442`	`+ def reset_cache(self) -> bool:`
	`443`	`+ return any(connector.reset_cache() for connector in self._connectors)`