[V1][Metrics] Make --disable-log-stats more effective

markmc · markmc · commit 79e8f3a1e7fb · 2025-02-11T02:13:48.000-05:00
Avoid constructing: - The logging and prometheus loggers - IterationStats and RequestStateStats in the output processor - SchedulerStats and EngineCoreEvents in the scheduler Also, add TODO for #12592 to disable prefix cache stats. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
@@ -38,7 +38,8 @@ def create_scheduler(
     return Scheduler(scheduler_config,
                      model_config,
                      cache_config,
-                     lora_config=None)
+                     lora_config=None,
+                     log_stats=True)
 
 
 def create_requests(
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
@@ -50,7 +50,8 @@ def test_engine_core(monkeypatch):
         executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class)
+                                 executor_class=executor_class,
+                                 log_stats=True)
         """Test basic request lifecycle."""
 
         # First request.
@@ -157,7 +158,8 @@ def test_engine_core_advanced_sampling(monkeypatch):
         executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class)
+                                 executor_class=executor_class,
+                                 log_stats=True)
         """Test basic request lifecycle."""
         # First request.
         request: EngineCoreRequest = make_request()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
@@ -94,6 +94,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=False,
         )
 
         MAX_TOKENS = 20
@@ -163,6 +164,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=True,
         )
 
         MAX_TOKENS = 20
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
@@ -633,7 +633,7 @@ def test_iteration_stats(dummy_test_vectors):
 
     # First iteration has 2 prefills.
     outputs = engine_core.get_outputs()[:num_active]
-    iteration_stats = IterationStats(output_processor.log_stats)
+    iteration_stats = IterationStats()
     output_processor.process_outputs(outputs, engine_core_timestamp,
                                      iteration_stats)
     total_prompt_tokens = sum([
@@ -646,7 +646,7 @@ def test_iteration_stats(dummy_test_vectors):
 
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
-    iteration_stats = IterationStats(output_processor.log_stats)
+    iteration_stats = IterationStats()
     output_processor.process_outputs(outputs, engine_core_timestamp,
                                      iteration_stats)
 
@@ -657,7 +657,7 @@ def test_iteration_stats(dummy_test_vectors):
     output_processor.add_request(inactive_request)
     num_active += 1
     outputs = engine_core.get_outputs()[:num_active]
-    iteration_stats = IterationStats(output_processor.log_stats)
+    iteration_stats = IterationStats()
     output_processor.process_outputs(outputs, engine_core_timestamp,
                                      iteration_stats)
     total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
@@ -667,7 +667,7 @@ def test_iteration_stats(dummy_test_vectors):
 
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
-    iteration_stats = IterationStats(output_processor.log_stats)
+    iteration_stats = IterationStats()
     output_processor.process_outputs(outputs, engine_core_timestamp,
                                      iteration_stats)
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -25,13 +25,16 @@ def __init__(
         sliding_window: Optional[int] = None,
         enable_caching: bool = True,
         num_preallocate_tokens: int = 64,
+        log_stats: bool = False,
     ) -> None:
         self.block_size = block_size
         self.num_gpu_blocks = num_gpu_blocks
         self.max_model_len = max_model_len
         self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
         self.sliding_window = sliding_window
         self.enable_caching = enable_caching
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.log_stats = log_stats
         # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
         # blocks for each request. For example, when a request reaches the end
         # of its block table, we preallocate N blocks in advance. This way, we
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
@@ -11,7 +11,8 @@
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
                                            SchedulerOutput)
-from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
+from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
+                            EngineCoreOutput, EngineCoreOutputs)
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
@@ -27,10 +28,12 @@ def __init__(
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
+        log_stats: bool,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
+        self.log_stats = log_stats
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -46,7 +49,8 @@ def __init__(
             num_gpu_blocks=num_gpu_blocks,
             max_model_len=self.max_model_len,
             sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching)
+            enable_caching=self.cache_config.enable_prefix_caching,
+            log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
 
         # req_id -> Request
@@ -249,7 +253,7 @@ def schedule(self) -> "SchedulerOutput":
                 self.running.append(request)
                 if request.status == RequestStatus.WAITING:
                     scheduled_new_reqs.append(request)
-                    request.scheduled(scheduled_timestamp)
+                    self.request_scheduled(request, scheduled_timestamp)
                 elif request.status == RequestStatus.PREEMPTED:
                     scheduled_resumed_reqs.append(request)
                 else:
@@ -546,7 +550,7 @@ def _check_stop(self, request: Request) -> bool:
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
         self.requests[request.request_id] = request
-        request.queued()
+        self.request_queued(request)
 
     def finish_requests(
         self,
@@ -594,7 +598,22 @@ def has_unfinished_requests(self) -> bool:
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def make_stats(self) -> SchedulerStats:
+    def request_queued(self, request: Request):
+        if not self.log_stats:
+            return
+        request.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED))
+
+    def request_scheduled(self, request: Request, timestamp: float):
+        if not self.log_stats:
+            return
+        request.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
+                                      timestamp))
+
+    def make_stats(self) -> Optional[SchedulerStats]:
+        if not self.log_stats:
+            return None
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
@@ -117,7 +117,7 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
-    scheduler_stats: SchedulerStats
+    scheduler_stats: Optional[SchedulerStats]
     timestamp: float = 0.0
 
     def __post_init__(self):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -53,10 +53,12 @@ def __init__(
 
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers: List[StatLoggerBase] = [
-            LoggingStatLogger(),
-            PrometheusStatLogger(vllm_config.model_config),
-        ]
+        self.stat_loggers: List[StatLoggerBase] = []
+        if self.log_stats:
+            self.stat_loggers.extend([
+                LoggingStatLogger(),
+                PrometheusStatLogger(vllm_config.model_config),
+            ])
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -85,6 +87,7 @@ def __init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
@@ -246,7 +249,7 @@ async def _run_output_handler(self):
                 # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
 
-                iteration_stats = IterationStats(self.log_stats)
+                iteration_stats = IterationStats() if self.log_stats else None
 
                 # Split outputs into chunks of at most
                 # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
@@ -277,7 +280,6 @@ async def _run_output_handler(self):
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once Prometheus overhead is non-trivial.
-                assert iteration_stats is not None
                 self._log_stats(
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
@@ -299,12 +301,14 @@ async def abort(self, request_id: str) -> None:
 
     def _log_stats(
         self,
-        scheduler_stats: SchedulerStats,
-        iteration_stats: IterationStats,
+        scheduler_stats: Optional[SchedulerStats],
+        iteration_stats: Optional[IterationStats],
     ):
         if not self.log_stats:
             return
 
+        assert scheduler_stats is not None
+        assert iteration_stats is not None
         for logger in self.stat_loggers:
             logger.log(scheduler_stats=scheduler_stats,
                        iteration_stats=iteration_stats)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -38,12 +38,15 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
+        log_stats: bool,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
 
+        self.log_stats = log_stats
+
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
 
@@ -59,6 +62,7 @@ def __init__(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
+            log_stats=self.log_stats,
         )
 
         self.mm_input_mapper_server = MMInputMapperServer(
@@ -148,11 +152,9 @@ def __init__(
         ready_pipe: Connection,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
+        log_stats: bool,
     ):
-        super().__init__(vllm_config, executor_class)
-
-        self.log_stats = log_stats
+        super().__init__(vllm_config, executor_class, log_stats)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -41,6 +41,7 @@ def make_client(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
+        log_stats: bool,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -50,12 +51,12 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(vllm_config, executor_class)
+            return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(vllm_config, executor_class)
+            return SyncMPClient(vllm_config, executor_class, log_stats)
 
-        return InprocClient(vllm_config, executor_class)
+        return InprocClient(vllm_config, executor_class, log_stats)
 
     @abstractmethod
     def shutdown(self):
@@ -204,13 +205,13 @@ def shutdown(self):
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig,
-                 executor_class: Type[Executor]):
+    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
+                 log_stats: bool):
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=False,
+            log_stats=log_stats,
         )
 
     def get_output(self) -> EngineCoreOutputs:
@@ -245,13 +246,13 @@ def reset_prefix_cache(self) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig,
-                 executor_class: Type[Executor]):
+    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
+                 log_stats: bool):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=True,
+            log_stats=log_stats,
         )
 
         self.outputs_queue: Optional[asyncio.Queue[bytes]] = None
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
@@ -73,6 +73,7 @@ def __init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=False,  # FIXME: implement
         )
 
     @classmethod
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):`
`94`	`94`	`asyncio_mode=False,`
`95`	`95`	`vllm_config=vllm_config,`
`96`	`96`	`executor_class=executor_class,`
	`97`	`+ log_stats=False,`
`97`	`98`	`)`
`98`	`99`
`99`	`100`	`MAX_TOKENS = 20`
`@@ -163,6 +164,7 @@ async def test_engine_core_client_asyncio(monkeypatch):`
`163`	`164`	`asyncio_mode=True,`
`164`	`165`	`vllm_config=vllm_config,`
`165`	`166`	`executor_class=executor_class,`
	`167`	`+ log_stats=True,`
`166`	`168`	`)`
`167`	`169`
`168`	`170`	`MAX_TOKENS = 20`