diff --git a/vllm/config.py b/vllm/config.py index 25f841231ded..fa1ea4f089c8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1891,13 +1891,6 @@ def __post_init__(self): "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " f"installed. Original error:\n{otel_import_error_traceback}") - if ((self.collect_model_forward_time - or self.collect_model_execute_time) - and self.otlp_traces_endpoint is None): - raise ValueError( - "collect_model_forward_time or collect_model_execute_time " - "requires --otlp-traces-endpoint to be set.") - @dataclass(frozen=True) class EngineConfig: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1dd0f097c74f..cea7833d7b55 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1643,6 +1643,9 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests: List[float] = [] + time_in_queue_requests: List[float] = [] + model_forward_time_requests: List[float] = [] + model_execute_time_requests: List[float] = [] # Metadata num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] @@ -1736,6 +1739,15 @@ def _get_stats(self, # Latency timings time_e2e_requests.append(now - seq_group.metrics.arrival_time) + if seq_group.metrics.time_in_queue is not None: + time_in_queue_requests.append( + seq_group.metrics.time_in_queue) + if seq_group.metrics.model_forward_time is not None: + model_forward_time_requests.append( + seq_group.metrics.model_forward_time) + if seq_group.metrics.model_execute_time is not None: + model_execute_time_requests.append( + seq_group.metrics.model_execute_time * 1000) # Metadata num_prompt_tokens_requests.append( len(seq_group.prompt_token_ids)) @@ -1793,6 +1805,9 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests=time_e2e_requests, + time_in_queue_requests=time_in_queue_requests, + model_forward_time_requests=model_forward_time_requests, + model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index a46625eff1e4..0f5615ff14db 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -133,7 +133,31 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:e2e_request_latency_seconds", documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) + buckets=[ + 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, + 40.0, 50.0, 60.0 + ]) + self.histogram_time_in_queue_request = self._histogram_cls( + name="vllm:time_in_queue_requests", + documentation= + "Histogram of time the request spent in the queue in seconds.", + labelnames=labelnames, + buckets=[ + 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, + 40.0, 50.0, 60.0 + ]) + self.histogram_model_forward_time_request = self._histogram_cls( + name="vllm:model_forward_time_milliseconds", + documentation= + "Histogram of time spent in the model forward pass in ms.", + labelnames=labelnames, + buckets=build_1_2_3_5_8_buckets(3000)) + self.histogram_model_execute_time_request = self._histogram_cls( + name="vllm:model_execute_time_milliseconds", + documentation= + "Histogram of time spent in the model execute function in ms.", + labelnames=labelnames, + buckets=build_1_2_3_5_8_buckets(3000)) # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( name="vllm:request_prompt_tokens", @@ -299,16 +323,12 @@ def _unregister_vllm_metrics(self) -> None: pass -def build_1_2_5_buckets(max_value: int) -> List[int]: +def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: """ - Builds a list of buckets with increasing powers of 10 multiplied by - mantissa values (1, 2, 5) until the value exceeds the specified maximum. + Builds a list of buckets with increasing powers of 10 multiplied by + mantissa values until the value exceeds the specified maximum. - Example: - >>> build_1_2_5_buckets(100) - [1, 2, 5, 10, 20, 50, 100] """ - mantissa_lst = [1, 2, 5] exponent = 0 buckets: List[int] = [] while True: @@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: exponent += 1 +def build_1_2_5_buckets(max_value: int) -> List[int]: + """ + Example: + >>> build_1_2_5_buckets(100) + [1, 2, 5, 10, 20, 50, 100] + """ + return build_buckets([1, 2, 5], max_value) + + +def build_1_2_3_5_8_buckets(max_value: int) -> List[int]: + """ + Example: + >>> build_1_2_3_5_8_buckets(100) + [1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100] + """ + return build_buckets([1, 2, 3, 5, 8], max_value) + + def local_interval_elapsed(now: float, last_log: float, local_interval: float) -> bool: elapsed_time = now - last_log @@ -486,6 +524,12 @@ def _log_prometheus(self, stats: Stats) -> None: # Latency self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests) + self._log_histogram(self.metrics.histogram_time_in_queue_request, + stats.time_in_queue_requests) + self._log_histogram(self.metrics.histogram_model_forward_time_request, + stats.model_forward_time_requests) + self._log_histogram(self.metrics.histogram_model_execute_time_request, + stats.model_execute_time_requests) # Metadata finished_reason_counter = CollectionsCounter( stats.finished_reason_requests) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index e9a5bd3b586b..510dd04bb3e5 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -46,6 +46,9 @@ class Stats: # Request stats (should have _requests suffix) # Latency time_e2e_requests: List[float] + time_in_queue_requests: List[float] + model_forward_time_requests: List[float] + model_execute_time_requests: List[float] # Metadata num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int]