Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1891,13 +1891,6 @@ def __post_init__(self):
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f"installed. Original error:\n{otel_import_error_traceback}")

if ((self.collect_model_forward_time
or self.collect_model_execute_time)
and self.otlp_traces_endpoint is None):
raise ValueError(
"collect_model_forward_time or collect_model_execute_time "
"requires --otlp-traces-endpoint to be set.")


@dataclass(frozen=True)
class EngineConfig:
Expand Down
15 changes: 15 additions & 0 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1643,6 +1643,9 @@ def _get_stats(self,
# Request stats
# Latency
time_e2e_requests: List[float] = []
time_in_queue_requests: List[float] = []
model_forward_time_requests: List[float] = []
model_execute_time_requests: List[float] = []
# Metadata
num_prompt_tokens_requests: List[int] = []
num_generation_tokens_requests: List[int] = []
Expand Down Expand Up @@ -1736,6 +1739,15 @@ def _get_stats(self,
# Latency timings
time_e2e_requests.append(now -
seq_group.metrics.arrival_time)
if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
if seq_group.metrics.model_forward_time is not None:
model_forward_time_requests.append(
seq_group.metrics.model_forward_time)
if seq_group.metrics.model_execute_time is not None:
model_execute_time_requests.append(
seq_group.metrics.model_execute_time * 1000)
# Metadata
num_prompt_tokens_requests.append(
len(seq_group.prompt_token_ids))
Expand Down Expand Up @@ -1793,6 +1805,9 @@ def _get_stats(self,
# Request stats
# Latency
time_e2e_requests=time_e2e_requests,
time_in_queue_requests=time_in_queue_requests,
model_forward_time_requests=model_forward_time_requests,
model_execute_time_requests=model_execute_time_requests,
# Metadata
num_prompt_tokens_requests=num_prompt_tokens_requests,
num_generation_tokens_requests=num_generation_tokens_requests,
Expand Down
60 changes: 52 additions & 8 deletions vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,31 @@ def __init__(self, labelnames: List[str], max_model_len: int):
name="vllm:e2e_request_latency_seconds",
documentation="Histogram of end to end request latency in seconds.",
labelnames=labelnames,
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
buckets=[
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0
])
self.histogram_time_in_queue_request = self._histogram_cls(
name="vllm:time_in_queue_requests",
documentation=
"Histogram of time the request spent in the queue in seconds.",
labelnames=labelnames,
buckets=[
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0
])
self.histogram_model_forward_time_request = self._histogram_cls(
name="vllm:model_forward_time_milliseconds",
documentation=
"Histogram of time spent in the model forward pass in ms.",
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
self.histogram_model_execute_time_request = self._histogram_cls(
name="vllm:model_execute_time_milliseconds",
documentation=
"Histogram of time spent in the model execute function in ms.",
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
# Metadata
self.histogram_num_prompt_tokens_request = self._histogram_cls(
name="vllm:request_prompt_tokens",
Expand Down Expand Up @@ -299,16 +323,12 @@ def _unregister_vllm_metrics(self) -> None:
pass


def build_1_2_5_buckets(max_value: int) -> List[int]:
def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
"""
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values (1, 2, 5) until the value exceeds the specified maximum.
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values until the value exceeds the specified maximum.

Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
mantissa_lst = [1, 2, 5]
exponent = 0
buckets: List[int] = []
while True:
Expand All @@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
exponent += 1


def build_1_2_5_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return build_buckets([1, 2, 5], max_value)


def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_3_5_8_buckets(100)
[1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
"""
return build_buckets([1, 2, 3, 5, 8], max_value)


def local_interval_elapsed(now: float, last_log: float,
local_interval: float) -> bool:
elapsed_time = now - last_log
Expand Down Expand Up @@ -486,6 +524,12 @@ def _log_prometheus(self, stats: Stats) -> None:
# Latency
self._log_histogram(self.metrics.histogram_e2e_time_request,
stats.time_e2e_requests)
self._log_histogram(self.metrics.histogram_time_in_queue_request,
stats.time_in_queue_requests)
self._log_histogram(self.metrics.histogram_model_forward_time_request,
stats.model_forward_time_requests)
self._log_histogram(self.metrics.histogram_model_execute_time_request,
stats.model_execute_time_requests)
# Metadata
finished_reason_counter = CollectionsCounter(
stats.finished_reason_requests)
Expand Down
3 changes: 3 additions & 0 deletions vllm/engine/metrics_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ class Stats:
# Request stats (should have _requests suffix)
# Latency
time_e2e_requests: List[float]
time_in_queue_requests: List[float]
model_forward_time_requests: List[float]
model_execute_time_requests: List[float]
# Metadata
num_prompt_tokens_requests: List[int]
num_generation_tokens_requests: List[int]
Expand Down