Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[V1][Metrics] Hook up IterationStats for Prometheus metrics #12478

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,6 @@ async def client(server):
@pytest.mark.asyncio
async def test_metrics_counts(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):
if use_v1:
pytest.skip("Skipping test on vllm V1")
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
await client.completions.create(
Expand All @@ -120,6 +118,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,

# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if use_v1 and metric_family not in EXPECTED_METRICS_V1:
continue

found_metric = False

# Check to see if the metric_family is found in the prom endpoint.
Expand Down Expand Up @@ -199,6 +200,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
]


Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,8 @@ def _log_stats(
return

for logger in self.stat_loggers:
logger.log(scheduler_stats=scheduler_stats)
logger.log(scheduler_stats=scheduler_stats,
iteration_stats=iteration_stats)

def encode(
self,
Expand Down
68 changes: 59 additions & 9 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import time
from abc import ABC, abstractmethod
from typing import Dict
from typing import Dict, List

import numpy as np
import prometheus_client

from vllm.logger import init_logger
from vllm.v1.metrics.stats import SchedulerStats
from vllm.v1.metrics.stats import IterationStats, SchedulerStats

logger = init_logger(__name__)

Expand All @@ -15,27 +16,61 @@
class StatLoggerBase(ABC):

@abstractmethod
def log(self, scheduler_stats: SchedulerStats):
def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
...


class LoggingStatLogger(StatLoggerBase):

def __init__(self):
self.last_log_time = time.monotonic()
self._reset(time.monotonic())

def log(self, scheduler_stats: SchedulerStats):
"""Log Stats to standard output."""
def _reset(self, now):
self.last_log_time = now

# Tracked stats over current local logging interval.
self.num_prompt_tokens: List[int] = []
self.num_generation_tokens: List[int] = []

def _local_interval_elapsed(self, now: float) -> bool:
# Log every _LOCAL_LOGGING_INTERVAL_SEC.
elapsed_time = now - self.last_log_time
return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC

def _track_iteration_stats(self, iteration_stats: IterationStats):
# Save tracked stats for token counters.
self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
self.num_generation_tokens.append(
iteration_stats.num_generation_tokens)

def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
# Compute summary metrics for tracked stats
return float(np.sum(tracked_stats) / (now - self.last_log_time))

def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
"""Log Stats to standard output."""

self._track_iteration_stats(iteration_stats)

now = time.monotonic()
if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
if not self._local_interval_elapsed(now):
return
self.last_log_time = now

prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
generation_throughput = self._get_throughput(
self.num_generation_tokens, now)

self._reset(now)

# Format and print output.
logger.info(
"Avg prompt throughput: %.1f tokens/s, "
"Avg generation throughput: %.1f tokens/s, "
"Running: %d reqs, Waiting: %d reqs ",
prompt_throughput,
generation_throughput,
scheduler_stats.num_running_reqs,
scheduler_stats.num_waiting_reqs,
)
Expand All @@ -61,11 +96,26 @@ def __init__(self, labels: Dict[str, str]):
documentation="Number of requests waiting to be processed.",
labelnames=labelnames).labels(*labelvalues)

def log(self, scheduler_stats: SchedulerStats):
self.counter_prompt_tokens = prometheus_client.Counter(
name="vllm:prompt_tokens_total",
documentation="Number of prefill tokens processed.",
labelnames=labelnames).labels(*labelvalues)

self.counter_generation_tokens = prometheus_client.Counter(
name="vllm:generation_tokens_total",
documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues)

def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
"""Log to prometheus."""
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)

self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
self.counter_generation_tokens.inc(
iteration_stats.num_generation_tokens)

@staticmethod
def _unregister_vllm_metrics():
# Unregister any existing vLLM collectors (for CI/CD
Expand Down
Loading