|
7 | 7 | import numpy as np |
8 | 8 | import prometheus_client |
9 | 9 |
|
10 | | -from vllm.config import ModelConfig |
| 10 | +from vllm.config import VllmConfig |
11 | 11 | from vllm.logger import init_logger |
12 | 12 | from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics |
13 | 13 | from vllm.v1.engine import FinishReason |
@@ -92,13 +92,13 @@ def log(self, scheduler_stats: SchedulerStats, |
92 | 92 |
|
93 | 93 | class PrometheusStatLogger(StatLoggerBase): |
94 | 94 |
|
95 | | - def __init__(self, model_config: ModelConfig): |
| 95 | + def __init__(self, vllm_config: VllmConfig): |
96 | 96 | self._unregister_vllm_metrics() |
97 | 97 |
|
98 | 98 | labelnames = ["model_name"] |
99 | | - labelvalues = [model_config.served_model_name] |
| 99 | + labelvalues = [vllm_config.model_config.served_model_name] |
100 | 100 |
|
101 | | - max_model_len = model_config.max_model_len |
| 101 | + max_model_len = vllm_config.model_config.max_model_len |
102 | 102 |
|
103 | 103 | self.gauge_scheduler_running = prometheus_client.Gauge( |
104 | 104 | name="vllm:num_requests_running", |
@@ -162,6 +162,13 @@ def __init__(self, model_config: ModelConfig): |
162 | 162 | buckets=build_1_2_5_buckets(max_model_len), |
163 | 163 | labelnames=labelnames).labels(*labelvalues) |
164 | 164 |
|
| 165 | + self.histogram_iteration_tokens = \ |
| 166 | + prometheus_client.Histogram( |
| 167 | + name="vllm:iteration_tokens_total", |
| 168 | + documentation="Histogram of number of tokens per engine_step.", |
| 169 | + buckets=build_cudagraph_buckets(vllm_config), |
| 170 | + labelnames=labelnames).labels(*labelvalues) |
| 171 | + |
165 | 172 | self.histogram_time_to_first_token = \ |
166 | 173 | prometheus_client.Histogram( |
167 | 174 | name="vllm:time_to_first_token_seconds", |
@@ -237,6 +244,9 @@ def log(self, scheduler_stats: SchedulerStats, |
237 | 244 | self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) |
238 | 245 | self.counter_generation_tokens.inc( |
239 | 246 | iteration_stats.num_generation_tokens) |
| 247 | + self.histogram_iteration_tokens.observe( |
| 248 | + iteration_stats.num_prompt_tokens + \ |
| 249 | + iteration_stats.num_generation_tokens) |
240 | 250 |
|
241 | 251 | for finished_request in iteration_stats.finished_requests: |
242 | 252 | self.counter_request_success[finished_request.finish_reason].inc() |
@@ -293,3 +303,13 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: |
293 | 303 | [1, 2, 5, 10, 20, 50, 100] |
294 | 304 | """ |
295 | 305 | return build_buckets([1, 2, 5], max_value) |
| 306 | + |
| 307 | + |
| 308 | +def build_cudagraph_buckets(vllm_config: VllmConfig) -> List[int]: |
| 309 | + if not vllm_config.model_config.enforce_eager: |
| 310 | + buckets = vllm_config.compilation_config.\ |
| 311 | + cudagraph_capture_sizes.copy() |
| 312 | + buckets.sort() |
| 313 | + return buckets |
| 314 | + else: |
| 315 | + return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] |
0 commit comments