1212from vllm .v1 .core .kv_cache_utils import PrefixCachingMetrics
1313from vllm .v1 .engine import FinishReason
1414from vllm .v1 .metrics .stats import IterationStats , SchedulerStats
15+ from vllm .v1 .spec_decode .metrics import SpecDecodingMetrics
1516
1617logger = init_logger (__name__ )
1718
@@ -31,13 +32,15 @@ def log(self): # noqa
3132
3233class LoggingStatLogger (StatLoggerBase ):
3334
34- def __init__ (self , engine_index : int = 0 ):
35+ def __init__ (self , vllm_config : VllmConfig , engine_index : int = 0 ):
3536 self .engine_index = engine_index
3637 self ._reset (time .monotonic ())
3738 self .last_scheduler_stats = SchedulerStats ()
3839 # Prefix cache metrics. This cannot be reset.
3940 # TODO: Make the interval configurable.
4041 self .prefix_caching_metrics = PrefixCachingMetrics ()
42+ self .spec_decoding_metrics = SpecDecodingMetrics (
43+ vllm_config .speculative_config )
4144
4245 def _reset (self , now ):
4346 self .last_log_time = now
@@ -65,6 +68,10 @@ def record(self, scheduler_stats: SchedulerStats,
6568
6669 self .prefix_caching_metrics .observe (scheduler_stats .prefix_cache_stats )
6770
71+ if scheduler_stats .spec_decoding_stats is not None :
72+ self .spec_decoding_metrics .observe (
73+ scheduler_stats .spec_decoding_stats )
74+
6875 self .last_scheduler_stats = scheduler_stats
6976
7077 def log (self ):
@@ -94,6 +101,9 @@ def log(self):
94101 self .prefix_caching_metrics .hit_rate * 100 ,
95102 )
96103
104+ if scheduler_stats .spec_decoding_stats is not None :
105+ self .spec_decoding_metrics .log ()
106+
97107
98108class PrometheusStatLogger (StatLoggerBase ):
99109
@@ -302,6 +312,29 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
302312 self .labelname_running_lora_adapters ,
303313 ])
304314
315+ #
316+ # Speculative Decoding metrics
317+ # The acceptance rate can be calculated using a PromQL query:
318+ #
319+ # rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
320+ # rate(vllm:spec_decode_num_draft_tokens_total[$interval])
321+ #
322+ self .counter_spec_decode_num_draft_tokens = \
323+ prometheus_client .Counter (
324+ name = "vllm:spec_decode_num_draft_tokens_total" ,
325+ documentation = "Number of draft tokens." ,
326+ labelnames = labelnames ).labels (* labelvalues )
327+ self .counter_spec_decode_num_accepted_tokens = \
328+ prometheus_client .Counter (
329+ name = "vllm:spec_decode_num_accepted_tokens_total" ,
330+ documentation = "Number of accepted tokens." ,
331+ labelnames = labelnames ).labels (* labelvalues )
332+ self .counter_spec_decode_num_emitted_tokens = \
333+ prometheus_client .Counter (
334+ name = "vllm:spec_decode_num_emitted_tokens_total" ,
335+ documentation = "Number of emitted tokens." ,
336+ labelnames = labelnames ).labels (* labelvalues )
337+
305338 #
306339 # Cache config info metric
307340 #
@@ -338,6 +371,14 @@ def record(self, scheduler_stats: SchedulerStats,
338371 self .counter_gpu_prefix_cache_hits .inc (
339372 scheduler_stats .prefix_cache_stats .hits )
340373
374+ if scheduler_stats .spec_decoding_stats is not None :
375+ self .counter_spec_decode_num_draft_tokens .inc (
376+ scheduler_stats .spec_decoding_stats .num_draft_tokens )
377+ self .counter_spec_decode_num_accepted_tokens .inc (
378+ scheduler_stats .spec_decoding_stats .num_accepted_tokens )
379+ self .counter_spec_decode_num_emitted_tokens .inc (
380+ scheduler_stats .spec_decoding_stats .num_emitted_tokens )
381+
341382 if iteration_stats is None :
342383 return
343384
0 commit comments