Skip to content

Commit 85ce056

Browse files
committed
[V1][Spec Decoding] Strip metrics back to acceptance rate
Now just num_accepted_tokens, num_draft_tokens, and acceptance rate. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent 1647956 commit 85ce056

File tree

5 files changed

+13
-39
lines changed

5 files changed

+13
-39
lines changed

vllm/v1/core/sched/scheduler.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -602,13 +602,9 @@ def update_from_output(
602602
request.num_computed_tokens -= num_tokens_rejected
603603

604604
if spec_decoding_stats is not None:
605-
# FIXME: If a drafter proposes zero tokens, we should
606-
# treat this as if num_spec_tokens were proposed and
607-
# all rejected to allow fair comparisons between drafters
608605
spec_decoding_stats.observe(
609606
num_draft_tokens=len(scheduled_spec_token_ids),
610-
num_accepted_tokens=len(generated_token_ids) - 1,
611-
num_emitted_tokens=len(generated_token_ids))
607+
num_accepted_tokens=len(generated_token_ids) - 1)
612608

613609
cached_encoder_input_ids = (
614610
self.encoder_cache_manager.get_cached_input_ids(request))

vllm/v1/engine/async_llm.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,7 @@ def __init__(
7474
for i in range(vllm_config.parallel_config.data_parallel_size):
7575
loggers: list[StatLoggerBase] = []
7676
if logger.isEnabledFor(logging.INFO):
77-
loggers.append(
78-
LoggingStatLogger(vllm_config, engine_index=i))
77+
loggers.append(LoggingStatLogger(engine_index=i))
7978
loggers.append(
8079
PrometheusStatLogger(vllm_config, engine_index=i))
8180
self.stat_loggers.append(loggers)

vllm/v1/metrics/loggers.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,14 @@ def log(self): # noqa
3232

3333
class LoggingStatLogger(StatLoggerBase):
3434

35-
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
35+
def __init__(self, engine_index: int = 0):
3636
self.engine_index = engine_index
3737
self._reset(time.monotonic())
3838
self.last_scheduler_stats = SchedulerStats()
3939
# Prefix cache metrics. This cannot be reset.
4040
# TODO: Make the interval configurable.
4141
self.prefix_caching_metrics = PrefixCachingMetrics()
42-
self.spec_decoding_metrics = SpecDecodingMetrics(
43-
vllm_config.speculative_config)
42+
self.spec_decoding_metrics = SpecDecodingMetrics()
4443

4544
def _reset(self, now):
4645
self.last_log_time = now
@@ -329,11 +328,6 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
329328
name="vllm:spec_decode_num_accepted_tokens_total",
330329
documentation="Number of accepted tokens.",
331330
labelnames=labelnames).labels(*labelvalues)
332-
self.counter_spec_decode_num_emitted_tokens = \
333-
prometheus_client.Counter(
334-
name="vllm:spec_decode_num_emitted_tokens_total",
335-
documentation="Number of emitted tokens.",
336-
labelnames=labelnames).labels(*labelvalues)
337331

338332
#
339333
# Cache config info metric
@@ -376,8 +370,6 @@ def record(self, scheduler_stats: SchedulerStats,
376370
scheduler_stats.spec_decoding_stats.num_draft_tokens)
377371
self.counter_spec_decode_num_accepted_tokens.inc(
378372
scheduler_stats.spec_decoding_stats.num_accepted_tokens)
379-
self.counter_spec_decode_num_emitted_tokens.inc(
380-
scheduler_stats.spec_decoding_stats.num_emitted_tokens)
381373

382374
if iteration_stats is None:
383375
return

vllm/v1/spec_decode/metrics.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import numpy as np
66

7-
from vllm.config import SpeculativeConfig
87
from vllm.logger import init_logger
98

109
logger = init_logger(__name__)
@@ -14,59 +13,47 @@
1413
class SpecDecodingStats:
1514
num_draft_tokens: int = 0
1615
num_accepted_tokens: int = 0
17-
num_emitted_tokens: int = 0
1816

1917
def take(self):
2018
copied = SpecDecodingStats(self.num_draft_tokens,
21-
self.num_accepted_tokens,
22-
self.num_emitted_tokens)
19+
self.num_accepted_tokens)
2320
self.reset()
2421
return copied
2522

2623
def reset(self):
2724
self.num_draft_tokens = 0
2825
self.num_accepted_tokens = 0
29-
self.num_emitted_tokens = 0
3026

31-
def observe(self, num_draft_tokens: int, num_accepted_tokens: int,
32-
num_emitted_tokens: int):
27+
def observe(self, num_draft_tokens: int, num_accepted_tokens: int):
3328
self.num_draft_tokens += num_draft_tokens
3429
self.num_accepted_tokens += num_accepted_tokens
35-
self.num_emitted_tokens += num_emitted_tokens
3630

3731

3832
class SpecDecodingMetrics:
3933

40-
def __init__(self, speculative_config: SpeculativeConfig):
41-
self.num_spec_tokens = (speculative_config.num_speculative_tokens
42-
if speculative_config is not None else 0)
34+
def __init__(self):
4335
self.reset()
4436

4537
def reset(self):
4638
self.num_draft_tokens: list[int] = []
4739
self.num_accepted_tokens: list[int] = []
48-
self.num_emitted_tokens: list[int] = []
4940

5041
def observe(self, spec_decoding_stats: SpecDecodingStats):
5142
self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
5243
self.num_accepted_tokens.append(
5344
spec_decoding_stats.num_accepted_tokens)
54-
self.num_emitted_tokens.append(spec_decoding_stats.num_emitted_tokens)
5545

5646
def log(self):
5747
num_draft_tokens = np.sum(self.num_draft_tokens)
5848
num_accepted_tokens = np.sum(self.num_accepted_tokens)
59-
num_emitted_tokens = np.sum(self.num_emitted_tokens)
6049

6150
draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens
6251
if num_draft_tokens > 0 else float("nan"))
6352

6453
logger.info(
6554
"Speculative metrics: "
6655
"Draft acceptance rate: %.3f, "
67-
"Number of speculative tokens: %d, "
6856
"Number of accepted tokens: %d, "
69-
"Number of draft tokens: %d, "
70-
"Number of emitted tokens: %d.", draft_acceptance_rate,
71-
num_accepted_tokens, num_draft_tokens, num_emitted_tokens)
57+
"Number of draft tokens: %d, ", draft_acceptance_rate,
58+
num_accepted_tokens, num_draft_tokens)
7259
self.reset()

vllm/v1/worker/gpu_model_runner.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,20 +1154,20 @@ def generate_draft_token_ids(
11541154
self,
11551155
sampled_token_ids: list[list[int]],
11561156
sampling_metadata: SamplingMetadata,
1157-
) -> list[Optional[list[int]]]:
1157+
) -> list[list[int]]:
11581158
# TODO(woosuk): Optimize.
1159-
draft_token_ids: list[Optional[list[int]]] = []
1159+
draft_token_ids: list[list[int]] = []
11601160
for i, sampled_ids in enumerate(sampled_token_ids):
11611161
num_sampled_ids = len(sampled_ids)
11621162
if not num_sampled_ids:
11631163
# Skip speculative decoding.
1164-
draft_token_ids.append(None)
1164+
draft_token_ids.append([])
11651165
continue
11661166

11671167
# Skip requests that require top-p, top-k, etc.
11681168
req_id = self.input_batch.req_ids[i]
11691169
if not is_spec_decode_supported(req_id, self.input_batch):
1170-
draft_token_ids.append(None)
1170+
draft_token_ids.append([])
11711171
continue
11721172

11731173
# Add sampled_token_ids to token_ids_cpu.

0 commit comments

Comments
 (0)