Skip to content

Commit ae2b26e

Browse files
committed
[Metrics][KVConnector] Add connector prefix cache hit rate stats
Signed-off-by: tovam <tovam@pliops.com>
1 parent 432e1cb commit ae2b26e

File tree

4 files changed

+95
-17
lines changed

4 files changed

+95
-17
lines changed

vllm/distributed/kv_transfer/kv_connector/v1/base.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343

4444
from vllm.logger import init_logger
4545
from vllm.v1.core.sched.output import SchedulerOutput
46+
from vllm.v1.metrics.stats import PrefixCacheStats
4647
from vllm.v1.outputs import KVConnectorOutput
4748

4849
if TYPE_CHECKING:
@@ -89,6 +90,8 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
8990
self._connector_metadata: Optional[KVConnectorMetadata] = None
9091
self._vllm_config = vllm_config
9192
self._role = role
93+
# FIXME: make prefix cache stats conditional on log_stats
94+
self.prefix_cache_stats = PrefixCacheStats()
9295

9396
@property
9497
def role(self) -> KVConnectorRole:
@@ -413,3 +416,27 @@ def build_kv_connector_stats(
413416
which can implement custom aggregation logic on the data dict.
414417
"""
415418
return None
419+
420+
def update_prefix_cache_stats(self, request_num_tokens: int,
421+
num_external_tokens: int) -> None:
422+
"""
423+
Update prefix cache statistics for a request.
424+
425+
Args:
426+
request_num_tokens (int): the number of tokens in the request.
427+
num_external_tokens (int): the number of tokens that will be
428+
loaded from the external KV cache.
429+
"""
430+
self.prefix_cache_stats.requests += 1
431+
self.prefix_cache_stats.queries += request_num_tokens
432+
self.prefix_cache_stats.hits += num_external_tokens
433+
434+
def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
435+
"""Get (and reset) the prefix cache stats.
436+
437+
Returns:
438+
The current prefix caching stats.
439+
"""
440+
stats = self.prefix_cache_stats
441+
self.prefix_cache_stats = PrefixCacheStats()
442+
return stats

vllm/v1/core/sched/scheduler.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,9 @@ def schedule(self) -> SchedulerOutput:
495495
new_computed_blocks + new_blocks,
496496
num_external_computed_tokens,
497497
)
498+
if self.log_stats:
499+
self.connector.update_prefix_cache_stats(
500+
request.num_tokens, num_external_computed_tokens)
498501

499502
# Request was already popped from self.waiting
500503
# unless it was re-added above due to new_blocks being None.
@@ -1197,16 +1200,20 @@ def make_stats(
11971200
if not self.log_stats:
11981201
return None
11991202
prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
1203+
connector_prefix_cache_stats = self.connector.make_prefix_cache_stats(
1204+
) if self.connector is not None else None
12001205
assert prefix_cache_stats is not None
1201-
return SchedulerStats(num_running_reqs=len(self.running),
1202-
num_waiting_reqs=len(self.waiting),
1203-
kv_cache_usage=self.kv_cache_manager.usage,
1204-
prefix_cache_stats=prefix_cache_stats,
1205-
spec_decoding_stats=spec_decoding_stats,
1206-
num_corrupted_reqs=sum(req.is_output_corrupted
1207-
for req in self.running),
1208-
kv_connector_stats=kv_connector_stats.data
1209-
if kv_connector_stats else None)
1206+
return SchedulerStats(
1207+
num_running_reqs=len(self.running),
1208+
num_waiting_reqs=len(self.waiting),
1209+
kv_cache_usage=self.kv_cache_manager.usage,
1210+
prefix_cache_stats=prefix_cache_stats,
1211+
connector_prefix_cache_stats=connector_prefix_cache_stats,
1212+
spec_decoding_stats=spec_decoding_stats,
1213+
num_corrupted_reqs=sum(req.is_output_corrupted
1214+
for req in self.running),
1215+
kv_connector_stats=kv_connector_stats.data
1216+
if kv_connector_stats else None)
12101217

12111218
def make_spec_decoding_stats(
12121219
self,

vllm/v1/metrics/loggers.py

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
6363
self.spec_decoding_logging = SpecDecodingLogging()
6464
kv_tranfer_config = self.vllm_config.kv_transfer_config
6565
self.kv_connector_logging = KVConnectorLogging(kv_tranfer_config)
66+
self.connector_prefix_caching_metrics = PrefixCachingMetrics(
67+
) if kv_tranfer_config else None
6668
self.last_prompt_throughput: float = 0.0
6769
self.last_generation_throughput: float = 0.0
6870

@@ -97,6 +99,11 @@ def record(self,
9799
self.prefix_caching_metrics.observe(
98100
scheduler_stats.prefix_cache_stats)
99101

102+
if (scheduler_stats.connector_prefix_cache_stats is not None
103+
and self.connector_prefix_caching_metrics is not None):
104+
self.connector_prefix_caching_metrics.observe(
105+
scheduler_stats.connector_prefix_cache_stats)
106+
100107
if scheduler_stats.spec_decoding_stats is not None:
101108
self.spec_decoding_logging.observe(
102109
scheduler_stats.spec_decoding_stats)
@@ -124,21 +131,30 @@ def log(self):
124131
self.last_prompt_throughput = prompt_throughput
125132

126133
# Format and print output.
127-
log_fn(
128-
"Engine %03d: "
129-
"Avg prompt throughput: %.1f tokens/s, "
130-
"Avg generation throughput: %.1f tokens/s, "
131-
"Running: %d reqs, Waiting: %d reqs, "
132-
"GPU KV cache usage: %.1f%%, "
133-
"Prefix cache hit rate: %.1f%%",
134+
log_msg = ("Engine %03d: "
135+
"Avg prompt throughput: %.1f tokens/s, "
136+
"Avg generation throughput: %.1f tokens/s, "
137+
"Running: %d reqs, Waiting: %d reqs, "
138+
"GPU KV cache usage: %.1f%%, "
139+
"Prefix cache hit rate: %.1f%%")
140+
141+
log_args = [
134142
self.engine_index,
135143
prompt_throughput,
136144
generation_throughput,
137145
scheduler_stats.num_running_reqs,
138146
scheduler_stats.num_waiting_reqs,
139147
scheduler_stats.kv_cache_usage * 100,
140148
self.prefix_caching_metrics.hit_rate * 100,
141-
)
149+
]
150+
151+
if self.connector_prefix_caching_metrics is not None:
152+
log_msg += ", KV connector prefix cache hit rate: %.1f%%"
153+
log_args.append(self.connector_prefix_caching_metrics.hit_rate *
154+
100)
155+
156+
log_fn(log_msg, *log_args)
157+
142158
self.spec_decoding_logging.log(log_fn=log_fn)
143159
self.kv_connector_logging.log(log_fn=log_fn)
144160

@@ -271,6 +287,25 @@ def __init__(self,
271287
self.counter_prefix_cache_hits = make_per_engine(
272288
counter_prefix_cache_hits, engine_indexes, model_name)
273289

290+
#
291+
# KV connector cache
292+
#
293+
counter_connector_prefix_cache_queries = self._counter_cls(
294+
name="vllm:connector_prefix_cache_queries",
295+
documentation=("KV connector prefix cache queries, "
296+
"in terms of number of queried tokens."),
297+
labelnames=labelnames)
298+
self.counter_connector_prefix_cache_queries = make_per_engine(
299+
counter_connector_prefix_cache_queries, engine_indexes, model_name)
300+
301+
counter_connector_prefix_cache_hits = self._counter_cls(
302+
name="vllm:connector_prefix_cache_hits",
303+
documentation=("KV connector prefix cache hits, "
304+
"in terms of number of cached tokens."),
305+
labelnames=labelnames)
306+
self.counter_connector_prefix_cache_hits = make_per_engine(
307+
counter_connector_prefix_cache_hits, engine_indexes, model_name)
308+
274309
#
275310
# Counters
276311
#
@@ -550,6 +585,12 @@ def record(self,
550585
self.counter_prefix_cache_hits[engine_idx].inc(
551586
scheduler_stats.prefix_cache_stats.hits)
552587

588+
if scheduler_stats.connector_prefix_cache_stats is not None:
589+
self.counter_connector_prefix_cache_queries[engine_idx].inc(
590+
scheduler_stats.connector_prefix_cache_stats.queries)
591+
self.counter_connector_prefix_cache_hits[engine_idx].inc(
592+
scheduler_stats.connector_prefix_cache_stats.hits)
593+
553594
if scheduler_stats.spec_decoding_stats is not None:
554595
self.spec_decoding_prom.observe(
555596
scheduler_stats.spec_decoding_stats, engine_idx)

vllm/v1/metrics/stats.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ class SchedulerStats:
4848
prefix_cache_stats: PrefixCacheStats = field(
4949
default_factory=PrefixCacheStats)
5050

51+
connector_prefix_cache_stats: Optional[PrefixCacheStats] = field(
52+
default=None)
53+
5154
spec_decoding_stats: Optional[SpecDecodingStats] = None
5255
kv_connector_stats: Optional[dict[str, Any]] = None
5356

0 commit comments

Comments
 (0)