99
1010from vllm .config import ModelConfig
1111from vllm .logger import init_logger
12+ from vllm .v1 .core .kv_cache_utils import PrefixCachingMetrics
1213from vllm .v1 .engine import FinishReason
1314from vllm .v1 .metrics .stats import IterationStats , SchedulerStats
1415
@@ -37,6 +38,9 @@ def _reset(self, now):
3738 self .num_prompt_tokens : List [int ] = []
3839 self .num_generation_tokens : List [int ] = []
3940
41+ # Prefix cache metrics. TODO: Make the interval configurable.
42+ self .prefix_caching_metrics = PrefixCachingMetrics ()
43+
4044 def _local_interval_elapsed (self , now : float ) -> bool :
4145 # Log every _LOCAL_LOGGING_INTERVAL_SEC.
4246 elapsed_time = now - self .last_log_time
@@ -58,6 +62,8 @@ def log(self, scheduler_stats: SchedulerStats,
5862
5963 self ._track_iteration_stats (iteration_stats )
6064
65+ self .prefix_caching_metrics .observe (scheduler_stats .prefix_cache_stats )
66+
6167 now = time .monotonic ()
6268 if not self ._local_interval_elapsed (now ):
6369 return
@@ -72,13 +78,15 @@ def log(self, scheduler_stats: SchedulerStats,
7278 logger .info (
7379 "Avg prompt throughput: %.1f tokens/s, "
7480 "Avg generation throughput: %.1f tokens/s, "
75- "Running: %d reqs, Waiting: %d reqs "
76- "GPU KV cache usage: %.1f%%." ,
81+ "Running: %d reqs, Waiting: %d reqs, "
82+ "GPU KV cache usage: %.1f%%, "
83+ "Prefix cache hit rate: %.1f%%" ,
7784 prompt_throughput ,
7885 generation_throughput ,
7986 scheduler_stats .num_running_reqs ,
8087 scheduler_stats .num_waiting_reqs ,
8188 scheduler_stats .gpu_cache_usage * 100 ,
89+ self .prefix_caching_metrics .hit_rate * 100 ,
8290 )
8391
8492
@@ -107,6 +115,18 @@ def __init__(self, model_config: ModelConfig):
107115 documentation = "GPU KV-cache usage. 1 means 100 percent usage." ,
108116 labelnames = labelnames ).labels (* labelvalues )
109117
118+ self .counter_gpu_prefix_cache_queries = prometheus_client .Counter (
119+ name = "vllm:gpu_prefix_cache_queries" ,
120+ documentation =
121+ "GPU prefix cache queries, in terms of number of queried blocks." ,
122+ labelnames = labelnames ).labels (* labelvalues )
123+
124+ self .counter_gpu_prefix_cache_hits = prometheus_client .Counter (
125+ name = "vllm:gpu_prefix_cache_hits" ,
126+ documentation =
127+ "GPU prefix cache hits, in terms of number of cached blocks." ,
128+ labelnames = labelnames ).labels (* labelvalues )
129+
110130 self .counter_prompt_tokens = prometheus_client .Counter (
111131 name = "vllm:prompt_tokens_total" ,
112132 documentation = "Number of prefill tokens processed." ,
@@ -170,6 +190,11 @@ def log(self, scheduler_stats: SchedulerStats,
170190
171191 self .gauge_gpu_cache_usage .set (scheduler_stats .gpu_cache_usage )
172192
193+ self .counter_gpu_prefix_cache_queries .inc (
194+ scheduler_stats .prefix_cache_stats .queries )
195+ self .counter_gpu_prefix_cache_hits .inc (
196+ scheduler_stats .prefix_cache_stats .hits )
197+
173198 self .counter_prompt_tokens .inc (iteration_stats .num_prompt_tokens )
174199 self .counter_generation_tokens .inc (
175200 iteration_stats .num_generation_tokens )
0 commit comments