diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 8e8cb27d42cf..e73e08e74b0d 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -310,7 +310,7 @@ def test_metrics(): def stats(requests, queries, hits): return PrefixCacheStats(requests=requests, queries=queries, hits=hits) - metrics = PrefixCachingMetrics(interval=5) + metrics = PrefixCachingMetrics(max_recent_requests=5) assert metrics.hit_rate == 0.0 metrics.observe(stats(1, 20, 9)) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 521fea70fc98..3026ecc1c968 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -47,15 +47,15 @@ class BlockHashType(NamedTuple): class PrefixCachingMetrics: - """Metrics for prefix caching with a hit rate of the most recent N requests. + """Metrics for prefix caching with a hit rate of the max recent N requests. Args: - interval: The number of the most recent requests to aggregate. + max_recent_requests: The number of the max recent requests to aggregate. Defaults to 1000. """ - def __init__(self, interval: int = 1000): - self.interval = interval + def __init__(self, max_recent_requests: int = 1000): + self.max_recent_requests = max_recent_requests # The current aggregated values. self.aggregated_requests = 0 self.aggregated_query_total = 0 @@ -70,7 +70,7 @@ def observe(self, stats: PrefixCacheStats): are being scheduled and are looking for computed blocks. When there are more than `interval` requests, the oldest set of - requestsare removed from the metrics. + requests are removed from the metrics. Args: stats: The prefix cache stats. @@ -87,7 +87,7 @@ def observe(self, stats: PrefixCacheStats): self.aggregated_query_hit += stats.hits # Remove the oldest stats if the number of requests exceeds. - if self.aggregated_requests > self.interval: + if self.aggregated_requests > self.max_recent_requests: old_requests, old_queries, old_hits = self.query_queue.popleft() self.aggregated_requests -= old_requests self.aggregated_query_total -= old_queries