Skip to content

Commit 43efac0

Browse files
DarkLight1337alhridoy
authored andcommitted
[Metrics] Log multi-modal cache stats and fix reset (vllm-project#26285)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 0ccaf77 commit 43efac0

File tree

25 files changed

+588
-237
lines changed

25 files changed

+588
-237
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import pytest
5+
6+
from vllm import LLM
7+
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
8+
from vllm.v1.metrics.reader import Counter, Metric
9+
10+
from ..openai.test_vision import TEST_IMAGE_ASSETS
11+
12+
13+
def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
14+
return [
15+
{
16+
"role": "user",
17+
"content": [
18+
{
19+
"type": "image_url",
20+
"image_url": {"url": image_url},
21+
},
22+
],
23+
}
24+
]
25+
26+
27+
def _get_counter_value(metrics: list[Metric], name: str):
28+
metric = next(m for m in metrics if m.name == name)
29+
assert isinstance(metric, Counter)
30+
return metric.value
31+
32+
33+
def _get_mm_cache_stats(metrics: list[Metric]):
34+
mm_cache_queries = _get_counter_value(metrics, "vllm:mm_cache_queries")
35+
mm_cache_hits = _get_counter_value(metrics, "vllm:mm_cache_hits")
36+
37+
return mm_cache_queries, mm_cache_hits
38+
39+
40+
@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
41+
@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
42+
def test_mm_cache_stats(
43+
num_gpus_available,
44+
image_urls,
45+
mm_processor_cache_type,
46+
):
47+
llm = LLM(
48+
model="llava-hf/llava-1.5-7b-hf",
49+
max_model_len=4096,
50+
max_num_seqs=5,
51+
enforce_eager=True,
52+
mm_processor_cache_type=mm_processor_cache_type,
53+
disable_log_stats=False,
54+
limit_mm_per_prompt={"image": 2},
55+
)
56+
57+
llm.chat(_make_messages(image_urls[0]))
58+
assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)
59+
60+
llm.chat(_make_messages(image_urls[1]))
61+
assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)
62+
63+
llm.chat(_make_messages(image_urls[0]))
64+
assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)
65+
66+
# NOTE: This only resets hit rate stats in CachingMetrics
67+
# The raw queries and hits counts remain unaffected
68+
llm.reset_mm_cache()
69+
70+
llm.chat(_make_messages(image_urls[0]))
71+
assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)
72+
73+
llm.chat(_make_messages(image_urls[1]))
74+
assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)

tests/entrypoints/openai/test_metrics.py

Lines changed: 115 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,18 @@
1818

1919
from ...utils import RemoteOpenAIServer
2020

21-
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
21+
MODELS = {
22+
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
23+
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
24+
}
2225
PREV_MINOR_VERSION = version._prev_minor_version()
2326

2427

28+
@pytest.fixture(scope="module", params=list(MODELS.keys()))
29+
def model_key(request):
30+
yield request.param
31+
32+
2533
@pytest.fixture(scope="module")
2634
def default_server_args():
2735
return [
@@ -45,11 +53,12 @@ def default_server_args():
4553
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
4654
],
4755
)
48-
def server(default_server_args, request):
56+
def server(model_key, default_server_args, request):
4957
if request.param:
5058
default_server_args.append(request.param)
5159

52-
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
60+
model_name = MODELS[model_key]
61+
with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
5362
yield remote_server
5463

5564

@@ -60,73 +69,80 @@ async def client(server):
6069

6170

6271
_PROMPT = "Hello my name is Robert and I love magic"
63-
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
64-
_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
65-
66-
_NUM_REQUESTS = 10
67-
_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
68-
_NUM_GENERATION_TOKENS_PER_REQUEST = 10
69-
70-
# {metric_family: [(suffix, expected_value)]}
71-
EXPECTED_VALUES = {
72-
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
73-
"vllm:time_per_output_token_seconds": [
74-
("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
75-
],
76-
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
77-
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
78-
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
79-
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
80-
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
81-
"vllm:request_prompt_tokens": [
82-
("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
83-
("_count", _NUM_REQUESTS),
84-
],
85-
"vllm:request_generation_tokens": [
86-
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
87-
("_count", _NUM_REQUESTS),
88-
],
89-
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
90-
"vllm:request_params_max_tokens": [
91-
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
92-
("_count", _NUM_REQUESTS),
93-
],
94-
"vllm:iteration_tokens_total": [
95-
(
96-
"_sum",
97-
_NUM_REQUESTS
98-
* (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST),
99-
),
100-
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
101-
],
102-
"vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
103-
"vllm:generation_tokens": [
104-
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
105-
],
106-
"vllm:request_success": [("_total", _NUM_REQUESTS)],
107-
}
72+
_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
73+
74+
75+
def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
76+
num_prompt_tokens = len(prompt_ids)
77+
78+
# {metric_family: [(suffix, expected_value)]}
79+
return {
80+
"vllm:time_to_first_token_seconds": [("_count", num_requests)],
81+
"vllm:time_per_output_token_seconds": [
82+
("_count", num_requests * (max_tokens - 1))
83+
],
84+
"vllm:e2e_request_latency_seconds": [("_count", num_requests)],
85+
"vllm:request_queue_time_seconds": [("_count", num_requests)],
86+
"vllm:request_inference_time_seconds": [("_count", num_requests)],
87+
"vllm:request_prefill_time_seconds": [("_count", num_requests)],
88+
"vllm:request_decode_time_seconds": [("_count", num_requests)],
89+
"vllm:request_prompt_tokens": [
90+
("_sum", num_requests * num_prompt_tokens),
91+
("_count", num_requests),
92+
],
93+
"vllm:request_generation_tokens": [
94+
("_sum", num_requests * max_tokens),
95+
("_count", num_requests),
96+
],
97+
"vllm:request_params_n": [("_count", num_requests)],
98+
"vllm:request_params_max_tokens": [
99+
("_sum", num_requests * max_tokens),
100+
("_count", num_requests),
101+
],
102+
"vllm:iteration_tokens_total": [
103+
(
104+
"_sum",
105+
num_requests * (num_prompt_tokens + max_tokens),
106+
),
107+
("_count", num_requests * max_tokens),
108+
],
109+
"vllm:prompt_tokens": [("_total", num_requests * num_prompt_tokens)],
110+
"vllm:generation_tokens": [("_total", num_requests * max_tokens)],
111+
"vllm:request_success": [("_total", num_requests)],
112+
}
108113

109114

110115
@pytest.mark.asyncio
111116
async def test_metrics_counts(
112117
server: RemoteOpenAIServer,
113118
client: openai.AsyncClient,
119+
model_key: str,
114120
):
115-
for _ in range(_NUM_REQUESTS):
121+
if model_key == "multimodal":
122+
pytest.skip("Unnecessary test")
123+
124+
model_name = MODELS[model_key]
125+
tokenizer = AutoTokenizer.from_pretrained(model_name)
126+
prompt_ids = tokenizer.encode(_PROMPT)
127+
num_requests = 10
128+
max_tokens = 10
129+
130+
for _ in range(num_requests):
116131
# sending a request triggers the metrics to be logged.
117132
await client.completions.create(
118-
model=MODEL_NAME,
119-
prompt=_TOKENIZED_PROMPT,
120-
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST,
133+
model=model_name,
134+
prompt=prompt_ids,
135+
max_tokens=max_tokens,
121136
)
122137

123138
response = requests.get(server.url_for("metrics"))
124139
print(response.text)
125140
assert response.status_code == HTTPStatus.OK
126141

127142
# Loop over all expected metric_families
128-
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
129-
if (metric_family not in EXPECTED_METRICS_V1) or (
143+
expected_values = _get_expected_values(num_requests, prompt_ids, max_tokens)
144+
for metric_family, suffix_values_list in expected_values.items():
145+
if metric_family not in EXPECTED_METRICS_V1 or (
130146
not server.show_hidden_metrics
131147
and metric_family in HIDDEN_DEPRECATED_METRICS
132148
):
@@ -217,6 +233,11 @@ async def test_metrics_counts(
217233
"vllm:request_decode_time_seconds_count",
218234
]
219235

236+
EXPECTED_METRICS_MM = [
237+
"vllm:mm_cache_queries",
238+
"vllm:mm_cache_hits",
239+
]
240+
220241
HIDDEN_DEPRECATED_METRICS: list[str] = [
221242
"vllm:gpu_cache_usage_perc",
222243
"vllm:gpu_prefix_cache_queries",
@@ -231,19 +252,43 @@ async def test_metrics_counts(
231252
async def test_metrics_exist(
232253
server: RemoteOpenAIServer,
233254
client: openai.AsyncClient,
255+
model_key: str,
234256
):
257+
model_name = MODELS[model_key]
258+
235259
# sending a request triggers the metrics to be logged.
236-
await client.completions.create(
237-
model=MODEL_NAME,
238-
prompt="Hello, my name is",
239-
max_tokens=5,
240-
temperature=0.0,
241-
)
260+
if model_key == "text":
261+
await client.completions.create(
262+
model=model_name,
263+
prompt="Hello, my name is",
264+
max_tokens=5,
265+
temperature=0.0,
266+
)
267+
else:
268+
await client.chat.completions.create(
269+
model=model_name,
270+
messages=[
271+
{
272+
"role": "user",
273+
"content": [
274+
{"type": "image_url", "image_url": {"url": _IMAGE_URL}},
275+
{"type": "text", "text": "What's in this image?"},
276+
],
277+
}
278+
],
279+
max_tokens=5,
280+
temperature=0.0,
281+
)
242282

243283
response = requests.get(server.url_for("metrics"))
244284
assert response.status_code == HTTPStatus.OK
245285

246-
for metric in EXPECTED_METRICS_V1:
286+
expected_metrics = EXPECTED_METRICS_V1
287+
if model_key == "multimodal":
288+
# NOTE: Don't use in-place assignment
289+
expected_metrics = expected_metrics + EXPECTED_METRICS_MM
290+
291+
for metric in expected_metrics:
247292
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
248293
continue
249294
assert metric in response.text
@@ -253,9 +298,14 @@ async def test_metrics_exist(
253298
async def test_abort_metrics_reset(
254299
server: RemoteOpenAIServer,
255300
client: openai.AsyncClient,
301+
model_key: str,
256302
):
303+
model_name = MODELS[model_key]
304+
tokenizer = AutoTokenizer.from_pretrained(model_name)
305+
prompt_ids = tokenizer.encode(_PROMPT)
306+
257307
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
258-
server
308+
server,
259309
)
260310

261311
# Expect no running requests or kvcache usage
@@ -268,8 +318,8 @@ async def test_abort_metrics_reset(
268318
for _ in range(3):
269319
task = asyncio.create_task(
270320
client.completions.create(
271-
model=MODEL_NAME,
272-
prompt=_TOKENIZED_PROMPT,
321+
model=model_name,
322+
prompt=prompt_ids,
273323
max_tokens=100, # Long generation to give time to abort
274324
temperature=0.0,
275325
)
@@ -281,7 +331,7 @@ async def test_abort_metrics_reset(
281331

282332
# Check that we have running requests
283333
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
284-
server
334+
server,
285335
)
286336

287337
# Expect running requests and kvcache usage

tests/v1/core/test_kv_cache_utils.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
BlockHash,
2121
FreeKVCacheBlockQueue,
2222
KVCacheBlock,
23-
PrefixCachingMetrics,
2423
estimate_max_model_len,
2524
generate_block_hash_extra_keys,
2625
generate_scheduler_kv_cache_config,
@@ -42,7 +41,7 @@
4241
SlidingWindowSpec,
4342
UniformTypeKVCacheSpecs,
4443
)
45-
from vllm.v1.metrics.stats import PrefixCacheStats
44+
from vllm.v1.metrics.stats import CachingMetrics, PrefixCacheStats
4645
from vllm.v1.request import Request
4746

4847
pytestmark = pytest.mark.cpu_test
@@ -536,7 +535,7 @@ def test_metrics():
536535
"""
537536
Test the prefix caching metrics.
538537
"""
539-
metrics = PrefixCachingMetrics(max_recent_requests=5)
538+
metrics = CachingMetrics(max_recent_requests=5)
540539
assert metrics.hit_rate == 0.0
541540

542541
metrics.observe(_stats(1, 20, 9))
@@ -568,7 +567,7 @@ def test_metrics_empty_stats():
568567
"""
569568
Test the prefix caching metrics with empty stats.
570569
"""
571-
metrics = PrefixCachingMetrics(max_recent_requests=5)
570+
metrics = CachingMetrics(max_recent_requests=5)
572571
metrics.observe(_stats(0, 0, 0))
573572
metrics.observe(_stats(1, 20, 9))
574573
metrics.observe(_stats(0, 0, 0))

tests/v1/distributed/test_async_llm_dp.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from vllm.v1.engine.async_llm import AsyncLLM
1818
from vllm.v1.engine.core_client import DPAsyncMPClient
1919
from vllm.v1.metrics.loggers import StatLoggerBase
20-
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
20+
from vllm.v1.metrics.stats import IterationStats, MultiModalCacheStats, SchedulerStats
2121

2222
DP_SIZE = int(os.getenv("DP_SIZE", 2))
2323

@@ -93,6 +93,7 @@ def record(
9393
self,
9494
scheduler_stats: Optional[SchedulerStats],
9595
iteration_stats: Optional[IterationStats],
96+
mm_cache_stats: Optional[MultiModalCacheStats] = None,
9697
engine_idx: int = 0,
9798
):
9899
if iteration_stats:

0 commit comments

Comments
 (0)