Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
db606d9
{Metrics] Log multi-modal cache stats
DarkLight1337 Oct 6, 2025
425cc85
Format
DarkLight1337 Oct 6, 2025
1f7ef40
Clean up
DarkLight1337 Oct 6, 2025
2ae0ea1
Actually record the stats
DarkLight1337 Oct 6, 2025
51fdb0e
Fix missing fields
DarkLight1337 Oct 6, 2025
346f61c
Rename
DarkLight1337 Oct 6, 2025
7b4edeb
Fix
DarkLight1337 Oct 6, 2025
4fd7b9c
Log once
DarkLight1337 Oct 7, 2025
294f817
Merge branch 'main' into log-mm-cache-stats-new
DarkLight1337 Oct 8, 2025
932653f
Add test
DarkLight1337 Oct 8, 2025
1fe3ce8
Merge branch 'main' into log-mm-cache-stats-new
DarkLight1337 Oct 8, 2025
3b6c535
Add FIXME
DarkLight1337 Oct 8, 2025
4129938
[Bugfix] Fix `reset_mm_cache` not working for SHM impl
DarkLight1337 Oct 8, 2025
387b282
Back-compatibility
DarkLight1337 Oct 8, 2025
ed26d2f
Merge branch 'fix-mm-reset' into log-mm-cache-stats-new
DarkLight1337 Oct 8, 2025
a7f5197
Get cache stats from P1 instead of P0 to pass the test
DarkLight1337 Oct 8, 2025
1e2e572
[Bugfix] Fix duplicate SHM cache initialization
DarkLight1337 Oct 8, 2025
26a0493
Fix
DarkLight1337 Oct 8, 2025
019bbcd
Address comment
DarkLight1337 Oct 8, 2025
8cf4393
Fix initialization
DarkLight1337 Oct 8, 2025
074ddc8
Fix
DarkLight1337 Oct 8, 2025
aa54cd6
Update
DarkLight1337 Oct 8, 2025
64c6c5e
Address comment
DarkLight1337 Oct 8, 2025
344bffd
Simplify
DarkLight1337 Oct 8, 2025
6856a8b
mypy
DarkLight1337 Oct 8, 2025
0cf7202
Remove unnecessary attrs
DarkLight1337 Oct 8, 2025
cf30c2d
Improve error msg
DarkLight1337 Oct 8, 2025
bdefdc4
Simplify
DarkLight1337 Oct 8, 2025
a927abf
Fix
DarkLight1337 Oct 8, 2025
6053b4d
Fix
DarkLight1337 Oct 8, 2025
fe02be8
mypy
DarkLight1337 Oct 8, 2025
1b12039
Merge branch 'main' into fix-shm-init
DarkLight1337 Oct 8, 2025
99460f6
mypy
DarkLight1337 Oct 8, 2025
cc08c65
Fix
DarkLight1337 Oct 8, 2025
16cc17c
Merge branch 'fix-shm-init' into log-mm-cache-stats-new
DarkLight1337 Oct 8, 2025
16cb36b
Address comment
DarkLight1337 Oct 8, 2025
70cf395
Fix request count
DarkLight1337 Oct 8, 2025
c4d8f4a
Fix default
DarkLight1337 Oct 8, 2025
3702944
Avoid hang
DarkLight1337 Oct 9, 2025
4ec44c6
Fix mocks
DarkLight1337 Oct 9, 2025
91cd61c
Merge branch 'fix-shm-init' into log-mm-cache-stats-new
DarkLight1337 Oct 9, 2025
5809b9d
Test passes
DarkLight1337 Oct 9, 2025
f7beaeb
Fix request count
DarkLight1337 Oct 9, 2025
610625d
Merge branch 'main' into log-mm-cache-stats-new
DarkLight1337 Oct 9, 2025
4a76fcf
Unused
DarkLight1337 Oct 9, 2025
11cf5f9
Unnecessary param
DarkLight1337 Oct 9, 2025
14bf073
Clear cache inside `MultiModalBudget`
DarkLight1337 Oct 9, 2025
c2e95a7
Fix test
DarkLight1337 Oct 9, 2025
0bbc9dd
Merge branch 'main' into log-mm-cache-stats-new
DarkLight1337 Oct 9, 2025
feb5831
Fix
DarkLight1337 Oct 10, 2025
120bbb5
Reduce diff
DarkLight1337 Oct 10, 2025
edb2248
Fix GC
DarkLight1337 Oct 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions tests/entrypoints/llm/test_mm_cache_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest

from vllm import LLM
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.v1.metrics.reader import Counter, Metric

from ..openai.test_vision import TEST_IMAGE_ASSETS


def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
return [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]


def _get_counter_value(metrics: list[Metric], name: str):
metric = next(m for m in metrics if m.name == name)
assert isinstance(metric, Counter)
return metric.value


def _get_mm_cache_stats(metrics: list[Metric]):
mm_cache_queries = _get_counter_value(metrics, "vllm:mm_cache_queries")
mm_cache_hits = _get_counter_value(metrics, "vllm:mm_cache_hits")

return mm_cache_queries, mm_cache_hits


@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
def test_mm_cache_stats(
num_gpus_available,
image_urls,
mm_processor_cache_type,
):
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
mm_processor_cache_type=mm_processor_cache_type,
disable_log_stats=False,
limit_mm_per_prompt={"image": 2},
)

llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)

llm.chat(_make_messages(image_urls[1]))
assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)

llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)

# NOTE: This only resets hit rate stats in CachingMetrics
# The raw queries and hits counts remain unaffected
llm.reset_mm_cache()

llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)

llm.chat(_make_messages(image_urls[1]))
assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)
180 changes: 115 additions & 65 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,18 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODELS = {
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
}
PREV_MINOR_VERSION = version._prev_minor_version()


@pytest.fixture(scope="module", params=list(MODELS.keys()))
def model_key(request):
yield request.param


@pytest.fixture(scope="module")
def default_server_args():
return [
Expand All @@ -45,11 +53,12 @@ def default_server_args():
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
def server(default_server_args, request):
def server(model_key, default_server_args, request):
if request.param:
default_server_args.append(request.param)

with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
model_name = MODELS[model_key]
with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
yield remote_server


Expand All @@ -60,73 +69,80 @@ async def client(server):


_PROMPT = "Hello my name is Robert and I love magic"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]

_NUM_REQUESTS = 10
_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
_NUM_GENERATION_TOKENS_PER_REQUEST = 10

# {metric_family: [(suffix, expected_value)]}
EXPECTED_VALUES = {
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
"vllm:time_per_output_token_seconds": [
("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
],
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prompt_tokens": [
("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS),
],
"vllm:request_generation_tokens": [
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS),
],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_max_tokens": [
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS),
],
"vllm:iteration_tokens_total": [
(
"_sum",
_NUM_REQUESTS
* (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST),
),
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
],
"vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": [
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
}
_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"


def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
num_prompt_tokens = len(prompt_ids)

# {metric_family: [(suffix, expected_value)]}
return {
"vllm:time_to_first_token_seconds": [("_count", num_requests)],
"vllm:time_per_output_token_seconds": [
("_count", num_requests * (max_tokens - 1))
],
"vllm:e2e_request_latency_seconds": [("_count", num_requests)],
"vllm:request_queue_time_seconds": [("_count", num_requests)],
"vllm:request_inference_time_seconds": [("_count", num_requests)],
"vllm:request_prefill_time_seconds": [("_count", num_requests)],
"vllm:request_decode_time_seconds": [("_count", num_requests)],
"vllm:request_prompt_tokens": [
("_sum", num_requests * num_prompt_tokens),
("_count", num_requests),
],
"vllm:request_generation_tokens": [
("_sum", num_requests * max_tokens),
("_count", num_requests),
],
"vllm:request_params_n": [("_count", num_requests)],
"vllm:request_params_max_tokens": [
("_sum", num_requests * max_tokens),
("_count", num_requests),
],
"vllm:iteration_tokens_total": [
(
"_sum",
num_requests * (num_prompt_tokens + max_tokens),
),
("_count", num_requests * max_tokens),
],
"vllm:prompt_tokens": [("_total", num_requests * num_prompt_tokens)],
"vllm:generation_tokens": [("_total", num_requests * max_tokens)],
"vllm:request_success": [("_total", num_requests)],
}


@pytest.mark.asyncio
async def test_metrics_counts(
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
for _ in range(_NUM_REQUESTS):
if model_key == "multimodal":
pytest.skip("Unnecessary test")

model_name = MODELS[model_key]
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_ids = tokenizer.encode(_PROMPT)
num_requests = 10
max_tokens = 10

for _ in range(num_requests):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST,
model=model_name,
prompt=prompt_ids,
max_tokens=max_tokens,
)

response = requests.get(server.url_for("metrics"))
print(response.text)
assert response.status_code == HTTPStatus.OK

# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if (metric_family not in EXPECTED_METRICS_V1) or (
expected_values = _get_expected_values(num_requests, prompt_ids, max_tokens)
for metric_family, suffix_values_list in expected_values.items():
if metric_family not in EXPECTED_METRICS_V1 or (
not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS
):
Expand Down Expand Up @@ -217,6 +233,11 @@ async def test_metrics_counts(
"vllm:request_decode_time_seconds_count",
]

EXPECTED_METRICS_MM = [
"vllm:mm_cache_queries",
"vllm:mm_cache_hits",
]

HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
Expand All @@ -231,19 +252,43 @@ async def test_metrics_counts(
async def test_metrics_exist(
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
model_name = MODELS[model_key]

# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
)
if model_key == "text":
await client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
)
else:
await client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": _IMAGE_URL}},
{"type": "text", "text": "What's in this image?"},
],
}
],
max_tokens=5,
temperature=0.0,
)

response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK

for metric in EXPECTED_METRICS_V1:
expected_metrics = EXPECTED_METRICS_V1
if model_key == "multimodal":
# NOTE: Don't use in-place assignment
expected_metrics = expected_metrics + EXPECTED_METRICS_MM

for metric in expected_metrics:
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
continue
assert metric in response.text
Expand All @@ -253,9 +298,14 @@ async def test_metrics_exist(
async def test_abort_metrics_reset(
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
model_name = MODELS[model_key]
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_ids = tokenizer.encode(_PROMPT)

running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server
server,
)

# Expect no running requests or kvcache usage
Expand All @@ -268,8 +318,8 @@ async def test_abort_metrics_reset(
for _ in range(3):
task = asyncio.create_task(
client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
model=model_name,
prompt=prompt_ids,
max_tokens=100, # Long generation to give time to abort
temperature=0.0,
)
Expand All @@ -281,7 +331,7 @@ async def test_abort_metrics_reset(

# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server
server,
)

# Expect running requests and kvcache usage
Expand Down
7 changes: 3 additions & 4 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
BlockHash,
FreeKVCacheBlockQueue,
KVCacheBlock,
PrefixCachingMetrics,
estimate_max_model_len,
generate_block_hash_extra_keys,
generate_scheduler_kv_cache_config,
Expand All @@ -42,7 +41,7 @@
SlidingWindowSpec,
UniformTypeKVCacheSpecs,
)
from vllm.v1.metrics.stats import PrefixCacheStats
from vllm.v1.metrics.stats import CachingMetrics, PrefixCacheStats
from vllm.v1.request import Request

pytestmark = pytest.mark.cpu_test
Expand Down Expand Up @@ -536,7 +535,7 @@ def test_metrics():
"""
Test the prefix caching metrics.
"""
metrics = PrefixCachingMetrics(max_recent_requests=5)
metrics = CachingMetrics(max_recent_requests=5)
assert metrics.hit_rate == 0.0

metrics.observe(_stats(1, 20, 9))
Expand Down Expand Up @@ -568,7 +567,7 @@ def test_metrics_empty_stats():
"""
Test the prefix caching metrics with empty stats.
"""
metrics = PrefixCachingMetrics(max_recent_requests=5)
metrics = CachingMetrics(max_recent_requests=5)
metrics.observe(_stats(0, 0, 0))
metrics.observe(_stats(1, 20, 9))
metrics.observe(_stats(0, 0, 0))
Expand Down
3 changes: 2 additions & 1 deletion tests/v1/distributed/test_async_llm_dp.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import DPAsyncMPClient
from vllm.v1.metrics.loggers import StatLoggerBase
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
from vllm.v1.metrics.stats import IterationStats, MultiModalCacheStats, SchedulerStats

DP_SIZE = int(os.getenv("DP_SIZE", 2))

Expand Down Expand Up @@ -93,6 +93,7 @@ def record(
self,
scheduler_stats: Optional[SchedulerStats],
iteration_stats: Optional[IterationStats],
mm_cache_stats: Optional[MultiModalCacheStats] = None,
engine_idx: int = 0,
):
if iteration_stats:
Expand Down
Loading