Skip to content

Commit 55e385b

Browse files
njhillamd-xiaoyu12
authored andcommitted
[BugFix] Fix stuck stats/metrics after requests are aborted (vllm-project#22995)
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Xiao Yu <xiao.yu@amd.com>
1 parent b4cfe83 commit 55e385b

File tree

3 files changed

+106
-5
lines changed

3 files changed

+106
-5
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3-
3+
import asyncio
44
import subprocess
55
import sys
66
import tempfile
@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
294294
assert metric in response.text
295295

296296

297+
@pytest.mark.asyncio
298+
async def test_abort_metrics_reset(server: RemoteOpenAIServer,
299+
client: openai.AsyncClient, use_v1: bool):
300+
301+
running_requests, waiting_requests, kv_cache_usage = (
302+
_get_running_metrics_from_api(server))
303+
304+
# Expect no running requests or kvcache usage
305+
assert running_requests == 0
306+
assert waiting_requests == 0
307+
assert kv_cache_usage == 0.0
308+
309+
# Start some long-running requests that we can abort
310+
tasks = []
311+
for _ in range(3):
312+
task = asyncio.create_task(
313+
client.completions.create(
314+
model=MODEL_NAME,
315+
prompt=_TOKENIZED_PROMPT,
316+
max_tokens=100, # Long generation to give time to abort
317+
temperature=0.0))
318+
tasks.append(task)
319+
320+
# Wait a bit for requests to start processing
321+
await asyncio.sleep(0.5)
322+
323+
# Check that we have running requests
324+
running_requests, waiting_requests, kv_cache_usage = (
325+
_get_running_metrics_from_api(server))
326+
327+
# Expect running requests and kvcache usage
328+
assert running_requests > 0
329+
assert kv_cache_usage > 0
330+
331+
# Cancel all tasks to abort the requests
332+
for task in tasks:
333+
task.cancel()
334+
335+
# Wait for cancellations to be processed
336+
await asyncio.sleep(1.0)
337+
338+
# Check that metrics have reset to zero
339+
response = requests.get(server.url_for("metrics"))
340+
assert response.status_code == HTTPStatus.OK
341+
342+
# Verify running and waiting requests counts and KV cache usage are zero
343+
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
344+
_get_running_metrics_from_api(server))
345+
346+
assert running_requests_after == 0,\
347+
(f"Expected 0 running requests after abort, got "
348+
f"{running_requests_after}")
349+
assert waiting_requests_after == 0,\
350+
(f"Expected 0 waiting requests after abort, got "
351+
f"{waiting_requests_after}")
352+
assert kv_cache_usage_after == 0,\
353+
(f"Expected 0% KV cache usage after abort, got "
354+
f"{kv_cache_usage_after}")
355+
356+
357+
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
358+
"""Return (running_count, waiting_count, kv_cache_usage)"""
359+
360+
response = requests.get(server.url_for("metrics"))
361+
assert response.status_code == HTTPStatus.OK
362+
363+
# Verify running and waiting requests counts and KV cache usage are zero
364+
running_requests, waiting_requests, kv_cache_usage = None, None, None
365+
366+
for family in text_string_to_metric_families(response.text):
367+
if family.name == "vllm:num_requests_running":
368+
for sample in family.samples:
369+
if sample.name == "vllm:num_requests_running":
370+
running_requests = sample.value
371+
break
372+
elif family.name == "vllm:num_requests_waiting":
373+
for sample in family.samples:
374+
if sample.name == "vllm:num_requests_waiting":
375+
waiting_requests = sample.value
376+
break
377+
elif family.name == "vllm:gpu_cache_usage_perc":
378+
for sample in family.samples:
379+
if sample.name == "vllm:gpu_cache_usage_perc":
380+
kv_cache_usage = sample.value
381+
break
382+
383+
assert running_requests is not None
384+
assert waiting_requests is not None
385+
assert kv_cache_usage is not None
386+
387+
return running_requests, waiting_requests, kv_cache_usage
388+
389+
297390
def test_metrics_exist_run_batch(use_v1: bool):
298391
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
299392

vllm/v1/core/block_pool.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,12 @@ def get_usage(self) -> float:
298298
Returns:
299299
The KV cache usage (between 0.0 and 1.0).
300300
"""
301-
return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
301+
302+
# Subtract 1 to account for null block.
303+
total_gpu_blocks = self.num_gpu_blocks - 1
304+
if not total_gpu_blocks:
305+
return 0
306+
return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks)
302307

303308
def take_events(self) -> list[KVCacheEvent]:
304309
"""Atomically takes all events and clears the queue.

vllm/v1/core/sched/scheduler.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -902,10 +902,13 @@ def update_from_output(
902902
finished_requests=finished_set)
903903
finished_req_ids.clear()
904904

905-
if engine_core_outputs:
905+
if (stats := self.make_stats(spec_decoding_stats)) is not None:
906906
# Return stats to only one of the front-ends.
907-
next(iter(engine_core_outputs.values())).scheduler_stats = (
908-
self.make_stats(spec_decoding_stats))
907+
if (eco := next(iter(engine_core_outputs.values()), None)) is None:
908+
# We must return the stats even if there are no request
909+
# outputs this step.
910+
engine_core_outputs[0] = eco = EngineCoreOutputs()
911+
eco.scheduler_stats = stats
909912

910913
return engine_core_outputs
911914

0 commit comments

Comments
 (0)