|
1 | 1 | # SPDX-License-Identifier: Apache-2.0 |
2 | 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
3 | | - |
| 3 | +import asyncio |
4 | 4 | import subprocess |
5 | 5 | import sys |
6 | 6 | import tempfile |
@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer, |
294 | 294 | assert metric in response.text |
295 | 295 |
|
296 | 296 |
|
| 297 | +@pytest.mark.asyncio |
| 298 | +async def test_abort_metrics_reset(server: RemoteOpenAIServer, |
| 299 | + client: openai.AsyncClient, use_v1: bool): |
| 300 | + |
| 301 | + running_requests, waiting_requests, kv_cache_usage = ( |
| 302 | + _get_running_metrics_from_api(server)) |
| 303 | + |
| 304 | + # Expect no running requests or kvcache usage |
| 305 | + assert running_requests == 0 |
| 306 | + assert waiting_requests == 0 |
| 307 | + assert kv_cache_usage == 0.0 |
| 308 | + |
| 309 | + # Start some long-running requests that we can abort |
| 310 | + tasks = [] |
| 311 | + for _ in range(3): |
| 312 | + task = asyncio.create_task( |
| 313 | + client.completions.create( |
| 314 | + model=MODEL_NAME, |
| 315 | + prompt=_TOKENIZED_PROMPT, |
| 316 | + max_tokens=100, # Long generation to give time to abort |
| 317 | + temperature=0.0)) |
| 318 | + tasks.append(task) |
| 319 | + |
| 320 | + # Wait a bit for requests to start processing |
| 321 | + await asyncio.sleep(0.5) |
| 322 | + |
| 323 | + # Check that we have running requests |
| 324 | + running_requests, waiting_requests, kv_cache_usage = ( |
| 325 | + _get_running_metrics_from_api(server)) |
| 326 | + |
| 327 | + # Expect running requests and kvcache usage |
| 328 | + assert running_requests > 0 |
| 329 | + assert kv_cache_usage > 0 |
| 330 | + |
| 331 | + # Cancel all tasks to abort the requests |
| 332 | + for task in tasks: |
| 333 | + task.cancel() |
| 334 | + |
| 335 | + # Wait for cancellations to be processed |
| 336 | + await asyncio.sleep(1.0) |
| 337 | + |
| 338 | + # Check that metrics have reset to zero |
| 339 | + response = requests.get(server.url_for("metrics")) |
| 340 | + assert response.status_code == HTTPStatus.OK |
| 341 | + |
| 342 | + # Verify running and waiting requests counts and KV cache usage are zero |
| 343 | + running_requests_after, waiting_requests_after, kv_cache_usage_after = ( |
| 344 | + _get_running_metrics_from_api(server)) |
| 345 | + |
| 346 | + assert running_requests_after == 0,\ |
| 347 | + (f"Expected 0 running requests after abort, got " |
| 348 | + f"{running_requests_after}") |
| 349 | + assert waiting_requests_after == 0,\ |
| 350 | + (f"Expected 0 waiting requests after abort, got " |
| 351 | + f"{waiting_requests_after}") |
| 352 | + assert kv_cache_usage_after == 0,\ |
| 353 | + (f"Expected 0% KV cache usage after abort, got " |
| 354 | + f"{kv_cache_usage_after}") |
| 355 | + |
| 356 | + |
| 357 | +def _get_running_metrics_from_api(server: RemoteOpenAIServer): |
| 358 | + """Return (running_count, waiting_count, kv_cache_usage)""" |
| 359 | + |
| 360 | + response = requests.get(server.url_for("metrics")) |
| 361 | + assert response.status_code == HTTPStatus.OK |
| 362 | + |
| 363 | + # Verify running and waiting requests counts and KV cache usage are zero |
| 364 | + running_requests, waiting_requests, kv_cache_usage = None, None, None |
| 365 | + |
| 366 | + for family in text_string_to_metric_families(response.text): |
| 367 | + if family.name == "vllm:num_requests_running": |
| 368 | + for sample in family.samples: |
| 369 | + if sample.name == "vllm:num_requests_running": |
| 370 | + running_requests = sample.value |
| 371 | + break |
| 372 | + elif family.name == "vllm:num_requests_waiting": |
| 373 | + for sample in family.samples: |
| 374 | + if sample.name == "vllm:num_requests_waiting": |
| 375 | + waiting_requests = sample.value |
| 376 | + break |
| 377 | + elif family.name == "vllm:gpu_cache_usage_perc": |
| 378 | + for sample in family.samples: |
| 379 | + if sample.name == "vllm:gpu_cache_usage_perc": |
| 380 | + kv_cache_usage = sample.value |
| 381 | + break |
| 382 | + |
| 383 | + assert running_requests is not None |
| 384 | + assert waiting_requests is not None |
| 385 | + assert kv_cache_usage is not None |
| 386 | + |
| 387 | + return running_requests, waiting_requests, kv_cache_usage |
| 388 | + |
| 389 | + |
297 | 390 | def test_metrics_exist_run_batch(use_v1: bool): |
298 | 391 | input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501 |
299 | 392 |
|
|
0 commit comments