Skip to content

Commit bbaf8e9

Browse files
dumb0002ilmarkov
authored andcommitted
[Core] Exposing engine sleep & wake_up state as prometheus metrics (vllm-project#24176)
Signed-off-by: Braulio Dumba <Braulio.Dumba@ibm.com>
1 parent 23823ef commit bbaf8e9

File tree

4 files changed

+114
-1
lines changed

4 files changed

+114
-1
lines changed

tests/entrypoints/openai/test_sleep.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
import requests
5+
from prometheus_client.parser import text_string_to_metric_families
56

67
from ...utils import RemoteOpenAIServer
78

@@ -31,12 +32,28 @@ def test_sleep_mode():
3132
assert response.status_code == 200
3233
assert response.json().get("is_sleeping") is True
3334

35+
# check sleep metrics
36+
response = requests.get(remote_server.url_for("metrics"))
37+
assert response.status_code == 200
38+
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
39+
assert awake == 0
40+
assert weights_offloaded == 1
41+
assert discard_all == 0
42+
3443
response = requests.post(remote_server.url_for("wake_up"))
3544
assert response.status_code == 200
3645
response = requests.get(remote_server.url_for("is_sleeping"))
3746
assert response.status_code == 200
3847
assert response.json().get("is_sleeping") is False
3948

49+
# check sleep metrics
50+
response = requests.get(remote_server.url_for("metrics"))
51+
assert response.status_code == 200
52+
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
53+
assert awake == 1
54+
assert weights_offloaded == 0
55+
assert discard_all == 0
56+
4057
# test wake up with tags
4158
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
4259
assert response.status_code == 200
@@ -59,3 +76,35 @@ def test_sleep_mode():
5976
response = requests.get(remote_server.url_for("is_sleeping"))
6077
assert response.status_code == 200
6178
assert response.json().get("is_sleeping") is False
79+
80+
# check sleep metrics
81+
response = requests.get(remote_server.url_for("metrics"))
82+
assert response.status_code == 200
83+
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
84+
assert awake == 1
85+
assert weights_offloaded == 0
86+
assert discard_all == 0
87+
88+
89+
def _get_sleep_metrics_from_api(response: requests.Response):
90+
"""Return (awake, weights_offloaded, discard_all)"""
91+
92+
awake, weights_offloaded, discard_all = None, None, None
93+
94+
for family in text_string_to_metric_families(response.text):
95+
if family.name == "vllm:engine_sleep_state":
96+
for sample in family.samples:
97+
if sample.name == "vllm:engine_sleep_state":
98+
for label_name, label_value in sample.labels.items():
99+
if label_value == "awake":
100+
awake = sample.value
101+
elif label_value == "weights_offloaded":
102+
weights_offloaded = sample.value
103+
elif label_value == "discard_all":
104+
discard_all = sample.value
105+
106+
assert awake is not None
107+
assert weights_offloaded is not None
108+
assert discard_all is not None
109+
110+
return awake, weights_offloaded, discard_all

vllm/v1/engine/async_llm.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,9 +689,15 @@ async def sleep(self, level: int = 1) -> None:
689689
await self.reset_prefix_cache()
690690
await self.engine_core.sleep_async(level)
691691

692+
if self.logger_manager is not None:
693+
self.logger_manager.record_sleep_state(1, level)
694+
692695
async def wake_up(self, tags: list[str] | None = None) -> None:
693696
await self.engine_core.wake_up_async(tags)
694697

698+
if self.logger_manager is not None:
699+
self.logger_manager.record_sleep_state(0, 0)
700+
695701
async def is_sleeping(self) -> bool:
696702
return await self.engine_core.is_sleeping_async()
697703

vllm/v1/engine/llm_engine.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,15 @@ def reset_prefix_cache(self, device: Device | None = None):
332332
def sleep(self, level: int = 1):
333333
self.engine_core.sleep(level)
334334

335+
if self.logger_manager is not None:
336+
self.logger_manager.record_sleep_state(1, level)
337+
335338
def wake_up(self, tags: list[str] | None = None):
336339
self.engine_core.wake_up(tags)
337340

341+
if self.logger_manager is not None:
342+
self.logger_manager.record_sleep_state(0, 0)
343+
338344
def is_sleeping(self) -> bool:
339345
return self.engine_core.is_sleeping()
340346

vllm/v1/metrics/loggers.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from prometheus_client import Counter, Gauge, Histogram
1111

12+
import vllm.envs as envs
1213
from vllm.config import SupportsMetricsInfo, VllmConfig
1314
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorLogging
1415
from vllm.logger import init_logger
@@ -56,6 +57,9 @@ def log_engine_initialized(self): ...
5657
def log(self): # noqa
5758
pass
5859

60+
def record_sleep_state(self, is_awake: int, level: int): # noqa
61+
pass
62+
5963

6064
def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
6165
factories: list[StatLoggerFactory] = []
@@ -384,8 +388,33 @@ def __init__(
384388
self.gauge_scheduler_waiting = make_per_engine(
385389
gauge_scheduler_waiting, engine_indexes, model_name
386390
)
391+
if envs.VLLM_SERVER_DEV_MODE:
392+
gauge_engine_sleep_state = self._gauge_cls(
393+
name="vllm:engine_sleep_state",
394+
documentation=(
395+
"Engine sleep state; awake = 0 means engine is sleeping; "
396+
"awake = 1 means engine is awake; "
397+
"weights_offloaded = 1 means sleep level 1; "
398+
"discard_all = 1 means sleep level 2."
399+
),
400+
labelnames=labelnames + ["sleep_state"],
401+
multiprocess_mode="mostrecent",
402+
)
403+
404+
self.gauge_engine_sleep_state = {}
405+
sleep_state = ["awake", "weights_offloaded", "discard_all"]
406+
407+
for s in sleep_state:
408+
self.gauge_engine_sleep_state[s] = {
409+
idx: gauge_engine_sleep_state.labels(
410+
engine=idx, model_name=model_name, sleep_state=s
411+
)
412+
for idx in engine_indexes
413+
}
414+
415+
# Setting default values
416+
self.record_sleep_state()
387417

388-
#
389418
# GPU cache
390419
#
391420
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
@@ -1010,6 +1039,25 @@ def record(
10101039
}
10111040
self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()
10121041

1042+
def record_sleep_state(self, sleep: int = 0, level: int = 0):
1043+
awake = 1
1044+
discard_all = 0
1045+
weights_offloaded = 0
1046+
1047+
if sleep == 1:
1048+
awake = 0
1049+
if level == 1:
1050+
weights_offloaded = 1
1051+
elif level == 2:
1052+
discard_all = 1
1053+
1054+
for engine_idx in self.engine_indexes:
1055+
self.gauge_engine_sleep_state["discard_all"][engine_idx].set(discard_all)
1056+
self.gauge_engine_sleep_state["weights_offloaded"][engine_idx].set(
1057+
weights_offloaded
1058+
)
1059+
self.gauge_engine_sleep_state["awake"][engine_idx].set(awake)
1060+
10131061
def log_engine_initialized(self):
10141062
self.log_metrics_info("cache_config", self.vllm_config.cache_config)
10151063

@@ -1131,6 +1179,10 @@ def record(
11311179
engine_idx=engine_idx,
11321180
)
11331181

1182+
def record_sleep_state(self, sleep: int = 0, level: int = 0):
1183+
for logger in self.stat_loggers:
1184+
logger.record_sleep_state(sleep, level)
1185+
11341186
def log(self):
11351187
for logger in self.stat_loggers:
11361188
logger.log()

0 commit comments

Comments
 (0)