Skip to content

Commit 9bb10a7

Browse files
coolkpKunjan Patel
andauthored
[MISC] Add lora requests to metrics (#9477)
Co-authored-by: Kunjan Patel <kunjanp_google_com@vllm.us-central1-a.c.kunjanp-gke-dev-2.internal>
1 parent 3921a2f commit 9bb10a7

File tree

3 files changed

+54
-2
lines changed

3 files changed

+54
-2
lines changed

vllm/engine/llm_engine.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import time
2+
from collections import Counter as collectionsCounter
23
from collections import deque
34
from contextlib import contextmanager
45
from dataclasses import dataclass
@@ -1617,6 +1618,25 @@ def _get_stats(self,
16171618
n_requests: List[int] = []
16181619
finished_reason_requests: List[str] = []
16191620

1621+
# Lora requests
1622+
running_lora_adapters = dict(
1623+
collectionsCounter([
1624+
running_request.lora_request.lora_name
1625+
for scheduler in self.scheduler
1626+
for running_request in scheduler.running
1627+
if running_request.lora_request
1628+
]))
1629+
waiting_lora_adapters = dict(
1630+
collectionsCounter([
1631+
waiting_request.lora_request.lora_name
1632+
for scheduler in self.scheduler
1633+
for waiting_request in scheduler.waiting
1634+
if waiting_request.lora_request
1635+
]))
1636+
max_lora_stat = "0"
1637+
if self.lora_config:
1638+
max_lora_stat = str(self.lora_config.max_loras)
1639+
16201640
# NOTE: This loop assumes prefill seq_groups are before
16211641
# decode seq_groups in scheduled_seq_groups.
16221642
if scheduler_outputs is not None:
@@ -1738,7 +1758,9 @@ def _get_stats(self,
17381758
num_generation_tokens_requests=num_generation_tokens_requests,
17391759
n_requests=n_requests,
17401760
finished_reason_requests=finished_reason_requests,
1741-
)
1761+
max_lora=str(max_lora_stat),
1762+
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
1763+
running_lora_adapters=list(running_lora_adapters.keys()))
17421764

17431765
def add_lora(self, lora_request: LoRARequest) -> bool:
17441766
return self.model_executor.add_lora(lora_request)

vllm/engine/metrics.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,11 @@ class Metrics:
3434
See https://prometheus.github.io/client_python/multiprocess/ for more
3535
details on limitations.
3636
"""
37+
3738
labelname_finish_reason = "finished_reason"
39+
labelname_waiting_lora_adapters = "waiting_lora_adapters"
40+
labelname_running_lora_adapters = "running_lora_adapters"
41+
labelname_max_lora = "max_lora"
3842
_gauge_cls = prometheus_client.Gauge
3943
_counter_cls = prometheus_client.Counter
4044
_histogram_cls = prometheus_client.Histogram
@@ -55,6 +59,16 @@ def __init__(self, labelnames: List[str], max_model_len: int):
5559
documentation="Number of requests waiting to be processed.",
5660
labelnames=labelnames,
5761
multiprocess_mode="sum")
62+
self.gauge_lora_info = self._gauge_cls(
63+
name="vllm:lora_requests_info",
64+
documentation="Running stats on lora requests.",
65+
labelnames=[
66+
self.labelname_running_lora_adapters,
67+
self.labelname_max_lora,
68+
self.labelname_waiting_lora_adapters,
69+
],
70+
multiprocess_mode="livemostrecent",
71+
)
5872
self.gauge_scheduler_swapped = self._gauge_cls(
5973
name="vllm:num_requests_swapped",
6074
documentation="Number of requests swapped to CPU.",
@@ -426,6 +440,9 @@ def _log_histogram(self, histogram, data: Union[List[int],
426440
for datum in data:
427441
histogram.labels(**self.labels).observe(datum)
428442

443+
def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
444+
gauge.labels(**data).set(1)
445+
429446
def _log_prometheus(self, stats: Stats) -> None:
430447
# System state data
431448
self._log_gauge(self.metrics.gauge_scheduler_running,
@@ -442,7 +459,17 @@ def _log_prometheus(self, stats: Stats) -> None:
442459
stats.cpu_prefix_cache_hit_rate)
443460
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
444461
stats.gpu_prefix_cache_hit_rate)
445-
462+
# Including max-lora in metric, in future this property of lora
463+
# config maybe extended to be dynamic.
464+
lora_info = {
465+
self.metrics.labelname_running_lora_adapters:
466+
",".join(stats.running_lora_adapters),
467+
self.metrics.labelname_waiting_lora_adapters:
468+
",".join(stats.waiting_lora_adapters),
469+
self.metrics.labelname_max_lora:
470+
stats.max_lora,
471+
}
472+
self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
446473
# Iteration level data
447474
self._log_counter(self.metrics.counter_num_preemption,
448475
stats.num_preemption_iter)

vllm/engine/metrics_types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ class Stats:
5151
num_generation_tokens_requests: List[int]
5252
n_requests: List[int]
5353
finished_reason_requests: List[str]
54+
waiting_lora_adapters: List[str]
55+
running_lora_adapters: List[str]
56+
max_lora: str
5457

5558
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
5659

0 commit comments

Comments
 (0)