Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
from collections import Counter as collectionsCounter
from collections import deque
from contextlib import contextmanager
from dataclasses import dataclass
Expand Down Expand Up @@ -1617,6 +1618,25 @@ def _get_stats(self,
n_requests: List[int] = []
finished_reason_requests: List[str] = []

# Lora requests
running_lora_adapters = dict(
collectionsCounter([
running_request.lora_request.lora_name
for scheduler in self.scheduler
for running_request in scheduler.running
if running_request.lora_request
]))
waiting_lora_adapters = dict(
collectionsCounter([
waiting_request.lora_request.lora_name
for scheduler in self.scheduler
for waiting_request in scheduler.waiting
if waiting_request.lora_request
]))
max_lora_stat = "0"
if self.lora_config:
max_lora_stat = str(self.lora_config.max_loras)
Comment on lines +1636 to +1638
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems always fixed? In this case can we don't dump this value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

across multiple deployments its hard to get this value, helps determine how many loras can be fitted on the server. You are right, its definitely static right now, initialised at runtime and thats it. I considered moving it to separate info metric like the cache config info. But I think in future there maybe value in enabling dynamic adjustment of max lora, like base_model which is static right now.


# NOTE: This loop assumes prefill seq_groups are before
# decode seq_groups in scheduled_seq_groups.
if scheduler_outputs is not None:
Expand Down Expand Up @@ -1738,7 +1758,9 @@ def _get_stats(self,
num_generation_tokens_requests=num_generation_tokens_requests,
n_requests=n_requests,
finished_reason_requests=finished_reason_requests,
)
max_lora=str(max_lora_stat),
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
running_lora_adapters=list(running_lora_adapters.keys()))

def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_executor.add_lora(lora_request)
Expand Down
29 changes: 28 additions & 1 deletion vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ class Metrics:
See https://prometheus.github.io/client_python/multiprocess/ for more
details on limitations.
"""

labelname_finish_reason = "finished_reason"
labelname_waiting_lora_adapters = "waiting_lora_adapters"
labelname_running_lora_adapters = "running_lora_adapters"
labelname_max_lora = "max_lora"
_gauge_cls = prometheus_client.Gauge
_counter_cls = prometheus_client.Counter
_histogram_cls = prometheus_client.Histogram
Expand All @@ -55,6 +59,16 @@ def __init__(self, labelnames: List[str], max_model_len: int):
documentation="Number of requests waiting to be processed.",
labelnames=labelnames,
multiprocess_mode="sum")
self.gauge_lora_info = self._gauge_cls(
name="vllm:lora_requests_info",
documentation="Running stats on lora requests.",
labelnames=[
self.labelname_running_lora_adapters,
self.labelname_max_lora,
self.labelname_waiting_lora_adapters,
],
multiprocess_mode="livemostrecent",
)
self.gauge_scheduler_swapped = self._gauge_cls(
name="vllm:num_requests_swapped",
documentation="Number of requests swapped to CPU.",
Expand Down Expand Up @@ -426,6 +440,9 @@ def _log_histogram(self, histogram, data: Union[List[int],
for datum in data:
histogram.labels(**self.labels).observe(datum)

def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
gauge.labels(**data).set(1)

def _log_prometheus(self, stats: Stats) -> None:
# System state data
self._log_gauge(self.metrics.gauge_scheduler_running,
Expand All @@ -442,7 +459,17 @@ def _log_prometheus(self, stats: Stats) -> None:
stats.cpu_prefix_cache_hit_rate)
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
stats.gpu_prefix_cache_hit_rate)

# Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic.
lora_info = {
self.metrics.labelname_running_lora_adapters:
",".join(stats.running_lora_adapters),
self.metrics.labelname_waiting_lora_adapters:
",".join(stats.waiting_lora_adapters),
self.metrics.labelname_max_lora:
stats.max_lora,
}
self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
# Iteration level data
self._log_counter(self.metrics.counter_num_preemption,
stats.num_preemption_iter)
Expand Down
3 changes: 3 additions & 0 deletions vllm/engine/metrics_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ class Stats:
num_generation_tokens_requests: List[int]
n_requests: List[int]
finished_reason_requests: List[str]
waiting_lora_adapters: List[str]
running_lora_adapters: List[str]
max_lora: str

spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None

Expand Down