From afd51dcb68ae2cdb119c756877acd8a311d4dea7 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 14 Feb 2025 13:27:13 -0500
Subject: [PATCH] [WIP][Metrics] Re-work approach to LoRA metrics

The current `vllm:lora_requests_info` Gauge is somewhat similar to an
Info metric (like cache_config_info) except the value is the current
wall-clock time, and is updated every iteration.

The label names used are:

- running_lora_adapters: a list of adapters with running requests,
  formatted as a comma-separated string.
- waiting_lora_adapters: similar, except listing adapters with
  requests waiting to be scheduled.
- max_lora - the static "max number of LoRAs in a single batch."
  configuration.

It looks like this:

```
vllm:lora_requests_info{max_lora="1",running_lora_adapters="",waiting_lora_adapters=""} 1.7395575657589855e+09
vllm:lora_requests_info{max_lora="1",running_lora_adapters="test-lora",waiting_lora_adapters=""} 1.7395575723949368e+09
vllm:lora_requests_info{max_lora="1",running_lora_adapters="test-lora",waiting_lora_adapters="test-lora"} 1.7395575717647147e+09
```

I can't really make much sense of this. Encoding a running/waiting
status for multiple adapters in a comma-separated string seems quite
misguided - we should use labels to distinguish between per-adapter
counts instead:

```
vllm:num_lora_requests_running{lora_name="test-lora",model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0
vllm:num_lora_requests_waiting{lora_name="test-lora",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0
```

This was added in #9477 and there is at least one known user. If we
revisit this design and deprecate the old metric, we should reduce the
need for a significant deprecation period by making the change in v0
also and asking this project to move to the new metric.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 vllm/engine/llm_engine.py    |  4 ++--
 vllm/engine/metrics.py       | 27 +++++++++++++++++++++++++--
 vllm/engine/metrics_types.py |  4 ++--
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2e5bc75c6db3..6cb88459df4c 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1812,8 +1812,8 @@ def _get_stats(self,
             max_tokens_requests=max_tokens_requests,
             finished_reason_requests=finished_reason_requests,
             max_lora=str(max_lora_stat),
-            waiting_lora_adapters=list(waiting_lora_adapters.keys()),
-            running_lora_adapters=list(running_lora_adapters.keys()))
+            waiting_lora_adapters=waiting_lora_adapters,
+            running_lora_adapters=running_lora_adapters)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_executor.add_lora(lora_request)
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 7c55d66e5077..7c28c583829b 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -53,6 +53,8 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
 
         max_model_len = vllm_config.model_config.max_model_len
 
+        lora_labelnames = labelnames + ["lora_name"]
+
         # System stats
         #   Scheduler State
         self.gauge_scheduler_running = self._gauge_cls(
@@ -65,6 +67,17 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames,
             multiprocess_mode="sum")
+        self.gauge_lora_requests_running = self._gauge_cls(
+            name="vllm:num_lora_requests_running",
+            documentation="Number of requests currently, per LoRA.",
+            labelnames=lora_labelnames,
+            multiprocess_mode="sum")
+        self.gauge_lora_requests_waiting = self._gauge_cls(
+            name="vllm:num_lora_requests_waiting",
+            documentation="Number of requests waiting, per LoRA.",
+            labelnames=lora_labelnames,
+            multiprocess_mode="sum")
+        # Deprecated
         self.gauge_lora_info = self._gauge_cls(
             name="vllm:lora_requests_info",
             documentation="Running stats on lora requests.",
@@ -517,9 +530,11 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
         self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                          vllm_config=vllm_config)
 
-    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+    def _log_gauge(self, gauge, data: Union[int, float],
+                   **extra_labels) -> None:
         # Convenience function for logging to gauge.
-        gauge.labels(**self.labels).set(data)
+        combined_labels = {**self.labels, **extra_labels}
+        gauge.labels(**combined_labels).set(data)
 
     def _log_counter(self, counter, data: Union[int, float]) -> None:
         # Convenience function for logging to counter.
@@ -561,6 +576,14 @@ def _log_prometheus(self, stats: Stats) -> None:
                         stats.cpu_prefix_cache_hit_rate)
         self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
                         stats.gpu_prefix_cache_hit_rate)
+        for lora_name, lora_running in stats.running_lora_adapters.items():
+            self._log_gauge(self.metrics.gauge_lora_requests_running,
+                            lora_running,
+                            lora_name=lora_name)
+        for lora_name, lora_waiting in stats.waiting_lora_adapters.items():
+            self._log_gauge(self.metrics.gauge_lora_requests_waiting,
+                            lora_waiting,
+                            lora_name=lora_name)
         # Including max-lora in metric, in future this property of lora
         # config maybe extended to be dynamic.
         lora_info = {
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 7f0c2fa70c3f..1ff9c58dd21d 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -63,8 +63,8 @@ class Stats:
     max_num_generation_tokens_requests: List[int]
     max_tokens_requests: List[int]
     finished_reason_requests: List[str]
-    waiting_lora_adapters: List[str]
-    running_lora_adapters: List[str]
+    waiting_lora_adapters: Dict[str, int]
+    running_lora_adapters: Dict[str, int]
     max_lora: str
 
     spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None