From afd51dcb68ae2cdb119c756877acd8a311d4dea7 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 14 Feb 2025 13:27:13 -0500 Subject: [PATCH] [WIP][Metrics] Re-work approach to LoRA metrics The current `vllm:lora_requests_info` Gauge is somewhat similar to an Info metric (like cache_config_info) except the value is the current wall-clock time, and is updated every iteration. The label names used are: - running_lora_adapters: a list of adapters with running requests, formatted as a comma-separated string. - waiting_lora_adapters: similar, except listing adapters with requests waiting to be scheduled. - max_lora - the static "max number of LoRAs in a single batch." configuration. It looks like this: ``` vllm:lora_requests_info{max_lora="1",running_lora_adapters="",waiting_lora_adapters=""} 1.7395575657589855e+09 vllm:lora_requests_info{max_lora="1",running_lora_adapters="test-lora",waiting_lora_adapters=""} 1.7395575723949368e+09 vllm:lora_requests_info{max_lora="1",running_lora_adapters="test-lora",waiting_lora_adapters="test-lora"} 1.7395575717647147e+09 ``` I can't really make much sense of this. Encoding a running/waiting status for multiple adapters in a comma-separated string seems quite misguided - we should use labels to distinguish between per-adapter counts instead: ``` vllm:num_lora_requests_running{lora_name="test-lora",model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0 vllm:num_lora_requests_waiting{lora_name="test-lora",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0 ``` This was added in #9477 and there is at least one known user. If we revisit this design and deprecate the old metric, we should reduce the need for a significant deprecation period by making the change in v0 also and asking this project to move to the new metric. Signed-off-by: Mark McLoughlin --- vllm/engine/llm_engine.py | 4 ++-- vllm/engine/metrics.py | 27 +++++++++++++++++++++++++-- vllm/engine/metrics_types.py | 4 ++-- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2e5bc75c6db3..6cb88459df4c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1812,8 +1812,8 @@ def _get_stats(self, max_tokens_requests=max_tokens_requests, finished_reason_requests=finished_reason_requests, max_lora=str(max_lora_stat), - waiting_lora_adapters=list(waiting_lora_adapters.keys()), - running_lora_adapters=list(running_lora_adapters.keys())) + waiting_lora_adapters=waiting_lora_adapters, + running_lora_adapters=running_lora_adapters) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 7c55d66e5077..7c28c583829b 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -53,6 +53,8 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): max_model_len = vllm_config.model_config.max_model_len + lora_labelnames = labelnames + ["lora_name"] + # System stats # Scheduler State self.gauge_scheduler_running = self._gauge_cls( @@ -65,6 +67,17 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): documentation="Number of requests waiting to be processed.", labelnames=labelnames, multiprocess_mode="sum") + self.gauge_lora_requests_running = self._gauge_cls( + name="vllm:num_lora_requests_running", + documentation="Number of requests currently, per LoRA.", + labelnames=lora_labelnames, + multiprocess_mode="sum") + self.gauge_lora_requests_waiting = self._gauge_cls( + name="vllm:num_lora_requests_waiting", + documentation="Number of requests waiting, per LoRA.", + labelnames=lora_labelnames, + multiprocess_mode="sum") + # Deprecated self.gauge_lora_info = self._gauge_cls( name="vllm:lora_requests_info", documentation="Running stats on lora requests.", @@ -517,9 +530,11 @@ def __init__(self, local_interval: float, labels: Dict[str, str], self.metrics = self._metrics_cls(labelnames=list(labels.keys()), vllm_config=vllm_config) - def _log_gauge(self, gauge, data: Union[int, float]) -> None: + def _log_gauge(self, gauge, data: Union[int, float], + **extra_labels) -> None: # Convenience function for logging to gauge. - gauge.labels(**self.labels).set(data) + combined_labels = {**self.labels, **extra_labels} + gauge.labels(**combined_labels).set(data) def _log_counter(self, counter, data: Union[int, float]) -> None: # Convenience function for logging to counter. @@ -561,6 +576,14 @@ def _log_prometheus(self, stats: Stats) -> None: stats.cpu_prefix_cache_hit_rate) self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, stats.gpu_prefix_cache_hit_rate) + for lora_name, lora_running in stats.running_lora_adapters.items(): + self._log_gauge(self.metrics.gauge_lora_requests_running, + lora_running, + lora_name=lora_name) + for lora_name, lora_waiting in stats.waiting_lora_adapters.items(): + self._log_gauge(self.metrics.gauge_lora_requests_waiting, + lora_waiting, + lora_name=lora_name) # Including max-lora in metric, in future this property of lora # config maybe extended to be dynamic. lora_info = { diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 7f0c2fa70c3f..1ff9c58dd21d 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -63,8 +63,8 @@ class Stats: max_num_generation_tokens_requests: List[int] max_tokens_requests: List[int] finished_reason_requests: List[str] - waiting_lora_adapters: List[str] - running_lora_adapters: List[str] + waiting_lora_adapters: Dict[str, int] + running_lora_adapters: Dict[str, int] max_lora: str spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None