From 3b106c82ef9fecf73d92877fef6e19f3004ff46a Mon Sep 17 00:00:00 2001
From: Eero Tamminen <eero.t.tamminen@intel.com>
Date: Wed, 13 Nov 2024 03:52:24 +0200
Subject: [PATCH] Replace HTTP "inprogress" gauge with megaservice
 "request_pending" one (#864)

* Add "megaservice_request_pending" metric

Unlike other megaservice ServiceOrchestrator metrics, this covers (can
cover) also non-streaming requests, as suggested in PR review.

This does not have issues Prometheus-fastapi-instrumentator
"inprogress" metric did:
* Extra instances which have to be differentiated e.g. for CI
* Rely on name -> suffix coming through obscure kwargs calls

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>

* Remove HTTP "inprogress" gauge as redundant

Now that ServiceOrchestrator provides pending metric.

Reverts the "inprogress" metric part of commit a6998a1dbdc9f88a5c4.

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>

* Document megaservice metrics

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 comps/cores/mega/http_service.py | 13 +-----------
 comps/cores/mega/orchestrator.py | 17 ++++++++++++++--
 comps/cores/telemetry/README.md  | 35 ++++++++++++++++++++++++--------
 3 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/comps/cores/mega/http_service.py b/comps/cores/mega/http_service.py
index 0f1bd1f9a..283540f49 100644
--- a/comps/cores/mega/http_service.py
+++ b/comps/cores/mega/http_service.py
@@ -35,18 +35,7 @@ def __init__(
         self.uvicorn_kwargs = uvicorn_kwargs or {}
         self.cors = cors
         self._app = self._create_app()
-
-        # remove part before '@', used by register_microservice() callers, and
-        # part after '/', added by MicroService(), to get real service name, and
-        # convert invalid characters to '_'
-        suffix = re.sub(r"[^a-zA-Z0-9]", "_", self.title.split("/")[0].split("@")[-1].lower())
-
-        instrumentator = Instrumentator(
-            inprogress_name=f"http_requests_inprogress_{suffix}",
-            should_instrument_requests_inprogress=True,
-            inprogress_labels=True,
-        )
-        instrumentator.instrument(self._app).expose(self._app)
+        Instrumentator().instrument(self._app).expose(self._app)
 
     @property
     def app(self):
diff --git a/comps/cores/mega/orchestrator.py b/comps/cores/mega/orchestrator.py
index 72a5aa66f..a73104562 100644
--- a/comps/cores/mega/orchestrator.py
+++ b/comps/cores/mega/orchestrator.py
@@ -12,7 +12,7 @@
 import aiohttp
 import requests
 from fastapi.responses import StreamingResponse
-from prometheus_client import Histogram
+from prometheus_client import Gauge, Histogram
 from pydantic import BaseModel
 
 from ..proto.docarray import LLMParams
@@ -33,6 +33,7 @@ class OrchestratorMetrics:
     first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)")
     inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)")
     request_latency = Histogram("megaservice_request_latency", "Whole request/reply latency (histogram)")
+    request_pending = Gauge("megaservice_request_pending", "Count of currently pending requests (gauge)")
 
     def __init__(self) -> None:
         pass
@@ -48,6 +49,12 @@ def token_update(self, token_start: float, is_first: bool) -> float:
     def request_update(self, req_start: float) -> None:
         self.request_latency.observe(time.time() - req_start)
 
+    def pending_update(self, increase: bool) -> None:
+        if increase:
+            self.request_pending.inc()
+        else:
+            self.request_pending.dec()
+
 
 class ServiceOrchestrator(DAG):
     """Manage 1 or N micro services in a DAG through Python API."""
@@ -74,13 +81,15 @@ def flow_to(self, from_service, to_service):
             return False
 
     async def schedule(self, initial_inputs: Dict | BaseModel, llm_parameters: LLMParams = LLMParams(), **kwargs):
+        req_start = time.time()
+        self.metrics.pending_update(True)
+
         result_dict = {}
         runtime_graph = DAG()
         runtime_graph.graph = copy.deepcopy(self.graph)
         if LOGFLAG:
             logger.info(initial_inputs)
 
-        req_start = time.time()
         timeout = aiohttp.ClientTimeout(total=1000)
         async with aiohttp.ClientSession(trust_env=True, timeout=timeout) as session:
             pending = {
@@ -146,6 +155,9 @@ def fake_stream(text):
             if node not in nodes_to_keep:
                 runtime_graph.delete_node_if_exists(node)
 
+        if not llm_parameters.streaming:
+            self.metrics.pending_update(False)
+
         return result_dict, runtime_graph
 
     def process_outputs(self, prev_nodes: List, result_dict: Dict) -> Dict:
@@ -234,6 +246,7 @@ def generate():
                                 token_start = self.metrics.token_update(token_start, is_first)
                             is_first = False
                     self.metrics.request_update(req_start)
+                    self.metrics.pending_update(False)
 
             return (
                 StreamingResponse(self.align_generator(generate(), **kwargs), media_type="text/event-stream"),
diff --git a/comps/cores/telemetry/README.md b/comps/cores/telemetry/README.md
index ca9b0b9da..35a371074 100644
--- a/comps/cores/telemetry/README.md
+++ b/comps/cores/telemetry/README.md
@@ -6,21 +6,23 @@ OPEA Comps currently provides telemetry functionalities for metrics and tracing
 
 ## Metrics
 
-OPEA microservice metrics are exported in Prometheus format and are divided into two categories: general metrics and specific metrics.
+OPEA microservice metrics are exported in Prometheus format under `/metrics` endpoint.
 
-General metrics, such as `http_requests_total`, `http_request_size_bytes`, are exposed by every microservice endpoint using the [prometheus-fastapi-instrumentator](https://github.com/trallnag/prometheus-fastapi-instrumentator).
+They come in several categories:
 
-Specific metrics are the built-in metrics exposed under `/metrics` by each specific microservices such as TGI, vLLM, TEI and others. Both types of the metrics adhere to the Prometheus format.
+- HTTP request metrics are exposed by every OPEA microservice using the [prometheus-fastapi-instrumentator](https://github.com/trallnag/prometheus-fastapi-instrumentator)
+- Megaservices export additional metrics for application end-to-end load / performance
+- Inferencing microservices such as TGI, vLLM, TEI provide their own metrics
 
-### General Metrics
-
-To access the general metrics of each microservice, you can use `curl` as follows:
+They can be fetched e.g. with `curl`:
 
 ```bash
 curl localhost:{port of your service}/metrics
 ```
 
-Then you will see Prometheus format metrics printed out as follows:
+### HTTP Metrics
+
+Metrics output looks following:
 
 ```yaml
 HELP http_requests_total Total number of requests by method, status and handler.
@@ -37,9 +39,22 @@ http_request_size_bytes_sum{handler="/v1/chatqna"} 128.0
 ...
 ```
 
-### Specific Metrics
+Most of them are histogram metrics.
+
+### Megaservice E2E metrics
+
+Applications' megaservice `ServiceOrchectrator` provides following metrics:
 
-To access the metrics exposed by each specific microservice, ensure that you check the specific port and your port mapping to reach the `/metrics` endpoint correctly.
+- `megaservice_first_token_latency`: time to first token (TTFT)
+- `megaservice_inter_token_latency`: inter-token latency (ITL ~ TPOT)
+- `megaservice_request_latency`: whole request E2E latency = TTFT + ITL \* tokens
+- `megaservice_request_pending`: how many LLM requests are still in progress
+
+Latency ones are histogram metrics i.e. include count, total value and set of value buckets for each item.
+
+They are available only for _streaming_ requests using LLM. Pending count accounts for all requests.
+
+### Inferencing Metrics
 
 For example, you can `curl localhost:6006/metrics` to retrieve the TEI embedding metrics, and the output should look like follows:
 
@@ -66,6 +81,8 @@ te_request_inference_duration_bucket{le="0.000022500000000000005"} 0
 te_request_inference_duration_bucket{le="0.00003375000000000001"} 0
 ```
 
+### Metrics collection
+
 These metrics can be scraped by the Prometheus server into a time-series database and further visualized using Grafana.
 
 Below are some default metrics endpoints for specific microservices: