From 3b106c82ef9fecf73d92877fef6e19f3004ff46a Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Wed, 13 Nov 2024 03:52:24 +0200 Subject: [PATCH] Replace HTTP "inprogress" gauge with megaservice "request_pending" one (#864) * Add "megaservice_request_pending" metric Unlike other megaservice ServiceOrchestrator metrics, this covers (can cover) also non-streaming requests, as suggested in PR review. This does not have issues Prometheus-fastapi-instrumentator "inprogress" metric did: * Extra instances which have to be differentiated e.g. for CI * Rely on name -> suffix coming through obscure kwargs calls Signed-off-by: Eero Tamminen * Remove HTTP "inprogress" gauge as redundant Now that ServiceOrchestrator provides pending metric. Reverts the "inprogress" metric part of commit a6998a1dbdc9f88a5c4. Signed-off-by: Eero Tamminen * Document megaservice metrics Signed-off-by: Eero Tamminen * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Eero Tamminen Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- comps/cores/mega/http_service.py | 13 +----------- comps/cores/mega/orchestrator.py | 17 ++++++++++++++-- comps/cores/telemetry/README.md | 35 ++++++++++++++++++++++++-------- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/comps/cores/mega/http_service.py b/comps/cores/mega/http_service.py index 0f1bd1f9a..283540f49 100644 --- a/comps/cores/mega/http_service.py +++ b/comps/cores/mega/http_service.py @@ -35,18 +35,7 @@ def __init__( self.uvicorn_kwargs = uvicorn_kwargs or {} self.cors = cors self._app = self._create_app() - - # remove part before '@', used by register_microservice() callers, and - # part after '/', added by MicroService(), to get real service name, and - # convert invalid characters to '_' - suffix = re.sub(r"[^a-zA-Z0-9]", "_", self.title.split("/")[0].split("@")[-1].lower()) - - instrumentator = Instrumentator( - inprogress_name=f"http_requests_inprogress_{suffix}", - should_instrument_requests_inprogress=True, - inprogress_labels=True, - ) - instrumentator.instrument(self._app).expose(self._app) + Instrumentator().instrument(self._app).expose(self._app) @property def app(self): diff --git a/comps/cores/mega/orchestrator.py b/comps/cores/mega/orchestrator.py index 72a5aa66f..a73104562 100644 --- a/comps/cores/mega/orchestrator.py +++ b/comps/cores/mega/orchestrator.py @@ -12,7 +12,7 @@ import aiohttp import requests from fastapi.responses import StreamingResponse -from prometheus_client import Histogram +from prometheus_client import Gauge, Histogram from pydantic import BaseModel from ..proto.docarray import LLMParams @@ -33,6 +33,7 @@ class OrchestratorMetrics: first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)") inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)") request_latency = Histogram("megaservice_request_latency", "Whole request/reply latency (histogram)") + request_pending = Gauge("megaservice_request_pending", "Count of currently pending requests (gauge)") def __init__(self) -> None: pass @@ -48,6 +49,12 @@ def token_update(self, token_start: float, is_first: bool) -> float: def request_update(self, req_start: float) -> None: self.request_latency.observe(time.time() - req_start) + def pending_update(self, increase: bool) -> None: + if increase: + self.request_pending.inc() + else: + self.request_pending.dec() + class ServiceOrchestrator(DAG): """Manage 1 or N micro services in a DAG through Python API.""" @@ -74,13 +81,15 @@ def flow_to(self, from_service, to_service): return False async def schedule(self, initial_inputs: Dict | BaseModel, llm_parameters: LLMParams = LLMParams(), **kwargs): + req_start = time.time() + self.metrics.pending_update(True) + result_dict = {} runtime_graph = DAG() runtime_graph.graph = copy.deepcopy(self.graph) if LOGFLAG: logger.info(initial_inputs) - req_start = time.time() timeout = aiohttp.ClientTimeout(total=1000) async with aiohttp.ClientSession(trust_env=True, timeout=timeout) as session: pending = { @@ -146,6 +155,9 @@ def fake_stream(text): if node not in nodes_to_keep: runtime_graph.delete_node_if_exists(node) + if not llm_parameters.streaming: + self.metrics.pending_update(False) + return result_dict, runtime_graph def process_outputs(self, prev_nodes: List, result_dict: Dict) -> Dict: @@ -234,6 +246,7 @@ def generate(): token_start = self.metrics.token_update(token_start, is_first) is_first = False self.metrics.request_update(req_start) + self.metrics.pending_update(False) return ( StreamingResponse(self.align_generator(generate(), **kwargs), media_type="text/event-stream"), diff --git a/comps/cores/telemetry/README.md b/comps/cores/telemetry/README.md index ca9b0b9da..35a371074 100644 --- a/comps/cores/telemetry/README.md +++ b/comps/cores/telemetry/README.md @@ -6,21 +6,23 @@ OPEA Comps currently provides telemetry functionalities for metrics and tracing ## Metrics -OPEA microservice metrics are exported in Prometheus format and are divided into two categories: general metrics and specific metrics. +OPEA microservice metrics are exported in Prometheus format under `/metrics` endpoint. -General metrics, such as `http_requests_total`, `http_request_size_bytes`, are exposed by every microservice endpoint using the [prometheus-fastapi-instrumentator](https://github.com/trallnag/prometheus-fastapi-instrumentator). +They come in several categories: -Specific metrics are the built-in metrics exposed under `/metrics` by each specific microservices such as TGI, vLLM, TEI and others. Both types of the metrics adhere to the Prometheus format. +- HTTP request metrics are exposed by every OPEA microservice using the [prometheus-fastapi-instrumentator](https://github.com/trallnag/prometheus-fastapi-instrumentator) +- Megaservices export additional metrics for application end-to-end load / performance +- Inferencing microservices such as TGI, vLLM, TEI provide their own metrics -### General Metrics - -To access the general metrics of each microservice, you can use `curl` as follows: +They can be fetched e.g. with `curl`: ```bash curl localhost:{port of your service}/metrics ``` -Then you will see Prometheus format metrics printed out as follows: +### HTTP Metrics + +Metrics output looks following: ```yaml HELP http_requests_total Total number of requests by method, status and handler. @@ -37,9 +39,22 @@ http_request_size_bytes_sum{handler="/v1/chatqna"} 128.0 ... ``` -### Specific Metrics +Most of them are histogram metrics. + +### Megaservice E2E metrics + +Applications' megaservice `ServiceOrchectrator` provides following metrics: -To access the metrics exposed by each specific microservice, ensure that you check the specific port and your port mapping to reach the `/metrics` endpoint correctly. +- `megaservice_first_token_latency`: time to first token (TTFT) +- `megaservice_inter_token_latency`: inter-token latency (ITL ~ TPOT) +- `megaservice_request_latency`: whole request E2E latency = TTFT + ITL \* tokens +- `megaservice_request_pending`: how many LLM requests are still in progress + +Latency ones are histogram metrics i.e. include count, total value and set of value buckets for each item. + +They are available only for _streaming_ requests using LLM. Pending count accounts for all requests. + +### Inferencing Metrics For example, you can `curl localhost:6006/metrics` to retrieve the TEI embedding metrics, and the output should look like follows: @@ -66,6 +81,8 @@ te_request_inference_duration_bucket{le="0.000022500000000000005"} 0 te_request_inference_duration_bucket{le="0.00003375000000000001"} 0 ``` +### Metrics collection + These metrics can be scraped by the Prometheus server into a time-series database and further visualized using Grafana. Below are some default metrics endpoints for specific microservices: