From 05fba48cec3e18d0d6a070a8dc629061cc6dd17d Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 10 Aug 2024 09:14:38 -0700 Subject: [PATCH 1/7] feat - refactor prometheus metrics --- litellm/integrations/prometheus.py | 238 +++++++++++++++-------------- 1 file changed, 126 insertions(+), 112 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 6160e4d33ef1..e8808307e97a 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -15,9 +15,10 @@ import litellm from litellm._logging import print_verbose, verbose_logger +from litellm.integrations.custom_logger import CustomLogger -class PrometheusLogger: +class PrometheusLogger(CustomLogger): # Class variables or attributes def __init__( self, @@ -147,83 +148,137 @@ def __init__( print_verbose(f"Got exception on init prometheus client {str(e)}") raise e - async def _async_log_event( - self, kwargs, response_obj, start_time, end_time, print_verbose, user_id - ): - self.log_event( - kwargs, response_obj, start_time, end_time, user_id, print_verbose - ) + async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): + # Define prometheus client + from litellm.proxy.proxy_server import premium_user - def log_event( - self, kwargs, response_obj, start_time, end_time, user_id, print_verbose - ): - try: - # Define prometheus client - from litellm.proxy.proxy_server import premium_user - - verbose_logger.debug( - f"prometheus Logging - Enters logging function for model {kwargs}" - ) + verbose_logger.debug( + f"prometheus Logging - Enters success logging function for kwargs {kwargs}" + ) - # unpack kwargs - model = kwargs.get("model", "") - response_cost = kwargs.get("response_cost", 0.0) or 0 - litellm_params = kwargs.get("litellm_params", {}) or {} - proxy_server_request = litellm_params.get("proxy_server_request") or {} - end_user_id = proxy_server_request.get("body", {}).get("user", None) - user_id = litellm_params.get("metadata", {}).get( - "user_api_key_user_id", None - ) - user_api_key = litellm_params.get("metadata", {}).get("user_api_key", None) - user_api_key_alias = litellm_params.get("metadata", {}).get( - "user_api_key_alias", None - ) - user_api_team = litellm_params.get("metadata", {}).get( - "user_api_key_team_id", None - ) - user_api_team_alias = litellm_params.get("metadata", {}).get( - "user_api_key_team_alias", None - ) + # unpack kwargs + model = kwargs.get("model", "") + response_cost = kwargs.get("response_cost", 0.0) or 0 + litellm_params = kwargs.get("litellm_params", {}) or {} + proxy_server_request = litellm_params.get("proxy_server_request") or {} + end_user_id = proxy_server_request.get("body", {}).get("user", None) + user_id = litellm_params.get("metadata", {}).get("user_api_key_user_id", None) + user_api_key = litellm_params.get("metadata", {}).get("user_api_key", None) + user_api_key_alias = litellm_params.get("metadata", {}).get( + "user_api_key_alias", None + ) + user_api_team = litellm_params.get("metadata", {}).get( + "user_api_key_team_id", None + ) + user_api_team_alias = litellm_params.get("metadata", {}).get( + "user_api_key_team_alias", None + ) - _team_spend = litellm_params.get("metadata", {}).get( - "user_api_key_team_spend", None - ) - _team_max_budget = litellm_params.get("metadata", {}).get( - "user_api_key_team_max_budget", None - ) - _remaining_team_budget = safe_get_remaining_budget( - max_budget=_team_max_budget, spend=_team_spend - ) + _team_spend = litellm_params.get("metadata", {}).get( + "user_api_key_team_spend", None + ) + _team_max_budget = litellm_params.get("metadata", {}).get( + "user_api_key_team_max_budget", None + ) + _remaining_team_budget = safe_get_remaining_budget( + max_budget=_team_max_budget, spend=_team_spend + ) - _api_key_spend = litellm_params.get("metadata", {}).get( - "user_api_key_spend", None - ) - _api_key_max_budget = litellm_params.get("metadata", {}).get( - "user_api_key_max_budget", None - ) - _remaining_api_key_budget = safe_get_remaining_budget( - max_budget=_api_key_max_budget, spend=_api_key_spend - ) + _api_key_spend = litellm_params.get("metadata", {}).get( + "user_api_key_spend", None + ) + _api_key_max_budget = litellm_params.get("metadata", {}).get( + "user_api_key_max_budget", None + ) + _remaining_api_key_budget = safe_get_remaining_budget( + max_budget=_api_key_max_budget, spend=_api_key_spend + ) - if response_obj is not None: - tokens_used = response_obj.get("usage", {}).get("total_tokens", 0) - else: - tokens_used = 0 + if response_obj is not None: + tokens_used = response_obj.get("usage", {}).get("total_tokens", 0) + else: + tokens_used = 0 - print_verbose( - f"inside track_prometheus_metrics, model {model}, response_cost {response_cost}, tokens_used {tokens_used}, end_user_id {end_user_id}, user_api_key {user_api_key}" - ) + print_verbose( + f"inside track_prometheus_metrics, model {model}, response_cost {response_cost}, tokens_used {tokens_used}, end_user_id {end_user_id}, user_api_key {user_api_key}" + ) - if ( - user_api_key is not None - and isinstance(user_api_key, str) - and user_api_key.startswith("sk-") - ): - from litellm.proxy.utils import hash_token + if ( + user_api_key is not None + and isinstance(user_api_key, str) + and user_api_key.startswith("sk-") + ): + from litellm.proxy.utils import hash_token + + user_api_key = hash_token(user_api_key) + + self.litellm_requests_metric.labels( + end_user_id, + user_api_key, + user_api_key_alias, + model, + user_api_team, + user_api_team_alias, + user_id, + ).inc() + self.litellm_spend_metric.labels( + end_user_id, + user_api_key, + user_api_key_alias, + model, + user_api_team, + user_api_team_alias, + user_id, + ).inc(response_cost) + self.litellm_tokens_metric.labels( + end_user_id, + user_api_key, + user_api_key_alias, + model, + user_api_team, + user_api_team_alias, + user_id, + ).inc(tokens_used) + + self.litellm_remaining_team_budget_metric.labels( + user_api_team, user_api_team_alias + ).set(_remaining_team_budget) + + self.litellm_remaining_api_key_budget_metric.labels( + user_api_key, user_api_key_alias + ).set(_remaining_api_key_budget) + + # set x-ratelimit headers + if premium_user is True: + self.set_llm_deployment_success_metrics(kwargs) + pass + + async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): + from litellm.proxy.proxy_server import premium_user + + verbose_logger.debug( + f"prometheus Logging - Enters success logging function for kwargs {kwargs}" + ) - user_api_key = hash_token(user_api_key) + # unpack kwargs + model = kwargs.get("model", "") + litellm_params = kwargs.get("litellm_params", {}) or {} + proxy_server_request = litellm_params.get("proxy_server_request") or {} + end_user_id = proxy_server_request.get("body", {}).get("user", None) + user_id = litellm_params.get("metadata", {}).get("user_api_key_user_id", None) + user_api_key = litellm_params.get("metadata", {}).get("user_api_key", None) + user_api_key_alias = litellm_params.get("metadata", {}).get( + "user_api_key_alias", None + ) + user_api_team = litellm_params.get("metadata", {}).get( + "user_api_key_team_id", None + ) + user_api_team_alias = litellm_params.get("metadata", {}).get( + "user_api_key_team_alias", None + ) - self.litellm_requests_metric.labels( + try: + self.litellm_llm_api_failed_requests_metric.labels( end_user_id, user_api_key, user_api_key_alias, @@ -232,56 +287,15 @@ def log_event( user_api_team_alias, user_id, ).inc() - self.litellm_spend_metric.labels( - end_user_id, - user_api_key, - user_api_key_alias, - model, - user_api_team, - user_api_team_alias, - user_id, - ).inc(response_cost) - self.litellm_tokens_metric.labels( - end_user_id, - user_api_key, - user_api_key_alias, - model, - user_api_team, - user_api_team_alias, - user_id, - ).inc(tokens_used) - self.litellm_remaining_team_budget_metric.labels( - user_api_team, user_api_team_alias - ).set(_remaining_team_budget) - - self.litellm_remaining_api_key_budget_metric.labels( - user_api_key, user_api_key_alias - ).set(_remaining_api_key_budget) - - # set x-ratelimit headers - if premium_user is True: - self.set_llm_deployment_success_metrics(kwargs) - - ### FAILURE INCREMENT ### - if "exception" in kwargs: - self.litellm_llm_api_failed_requests_metric.labels( - end_user_id, - user_api_key, - user_api_key_alias, - model, - user_api_team, - user_api_team_alias, - user_id, - ).inc() - - self.set_llm_deployment_failure_metrics(kwargs) + self.set_llm_deployment_failure_metrics(kwargs) except Exception as e: verbose_logger.error( "prometheus Layer Error(): Exception occured - {}".format(str(e)) ) verbose_logger.debug(traceback.format_exc()) pass + pass def set_llm_deployment_failure_metrics(self, request_kwargs: dict): try: From 20ef72194cae3561ac4b1c58fd5730368fd6be6e Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 10 Aug 2024 09:15:23 -0700 Subject: [PATCH 2/7] use customLogger for prometheus logger --- litellm/litellm_core_utils/litellm_logging.py | 47 ------------------- 1 file changed, 47 deletions(-) diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index 36d4f6aa1c46..cef7973128d1 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -949,34 +949,6 @@ def success_handler( user_id=kwargs.get("user", None), print_verbose=print_verbose, ) - if callback == "prometheus": - verbose_logger.debug("reaches prometheus for success logging!") - kwargs = {} - for k, v in self.model_call_details.items(): - if ( - k != "original_response" - ): # copy.deepcopy raises errors as this could be a coroutine - kwargs[k] = v - # this only logs streaming once, complete_streaming_response exists i.e when stream ends - if self.stream: - verbose_logger.debug( - f"prometheus: is complete_streaming_response in kwargs: {kwargs.get('complete_streaming_response', None)}" - ) - if complete_streaming_response is None: - continue - else: - print_verbose( - "reaches prometheus for streaming logging!" - ) - result = kwargs["complete_streaming_response"] - prometheusLogger.log_event( - kwargs=kwargs, - response_obj=result, - start_time=start_time, - end_time=end_time, - user_id=kwargs.get("user", None), - print_verbose=print_verbose, - ) if callback == "generic": global genericAPILogger verbose_logger.debug("reaches langfuse for success logging!") @@ -1763,25 +1735,6 @@ def failure_handler( level="ERROR", kwargs=self.model_call_details, ) - if callback == "prometheus": - global prometheusLogger - verbose_logger.debug("reaches prometheus for success logging!") - kwargs = {} - for k, v in self.model_call_details.items(): - if ( - k != "original_response" - ): # copy.deepcopy raises errors as this could be a coroutine - kwargs[k] = v - kwargs["exception"] = str(exception) - prometheusLogger.log_event( - kwargs=kwargs, - response_obj=result, - start_time=start_time, - end_time=end_time, - user_id=kwargs.get("user", None), - print_verbose=print_verbose, - ) - if callback == "logfire": verbose_logger.debug("reaches logfire for failure logging!") kwargs = {} From ce9af1e1dfcc11c763b2d20a109e7188cb263bf8 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 10 Aug 2024 09:28:46 -0700 Subject: [PATCH 3/7] refactor prometheus to be a customLogger class --- litellm/__init__.py | 1 + litellm/litellm_core_utils/litellm_logging.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/litellm/__init__.py b/litellm/__init__.py index 7e23e92a4ac9..fdfe3bd9ea15 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -43,6 +43,7 @@ "logfire", "dynamic_rate_limiter", "langsmith", + "prometheus", "galileo", "braintrust", "arize", diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index cef7973128d1..9f84b26d6676 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -1904,9 +1904,6 @@ def set_callbacks(callback_list, function_id=None): openMeterLogger = OpenMeterLogger() elif callback == "datadog": dataDogLogger = DataDogLogger() - elif callback == "prometheus": - if prometheusLogger is None: - prometheusLogger = PrometheusLogger() elif callback == "dynamodb": dynamoLogger = DyanmoDBLogger() elif callback == "s3": @@ -1980,6 +1977,14 @@ def _init_custom_logger_compatible_class( _langsmith_logger = LangsmithLogger() _in_memory_loggers.append(_langsmith_logger) return _langsmith_logger # type: ignore + elif logging_integration == "prometheus": + for callback in _in_memory_loggers: + if isinstance(callback, PrometheusLogger): + return callback # type: ignore + + _prometheus_logger = PrometheusLogger() + _in_memory_loggers.append(_prometheus_logger) + return _prometheus_logger # type: ignore elif logging_integration == "gcs_bucket": for callback in _in_memory_loggers: if isinstance(callback, GCSBucketLogger): @@ -2102,6 +2107,10 @@ def get_custom_logger_compatible_class( for callback in _in_memory_loggers: if isinstance(callback, LangsmithLogger): return callback + elif logging_integration == "prometheus": + for callback in _in_memory_loggers: + if isinstance(callback, PrometheusLogger): + return callback elif logging_integration == "gcs_bucket": for callback in _in_memory_loggers: if isinstance(callback, GCSBucketLogger): From 2174801cb05afd44e9cd9895e4d1158dae3ded64 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 10 Aug 2024 09:45:12 -0700 Subject: [PATCH 4/7] prometheus add basic testing for success --- litellm/tests/test_prometheus.py | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 litellm/tests/test_prometheus.py diff --git a/litellm/tests/test_prometheus.py b/litellm/tests/test_prometheus.py new file mode 100644 index 000000000000..73281da728de --- /dev/null +++ b/litellm/tests/test_prometheus.py @@ -0,0 +1,79 @@ +import io +import os +import sys + +sys.path.insert(0, os.path.abspath("../..")) + +import asyncio +import logging +import uuid + +import pytest +from prometheus_client import REGISTRY + +import litellm +from litellm import completion +from litellm._logging import verbose_logger +from litellm.integrations.prometheus import PrometheusLogger +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler + +verbose_logger.setLevel(logging.DEBUG) + +litellm.set_verbose = True +import time + + +@pytest.mark.asyncio() +async def test_async_prometheus_success_logging(): + run_id = str(uuid.uuid4()) + litellm.set_verbose = True + litellm.success_callback = ["prometheus"] + litellm.failure_callback = ["prometheus"] + + response = await litellm.acompletion( + model="claude-instant-1.2", + messages=[{"role": "user", "content": "what llm are u"}], + max_tokens=10, + mock_response="hi", + temperature=0.2, + metadata={ + "id": run_id, + "tags": ["tag1", "tag2"], + "user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c", + "user_api_key_alias": "ishaans-prometheus-key", + "user_api_end_user_max_budget": None, + "litellm_api_version": "1.40.19", + "global_max_parallel_requests": None, + "user_api_key_user_id": "admin", + "user_api_key_org_id": None, + "user_api_key_team_id": "dbe2f686-a686-4896-864a-4c3924458709", + "user_api_key_team_alias": "testing-team", + }, + ) + print(response) + await asyncio.sleep(3) + + # get prometheus logger + from litellm.litellm_core_utils.litellm_logging import _in_memory_loggers + + for callback in _in_memory_loggers: + if isinstance(callback, PrometheusLogger): + test_prometheus_logger = callback + + print("done with success request") + + print( + "vars of test_prometheus_logger", + vars(test_prometheus_logger.litellm_requests_metric), + ) + + # Get the metrics + metrics = {} + for metric in REGISTRY.collect(): + for sample in metric.samples: + metrics[sample.name] = sample.value + + print("metrics from prometheus", metrics) + + assert metrics["litellm_requests_metric_total"] == 1.0 + assert metrics["litellm_total_tokens_total"] == 30.0 From 83446926afb6e1cae5cf1516bf944136cbf75093 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 10 Aug 2024 10:05:33 -0700 Subject: [PATCH 5/7] track llm_deployment_success_responses --- litellm/integrations/prometheus.py | 57 +++++++++++++++++++++++++++--- litellm/tests/test_prometheus.py | 3 +- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index e8808307e97a..2cdad8a181b7 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -35,7 +35,7 @@ def __init__( self.litellm_llm_api_failed_requests_metric = Counter( name="litellm_llm_api_failed_requests_metric", - documentation="Total number of failed LLM API calls via litellm", + documentation="Total number of failed LLM API calls via litellm - track fails per API Key, team, user", labelnames=[ "end_user", "hashed_api_key", @@ -49,7 +49,7 @@ def __init__( self.litellm_requests_metric = Counter( name="litellm_requests_metric", - documentation="Total number of LLM calls to litellm", + documentation="Total number of LLM calls to litellm - track total per API Key, team, user", labelnames=[ "end_user", "hashed_api_key", @@ -105,12 +105,16 @@ def __init__( labelnames=["hashed_api_key", "api_key_alias"], ) + ######################################## + # LLM API Deployment Metrics / analytics + ######################################## + # Litellm-Enterprise Metrics if premium_user is True: # Remaining Rate Limit for model self.litellm_remaining_requests_metric = Gauge( "litellm_remaining_requests", - "remaining requests for model, returned from LLM API Provider", + "LLM Deployment Analytics - remaining requests for model, returned from LLM API Provider", labelnames=[ "model_group", "api_provider", @@ -140,7 +144,23 @@ def __init__( # Metric for deployment state self.deployment_state = Gauge( "deployment_state", - "The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage", + "LLM Deployment Analytics - The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage", + labelnames=_logged_llm_labels, + ) + + self.llm_deployment_success_responses = Counter( + name="llm_deployment_success_responses", + documentation="LLM Deployment Analytics - Total number of successful LLM API calls via litellm", + labelnames=_logged_llm_labels, + ) + self.llm_deployment_failure_responses = Counter( + name="llm_deployment_failure_responses", + documentation="LLM Deployment Analytics - Total number of failed LLM API calls via litellm", + labelnames=_logged_llm_labels, + ) + self.llm_deployment_total_requests = Counter( + name="llm_deployment_total_requests", + documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure", labelnames=_logged_llm_labels, ) @@ -287,7 +307,6 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti user_api_team_alias, user_id, ).inc() - self.set_llm_deployment_failure_metrics(kwargs) except Exception as e: verbose_logger.error( @@ -319,6 +338,20 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): api_provider=llm_provider, ) + self.llm_deployment_failure_responses.labels( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ).inc() + + self.llm_deployment_total_requests.labels( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ).inc() + pass except: pass @@ -378,6 +411,20 @@ def set_llm_deployment_success_metrics(self, request_kwargs: dict): api_base=api_base, api_provider=llm_provider, ) + + self.llm_deployment_success_responses.labels( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ).inc() + + self.llm_deployment_total_requests.labels( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ).inc() except Exception as e: verbose_logger.error( "Prometheus Error: set_llm_deployment_success_metrics. Exception occured - {}".format( diff --git a/litellm/tests/test_prometheus.py b/litellm/tests/test_prometheus.py index 73281da728de..74c48117b2d5 100644 --- a/litellm/tests/test_prometheus.py +++ b/litellm/tests/test_prometheus.py @@ -74,6 +74,7 @@ async def test_async_prometheus_success_logging(): metrics[sample.name] = sample.value print("metrics from prometheus", metrics) - assert metrics["litellm_requests_metric_total"] == 1.0 assert metrics["litellm_total_tokens_total"] == 30.0 + assert metrics["llm_deployment_success_responses_total"] == 1.0 + assert metrics["llm_deployment_total_requests_total"] == 1.0 From 01f5456e3d2aca510080596b617e3807109366e1 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 10 Aug 2024 12:53:56 -0700 Subject: [PATCH 6/7] feat - track latency per llm deployment --- litellm/integrations/prometheus.py | 58 ++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 2cdad8a181b7..1425b101c098 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -1,13 +1,12 @@ # used for /metrics endpoint on LiteLLM Proxy #### What this does #### # On success, log events to Prometheus - -import datetime import os import subprocess import sys import traceback import uuid +from datetime import datetime, timedelta from typing import Optional, TypedDict, Union import dotenv @@ -25,7 +24,7 @@ def __init__( **kwargs, ): try: - from prometheus_client import Counter, Gauge + from prometheus_client import Counter, Gauge, Histogram from litellm.proxy.proxy_server import premium_user @@ -164,6 +163,13 @@ def __init__( labelnames=_logged_llm_labels, ) + # Deployment Latency tracking + self.llm_deployment_latency_per_output_token = Histogram( + name="llm_deployment_latency_per_output_token", + documentation="LLM Deployment Analytics - Latency per output token", + labelnames=_logged_llm_labels, + ) + except Exception as e: print_verbose(f"Got exception on init prometheus client {str(e)}") raise e @@ -213,9 +219,10 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti _remaining_api_key_budget = safe_get_remaining_budget( max_budget=_api_key_max_budget, spend=_api_key_spend ) - + output_tokens = 1.0 if response_obj is not None: tokens_used = response_obj.get("usage", {}).get("total_tokens", 0) + output_tokens = response_obj.get("usage", {}).get("completion_tokens", 0) else: tokens_used = 0 @@ -270,7 +277,9 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti # set x-ratelimit headers if premium_user is True: - self.set_llm_deployment_success_metrics(kwargs) + self.set_llm_deployment_success_metrics( + kwargs, start_time, end_time, output_tokens + ) pass async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): @@ -356,7 +365,13 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): except: pass - def set_llm_deployment_success_metrics(self, request_kwargs: dict): + def set_llm_deployment_success_metrics( + self, + request_kwargs: dict, + start_time, + end_time, + output_tokens: float = 1.0, + ): try: verbose_logger.debug("setting remaining tokens requests metric") _response_headers = request_kwargs.get("response_headers") @@ -425,6 +440,37 @@ def set_llm_deployment_success_metrics(self, request_kwargs: dict): api_base=api_base, api_provider=llm_provider, ).inc() + + # Track deployment Latency + response_ms: timedelta = end_time - start_time + time_to_first_token_response_time: Optional[timedelta] = None + + if ( + request_kwargs.get("stream", None) is not None + and request_kwargs["stream"] == True + ): + # only log ttft for streaming request + time_to_first_token_response_time = ( + request_kwargs.get("completion_start_time", end_time) - start_time + ) + + # use the metric that is not None + # if streaming - use time_to_first_token_response + # if not streaming - use response_ms + _latency: timedelta = time_to_first_token_response_time or response_ms + _latency_seconds = _latency.total_seconds() + + # latency per output token + latency_per_token = None + if output_tokens is not None and output_tokens > 0: + latency_per_token = _latency_seconds / output_tokens + self.llm_deployment_latency_per_output_token.labels( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ).observe(latency_per_token) + except Exception as e: verbose_logger.error( "Prometheus Error: set_llm_deployment_success_metrics. Exception occured - {}".format( From 51ce8ba30140852ff3edffd1e5e37b5ce6b489e3 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 10 Aug 2024 12:56:23 -0700 Subject: [PATCH 7/7] test for prom metrics --- litellm/tests/test_prometheus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_prometheus.py b/litellm/tests/test_prometheus.py index 74c48117b2d5..64e824e6db0e 100644 --- a/litellm/tests/test_prometheus.py +++ b/litellm/tests/test_prometheus.py @@ -78,3 +78,4 @@ async def test_async_prometheus_success_logging(): assert metrics["litellm_total_tokens_total"] == 30.0 assert metrics["llm_deployment_success_responses_total"] == 1.0 assert metrics["llm_deployment_total_requests_total"] == 1.0 + assert metrics["llm_deployment_latency_per_output_token_bucket"] == 1.0