feat(proxy_cli.py): add new 'log_config' cli param (#6352)

* feat(proxy_cli.py): add new 'log_config' cli param Allows passing logging.conf to uvicorn on startup * docs(cli.md): add logging conf to uvicorn cli docs * fix(get_llm_provider_logic.py): fix default api base for litellm_proxy Fixes #6332 * feat(openai_like/embedding): Add support for jina ai embeddings Closes #6337 * docs(deploy.md): update entrypoint.sh filepath post-refactor Fixes outdated docs * feat(prometheus.py): emit time_to_first_token metric on prometheus Closes #6334 * fix(prometheus.py): only emit time to first token metric if stream is True enables more accurate ttft usage * test: handle vertex api instability * fix(get_llm_provider_logic.py): fix import * fix(openai.py): fix deepinfra default api base * fix(anthropic/transformation.py): remove anthropic beta header (#6361)
BerriAI · Oct 22, 2024 · dbbd0f2 · dbbd0f2
1 parent 95a1069
commit dbbd0f2
Show file tree

Hide file tree

Showing 22 changed files with 839 additions and 260 deletions.
diff --git a/docs/my-website/docs/proxy/cli.md b/docs/my-website/docs/proxy/cli.md
@@ -176,3 +176,11 @@ Cli arguments,  --host, --port, --num_workers
      ```
 
 
+## --log_config
+   - **Default:** `None`
+   - **Type:** `str`
+   - Specify a log configuration file for uvicorn.
+   - **Usage:** 
+     ```shell
+     litellm --log_config path/to/log_config.conf
+     ```
diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md
@@ -125,7 +125,7 @@ WORKDIR /app
 COPY config.yaml .
 
 # Make sure your docker/entrypoint.sh is executable
-RUN chmod +x entrypoint.sh
+RUN chmod +x ./docker/entrypoint.sh
 
 # Expose the necessary port
 EXPOSE 4000/tcp
@@ -632,7 +632,7 @@ RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
 WORKDIR /app
 
 # Make sure your entrypoint.sh is executable
-RUN chmod +x entrypoint.sh
+RUN chmod +x ./docker/entrypoint.sh
 
 # Expose the necessary port
 EXPOSE 4000/tcp

diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md
@@ -134,8 +134,9 @@ Use this for LLM API Error monitoring and tracking remaining rate limits and tok
 
 | Metric Name          | Description                          |
 |----------------------|--------------------------------------|
-| `litellm_request_total_latency_metric`             | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `litellm_call_id`, `model`, `user_api_key`, `user_api_key_alias`, `user_api_team`, `user_api_team_alias` |
-| `litellm_llm_api_latency_metric`             | Latency (seconds) for just the LLM API call - tracked for labels `litellm_call_id`, `model`, `user_api_key`, `user_api_key_alias`, `user_api_team`, `user_api_team_alias` |
+| `litellm_request_total_latency_metric`             | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `model`, `hashed_api_key`, `api_key_alias`, `team`, `team_alias` |
+| `litellm_llm_api_latency_metric`             | Latency (seconds) for just the LLM API call - tracked for labels `model`, `hashed_api_key`, `api_key_alias`, `team`, `team_alias` |
+| `litellm_llm_api_time_to_first_token_metric`             | Time to first token for LLM API call - tracked for labels `model`, `hashed_api_key`, `api_key_alias`, `team`, `team_alias` [Note: only emitted for streaming requests] |
 
 ## Virtual Key - Budget, Rate Limit Metrics
 

diff --git a/litellm/__init__.py b/litellm/__init__.py
@@ -98,6 +98,7 @@
 openai_key: Optional[str] = None
 groq_key: Optional[str] = None
 databricks_key: Optional[str] = None
+openai_like_key: Optional[str] = None
 azure_key: Optional[str] = None
 anthropic_key: Optional[str] = None
 replicate_key: Optional[str] = None
@@ -710,6 +711,8 @@ def add_known_models():
 
 class LlmProviders(str, Enum):
     OPENAI = "openai"
+    OPENAI_LIKE = "openai_like"  # embedding only
+    JINA_AI = "jina_ai"
     CUSTOM_OPENAI = "custom_openai"
     TEXT_COMPLETION_OPENAI = "text-completion-openai"
     COHERE = "cohere"
@@ -1013,6 +1016,7 @@ class LlmProviders(str, Enum):
 from .llms.fireworks_ai.embed.fireworks_ai_transformation import (
     FireworksAIEmbeddingConfig,
 )
+from .llms.jina_ai.embedding.transformation import JinaAIEmbeddingConfig
 from .llms.volcengine import VolcEngineConfig
 from .llms.text_completion_codestral import MistralTextCompletionConfig
 from .llms.AzureOpenAI.azure import (
@@ -1022,6 +1026,7 @@ class LlmProviders(str, Enum):
 
 from .llms.AzureOpenAI.chat.gpt_transformation import AzureOpenAIConfig
 from .llms.hosted_vllm.chat.transformation import HostedVLLMChatConfig
+from .llms.perplexity.chat.transformation import PerplexityChatConfig
 from .llms.AzureOpenAI.chat.o1_transformation import AzureOpenAIO1Config
 from .llms.watsonx import IBMWatsonXAIConfig
 from .main import *  # type: ignore

diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
@@ -97,6 +97,19 @@ def __init__(
                 buckets=LATENCY_BUCKETS,
             )
 
+            self.litellm_llm_api_time_to_first_token_metric = Histogram(
+                "litellm_llm_api_time_to_first_token_metric",
+                "Time to first token for a models LLM API call",
+                labelnames=[
+                    "model",
+                    "hashed_api_key",
+                    "api_key_alias",
+                    "team",
+                    "team_alias",
+                ],
+                buckets=LATENCY_BUCKETS,
+            )
+
             # Counter for spend
             self.litellm_spend_metric = Counter(
                 "litellm_spend_metric",
@@ -335,14 +348,17 @@ async def async_log_success_event(  # noqa: PLR0915
         )
 
         # unpack kwargs
-        standard_logging_payload: StandardLoggingPayload = kwargs.get(
-            "standard_logging_object", {}
+        standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
+            "standard_logging_object"
         )
+        if standard_logging_payload is None:
+            raise ValueError("standard_logging_object is required")
         model = kwargs.get("model", "")
         litellm_params = kwargs.get("litellm_params", {}) or {}
         _metadata = litellm_params.get("metadata", {})
         proxy_server_request = litellm_params.get("proxy_server_request") or {}
         end_user_id = proxy_server_request.get("body", {}).get("user", None)
+        model_parameters: dict = standard_logging_payload["model_parameters"]
         user_id = standard_logging_payload["metadata"]["user_api_key_user_id"]
         user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"]
         user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"]
@@ -468,6 +484,28 @@ async def async_log_success_event(  # noqa: PLR0915
         total_time_seconds = total_time.total_seconds()
         api_call_start_time = kwargs.get("api_call_start_time", None)
 
+        completion_start_time = kwargs.get("completion_start_time", None)
+
+        if (
+            completion_start_time is not None
+            and isinstance(completion_start_time, datetime)
+            and model_parameters.get("stream")
+            is True  # only emit for streaming requests
+        ):
+            time_to_first_token_seconds = (
+                completion_start_time - api_call_start_time
+            ).total_seconds()
+            self.litellm_llm_api_time_to_first_token_metric.labels(
+                model,
+                user_api_key,
+                user_api_key_alias,
+                user_api_team,
+                user_api_team_alias,
+            ).observe(time_to_first_token_seconds)
+        else:
+            verbose_logger.debug(
+                "Time to first token metric not emitted, stream option in model_parameters is not True"
+            )
         if api_call_start_time is not None and isinstance(
             api_call_start_time, datetime
         ):
@@ -512,6 +550,7 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti
             "standard_logging_object", {}
         )
         proxy_server_request = litellm_params.get("proxy_server_request") or {}
+
         end_user_id = proxy_server_request.get("body", {}).get("user", None)
         user_id = standard_logging_payload["metadata"]["user_api_key_user_id"]
         user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"]