perf: remove 'always_read_redis' - adding +830ms on each llm call (#6414

) * perf: remove 'always_read_redis' - adding +830ms on each llm call * test: cleanup codestral tests - backend api unavailable
BerriAI · Oct 25, 2024 · d59f8f9 · d59f8f9
1 parent 0f0470f
commit d59f8f9
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 57 deletions.
diff --git a/litellm/__init__.py b/litellm/__init__.py
@@ -160,9 +160,6 @@
 caching: bool = (
     False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 )
-always_read_redis: bool = (
-    True  # always use redis for rate limiting logic on litellm proxy
-)
 caching_with_models: bool = (
     False  # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 )

diff --git a/litellm/caching/dual_cache.py b/litellm/caching/dual_cache.py
@@ -32,7 +32,6 @@ def __init__(
         redis_cache: Optional[RedisCache] = None,
         default_in_memory_ttl: Optional[float] = None,
         default_redis_ttl: Optional[float] = None,
-        always_read_redis: Optional[bool] = True,
     ) -> None:
         super().__init__()
         # If in_memory_cache is not provided, use the default InMemoryCache
@@ -44,7 +43,6 @@ def __init__(
             default_in_memory_ttl or litellm.default_in_memory_ttl
         )
         self.default_redis_ttl = default_redis_ttl or litellm.default_redis_ttl
-        self.always_read_redis = always_read_redis
 
     def update_cache_ttl(
         self, default_in_memory_ttl: Optional[float], default_redis_ttl: Optional[float]
@@ -102,12 +100,8 @@ def get_cache(self, key, local_only: bool = False, **kwargs):
                 if in_memory_result is not None:
                     result = in_memory_result
 
-            if (
-                (self.always_read_redis is True)
-                and self.redis_cache is not None
-                and local_only is False
-            ):
-                # If not found in in-memory cache or always_read_redis is True, try fetching from Redis
+            if result is None and self.redis_cache is not None and local_only is False:
+                # If not found in in-memory cache, try fetching from Redis
                 redis_result = self.redis_cache.get_cache(key, **kwargs)
 
                 if redis_result is not None:

diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py
@@ -171,7 +171,7 @@ async def async_service_success_hook(
                         try:
                             value = str(value)
                         except Exception:
-                            value = "litllm logging error - could_not_json_serialize"
+                            value = "litellm logging error - could_not_json_serialize"
                     self.safe_set_attribute(
                         span=service_logging_span,
                         key=key,

diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
@@ -1,10 +1,51 @@
 model_list:
   - model_name: gpt-4o
     litellm_params:
-      model: azure/gpt-4o-realtime-preview
-      api_key: os.environ/AZURE_SWEDEN_API_KEY
-      api_base: os.environ/AZURE_SWEDEN_API_BASE
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
 
 litellm_settings:
-  success_callback: ["langfuse"]
-  # logged_real_time_event_types: "*"
+  callbacks: ["prometheus", "otel"]
+
+general_settings:
+  user_api_key_cache_ttl: 3600
+
+router_settings:
+  routing_strategy: latency-based-routing
+  routing_strategy_args:
+    # only assign 40% of traffic to the fastest deployment to avoid overloading it
+    lowest_latency_buffer: 0.4
+
+    # consider last five minutes of calls for latency calculation
+    ttl: 300
+
+  # model_group_alias:
+  #   gpt-4o: gpt-4o-128k-2024-05-13
+  #   gpt-4o-mini: gpt-4o-mini-128k-2024-07-18
+
+  enable_tag_filtering: True
+
+  # retry call 3 times on each model_name (we don't use fallbacks, so this would be 3 times total)
+  num_retries: 3
+
+  # -- cooldown settings --
+  # see https://github.com/BerriAI/litellm/blob/main/litellm/router_utils/cooldown_handlers.py#L265
+
+  # cooldown model if it fails > n calls in a minute.
+  allowed_fails: 2
+
+  # (in seconds) how long to cooldown model if fails/min > allowed_fails
+  cooldown_time: 60
+
+  allowed_fails_policy:
+    InternalServerErrorAllowedFails: 1
+    RateLimitErrorAllowedFails: 2
+    TimeoutErrorAllowedFails: 3
+  # -- end cooldown settings --
+
+  # see https://docs.litellm.ai/docs/proxy/prod#3-use-redis-porthost-password-not-redis_url
+  redis_host: os.environ/REDIS_HOST
+  redis_port: os.environ/REDIS_PORT
+  redis_password: os.environ/REDIS_PASSWORD
+
diff --git a/tests/local_testing/test_caching.py b/tests/local_testing/test_caching.py
@@ -2066,46 +2066,6 @@ async def test_cache_default_off_acompletion():
     assert response3.id == response4.id
 
 
-@pytest.mark.asyncio()
-@pytest.mark.skip(reason="dual caching should first prioritze local cache")
-async def test_dual_cache_uses_redis():
-    """
-
-    - Store diff values in redis and in memory cache
-    - call get cache
-    - Assert that value from redis is used
-    """
-    litellm.set_verbose = True
-    from litellm.caching.caching import DualCache, RedisCache
-
-    current_usage = uuid.uuid4()
-
-    _cache_obj = DualCache(redis_cache=RedisCache(), always_read_redis=True)
-
-    # set cache
-    await _cache_obj.async_set_cache(key=f"current_usage: {current_usage}", value=10)
-
-    # modify value of in memory cache
-    _cache_obj.in_memory_cache.cache_dict[f"current_usage: {current_usage}"] = 1
-
-    # get cache
-    value = await _cache_obj.async_get_cache(key=f"current_usage: {current_usage}")
-    print("value from dual cache", value)
-    assert value == 10
-
-
-@pytest.mark.asyncio()
-async def test_proxy_logging_setup():
-    """
-    Assert always_read_redis is True when used by internal usage cache
-    """
-    from litellm.caching.caching import DualCache
-    from litellm.proxy.utils import ProxyLogging
-
-    pl_obj = ProxyLogging(user_api_key_cache=DualCache())
-    assert pl_obj.internal_usage_cache.dual_cache.always_read_redis is True
-
-
 @pytest.mark.skip(reason="local test. Requires sentinel setup.")
 @pytest.mark.asyncio
 async def test_redis_sentinel_caching():