From e35a7c32cb529bc329dffbe008dcdc3ebe66e4e0 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 16:34:33 -0800
Subject: [PATCH 01/60] fix(proxy/utils.py): if langfuse trace id passed in,
 just send that as part of alert

---
 litellm/proxy/utils.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 84b09d726552..20d958bae01b 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -149,12 +149,20 @@ async def response_taking_too_long(
         if request_data is not None:
             model = request_data.get("model", "")
             messages = request_data.get("messages", "")
-            # try casting messages to str and get the first 100 characters, else mark as None
-            try:
+            trace_id = request_data.get("metadata", {}).get(
+                "trace_id", None
+            )  # get langfuse trace id
+            if trace_id is not None:
                 messages = str(messages)
-                messages = messages[:10000]
-            except:
-                messages = None
+                messages = messages[:100]
+                messages = f"{messages}\nLangfuse Trace Id: {trace_id}"
+            else:
+                # try casting messages to str and get the first 100 characters, else mark as None
+                try:
+                    messages = str(messages)
+                    messages = messages[:10000]
+                except:
+                    messages = None
 
             request_info = f"\nRequest Model: {model}\nMessages: {messages}"
         else:

From 0609968853428f2ae14700792eba746713b21dfc Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 14:36:24 -0800
Subject: [PATCH 02/60] test(test_key_generate_dynamodb.py): fix test

---
 litellm/tests/test_key_generate_dynamodb.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_key_generate_dynamodb.py b/litellm/tests/test_key_generate_dynamodb.py
index 61d0ff6a66e6..e77dc7472315 100644
--- a/litellm/tests/test_key_generate_dynamodb.py
+++ b/litellm/tests/test_key_generate_dynamodb.py
@@ -490,8 +490,13 @@ def test_dynamo_db_migration(custom_db_client):
     try:
 
         async def test():
+            request = GenerateKeyRequest(max_budget=1)
+            key = await generate_key_fn(request)
+            print(key)
+
+            generated_key = key.key
             bearer_token = (
-                "Bearer " + "sk-elJDL2pOEjcAuC7zD4psAg"
+                "Bearer " + generated_key
             )  # this works with ishaan's db, it's a never expiring key
 
             request = Request(scope={"type": "http"})

From 2dab09cde17e5c7e3f114e9a16c359ad7af69940 Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 17:07:57 -0800
Subject: [PATCH 03/60] Update model_prices_and_context_window.json

---
 model_prices_and_context_window.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index b6ded001c93a..4c28bdbe8b35 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -156,8 +156,8 @@
         "max_tokens": 4097,
         "max_input_tokens": 4097,
         "max_output_tokens": 4096,
-        "input_cost_per_token": 0.000012,
-        "output_cost_per_token": 0.000016,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000006,
         "litellm_provider": "openai",
         "mode": "chat"
     },

From f48c92e817fd7e199ed7d5e5abca389968ac43b3 Mon Sep 17 00:00:00 2001
From: John HU <hszqqq12@gmail.com>
Date: Mon, 5 Feb 2024 17:30:39 -0800
Subject: [PATCH 04/60] Fix admin UI title and description

---
 ui/litellm-dashboard/src/app/layout.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ui/litellm-dashboard/src/app/layout.tsx b/ui/litellm-dashboard/src/app/layout.tsx
index 3314e4780a0c..a04a0d66edce 100644
--- a/ui/litellm-dashboard/src/app/layout.tsx
+++ b/ui/litellm-dashboard/src/app/layout.tsx
@@ -5,8 +5,8 @@ import "./globals.css";
 const inter = Inter({ subsets: ["latin"] });
 
 export const metadata: Metadata = {
-  title: "Create Next App",
-  description: "Generated by create next app",
+  title: "🚅 LiteLLM",
+  description: "LiteLLM Proxy Admin UI",
 };
 
 export default function RootLayout({

From f363f0f5bafbaf3e440d6729fbb5d66c6a637d88 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 16:16:15 -0800
Subject: [PATCH 05/60] fix(langfuse.py): support logging failed llm api calls
 to langfuse

---
 litellm/integrations/langfuse.py | 208 +++++++++++++++++++------------
 litellm/utils.py                 |  58 ++++-----
 2 files changed, 156 insertions(+), 110 deletions(-)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index e62dccdc47e2..82de33366096 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -55,8 +55,21 @@ def __init__(self, langfuse_public_key=None, langfuse_secret=None):
         else:
             self.upstream_langfuse = None
 
+    # def log_error(kwargs, response_obj, start_time, end_time):
+    #     generation = trace.generation(
+    #         level ="ERROR" # can be any of DEBUG, DEFAULT, WARNING or ERROR
+    #         status_message='error' # can be any string (e.g. stringified stack trace or error body)
+    #     )
     def log_event(
-        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
+        self,
+        kwargs,
+        response_obj,
+        start_time,
+        end_time,
+        user_id,
+        print_verbose,
+        level="DEFAULT",
+        status_message=None,
     ):
         # Method definition
 
@@ -84,37 +97,49 @@ def log_event(
                         pass
 
             # end of processing langfuse ########################
-            if kwargs.get("call_type", None) == "embedding" or isinstance(
-                response_obj, litellm.EmbeddingResponse
+            if (
+                level == "ERROR"
+                and status_message is not None
+                and isinstance(status_message, str)
+            ):
+                input = prompt
+                output = status_message
+            elif response_obj is not None and (
+                kwargs.get("call_type", None) == "embedding"
+                or isinstance(response_obj, litellm.EmbeddingResponse)
             ):
                 input = prompt
                 output = response_obj["data"]
-            else:
+            elif response_obj is not None:
                 input = prompt
                 output = response_obj["choices"][0]["message"].json()
-            print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
-            self._log_langfuse_v2(
-                user_id,
-                metadata,
-                output,
-                start_time,
-                end_time,
-                kwargs,
-                optional_params,
-                input,
-                response_obj,
-                print_verbose,
-            ) if self._is_langfuse_v2() else self._log_langfuse_v1(
-                user_id,
-                metadata,
-                output,
-                start_time,
-                end_time,
-                kwargs,
-                optional_params,
-                input,
-                response_obj,
-            )
+            print(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
+            if self._is_langfuse_v2():
+                self._log_langfuse_v2(
+                    user_id,
+                    metadata,
+                    output,
+                    start_time,
+                    end_time,
+                    kwargs,
+                    optional_params,
+                    input,
+                    response_obj,
+                    level,
+                    print_verbose,
+                )
+            elif response_obj is not None:
+                self._log_langfuse_v1(
+                    user_id,
+                    metadata,
+                    output,
+                    start_time,
+                    end_time,
+                    kwargs,
+                    optional_params,
+                    input,
+                    response_obj,
+                )
 
             self.Langfuse.flush()
             print_verbose(
@@ -123,15 +148,15 @@ def log_event(
             verbose_logger.info(f"Langfuse Layer Logging - logging success")
         except:
             traceback.print_exc()
-            print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
             pass
 
     async def _async_log_event(
         self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
     ):
-        self.log_event(
-            kwargs, response_obj, start_time, end_time, user_id, print_verbose
-        )
+        """
+        TODO: support async calls when langfuse is truly async
+        """
 
     def _is_langfuse_v2(self):
         import langfuse
@@ -193,57 +218,78 @@ def _log_langfuse_v2(
         optional_params,
         input,
         response_obj,
+        level,
         print_verbose,
     ):
         import langfuse
 
-        tags = []
-        supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
-        supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
-
-        print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
-
-        generation_name = metadata.get("generation_name", None)
-        if generation_name is None:
-            # just log `litellm-{call_type}` as the generation name
-            generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
-
-        trace_params = {
-            "name": generation_name,
-            "input": input,
-            "output": output,
-            "user_id": metadata.get("trace_user_id", user_id),
-            "id": metadata.get("trace_id", None),
-            "session_id": metadata.get("session_id", None),
-        }
-        cost = kwargs["response_cost"]
-        print_verbose(f"trace: {cost}")
-        if supports_tags:
-            for key, value in metadata.items():
-                tags.append(f"{key}:{value}")
-            if "cache_hit" in kwargs:
-                tags.append(f"cache_hit:{kwargs['cache_hit']}")
-            trace_params.update({"tags": tags})
-
-        trace = self.Langfuse.trace(**trace_params)
-
-        # get generation_id
-        generation_id = None
-        if response_obj.get("id", None) is not None:
-            generation_id = litellm.utils.get_logging_id(start_time, response_obj)
-        trace.generation(
-            name=generation_name,
-            id=metadata.get("generation_id", generation_id),
-            startTime=start_time,
-            endTime=end_time,
-            model=kwargs["model"],
-            modelParameters=optional_params,
-            input=input,
-            output=output,
-            usage={
-                "prompt_tokens": response_obj["usage"]["prompt_tokens"],
-                "completion_tokens": response_obj["usage"]["completion_tokens"],
-                "total_cost": cost if supports_costs else None,
-            },
-            metadata=metadata,
-        )
+        try:
+            tags = []
+            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
+            supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
+
+            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
+
+            generation_name = metadata.get("generation_name", None)
+            if generation_name is None:
+                # just log `litellm-{call_type}` as the generation name
+                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
+
+            trace_params = {
+                "name": generation_name,
+                "input": input,
+                "user_id": metadata.get("trace_user_id", user_id),
+                "id": metadata.get("trace_id", None),
+                "session_id": metadata.get("session_id", None),
+            }
+
+            if level == "ERROR":
+                trace_params["status_message"] = output
+            else:
+                trace_params["output"] = output
+
+            cost = kwargs.get("response_cost", None)
+            print_verbose(f"trace: {cost}")
+            if supports_tags:
+                for key, value in metadata.items():
+                    tags.append(f"{key}:{value}")
+                if "cache_hit" in kwargs:
+                    tags.append(f"cache_hit:{kwargs['cache_hit']}")
+                trace_params.update({"tags": tags})
+
+            trace = self.Langfuse.trace(**trace_params)
+
+            if level == "ERROR":
+                trace.generation(
+                    level="ERROR",  # can be any of DEBUG, DEFAULT, WARNING or ERROR
+                    status_message=output,  # can be any string (e.g. stringified stack trace or error body)
+                )
+                print(f"SUCCESSFULLY LOGGED ERROR")
+            else:
+                # get generation_id
+                generation_id = None
+                if (
+                    response_obj is not None
+                    and response_obj.get("id", None) is not None
+                ):
+                    generation_id = litellm.utils.get_logging_id(
+                        start_time, response_obj
+                    )
+                trace.generation(
+                    name=generation_name,
+                    id=metadata.get("generation_id", generation_id),
+                    startTime=start_time,
+                    endTime=end_time,
+                    model=kwargs["model"],
+                    modelParameters=optional_params,
+                    input=input,
+                    output=output,
+                    usage={
+                        "prompt_tokens": response_obj["usage"]["prompt_tokens"],
+                        "completion_tokens": response_obj["usage"]["completion_tokens"],
+                        "total_cost": cost if supports_costs else None,
+                    },
+                    metadata=metadata,
+                )
+        except Exception as e:
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
diff --git a/litellm/utils.py b/litellm/utils.py
index e56ba879f8ff..1e83a319f433 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1636,34 +1636,6 @@ async def async_success_handler(
                             end_time=end_time,
                             print_verbose=print_verbose,
                         )
-                if callback == "langfuse":
-                    global langFuseLogger
-                    print_verbose("reaches Async langfuse for logging!")
-                    kwargs = {}
-                    for k, v in self.model_call_details.items():
-                        if (
-                            k != "original_response"
-                        ):  # copy.deepcopy raises errors as this could be a coroutine
-                            kwargs[k] = v
-                    # this only logs streaming once, complete_streaming_response exists i.e when stream ends
-                    if self.stream:
-                        if "complete_streaming_response" not in kwargs:
-                            return
-                        else:
-                            print_verbose(
-                                "reaches Async langfuse for streaming logging!"
-                            )
-                            result = kwargs["complete_streaming_response"]
-                    if langFuseLogger is None:
-                        langFuseLogger = LangFuseLogger()
-                    await langFuseLogger._async_log_event(
-                        kwargs=kwargs,
-                        response_obj=result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        user_id=kwargs.get("user", None),
-                        print_verbose=print_verbose,
-                    )
             except:
                 print_verbose(
                     f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
@@ -1788,9 +1760,37 @@ def failure_handler(
                             response_obj=result,
                             kwargs=self.model_call_details,
                         )
+                    elif callback == "langfuse":
+                        global langFuseLogger
+                        verbose_logger.debug("reaches langfuse for logging!")
+                        kwargs = {}
+                        for k, v in self.model_call_details.items():
+                            if (
+                                k != "original_response"
+                            ):  # copy.deepcopy raises errors as this could be a coroutine
+                                kwargs[k] = v
+                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
+                        if langFuseLogger is None or (
+                            self.langfuse_public_key != langFuseLogger.public_key
+                            and self.langfuse_secret != langFuseLogger.secret_key
+                        ):
+                            langFuseLogger = LangFuseLogger(
+                                langfuse_public_key=self.langfuse_public_key,
+                                langfuse_secret=self.langfuse_secret,
+                            )
+                        langFuseLogger.log_event(
+                            start_time=start_time,
+                            end_time=end_time,
+                            response_obj=None,
+                            user_id=kwargs.get("user", None),
+                            print_verbose=print_verbose,
+                            status_message=str(exception),
+                            level="ERROR",
+                            kwargs=self.model_call_details,
+                        )
                 except Exception as e:
                     print_verbose(
-                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {traceback.format_exc()}"
+                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {str(e)}"
                     )
                     print_verbose(
                         f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"

From 0996ea3f364451868e377489166636ca66944a9e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:37:05 -0800
Subject: [PATCH 06/60] (docs) upperbound_key_generate_params

---
 docs/my-website/docs/proxy/virtual_keys.md       | 16 ++++++++++++++++
 .../model_prices_and_context_window_backup.json  |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md
index dd5edc6da81b..c51bfc0ac90b 100644
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@@ -352,6 +352,22 @@ Request Params:
 }
 ```
 
+## Upperbound /key/generate params
+Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. 
+
+Set `litellm_settings:upperbound_key_generate_params`:
+```yaml
+litellm_settings:
+  upperbound_key_generate_params:
+    max_budget: 100 # upperbound of $100, for all /key/generate requests
+    duration: "30d" # upperbound of 30 days for all /key/generate requests
+```
+
+** Expected Behavior **
+
+- Send a `/key/generate` request with `max_budget=200`
+- Key will be created with `max_budget=100` since 100 is the upper bound
+
 ## Default /key/generate params
 Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index b6ded001c93a..4c28bdbe8b35 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -156,8 +156,8 @@
         "max_tokens": 4097,
         "max_input_tokens": 4097,
         "max_output_tokens": 4096,
-        "input_cost_per_token": 0.000012,
-        "output_cost_per_token": 0.000016,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000006,
         "litellm_provider": "openai",
         "mode": "chat"
     },

From 8d0c235004b0a838fecd5d2b4cbc2092fbe3e712 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:38:47 -0800
Subject: [PATCH 07/60] (feat) upperbound_key_generate_params

---
 litellm/__init__.py           |  1 +
 litellm/proxy/proxy_server.py | 69 +++++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 3f2a1e4b4d25..26b761c64a1b 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -146,6 +146,7 @@
 dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
+upperbound_key_generate_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 289a36cb2b0e..494c874147ba 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1391,6 +1391,26 @@ async def load_config(
 proxy_config = ProxyConfig()
 
 
+def _duration_in_seconds(duration: str):
+    match = re.match(r"(\d+)([smhd]?)", duration)
+    if not match:
+        raise ValueError("Invalid duration format")
+
+    value, unit = match.groups()
+    value = int(value)
+
+    if unit == "s":
+        return value
+    elif unit == "m":
+        return value * 60
+    elif unit == "h":
+        return value * 3600
+    elif unit == "d":
+        return value * 86400
+    else:
+        raise ValueError("Unsupported duration unit")
+
+
 async def generate_key_helper_fn(
     duration: Optional[str],
     models: list,
@@ -1425,25 +1445,6 @@ async def generate_key_helper_fn(
     if token is None:
         token = f"sk-{secrets.token_urlsafe(16)}"
 
-    def _duration_in_seconds(duration: str):
-        match = re.match(r"(\d+)([smhd]?)", duration)
-        if not match:
-            raise ValueError("Invalid duration format")
-
-        value, unit = match.groups()
-        value = int(value)
-
-        if unit == "s":
-            return value
-        elif unit == "m":
-            return value * 60
-        elif unit == "h":
-            return value * 3600
-        elif unit == "d":
-            return value * 86400
-        else:
-            raise ValueError("Unsupported duration unit")
-
     if duration is None:  # allow tokens that never expire
         expires = None
     else:
@@ -2660,6 +2661,36 @@ async def generate_key_fn(
                 elif key == "metadata" and value == {}:
                     setattr(data, key, litellm.default_key_generate_params.get(key, {}))
 
+        # check if user set default key/generate params on config.yaml
+        if litellm.upperbound_key_generate_params is not None:
+            for elem in data:
+                # if key in litellm.upperbound_key_generate_params, use the min of value and litellm.upperbound_key_generate_params[key]
+                key, value = elem
+                if value is not None and key in litellm.upperbound_key_generate_params:
+                    # if value is float/int
+                    if key in [
+                        "max_budget",
+                        "max_parallel_requests",
+                        "tpm_limit",
+                        "rpm_limit",
+                    ]:
+                        if value > litellm.upperbound_key_generate_params[key]:
+                            # directly compare floats/ints
+                            setattr(
+                                data, key, litellm.upperbound_key_generate_params[key]
+                            )
+                    elif key == "budget_duration":
+                        # budgets are in 1s, 1m, 1h, 1d, 1m (30s, 30m, 30h, 30d, 30m)
+                        # compare the duration in seconds and max duration in seconds
+                        upperbound_budget_duration = _duration_in_seconds(
+                            duration=litellm.upperbound_key_generate_params[key]
+                        )
+                        user_set_budget_duration = _duration_in_seconds(duration=value)
+                        if user_set_budget_duration > upperbound_budget_duration:
+                            setattr(
+                                data, key, litellm.upperbound_key_generate_params[key]
+                            )
+
         data_json = data.json()  # type: ignore
 
         # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users

From e21f9064638acee868a2cf796b4e5d94bbf9e8c9 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:39:36 -0800
Subject: [PATCH 08/60] (test) test_upperbound_key_params

---
 litellm/tests/test_key_generate_prisma.py | 34 +++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py
index de26168591c7..b4c86afb25ff 100644
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@@ -1279,6 +1279,40 @@ async def test_default_key_params(prisma_client):
         pytest.fail(f"Got exception {e}")
 
 
+@pytest.mark.asyncio()
+async def test_upperbound_key_params(prisma_client):
+    """
+    - create key
+    - get key info
+    - assert key_name is not null
+    """
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    litellm.upperbound_key_generate_params = {
+        "max_budget": 0.001,
+        "budget_duration": "1m",
+    }
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    try:
+        request = GenerateKeyRequest(
+            max_budget=200000,
+            budget_duration="30d",
+        )
+        key = await generate_key_fn(request)
+        generated_key = key.key
+
+        result = await info_key_fn(key=generated_key)
+        key_info = result["info"]
+        # assert it used the upper bound for max_budget, and budget_duration
+        assert key_info["max_budget"] == 0.001
+        assert key_info["budget_duration"] == "1m"
+
+        print(result)
+    except Exception as e:
+        print("Got Exception", e)
+        pytest.fail(f"Got exception {e}")
+
+
 def test_get_bearer_token():
     from litellm.proxy.proxy_server import _get_bearer_token
 

From 732ac6df4948b4143f6702fac7a332c81a839739 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:40:52 -0800
Subject: [PATCH 09/60] (feat) proxy - upperbound params /key/generate

---
 litellm/proxy/proxy_config.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 874049a752f7..bd844bd7ba3e 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,6 +73,9 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
+  upperbound_key_generate_params:
+    max_budget: 100
+    duration: "30d"
   # cache: True     
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From ca029d13ee268fae5bca9c6340f3fc34067fb9ce Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:51:08 -0800
Subject: [PATCH 10/60] (fix) proxy startup test

---
 .../test_configs/test_config_no_auth.yaml     | 95 -------------------
 1 file changed, 95 deletions(-)

diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml
index 8441018e355a..ccebe016db47 100644
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@@ -9,21 +9,11 @@ model_list:
     api_key: os.environ/AZURE_CANADA_API_KEY
     model: azure/gpt-35-turbo
   model_name: azure-model
-- litellm_params:
-    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
-    api_key: os.environ/AZURE_API_KEY
-    model: azure/chatgpt-v-2
-  model_name: azure-cloudflare-model
 - litellm_params:
     api_base: https://openai-france-1234.openai.azure.com
     api_key: os.environ/AZURE_FRANCE_API_KEY
     model: azure/gpt-turbo
   model_name: azure-model
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-  model_name: test_openai_models
 - litellm_params:
     model: gpt-3.5-turbo
   model_info:
@@ -36,93 +26,8 @@ model_list:
     description: this is a test openai model
     id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
   model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
-  model_name: test_openai_models
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-07-01-preview
-    model: azure/azure-embedding-model
-  model_info:
-    mode: embedding
-  model_name: azure-embedding-model
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: f6f74e14-ac64-4403-9365-319e584dcdc5
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 9b1ef341-322c-410a-8992-903987fef439
-  model_name: test_openai_models
 - litellm_params:
     model: bedrock/amazon.titan-embed-text-v1
   model_info:
     mode: embedding
   model_name: amazon-embeddings
-- litellm_params:
-    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
-  model_info:
-    mode: embedding
-  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
-- litellm_params:
-    model: dall-e-3
-  model_info:
-    mode: image_generation
-  model_name: dall-e-3
-- litellm_params:
-    api_base: os.environ/AZURE_SWEDEN_API_BASE
-    api_key: os.environ/AZURE_SWEDEN_API_KEY
-    api_version: 2023-12-01-preview
-    model: azure/dall-e-3-test
-  model_info:
-    mode: image_generation
-  model_name: dall-e-3
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-06-01-preview
-    model: azure/
-  model_info:
-    mode: image_generation
-  model_name: dall-e-2
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-07-01-preview
-    model: azure/azure-embedding-model
-  model_info:
-    base_model: text-embedding-ada-002
-    mode: embedding
-  model_name: text-embedding-ada-002
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
-  model_name: test_openai_models

From 7ccb7c00d8b6c8ba73246362cf43b914a96e6f92 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:53:31 -0800
Subject: [PATCH 11/60] (ci/cd) print debug info for
 test_proxy_gunicorn_startup_config_dict

---
 litellm/tests/test_proxy_startup.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_proxy_startup.py
index 650e2f8a7a65..a846c9f4a3c4 100644
--- a/litellm/tests/test_proxy_startup.py
+++ b/litellm/tests/test_proxy_startup.py
@@ -33,6 +33,11 @@ def test_proxy_gunicorn_startup_direct_config():
     Test both approaches
     """
     try:
+        from litellm._logging import verbose_proxy_logger, verbose_router_logger
+        import logging
+
+        verbose_proxy_logger.setLevel(level=logging.DEBUG)
+        verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@@ -48,6 +53,11 @@ def test_proxy_gunicorn_startup_direct_config():
 
 def test_proxy_gunicorn_startup_config_dict():
     try:
+        from litellm._logging import verbose_proxy_logger, verbose_router_logger
+        import logging
+
+        verbose_proxy_logger.setLevel(level=logging.DEBUG)
+        verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"

From 3b977679f8951664438b84a702982681c94329fc Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 06:46:49 -0800
Subject: [PATCH 12/60] (fix) test_normal_router_tpm_limit

---
 litellm/tests/test_parallel_request_limiter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py
index 34dc0e3b57bf..528bb19d2a0e 100644
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@@ -379,6 +379,7 @@ async def test_normal_router_tpm_limit():
         )
 
     except Exception as e:
+        print("Exception on test_normal_router_tpm_limit", e)
         assert e.status_code == 429
 
 

From d189e95045f896238f596d323e9f53d697950f81 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:22:54 -0800
Subject: [PATCH 13/60] fix(ollama_chat.py): fix ollama chat completion token
 counting

---
 litellm/llms/ollama_chat.py | 8 ++++++--
 litellm/utils.py            | 3 ---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index 95ff8dfaa3a2..3628ae2903c2 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -320,11 +320,15 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                 model_response["choices"][0]["message"] = message
             else:
                 model_response["choices"][0]["message"] = response_json["message"]
+
             model_response["created"] = int(time.time())
-            model_response["model"] = "ollama/" + data["model"]
+            model_response["model"] = "ollama_chat/" + data["model"]
             prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"]))  # type: ignore
             completion_tokens = response_json.get(
-                "eval_count", litellm.token_counter(text=response_json["message"])
+                "eval_count",
+                litellm.token_counter(
+                    text=response_json["message"]["content"], count_response_tokens=True
+                ),
             )
             model_response["usage"] = litellm.Usage(
                 prompt_tokens=prompt_tokens,
diff --git a/litellm/utils.py b/litellm/utils.py
index 1e83a319f433..8491a1d5e106 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -983,9 +983,6 @@ def post_call(
             verbose_logger.debug(
                 f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
             )
-            verbose_logger.debug(
-                f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}"
-            )
             if self.logger_fn and callable(self.logger_fn):
                 try:
                     self.logger_fn(

From 3db9830d4bfeea77e2a8e57eb3478990b96dc1d3 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:26:13 -0800
Subject: [PATCH 14/60] fix(utils.py): use print_verbose for statements, so
 debug can be seen when running sdk

---
 litellm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index 8491a1d5e106..5ccb85ef0514 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -980,7 +980,7 @@ def post_call(
             self.model_call_details["log_event_type"] = "post_api_call"
 
             # User Logging -> if you pass in a custom logging function
-            verbose_logger.debug(
+            print_verbose(
                 f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
             )
             if self.logger_fn and callable(self.logger_fn):

From 80eb8d0eaee97f04ce2a36128676370c694438a5 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:43:47 -0800
Subject: [PATCH 15/60] fix(ollama_chat.py): explicitly state if ollama call is
 streaming or not

---
 litellm/llms/ollama_chat.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index 3628ae2903c2..d1a439398b23 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -146,7 +146,12 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
-    data = {"model": model, "messages": messages, "options": optional_params}
+    data = {
+        "model": model,
+        "messages": messages,
+        "options": optional_params,
+        "stream": stream,
+    }
     ## LOGGING
     logging_obj.pre_call(
         input=None,

From 7b286f38ce421bbe3b4948762a2b4356cdee9e9f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:44:04 -0800
Subject: [PATCH 16/60] =?UTF-8?q?bump:=20version=201.22.6=20=E2=86=92=201.?=
 =?UTF-8?q?22.7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 06dedbed63b9..be8c8966bec5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.6"
+version = "1.22.7"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.6"
+version = "1.22.7"
 version_files = [
     "pyproject.toml:^version"
 ]

From 944833b04ad3b144dec76c70dd840ae91254d971 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:35:46 -0800
Subject: [PATCH 17/60] build(requirements.txt): update the proxy
 requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c9bd0e511d74..768e8dff3f38 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
-google-generativeai==0.1.0 # for vertex ai calls
+google-generativeai==0.3.2 # for vertex ai calls
 async_generator==1.10.0 # for async ollama calls
 traceloop-sdk==0.5.3 # for open telemetry logging
 langfuse>=2.6.3 # for langfuse self-hosted logging

From b47f9dcb6dbebabe0ba2fa193840dda4f4d2a30a Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 10:11:43 -0800
Subject: [PATCH 18/60] fix(ollama.py): support format for ollama

---
 litellm/llms/ollama.py      | 10 +++++++++-
 litellm/llms/ollama_chat.py |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py
index d0bc24af4c4b..9339deb78d7c 100644
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@@ -146,7 +146,15 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
-    data = {"model": model, "prompt": prompt, "options": optional_params}
+    format = optional_params.pop("format", None)
+    data = {
+        "model": model,
+        "prompt": prompt,
+        "options": optional_params,
+        "stream": stream,
+    }
+    if format is not None:
+        data["format"] = format
 
     ## LOGGING
     logging_obj.pre_call(
diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index d1a439398b23..0311931b1390 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -146,12 +146,15 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
+    format = optional_params.pop("format", None)
     data = {
         "model": model,
         "messages": messages,
         "options": optional_params,
         "stream": stream,
     }
+    if format is not None:
+        data["format"] = format
     ## LOGGING
     logging_obj.pre_call(
         input=None,

From fbf95ca9c7ff02817ed97a5086924f75b1fd4005 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 10:12:13 -0800
Subject: [PATCH 19/60] =?UTF-8?q?bump:=20version=201.22.7=20=E2=86=92=201.?=
 =?UTF-8?q?22.8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index be8c8966bec5..17d80ae8ee15 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.7"
+version = "1.22.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.7"
+version = "1.22.8"
 version_files = [
     "pyproject.toml:^version"
 ]

From 45ab0f01c09abc5a363c4f4b518ae5b642b35589 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:57:20 -0800
Subject: [PATCH 20/60] (ci/cd) run in verbose mode

---
 .circleci/config.yml             | 2 +-
 litellm/tests/test_completion.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c1224159a1fb..9a29ed07ca38 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204d7..e0ee05d4f4f8 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From b3a4982eda7e33c014ed09fe4406e51643ec8bda Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:27:24 -0800
Subject: [PATCH 21/60] (fix) rename proxy startup test

---
 litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} (100%)

diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_aproxy_startup.py
similarity index 100%
rename from litellm/tests/test_proxy_startup.py
rename to litellm/tests/test_aproxy_startup.py

From 233590e8c2f9f2b244cb292c65433bedd5ca858e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:38:57 -0800
Subject: [PATCH 22/60] (fix) proxy_startup test

---
 litellm/tests/test_aproxy_startup.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/litellm/tests/test_aproxy_startup.py b/litellm/tests/test_aproxy_startup.py
index a846c9f4a3c4..024d69b1ffef 100644
--- a/litellm/tests/test_aproxy_startup.py
+++ b/litellm/tests/test_aproxy_startup.py
@@ -36,6 +36,11 @@ def test_proxy_gunicorn_startup_direct_config():
         from litellm._logging import verbose_proxy_logger, verbose_router_logger
         import logging
 
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
@@ -49,6 +54,10 @@ def test_proxy_gunicorn_startup_direct_config():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 def test_proxy_gunicorn_startup_config_dict():
@@ -58,6 +67,11 @@ def test_proxy_gunicorn_startup_config_dict():
 
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@@ -71,6 +85,10 @@ def test_proxy_gunicorn_startup_config_dict():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 # test_proxy_gunicorn_startup()

From 334acfb5f8b97f812748f81772089de88422e5dc Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:16 -0800
Subject: [PATCH 23/60] (ci/cd) run pytest without -s

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9a29ed07ca38..c1224159a1fb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results

From 0d5f6cacc42005cb6bc7aca6fc2f9a417eede014 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:24 -0800
Subject: [PATCH 24/60] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4f8..bd0301f204d7 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 97dbf14b321f6c80603744ff238b4c9fc392ef8e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:43:28 -0800
Subject: [PATCH 25/60] (fix) parallel_request_limiter debug

---
 litellm/proxy/hooks/parallel_request_limiter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py
index ca60421a5011..48cf5b7799b0 100644
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@@ -130,7 +130,9 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti
                 "current_rpm": current["current_rpm"] + 1,
             }
 
-            self.print_verbose(f"updated_value in success call: {new_val}")
+            self.print_verbose(
+                f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
+            )
             self.user_api_key_cache.set_cache(
                 request_count_api_key, new_val, ttl=60
             )  # store in cache for 1 min.

From 6640690ad62c3609bb36d10b9cbf71226951c3c8 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:44:30 -0800
Subject: [PATCH 26/60] (fix) test_normal_router_tpm_limit

---
 litellm/tests/test_parallel_request_limiter.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py
index 528bb19d2a0e..bfac8ddeae9a 100644
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@@ -306,6 +306,10 @@ async def test_normal_router_call():
 
 @pytest.mark.asyncio
 async def test_normal_router_tpm_limit():
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    verbose_proxy_logger.setLevel(level=logging.DEBUG)
     model_list = [
         {
             "model_name": "azure-model",
@@ -353,6 +357,7 @@ async def test_normal_router_tpm_limit():
     current_minute = datetime.now().strftime("%M")
     precise_minute = f"{current_date}-{current_hour}-{current_minute}"
     request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+    print("Test: Checking current_requests for precise_minute=", precise_minute)
 
     assert (
         parallel_request_handler.user_api_key_cache.get_cache(
@@ -366,6 +371,7 @@ async def test_normal_router_tpm_limit():
         model="azure-model",
         messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
         metadata={"user_api_key": _api_key},
+        mock_response="hello",
     )
     await asyncio.sleep(1)  # success is done in a separate thread
     print(f"response: {response}")

From 5b63827430ed3e1e6517a7d4cd3e65cd8f4430fe Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:47:19 -0800
Subject: [PATCH 27/60] (ci/cd) fix test_config_no_auth

---
 .../test_configs/test_config_no_auth.yaml     | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml
index ccebe016db47..9d7aff570227 100644
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@@ -9,11 +9,21 @@ model_list:
     api_key: os.environ/AZURE_CANADA_API_KEY
     model: azure/gpt-35-turbo
   model_name: azure-model
+- litellm_params:
+    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
+    api_key: os.environ/AZURE_API_KEY
+    model: azure/chatgpt-v-2
+  model_name: azure-cloudflare-model
 - litellm_params:
     api_base: https://openai-france-1234.openai.azure.com
     api_key: os.environ/AZURE_FRANCE_API_KEY
     model: azure/gpt-turbo
   model_name: azure-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+  model_name: test_openai_models
 - litellm_params:
     model: gpt-3.5-turbo
   model_info:
@@ -26,8 +36,93 @@ model_list:
     description: this is a test openai model
     id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
   model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
+  model_name: test_openai_models
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    mode: embedding
+  model_name: azure-embedding-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: f6f74e14-ac64-4403-9365-319e584dcdc5
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 9b1ef341-322c-410a-8992-903987fef439
+  model_name: test_openai_models
 - litellm_params:
     model: bedrock/amazon.titan-embed-text-v1
   model_info:
     mode: embedding
   model_name: amazon-embeddings
+- litellm_params:
+    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+  model_info:
+    mode: embedding
+  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
+- litellm_params:
+    model: dall-e-3
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_SWEDEN_API_BASE
+    api_key: os.environ/AZURE_SWEDEN_API_KEY
+    api_version: 2023-12-01-preview
+    model: azure/dall-e-3-test
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-06-01-preview
+    model: azure/
+  model_info:
+    mode: image_generation
+  model_name: dall-e-2
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    base_model: text-embedding-ada-002
+    mode: embedding
+  model_name: text-embedding-ada-002
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
+  model_name: test_openai_models
\ No newline at end of file

From 01701c95b8eff602d6683461a1a618b423e66f1c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:53:47 -0800
Subject: [PATCH 28/60] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204d7..e0ee05d4f4f8 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 647dbb9331a4bfe8df3c66cc2f0f67509aa0cdfa Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:02:36 -0800
Subject: [PATCH 29/60] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4f8..bd0301f204d7 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From eee5353e7745dd40f10eac358422ad7c9daaa210 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:29 -0800
Subject: [PATCH 30/60] fix(utils.py): round max tokens to be int always

---
 litellm/tests/test_completion.py | 5 +++--
 litellm/utils.py                 | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204d7..de79c97afa96 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -544,13 +544,13 @@ def hf_test_completion_tgi():
 def test_completion_openai():
     try:
         litellm.set_verbose = True
+        litellm.drop_params = True
         print(f"api key: {os.environ['OPENAI_API_KEY']}")
         litellm.api_key = os.environ["OPENAI_API_KEY"]
         response = completion(
             model="gpt-3.5-turbo",
-            messages=messages,
+            messages=[{"role": "user", "content": "Hey"}],
             max_tokens=10,
-            request_timeout=1,
             metadata={"hi": "bye"},
         )
         print("This is the response object\n", response)
@@ -565,6 +565,7 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
+        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:
diff --git a/litellm/utils.py b/litellm/utils.py
index 5ccb85ef0514..fdca57e51f2b 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2348,7 +2348,9 @@ def wrapper(*args, **kwargs):
                     elif user_max_tokens + input_tokens > max_output_tokens:
                         user_max_tokens = max_output_tokens - input_tokens
                     print_verbose(f"user_max_tokens: {user_max_tokens}")
-                    kwargs["max_tokens"] = user_max_tokens
+                    kwargs["max_tokens"] = int(
+                        round(user_max_tokens)
+                    )  # make sure max tokens is always an int
                 except Exception as e:
                     print_verbose(f"Error while checking max token limit: {str(e)}")
             # MODEL CALL

From 6de6da71b77c35989f2085907f382098feb16648 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:49 -0800
Subject: [PATCH 31/60] =?UTF-8?q?bump:=20version=201.22.8=20=E2=86=92=201.?=
 =?UTF-8?q?22.9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 17d80ae8ee15..944aad7f8bb9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.8"
+version = "1.22.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.8"
+version = "1.22.9"
 version_files = [
     "pyproject.toml:^version"
 ]

From d85b1f8816d0957ec75f80d66137aefb320fba45 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:09:48 -0800
Subject: [PATCH 32/60] (feat) show langfuse logging tags better through proxy

---
 litellm/integrations/langfuse.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 82de33366096..3c3e793dfb03 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -252,8 +252,14 @@ def _log_langfuse_v2(
             print_verbose(f"trace: {cost}")
             if supports_tags:
                 for key, value in metadata.items():
-                    tags.append(f"{key}:{value}")
+                    if key in [
+                        "user_api_key",
+                        "user_api_key_user_id",
+                    ]:
+                        tags.append(f"{key}:{value}")
                 if "cache_hit" in kwargs:
+                    if kwargs["cache_hit"] is None:
+                        kwargs["cache_hit"] = False
                     tags.append(f"cache_hit:{kwargs['cache_hit']}")
                 trace_params.update({"tags": tags})
 

From d67a9ada4f588a062f62f7ad9a49c429405a34d9 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 12:28:21 -0800
Subject: [PATCH 33/60] (feat )add semantic cache

---
 litellm/caching.py            | 102 +++++++++++++++++++++++++++++++++-
 litellm/tests/test_caching.py |  25 +++++++++
 2 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index d0721fe9a936..e1ef95dc3499 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -83,7 +83,6 @@ def flush_cache(self):
         self.cache_dict.clear()
         self.ttl_dict.clear()
 
-
     async def disconnect(self):
         pass
 
@@ -217,7 +216,6 @@ async def async_get_cache(self, key, **kwargs):
     def flush_cache(self):
         self.redis_client.flushall()
 
-
     async def disconnect(self):
         pass
 
@@ -225,6 +223,102 @@ def delete_cache(self, key):
         self.redis_client.delete(key)
 
 
+class RedisSemanticCache(RedisCache):
+    def __init__(self, host, port, password, **kwargs):
+        super().__init__()
+
+        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
+        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+        # from redis.commands.search.query import Query
+
+        # INDEX_NAME = 'idx:litellm_completion_response_vss'
+        # DOC_PREFIX = 'bikes:'
+
+        # try:
+        #     # check to see if index exists
+        #     client.ft(INDEX_NAME).info()
+        #     print('Index already exists!')
+        # except:
+        #     # schema
+        #     schema = (
+        #         TextField('$.model', no_stem=True, as_name='model'),
+        #         TextField('$.brand', no_stem=True, as_name='brand'),
+        #         NumericField('$.price', as_name='price'),
+        #         TagField('$.type', as_name='type'),
+        #         TextField('$.description', as_name='description'),
+        #         VectorField('$.description_embeddings',
+        #             'FLAT', {
+        #                 'TYPE': 'FLOAT32',
+        #                 'DIM': VECTOR_DIMENSION,
+        #                 'DISTANCE_METRIC': 'COSINE',
+        #             },  as_name='vector'
+        #         ),
+        #     )
+
+        #     # index Definition
+        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
+
+        #     # create Index
+        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+
+    def set_cache(self, key, value, **kwargs):
+        ttl = kwargs.get("ttl", None)
+        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
+        try:
+            # get text response
+            # print("in redis semantic cache: value: ", value)
+            llm_response = value["response"]
+
+            # if llm_response is a string, convert it to a dictionary
+            if isinstance(llm_response, str):
+                llm_response = json.loads(llm_response)
+
+            # print("converted llm_response: ", llm_response)
+            response = llm_response["choices"][0]["message"]["content"]
+
+            # create embedding response
+
+            embedding_response = litellm.embedding(
+                model="text-embedding-ada-002",
+                input=response,
+                cache={"no-store": True},
+            )
+
+            raw_embedding = embedding_response["data"][0]["embedding"]
+            raw_embedding_dimension = len(raw_embedding)
+
+            # print("embedding: ", raw_embedding)
+            key = "litellm-semantic:" + key
+            self.redis_client.json().set(
+                name=key,
+                path="$",
+                obj=json.dumps(
+                    {
+                        "response": response,
+                        "embedding": raw_embedding,
+                        "dimension": raw_embedding_dimension,
+                    }
+                ),
+            )
+
+            stored_redis_value = self.redis_client.json().get(name=key)
+
+            # print("Stored Redis Value: ", stored_redis_value)
+
+        except Exception as e:
+            # print("Error occurred: ", e)
+            # NON blocking - notify users Redis is throwing an exception
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+
+    def get_cache(self, key, **kwargs):
+        pass
+
+    async def async_set_cache(self, key, value, **kwargs):
+        pass
+
+    async def async_get_cache(self, key, **kwargs):
+        pass
+
 
 class S3Cache(BaseCache):
     def __init__(
@@ -429,7 +523,7 @@ def delete_cache(self, key):
 class Cache:
     def __init__(
         self,
-        type: Optional[Literal["local", "redis", "s3"]] = "local",
+        type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
@@ -468,6 +562,8 @@ def __init__(
         """
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
+        elif type == "redis-semantic":
+            self.cache = RedisSemanticCache(host, port, password, **kwargs)
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 468ab6f80fa9..32904ab7846b 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -987,3 +987,28 @@ def test_cache_context_managers():
 
 
 # test_cache_context_managers()
+
+
+def test_redis_semantic_cache_completion():
+    litellm.set_verbose = False
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+    messages = [
+        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
+    ]
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+    )
+    print("test2 for Redis Caching - non streaming")
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
+    # response2 = completion(
+    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
+    # )
+
+
+# test_redis_cache_completion()

From a510adb1e6b811def30935c50281c1737fb0589a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:12 -0800
Subject: [PATCH 34/60] (feat) working - sync semantic caching

---
 litellm/caching.py | 231 ++++++++++++++++++++++++++++++---------------
 1 file changed, 154 insertions(+), 77 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index e1ef95dc3499..0a1046f0d8a0 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -223,94 +223,161 @@ def delete_cache(self, key):
         self.redis_client.delete(key)
 
 
-class RedisSemanticCache(RedisCache):
-    def __init__(self, host, port, password, **kwargs):
-        super().__init__()
+class RedisSemanticCache(BaseCache):
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        password=None,
+        redis_url=None,
+        similarity_threshold=None,
+        **kwargs,
+    ):
+        from redisvl.index import SearchIndex
+        from redisvl.query import VectorQuery
 
-        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
-        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
-        # from redis.commands.search.query import Query
-
-        # INDEX_NAME = 'idx:litellm_completion_response_vss'
-        # DOC_PREFIX = 'bikes:'
-
-        # try:
-        #     # check to see if index exists
-        #     client.ft(INDEX_NAME).info()
-        #     print('Index already exists!')
-        # except:
-        #     # schema
-        #     schema = (
-        #         TextField('$.model', no_stem=True, as_name='model'),
-        #         TextField('$.brand', no_stem=True, as_name='brand'),
-        #         NumericField('$.price', as_name='price'),
-        #         TagField('$.type', as_name='type'),
-        #         TextField('$.description', as_name='description'),
-        #         VectorField('$.description_embeddings',
-        #             'FLAT', {
-        #                 'TYPE': 'FLOAT32',
-        #                 'DIM': VECTOR_DIMENSION,
-        #                 'DISTANCE_METRIC': 'COSINE',
-        #             },  as_name='vector'
-        #         ),
-        #     )
-
-        #     # index Definition
-        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
-
-        #     # create Index
-        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+        print_verbose(
+            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
+        )
+        if similarity_threshold is None:
+            raise Exception("similarity_threshold must be provided, passed None")
+        self.similarity_threshold = similarity_threshold
+        schema = {
+            "index": {
+                "name": "litellm_semantic_cache_index",
+                "prefix": "litellm",
+                "storage_type": "hash",
+            },
+            "fields": {
+                "text": [{"name": "response"}],
+                "text": [{"name": "prompt"}],
+                "vector": [
+                    {
+                        "name": "litellm_embedding",
+                        "dims": 1536,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    }
+                ],
+            },
+        }
+        self.index = SearchIndex.from_dict(schema)
+        if redis_url is None:
+            # if no url passed, check if host, port and password are passed, if not raise an Exception
+            if host is None or port is None or password is None:
+                raise Exception(f"Redis host, port, and password must be provided")
+            redis_url = "redis://:" + password + "@" + host + ":" + port
+        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
+        self.index.connect(redis_url=redis_url)
+        self.index.create(overwrite=False)  # don't overwrite existing index
+
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
+
+        # check if cached_response is bytes
+        if isinstance(cached_response, bytes):
+            cached_response = cached_response.decode("utf-8")
 
-    def set_cache(self, key, value, **kwargs):
-        ttl = kwargs.get("ttl", None)
-        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
         try:
-            # get text response
-            # print("in redis semantic cache: value: ", value)
-            llm_response = value["response"]
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
 
-            # if llm_response is a string, convert it to a dictionary
-            if isinstance(llm_response, str):
-                llm_response = json.loads(llm_response)
+    def set_cache(self, key, value, **kwargs):
+        import numpy as np
 
-            # print("converted llm_response: ", llm_response)
-            response = llm_response["choices"][0]["message"]["content"]
+        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
 
-            # create embedding response
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
 
-            embedding_response = litellm.embedding(
-                model="text-embedding-ada-002",
-                input=response,
-                cache={"no-store": True},
-            )
+        # create an embedding for prompt
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
 
-            raw_embedding = embedding_response["data"][0]["embedding"]
-            raw_embedding_dimension = len(raw_embedding)
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
 
-            # print("embedding: ", raw_embedding)
-            key = "litellm-semantic:" + key
-            self.redis_client.json().set(
-                name=key,
-                path="$",
-                obj=json.dumps(
-                    {
-                        "response": response,
-                        "embedding": raw_embedding,
-                        "dimension": raw_embedding_dimension,
-                    }
-                ),
-            )
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
 
-            stored_redis_value = self.redis_client.json().get(name=key)
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
 
-            # print("Stored Redis Value: ", stored_redis_value)
+        # Add more data
+        keys = self.index.load(new_data)
 
-        except Exception as e:
-            # print("Error occurred: ", e)
-            # NON blocking - notify users Redis is throwing an exception
-            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+        pass
 
     def get_cache(self, key, **kwargs):
+        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+            num_results=1,
+        )
+
+        results = self.index.query(query)
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
@@ -527,6 +594,7 @@ def __init__(
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
         supported_call_types: Optional[
             List[Literal["completion", "acompletion", "embedding", "aembedding"]]
         ] = ["completion", "acompletion", "embedding", "aembedding"],
@@ -547,10 +615,12 @@ def __init__(
         Initializes the cache based on the given type.
 
         Args:
-            type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
             host (str, optional): The host address for the Redis cache. Required if type is "redis".
             port (int, optional): The port number for the Redis cache. Required if type is "redis".
             password (str, optional): The password for the Redis cache. Required if type is "redis".
+            similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
+
             supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
             **kwargs: Additional keyword arguments for redis.Redis() cache
 
@@ -563,7 +633,13 @@ def __init__(
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
         elif type == "redis-semantic":
-            self.cache = RedisSemanticCache(host, port, password, **kwargs)
+            self.cache = RedisSemanticCache(
+                host,
+                port,
+                password,
+                similarity_threshold=similarity_threshold,
+                **kwargs,
+            )
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
@@ -743,6 +819,7 @@ def get_cache(self, *args, **kwargs):
             The cached result if it exists, otherwise None.
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -752,7 +829,7 @@ def get_cache(self, *args, **kwargs):
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = self.cache.get_cache(cache_key)
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From a5afbf6d56b2cdf33e1c6752600a5598a2092ead Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:32 -0800
Subject: [PATCH 35/60] (test) semantic cache

---
 litellm/tests/test_caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 32904ab7846b..3ac812cf35ed 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -990,7 +990,7 @@ def test_cache_context_managers():
 
 
 def test_redis_semantic_cache_completion():
-    litellm.set_verbose = False
+    litellm.set_verbose = True
 
     random_number = random.randint(
         1, 100000
@@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion():
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.5,
     )
     print("test2 for Redis Caching - non streaming")
     response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)

From 2ad8b70f50be1ca51e45f0c48daf8b82f8767b90 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:22:50 -0800
Subject: [PATCH 36/60] (test) semantic caching

---
 litellm/tests/test_caching.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 3ac812cf35ed..4b47614ccacc 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion():
     random_number = random.randint(
         1, 100000
     )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
+
+    print("testing semantic caching")
     litellm.cache = Cache(
         type="redis-semantic",
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
-        similarity_threshold=0.5,
+        similarity_threshold=0.8,
     )
-    print("test2 for Redis Caching - non streaming")
-    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
-    # response2 = completion(
-    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
-    # )
+    response1 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
+    # assert response1.choices[0].message == 1
 
 
 # test_redis_cache_completion()

From 553b993473e1cfc60fae5dfee0ef38fd0c052d7b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:25:22 -0800
Subject: [PATCH 37/60] (fix) semantic cache

---
 litellm/caching.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 0a1046f0d8a0..877f935fab05 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -270,7 +270,10 @@ def __init__(
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         self.index.connect(redis_url=redis_url)
-        self.index.create(overwrite=False)  # don't overwrite existing index
+        try:
+            self.index.create(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
     def _get_cache_logic(self, cached_response: Any):
         """

From 08d72fd2a0806586b2def50fee8011bfb8c3b985 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:13:12 -0800
Subject: [PATCH 38/60] (feat) RedisSemanticCache - async

---
 litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 6 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 877f935fab05..ad37f2077cb5 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -231,6 +231,7 @@ def __init__(
         password=None,
         redis_url=None,
         similarity_threshold=None,
+        use_async=False,
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -262,14 +263,19 @@ def __init__(
                 ],
             },
         }
-        self.index = SearchIndex.from_dict(schema)
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
                 raise Exception(f"Redis host, port, and password must be provided")
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
-        self.index.connect(redis_url=redis_url)
+        if use_async == False:
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url)
+        elif use_async == True:
+            schema["index"]["name"] = "litellm_semantic_cache_index_async"
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url, use_async=True)
         try:
             self.index.create(overwrite=False)  # don't overwrite existing index
         except Exception as e:
@@ -327,10 +333,10 @@ def set_cache(self, key, value, **kwargs):
         # Add more data
         keys = self.index.load(new_data)
 
-        pass
+        return
 
     def get_cache(self, key, **kwargs):
-        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
 
@@ -360,6 +366,11 @@ def get_cache(self, key, **kwargs):
         )
 
         results = self.index.query(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
 
         vector_distance = results[0]["vector_distance"]
         vector_distance = float(vector_distance)
@@ -384,9 +395,93 @@ def get_cache(self, key, **kwargs):
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
-        pass
+        import numpy as np
+
+        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
+
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
+
+        # Add more data
+        keys = await self.index.aload(new_data)
+        return
 
     async def async_get_cache(self, key, **kwargs):
+        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+        )
+        results = await self.index.aquery(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
         pass
 
 
@@ -612,6 +707,7 @@ def __init__(
         s3_aws_secret_access_key: Optional[str] = None,
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
+        redis_semantic_cache_use_async=False,
         **kwargs,
     ):
         """
@@ -641,6 +737,7 @@ def __init__(
                 port,
                 password,
                 similarity_threshold=similarity_threshold,
+                use_async=redis_semantic_cache_use_async,
                 **kwargs,
             )
         elif type == "local":
@@ -847,6 +944,7 @@ async def async_get_cache(self, *args, **kwargs):
         Used for embedding calls in async wrapper
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -856,7 +954,9 @@ async def async_get_cache(self, *args, **kwargs):
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = await self.cache.async_get_cache(cache_key)
+                cached_result = await self.cache.async_get_cache(
+                    cache_key, messages=messages
+                )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From 1d151e47777c9c5ae1488fadc2ed364392c8845a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:14:54 -0800
Subject: [PATCH 39/60] (test) async semantic cache

---
 litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 4b47614ccacc..a1a42ff659d9 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -991,6 +991,9 @@ def test_cache_context_managers():
 
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
 
     random_number = random.randint(
         1, 100000
@@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion():
 
 
 # test_redis_cache_completion()
+
+
+@pytest.mark.asyncio
+async def test_redis_semantic_cache_acompletion():
+    litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+
+    print("testing semantic caching")
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.8,
+        redis_semantic_cache_use_async=True,
+    )
+    response1 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"

From aa7580411da0f115cf1480a53f3b75660507edf2 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:52:57 -0800
Subject: [PATCH 40/60] (feat) working semantic-cache on litellm proxy

---
 litellm/caching.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index ad37f2077cb5..a7958d074c29 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -266,21 +266,30 @@ def __init__(
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
-                raise Exception(f"Redis host, port, and password must be provided")
+                # try checking env for host, port and password
+                import os
+
+                host = os.getenv("REDIS_HOST")
+                port = os.getenv("REDIS_PORT")
+                password = os.getenv("REDIS_PASSWORD")
+                if host is None or port is None or password is None:
+                    raise Exception("Redis host, port, and password must be provided")
+
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         if use_async == False:
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url)
+            try:
+                self.index.create(overwrite=False)  # don't overwrite existing index
+            except Exception as e:
+                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         elif use_async == True:
             schema["index"]["name"] = "litellm_semantic_cache_index_async"
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url, use_async=True)
-        try:
-            self.index.create(overwrite=False)  # don't overwrite existing index
-        except Exception as e:
-            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
+    #
     def _get_cache_logic(self, cached_response: Any):
         """
         Common 'get_cache_logic' across sync + async redis client implementations
@@ -397,6 +406,10 @@ def get_cache(self, key, **kwargs):
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
 
+        try:
+            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
 
         # get the prompt

From c8d5714e59ab24aebcb0101249af31bf6926b699 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:54:36 -0800
Subject: [PATCH 41/60] (feat) redis-semantic cache

---
 litellm/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index fdca57e51f2b..c25572c03c90 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -55,7 +55,7 @@
 from .proxy._types import KeyManagementSystem
 from openai import OpenAIError as OriginalError
 from openai._models import BaseModel as OpenAIObject
-from .caching import S3Cache
+from .caching import S3Cache, RedisSemanticCache
 from .exceptions import (
     AuthenticationError,
     BadRequestError,
@@ -2533,6 +2533,14 @@ async def wrapper_async(*args, **kwargs):
                         ):
                             if len(cached_result) == 1 and cached_result[0] is None:
                                 cached_result = None
+                    elif isinstance(litellm.cache.cache, RedisSemanticCache):
+                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                        kwargs[
+                            "preset_cache_key"
+                        ] = preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                        cached_result = await litellm.cache.async_get_cache(
+                            *args, **kwargs
+                        )
                     else:
                         preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                         kwargs[

From e0d5c953d660f7834fb3efb250740d193dddc463 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:55:25 -0800
Subject: [PATCH 42/60] (feat) working semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index bd844bd7ba3e..41c3b4182859 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,10 +73,12 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
-  upperbound_key_generate_params:
-    max_budget: 100
-    duration: "30d"
-  # cache: True     
+  cache: True          # set cache responses to True
+  cache_params:
+    type: "redis-semantic"
+    similarity_threshold: 0.8
+    redis_semantic_cache_use_async: True
+  # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
 

From fb1212ac8269f872cc81c58280065ee4c0fd8020 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:30:45 -0800
Subject: [PATCH 43/60] (fix) add redisvl==0.0.7

---
 .circleci/requirements.txt | 3 ++-
 requirements.txt           | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt
index 85b576bff242..4730fc28b15c 100644
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@@ -10,4 +10,5 @@ anthropic
 boto3
 orjson
 pydantic
-google-cloud-aiplatform
\ No newline at end of file
+google-cloud-aiplatform
+redisvl==0.0.7 # semantic caching
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 768e8dff3f38..b0a49553d1b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
+redisvl==0.0.7 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From e2c88ce1542e6ebf4f1c51c2734db2f2aed1e447 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:31:57 -0800
Subject: [PATCH 44/60] (feat) log semantic_sim to langfuse

---
 litellm/caching.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index a7958d074c29..133d1db6dd5f 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -471,9 +471,11 @@ async def async_get_cache(self, key, **kwargs):
         )
         results = await self.index.aquery(query)
         if results == None:
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
             return None
         if isinstance(results, list):
             if len(results) == 0:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
                 return None
 
         vector_distance = results[0]["vector_distance"]
@@ -485,6 +487,10 @@ async def async_get_cache(self, key, **kwargs):
         print_verbose(
             f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
         )
+
+        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+
         if similarity > self.similarity_threshold:
             # cache hit !
             cached_value = results[0]["response"]
@@ -968,7 +974,7 @@ async def async_get_cache(self, *args, **kwargs):
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
                 cached_result = await self.cache.async_get_cache(
-                    cache_key, messages=messages
+                    cache_key, *args, **kwargs
                 )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age

From 91424b66d7101f0293996949ce799e104b9497b7 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:22:02 -0800
Subject: [PATCH 45/60] allow setting redis_semantic cache_embedding model

---
 litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 133d1db6dd5f..6bf53ea45100 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -232,6 +232,7 @@ def __init__(
         redis_url=None,
         similarity_threshold=None,
         use_async=False,
+        embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -243,6 +244,7 @@ def __init__(
         if similarity_threshold is None:
             raise Exception("similarity_threshold must be provided, passed None")
         self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
         schema = {
             "index": {
                 "name": "litellm_semantic_cache_index",
@@ -322,7 +324,7 @@ def set_cache(self, key, value, **kwargs):
 
         # create an embedding for prompt
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -359,7 +361,7 @@ def get_cache(self, key, **kwargs):
 
         # convert to embedding
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -405,6 +407,7 @@ def get_cache(self, key, **kwargs):
 
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         try:
             await self.index.acreate(overwrite=False)  # don't overwrite existing index
@@ -418,12 +421,24 @@ async def async_set_cache(self, key, value, **kwargs):
         for message in messages:
             prompt += message["content"]
         # create an embedding for prompt
-
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -445,6 +460,7 @@ async def async_get_cache(self, key, **kwargs):
         print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         # query
 
@@ -454,12 +470,24 @@ async def async_get_cache(self, key, **kwargs):
         for message in messages:
             prompt += message["content"]
 
-        # convert to embedding
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -727,6 +755,7 @@ def __init__(
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
         redis_semantic_cache_use_async=False,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         """
@@ -757,6 +786,7 @@ def __init__(
                 password,
                 similarity_threshold=similarity_threshold,
                 use_async=redis_semantic_cache_use_async,
+                embedding_model=redis_semantic_cache_embedding_model,
                 **kwargs,
             )
         elif type == "local":

From 58f47c9e29d0704a88f9d87594fc467e8342ba95 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:27:33 -0800
Subject: [PATCH 46/60] (fix) use semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 41c3b4182859..326544f41ef2 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -77,7 +77,7 @@ litellm_settings:
   cache_params:
     type: "redis-semantic"
     similarity_threshold: 0.8
-    redis_semantic_cache_use_async: True
+    redis_semantic_cache_embedding_model: azure-embedding-model
   # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From 04433c01fd79b5f13bc9156831bd4d390ccd5518 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:32:07 -0800
Subject: [PATCH 47/60] (docs) using semantic caching on proxy

---
 docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 03bb9fed34ec..3f2687824199 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -9,7 +9,7 @@ LiteLLM supports:
 - Redis Cache 
 - s3 Bucket Cache 
 
-## Quick Start - Redis, s3 Cache
+## Quick Start - Redis, s3 Cache, Semantic Cache
 <Tabs>
 
 <TabItem value="redis" label="redis cache">
@@ -84,6 +84,56 @@ litellm_settings:
 $ litellm --config /path/to/config.yaml
 ```
 </TabItem>
+
+
+<TabItem value="redis-sem" label="redis semantic cache">
+
+Caching can be enabled by adding the `cache` key in the `config.yaml`
+
+### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+  - model_name: azure-embedding-model
+    litellm_params:
+      model: azure/azure-embedding-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+
+litellm_settings:
+  set_verbose: True
+  cache: True          # set cache responses to True, litellm defaults to using a redis cache
+  cache_params:
+    type: "redis-semantic"  
+    similarity_threshold: 0.8   # similarity threshold for semantic cache
+    redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
+```
+
+### Step 2: Add Redis Credentials to .env
+Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
+
+  ```shell
+  REDIS_URL = ""        # REDIS_URL='redis://username:password@hostname:port/database'
+  ## OR ## 
+  REDIS_HOST = ""       # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
+  REDIS_PORT = ""       # REDIS_PORT='18841'
+  REDIS_PASSWORD = ""   # REDIS_PASSWORD='liteLlmIsAmazing'
+  ```
+
+**Additional kwargs**  
+You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: 
+```shell
+REDIS_<redis-kwarg-name> = ""
+``` 
+
+### Step 3: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+</TabItem>
 </Tabs>
 
 

From f8248b2c798f18d1e0d3b108c22a9c07f855da8e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:35:21 -0800
Subject: [PATCH 48/60] (feat) redis-semantic cache on proxy

---
 litellm/proxy/proxy_server.py | 5 ++++-
 requirements.txt              | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 494c874147ba..661e932f378a 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1168,7 +1168,7 @@ async def load_config(
 
                     verbose_proxy_logger.debug(f"passed cache type={cache_type}")
 
-                    if cache_type == "redis":
+                    if cache_type == "redis" or cache_type == "redis-semantic":
                         cache_host = litellm.get_secret("REDIS_HOST", None)
                         cache_port = litellm.get_secret("REDIS_PORT", None)
                         cache_password = litellm.get_secret("REDIS_PASSWORD", None)
@@ -1195,6 +1195,9 @@ async def load_config(
                             f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}"
                         )
                         print()  # noqa
+                    if cache_type == "redis-semantic":
+                        # by default this should always be async
+                        cache_params.update({"redis_semantic_cache_use_async": True})
 
                     # users can pass os.environ/ variables on the proxy - we should read them from the env
                     for key, value in cache_params.items():
diff --git a/requirements.txt b/requirements.txt
index b0a49553d1b2..3ace5872adb3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
+numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From f3de05cc54a48c6ec076e722ec913769e6fc16db Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:39:44 -0800
Subject: [PATCH 49/60] (fix) test-semantic caching

---
 litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index a1a42ff659d9..cc18dda165da 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion():
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
-    # assert response1.choices[0].message == 1
+    random_number = random.randint(1, 100000)
+
+    response2 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response2: {response1}")
+    assert response1.id == response2.id
 
 
 # test_redis_cache_completion()
@@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion():
                 "content": f"write a one sentence poem about: {random_number}",
             }
         ],
-        max_tokens=20,
+        max_tokens=5,
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"
+    random_number = random.randint(1, 100000)
+    response2 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=5,
+    )
+    print(f"response2: {response2}")
+    assert response1.id == response2.id

From b49b37568a44487df5923d7fef41d357ff4a614e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:53:28 -0800
Subject: [PATCH 50/60] (docs) redis cache

---
 docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 8a580f087ca7..7b21d35b6cbf 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -1,11 +1,11 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# Caching - In-Memory, Redis, s3
+# Caching - In-Memory, Redis, s3,  Redis Semantic Cache
 
 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
 
-## Initialize Cache - In Memory, Redis, s3 Bucket
+## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
 
 
 <Tabs>
@@ -18,7 +18,7 @@ pip install redis
 ```
 
 For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -55,7 +55,7 @@ Set AWS environment variables
 AWS_ACCESS_KEY_ID = "AKI*******"
 AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -80,6 +80,66 @@ response2 = completion(
 </TabItem>
 
 
+<TabItem value="redis-sem" label="redis-semantic cache">
+
+Install redis
+```shell
+pip install redisvl==0.0.7
+```
+
+For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+random_number = random.randint(
+    1, 100000
+)  # add a random number to ensure it's always adding / reading from cache
+
+print("testing semantic caching")
+litellm.cache = Cache(
+    type="redis-semantic",
+    host=os.environ["REDIS_HOST"],
+    port=os.environ["REDIS_PORT"],
+    password=os.environ["REDIS_PASSWORD"],
+    similarity_threshold=0.8,
+    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
+)
+response1 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response1: {response1}")
+
+random_number = random.randint(1, 100000)
+
+response2 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response2: {response1}")
+assert response1.id == response2.id
+# response1 == response2, response 1 is cached
+```
+
+</TabItem>
+
+
+
 <TabItem value="in-mem" label="in memory cache">
 
 ### Quick Start

From 102f20fc030a890db73706a1c7737edced6d9b6b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:54:55 -0800
Subject: [PATCH 51/60] (docs) litellm semantic caching

---
 docs/my-website/docs/caching/redis_cache.md | 2 +-
 docs/my-website/docs/proxy/caching.md       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 7b21d35b6cbf..75e1db9557f9 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -104,7 +104,7 @@ litellm.cache = Cache(
     host=os.environ["REDIS_HOST"],
     port=os.environ["REDIS_PORT"],
     password=os.environ["REDIS_PASSWORD"],
-    similarity_threshold=0.8,
+    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
     redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 3f2687824199..d5b589e5c22b 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -7,6 +7,7 @@ Cache LLM Responses
 LiteLLM supports:
 - In Memory Cache
 - Redis Cache 
+- Redis Semantic Cache
 - s3 Bucket Cache 
 
 ## Quick Start - Redis, s3 Cache, Semantic Cache

From e32c2beddd89c4db65476d876b9144b946633254 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:55:15 -0800
Subject: [PATCH 52/60] (fix) semantic caching

---
 litellm/tests/test_caching.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index cc18dda165da..96fd8eb9d23a 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion():
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
         similarity_threshold=0.8,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
     )
     response1 = completion(
         model="gpt-3.5-turbo",

From 43061d612df98e99ba2694af43b3d33cf10f40fe Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:04:19 -0800
Subject: [PATCH 53/60] (fix) mark semantic caching as beta test

---
 litellm/tests/test_caching.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 96fd8eb9d23a..6cb5b974a18e 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -989,6 +989,7 @@ def test_cache_context_managers():
 # test_cache_context_managers()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
     import logging
@@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion():
 # test_redis_cache_completion()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 @pytest.mark.asyncio
 async def test_redis_semantic_cache_acompletion():
     litellm.set_verbose = True

From b5db630dba4f07b14fa95a4a1c94c42e75d066ca Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:26:48 -0800
Subject: [PATCH 54/60] (ci/cd) run again

---
 litellm/tests/test_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 6cb5b974a18e..8433941e90f2 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -998,7 +998,7 @@ def test_redis_semantic_cache_completion():
 
     random_number = random.randint(
         1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
+    )  # add a random number to ensure it's always adding /reading from cache
 
     print("testing semantic caching")
     litellm.cache = Cache(

From 0d03b28a3b7548b9655a0b66b15c885ad4bbe43d Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:29:31 -0800
Subject: [PATCH 55/60] test(test_completion.py): fix test

---
 docs/my-website/docs/proxy/caching.md | 7 ++++---
 litellm/tests/test_completion.py      | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index d5b589e5c22b..2b385de8e5e4 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -211,9 +211,10 @@ litellm_settings:
 
 The proxy support 3 cache-controls:
 
-- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
-- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
-- `no-cache`: Will not return a cached response, but instead call the actual endpoint. 
+- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. 
+- `no-store`: *Optional(bool)* Will not cache the response. 
 
 [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
 
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index de79c97afa96..b075e4819044 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -565,7 +565,6 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
-        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:

From 325ca43946bfb507cd3e3f66cbbc7f72caa3d6d0 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:35:25 -0800
Subject: [PATCH 56/60] (feat) show semantic-cache on health/readiness

---
 litellm/caching.py            |  3 +++
 litellm/proxy/proxy_server.py | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/litellm/caching.py b/litellm/caching.py
index 6bf53ea45100..f996a587354b 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -531,6 +531,9 @@ async def async_get_cache(self, key, **kwargs):
             return None
         pass
 
+    async def _index_info(self):
+        return await self.index.ainfo()
+
 
 class S3Cache(BaseCache):
     def __init__(
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 661e932f378a..427bb88a9c46 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -4051,8 +4051,18 @@ async def health_readiness():
 
     cache_type = None
     if litellm.cache is not None:
+        from litellm.caching import RedisSemanticCache
+
         cache_type = litellm.cache.type
 
+        if isinstance(litellm.cache.cache, RedisSemanticCache):
+            # ping the cache
+            try:
+                index_info = await litellm.cache.cache._index_info()
+            except Exception as e:
+                index_info = "index does not exist - error: " + str(e)
+            cache_type = {"type": cache_type, "index_info": index_info}
+
     if prisma_client is not None:  # if db passed in, check if it's connected
         if prisma_client.db.is_connected() == True:
             response_object = {"db": "connected"}

From eb3b68a2f0454f22b16940c572a72d27aefca7df Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:36:35 -0800
Subject: [PATCH 57/60] (fix) dockerfile requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3ace5872adb3..f2bff2680b65 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ pyyaml>=6.0.1 # server dep
 uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
-redis==4.6.0 # caching
+redis==5.0.0 # caching
 redisvl==0.0.7 # semantic caching
 numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db

From 8ba2c8dbf7604257f712dbc3d154646a60fb9738 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:41:22 -0800
Subject: [PATCH 58/60] (fix) langfuse show semantic-similarity in tags

---
 litellm/integrations/langfuse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 3c3e793dfb03..3031868ec758 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -255,6 +255,7 @@ def _log_langfuse_v2(
                     if key in [
                         "user_api_key",
                         "user_api_key_user_id",
+                        "semantic-similarity",
                     ]:
                         tags.append(f"{key}:{value}")
                 if "cache_hit" in kwargs:

From 78f75647da2cc7fdd2353c79c56eefd149ec843a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:52:32 -0800
Subject: [PATCH 59/60] (fix) redisvl requirements.txt issue

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f2bff2680b65..55c5f14568d7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,6 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==5.0.0 # caching
-redisvl==0.0.7 # semantic caching
 numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions

From be81183782e270e4fb2b88fa158c1a56a8033040 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:55:51 -0800
Subject: [PATCH 60/60] refactor(main.py): trigger deploy

n
---
 litellm/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/litellm/main.py b/litellm/main.py
index 384dadc32d5b..b18221607fac 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -10,7 +10,6 @@
 import os, openai, sys, json, inspect, uuid, datetime, threading
 from typing import Any, Literal, Union
 from functools import partial
-
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx