Merge branch 'main' into litellm_ui_improvements_time_series

BerriAI · Feb 10, 2024 · a8ef8e1 · a8ef8e1
2 parents 4678c6b + 1aa9865
commit a8ef8e1
Show file tree

Hide file tree

Showing 23 changed files with 629 additions and 167 deletions.
diff --git a/template.yaml → .github/template.yaml b/template.yaml → .github/template.yaml
diff --git a/README.md b/README.md
@@ -28,6 +28,8 @@ LiteLLM manages:
 - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
 - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
+- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+
 
 [**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-provider-docs)
@@ -155,6 +157,9 @@ print(response)
 ```
 
 ## Proxy Key Management ([Docs](https://docs.litellm.ai/docs/proxy/virtual_keys))
+UI on `/ui` on your proxy server 
+![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
+
 Track Spend, Set budgets and create virtual keys for the proxy
 `POST /key/generate`
 
@@ -174,13 +179,6 @@ curl 'http://0.0.0.0:8000/key/generate' \
 }
 ```
 
-### [Beta] Proxy UI ([Docs](https://docs.litellm.ai/docs/proxy/ui))
-
-A UI to create keys, track spend per key
-
-Code: https://github.com/BerriAI/litellm/tree/main/ui  
-![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
-
 ## Supported Providers ([Docs](https://docs.litellm.ai/docs/providers))
 | Provider      | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses)  | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion)  | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming)  | [Async Embedding](https://docs.litellm.ai/docs/embedding/supported_embedding)  | [Async Image Generation](https://docs.litellm.ai/docs/image_generation)  | 
 | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |

diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md
@@ -5,10 +5,14 @@ import TabItem from '@theme/TabItem';
 
 https://github.com/BerriAI/litellm
 
-import QuickStart from '../src/components/QuickStart.js'
 
 ## **Call 100+ LLMs using the same Input/Output Format**
 
+- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
+- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
+- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
+- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+
 ## Basic usage 
 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@@ -157,9 +161,6 @@ response = completion(
   messages=[{ "content": "Hello, how are you?","role": "user"}],
   stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```
 
 </TabItem>
@@ -177,9 +178,6 @@ response = completion(
   messages=[{ "content": "Hello, how are you?","role": "user"}],
   stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```
 
 </TabItem>
@@ -199,9 +197,6 @@ response = completion(
   messages=[{ "content": "Hello, how are you?","role": "user"}],
   stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```
 
 </TabItem>
@@ -222,9 +217,7 @@ response = completion(
   stream=True,
 )
 
-
-for chunk in response: 
-  print(chunk)
+print(response)
 ```
 
 </TabItem>
@@ -246,9 +239,6 @@ response = completion(
   messages = [{ "content": "Hello, how are you?","role": "user"}],
   stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```
 
 </TabItem>
@@ -265,9 +255,6 @@ response = completion(
             api_base="http://localhost:11434",
             stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```
 </TabItem>
 <TabItem value="or" label="Openrouter">
@@ -284,9 +271,6 @@ response = completion(
   messages = [{ "content": "Hello, how are you?","role": "user"}],
   stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```
 </TabItem>
 
@@ -327,34 +311,8 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```
 
-## Calculate Costs, Usage, Latency
-
-Pass the completion response to `litellm.completion_cost(completion_response=response)` and get the cost
-
-```python
-from litellm import completion, completion_cost
-import os
-os.environ["OPENAI_API_KEY"] = "your-api-key"
-
-response = completion(
-  model="gpt-3.5-turbo", 
-  messages=[{ "content": "Hello, how are you?","role": "user"}]
-)
-
-cost = completion_cost(completion_response=response)
-print("Cost for completion call with gpt-3.5-turbo: ", f"${float(cost):.10f}")
-```
-
-**Output**
-```shell
-Cost for completion call with gpt-3.5-turbo:  $0.0000775000
-```
-
-### Track Costs, Usage, Latency for streaming
-We use a custom callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
-- We define a callback function to calculate cost `def track_cost_callback()`
-- In `def track_cost_callback()` we check if the stream is complete - `if "complete_streaming_response" in kwargs`
-- Use `litellm.completion_cost()` to calculate cost, once the stream is complete
+## Track Costs, Usage, Latency for streaming
+Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
 
 ```python
 import litellm
@@ -366,18 +324,8 @@ def track_cost_callback(
     start_time, end_time    # start/end time
 ):
     try:
-        # check if it has collected an entire stream response
-        if "complete_streaming_response" in kwargs:
-            # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost 
-            completion_response=kwargs["complete_streaming_response"]
-            input_text = kwargs["messages"]
-            output_text = completion_response["choices"][0]["message"]["content"]
-            response_cost = litellm.completion_cost(
-                model = kwargs["model"],
-                messages = input_text,
-                completion=output_text
-            )
-            print("streaming response_cost", response_cost)
+      response_cost = kwargs.get("response_cost", 0)
+      print("streaming response_cost", response_cost)
     except:
         pass
 # set callback 
@@ -400,6 +348,8 @@ response = completion(
 
 Track spend across multiple projects/people 
 
+![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
+
 The proxy provides: 
 1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
 2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
@@ -436,8 +386,7 @@ response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
 print(response)
 ```
 
-
 ## More details
 * [exception mapping](./exception_mapping.md)
 * [retries + model fallbacks for completion()](./completion/reliable_completions.md)
-* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
+* [proxy virtual keys & spend management](./tutorials/fallbacks.md)
diff --git a/docs/my-website/docs/proxy/quick_start.md b/docs/my-website/docs/proxy/quick_start.md
@@ -8,16 +8,8 @@ Quick start CLI, Config, Docker
 LiteLLM Server manages:
 
 * **Unified Interface**: Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format
+* **Cost tracking**: Authentication, Spend Tracking & Budgets [Virtual Keys](https://docs.litellm.ai/docs/proxy/virtual_keys)
 * **Load Balancing**: between [Multiple Models](#multiple-models---quick-start) + [Deployments of the same model](#multiple-instances-of-1-model) - LiteLLM proxy can handle 1.5k+ requests/second during load tests.
-* **Cost tracking**: Authentication & Spend Tracking [Virtual Keys](#managing-auth---virtual-keys)
-
-[**See LiteLLM Proxy code**](https://github.com/BerriAI/litellm/tree/main/litellm/proxy)
-
-
-#### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
-
-
-View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
 
 ```shell
 $ pip install 'litellm[proxy]'
@@ -221,8 +213,38 @@ $ litellm --model command-nightly
 
 </Tabs>
 
+## Quick Start - LiteLLM Proxy + Config.yaml
+The config allows you to create a model list and set `api_base`, `max_tokens` (all litellm params). See more details about the config [here](https://docs.litellm.ai/docs/proxy/configs)
+
+### Create a Config for LiteLLM Proxy
+Example config
+
+```yaml
+model_list: 
+  - model_name: gpt-3.5-turbo # user-facing model alias
+    litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
+      model: azure/<your-deployment-name>
+      api_base: <your-azure-api-endpoint>
+      api_key: <your-azure-api-key>
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/gpt-turbo-small-ca
+      api_base: https://my-endpoint-canada-berri992.openai.azure.com/
+      api_key: <your-azure-api-key>
+  - model_name: vllm-model
+    litellm_params:
+      model: openai/<your-model-name>
+      api_base: <your-api-base> # e.g. http://0.0.0.0:3000
+```
+
+### Run proxy with config
+
+```shell
+litellm --config your_config.yaml
+```
+
 
-### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
+## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
 
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
@@ -330,37 +352,6 @@ print(query_result[:5])
 </TabItem>
 </Tabs>
 
-
-## Quick Start - LiteLLM Proxy + Config.yaml
-The config allows you to create a model list and set `api_base`, `max_tokens` (all litellm params). See more details about the config [here](https://docs.litellm.ai/docs/proxy/configs)
-
-### Create a Config for LiteLLM Proxy
-Example config
-
-```yaml
-model_list: 
-  - model_name: gpt-3.5-turbo # user-facing model alias
-    litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
-      model: azure/<your-deployment-name>
-      api_base: <your-azure-api-endpoint>
-      api_key: <your-azure-api-key>
-  - model_name: gpt-3.5-turbo
-    litellm_params:
-      model: azure/gpt-turbo-small-ca
-      api_base: https://my-endpoint-canada-berri992.openai.azure.com/
-      api_key: <your-azure-api-key>
-  - model_name: vllm-model
-    litellm_params:
-      model: openai/<your-model-name>
-      api_base: <your-api-base> # e.g. http://0.0.0.0:3000
-```
-
-### Run proxy with config
-
-```shell
-litellm --config your_config.yaml
-```
-
 [**More Info**](./configs.md)
 
 

diff --git a/docs/my-website/docs/proxy/ui.md b/docs/my-website/docs/proxy/ui.md
@@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# [BETA] Admin UI
+# 🔑 [BETA] Admin UI 
 ### **Create + delete keys through a UI**
 
 :::info

diff --git a/docs/my-website/docs/proxy/user_keys.md b/docs/my-website/docs/proxy/user_keys.md
@@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# Use with Langchain, OpenAI SDK, Curl
+# Use with Langchain, OpenAI SDK, LlamaIndex, Curl
 
 :::info
 
@@ -51,6 +51,42 @@ response = client.chat.completions.create(
 print(response)
 ```
 </TabItem>
+<TabItem value="LlamaIndex" label="LlamaIndex">
+
+```python
+import os, dotenv
+
+from llama_index.llms import AzureOpenAI
+from llama_index.embeddings import AzureOpenAIEmbedding
+from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
+
+llm = AzureOpenAI(
+    engine="azure-gpt-3.5",               # model_name on litellm proxy
+    temperature=0.0,
+    azure_endpoint="http://0.0.0.0:4000", # litellm proxy endpoint
+    api_key="sk-1234",                    # litellm proxy API Key
+    api_version="2023-07-01-preview",
+)
+
+embed_model = AzureOpenAIEmbedding(
+    deployment_name="azure-embedding-model",
+    azure_endpoint="http://0.0.0.0:4000",
+    api_key="sk-1234",
+    api_version="2023-07-01-preview",
+)
+
+
+documents = SimpleDirectoryReader("llama_index_data").load_data()
+service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
+index = VectorStoreIndex.from_documents(documents, service_context=service_context)
+
+query_engine = index.as_query_engine()
+response = query_engine.query("What did the author do growing up?")
+print(response)
+
+```
+</TabItem>
+
 <TabItem value="Curl" label="Curl Request">
 
 Pass `metadata` as part of the request body

diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md
@@ -6,6 +6,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set
 
 :::info
 
+- 🔑 [UI to Generate, Edit, Delete Keys (with SSO)](https://docs.litellm.ai/docs/proxy/ui)
 - [Deploy LiteLLM Proxy with Key Management](https://docs.litellm.ai/docs/proxy/deploy#deploy-with-database)
 - Dockerfile.database for LiteLLM Proxy + Key Management [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
 

diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
@@ -98,7 +98,7 @@ const sidebars = {
       link: {
         type: 'generated-index',
         title: '💥 OpenAI Proxy Server',
-        description: `Proxy Server to call 100+ LLMs in a unified interface, load balance deployments, track costs per user`,
+        description: `Proxy Server to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
         slug: '/simple_proxy',
       },
       items: [
-Original file line number
+Diff line change
@@ Expand Up @@
     :::info
+    - 🔑 [UI to Generate, Edit, Delete Keys (with SSO)](https://docs.litellm.ai/docs/proxy/ui)
     - [Deploy LiteLLM Proxy with Key Management](https://docs.litellm.ai/docs/proxy/deploy#deploy-with-database)
     - Dockerfile.database for LiteLLM Proxy + Key Management [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
@@ Expand Down @@