feat: LiteLLM model support for evals (#1675)

* LiteLLM implmentation for evals * LiteLLM completion retry Support for LiteLLM Encoding interface return type Tokenizers added as dep * type checking Removed encoder implementation, unnecessary. * Done * better init validation, mock pytest, docs Added litellm to dev & hatch envs dependencies * dear mypy * have to ignore "litellm": module is installed, but missing library stubs or py.typed marker --------- Co-authored-by: Mikyo King <mikyo@arize.com>
Arize-ai · Nov 17, 2023 · 5f2a999 · 5f2a999
1 parent 45eb5f2
commit 5f2a999
Show file tree

Hide file tree

Showing 8 changed files with 215 additions and 6 deletions.
diff --git a/docs/api/evaluation-models.md b/docs/api/evaluation-models.md
@@ -72,7 +72,6 @@ Here is an example of how to initialize `OpenAIModel` for Azure:
 
 ```python
 model = OpenAIModel(
-    model = OpenAIModel(
     model_name="gpt-4-32k",
     azure_endpoint="https://YOUR_SUBDOMAIN.openai.azure.com/",
     api_version="2023-03-15-preview"
@@ -210,6 +209,44 @@ model = BedrockModel(client=client_bedrock)
 
 ```
 
+
+### phoenix.experimental.evals.LiteLLMModel
+Need to install the extra dependency ``litellm>=1.0.3``
+```python
+class LiteLLMModel(BaseEvalModel):
+    model_name: str = "gpt-3.5-turbo"
+    """The model name to use."""
+    temperature: float = 0.0
+    """What sampling temperature to use."""
+    max_tokens: int = 256
+    """The maximum number of tokens to generate in the completion."""
+    top_p: float = 1
+    """Total probability mass of tokens to consider at each step."""
+    num_retries: int = 6
+    """Maximum number to retry a model if an RateLimitError, OpenAIError, or
+    ServiceUnavailableError occurs."""
+    request_timeout: int = 60
+    """Maximum number of seconds to wait when retrying."""
+    model_kwargs: Dict[str, Any] = field(default_factory=dict)
+    """Model specific params"""
+
+    # non-LiteLLM params
+    retry_min_seconds: int = 10
+    """Minimum number of seconds to wait when retrying."""
+    max_content_size: Optional[int] = None
+    """If you're using a fine-tuned model, set this to the maximum content size"""
+```
+You can choose among [multiple models](https://docs.litellm.ai/docs/providers) supported by LiteLLM. Make sure you have set the right environment variables set prior to initializing the model. For additional information about the environment variables for specific model providers visit: [LiteLLM provider specific params](https://docs.litellm.ai/docs/completion/input#provider-specific-params)
+
+Here is an example of how to initialize `LiteLLMModel` for model "gpt-3.5-turbo":
+
+```python
+model = LiteLLMModel(model_name="gpt-3.5-turbo", temperature=0.0)
+model("Hello world, this is a test if you are working?")
+# Output: 'Hello! Yes, I am here and ready to assist you. How can I help you today?'
+```
+
+
 ## **Usage**
 
 In this section, we will showcase the methods and properties that our `EvalModels` have. First, instantiate your model from the[#supported-llm-providers](evaluation-models.md#supported-llm-providers "mention"). Once you've instantiated your `model`, you can get responses from the LLM by simply calling the model and passing a text string.

diff --git a/docs/llm-evals/running-pre-tested-evals/README.md b/docs/llm-evals/running-pre-tested-evals/README.md
@@ -19,7 +19,7 @@ model("What is the largest costal city in France?")
 
 We currently support a growing set of models for LLM Evals, please check out the [API section for usage](../../api/evaluation-models.md).&#x20;
 
-<table data-full-width="false"><thead><tr><th width="357">Model</th><th>Support </th></tr></thead><tbody><tr><td>GPT-4 </td><td>✔</td></tr><tr><td>GPT-3.5 Turbo</td><td>✔</td></tr><tr><td>GPT-3.5 Instruct</td><td>✔</td></tr><tr><td>Azure Hosted Open AI </td><td>✔</td></tr><tr><td>Palm 2 Vertex</td><td>✔</td></tr><tr><td>AWS Bedrock</td><td>✔</td></tr><tr><td>Litellm</td><td>(coming soon)</td></tr><tr><td>Huggingface Llama7B</td><td>(coming soon)</td></tr><tr><td>Anthropic</td><td>(coming soon)</td></tr><tr><td>Cohere</td><td>(coming soon)</td></tr></tbody></table>
+<table data-full-width="false"><thead><tr><th width="357">Model</th><th>Support </th></tr></thead><tbody><tr><td>GPT-4 </td><td>✔</td></tr><tr><td>GPT-3.5 Turbo</td><td>✔</td></tr><tr><td>GPT-3.5 Instruct</td><td>✔</td></tr><tr><td>Azure Hosted Open AI </td><td>✔</td></tr><tr><td>Palm 2 Vertex</td><td>✔</td></tr><tr><td>AWS Bedrock</td><td>✔</td></tr><tr><td>Litellm</td><td>✔</td></tr><tr><td>Huggingface Llama7B</td><td>(coming soon)</td></tr><tr><td>Anthropic</td><td>(coming soon)</td></tr><tr><td>Cohere</td><td>(coming soon)</td></tr></tbody></table>
 
 ## How we benchmark pre-tested evals&#x20;
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,6 +57,7 @@ dev = [
   "arize[AutoEmbeddings, LLM_Evaluation]",
   "llama-index>=0.9.0",
   "langchain>=0.0.334",
+  "litellm>=1.0.3"
 ]
 experimental = [
   "tenacity",
@@ -95,6 +96,7 @@ dependencies = [
   "pytest-lazy-fixture",
   "arize",
   "langchain>=0.0.334",
+  "litellm>=1.0.3",
   "llama-index>=0.9.0",
   "openai>=1.0.0",
   "tenacity",
@@ -120,6 +122,7 @@ dependencies = [
   "types-requests",
   "types-protobuf",
   "openai>=1.0.0",
+  "litellm>=1.0.3"
 ]
 
 [tool.hatch.envs.style]
@@ -273,6 +276,7 @@ module = [
   "wrapt",
   "sortedcontainers",
   "langchain.*",
+  "litellm"
 ]
 ignore_missing_imports = true
 

diff --git a/src/phoenix/experimental/evals/__init__.py b/src/phoenix/experimental/evals/__init__.py
@@ -1,5 +1,5 @@
 from .functions import llm_classify, llm_generate, run_relevance_eval
-from .models import OpenAIModel, VertexAIModel
+from .models import BedrockModel, LiteLLMModel, OpenAIModel, VertexAIModel
 from .retrievals import compute_precisions_at_k
 from .templates import (
     CODE_READABILITY_PROMPT_RAILS_MAP,
@@ -23,6 +23,8 @@
     "llm_generate",
     "OpenAIModel",
     "VertexAIModel",
+    "BedrockModel",
+    "LiteLLMModel",
     "PromptTemplate",
     "ClassificationTemplate",
     "CODE_READABILITY_PROMPT_RAILS_MAP",

diff --git a/src/phoenix/experimental/evals/models/__init__.py b/src/phoenix/experimental/evals/models/__init__.py
@@ -1,6 +1,14 @@
 from .base import BaseEvalModel, set_verbosity
 from .bedrock import BedrockModel
+from .litellm import LiteLLMModel
 from .openai import OpenAIModel
 from .vertexai import VertexAIModel
 
-__all__ = ["BedrockModel", "BaseEvalModel", "OpenAIModel", "VertexAIModel", "set_verbosity"]
+__all__ = [
+    "BedrockModel",
+    "BaseEvalModel",
+    "LiteLLMModel",
+    "OpenAIModel",
+    "VertexAIModel",
+    "set_verbosity",
+]
diff --git a/src/phoenix/experimental/evals/models/base.py b/src/phoenix/experimental/evals/models/base.py
@@ -2,12 +2,11 @@
 from abc import ABC, abstractmethod, abstractproperty
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Type
+from typing import TYPE_CHECKING, Any, Callable, Generator, List, Optional, Sequence, Type
 
 if TYPE_CHECKING:
     from tiktoken import Encoding
 
-from typing import Sequence
 
 from tenacity import (
     RetryCallState,

diff --git a/src/phoenix/experimental/evals/models/litellm.py b/src/phoenix/experimental/evals/models/litellm.py
@@ -0,0 +1,122 @@
+import logging
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from phoenix.experimental.evals.models.base import BaseEvalModel
+
+if TYPE_CHECKING:
+    from tiktoken import Encoding
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class LiteLLMModel(BaseEvalModel):
+    model_name: str = "gpt-3.5-turbo"
+    """The model name to use."""
+    temperature: float = 0.0
+    """What sampling temperature to use."""
+    max_tokens: int = 256
+    """The maximum number of tokens to generate in the completion."""
+    top_p: float = 1
+    """Total probability mass of tokens to consider at each step."""
+    num_retries: int = 6  # Will be superseeded by max_retries
+    """Maximum number to retry a model if an RateLimitError, OpenAIError, or
+    ServiceUnavailableError occurs."""
+    request_timeout: int = 60
+    """Maximum number of seconds to wait when retrying."""
+    model_kwargs: Dict[str, Any] = field(default_factory=dict)
+    """Model specific params"""
+
+    # non-LiteLLM params
+    retry_min_seconds: int = 10
+    """Minimum number of seconds to wait when retrying."""
+    max_content_size: Optional[int] = None
+    """If you're using a fine-tuned model, set this to the maximum content size"""
+
+    def __post_init__(self) -> None:
+        self._init_environment()
+        self._init_model_encoding()
+
+    def _init_environment(self) -> None:
+        try:
+            import litellm
+            from litellm import validate_environment
+
+            self._litellm = litellm
+            env_info = validate_environment(self._litellm.utils.get_llm_provider(self.model_name))
+
+            if not env_info["keys_in_environment"]:
+                raise RuntimeError(
+                    f"Missing environment variable(s): '{str(env_info['missing_keys'])}', for "
+                    f"model: {self.model_name}. \nFor additional information about the right "
+                    "environment variables for specific model providers:\n"
+                    "https://docs.litellm.ai/docs/completion/input#provider-specific-params."
+                )
+        except ImportError:
+            self._raise_import_error(
+                package_display_name="LiteLLM",
+                package_name="litellm",
+            )
+
+    def _init_model_encoding(self) -> None:
+        from litellm import decode, encode
+
+        if self.model_name in self._litellm.model_list:
+            self._encoding = encode
+            self._decoding = decode
+        else:
+            raise ValueError(
+                f"Model name not found in the LiteLLM's models list: \n{self._litellm.model_list}"
+            )
+
+    @property
+    def max_context_size(self) -> int:
+        context_size = self.max_content_size or self._litellm.get_max_tokens(self.model_name).get(
+            "max_tokens", None
+        )
+
+        if context_size is None:
+            raise ValueError(
+                "Can't determine maximum context size. An unknown model name was "
+                + f"used: {self.model_name}."
+            )
+
+        return context_size
+
+    @property
+    def encoder(self) -> "Encoding":
+        raise NotImplementedError
+
+    def get_tokens_from_text(self, text: str) -> List[int]:
+        result: List[int] = self._encoding(model=self.model_name, text=text)
+        return result
+
+    def get_text_from_tokens(self, tokens: List[int]) -> str:
+        return str(self._decoding(model=self.model_name, tokens=tokens))
+
+    def _generate(self, prompt: str, **kwargs: Dict[str, Any]) -> str:
+        messages = self._get_messages_from_prompt(prompt)
+        return str(
+            self._generate_with_retry(
+                model=self.model_name,
+                messages=messages,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                top_p=self.top_p,
+                num_retries=self.num_retries,
+                request_timeout=self.request_timeout,
+                **self.model_kwargs,
+            )
+        )
+
+    def _generate_with_retry(self, **kwargs: Any) -> Any:
+        # Using default LiteLLM completion with retries = self.num_retries.
+
+        response = self._litellm.completion(**kwargs)
+        return response.choices[0].message.content
+
+    def _get_messages_from_prompt(self, prompt: str) -> List[Dict[str, str]]:
+        # LiteLLM requires prompts in the format of messages
+        # messages=[{"content": "ABC?","role": "user"}]
+        return [{"content": prompt, "role": "user"}]
diff --git a/tests/experimental/evals/functions/test_generate.py b/tests/experimental/evals/functions/test_generate.py
@@ -8,6 +8,7 @@
 import pytest
 import respx
 from phoenix.experimental.evals import OpenAIModel, llm_generate
+from phoenix.experimental.evals.models.litellm import LiteLLMModel
 from phoenix.experimental.evals.models.openai import OPENAI_API_KEY_ENVVAR_NAME
 from respx.patterns import M
 
@@ -179,3 +180,39 @@ def test_classify_tolerance_to_exceptions(
     # Make sure there is a logger.error output
     captured = capfd.readouterr()
     assert "Process was interrupted" in captured.out
+
+
+def test_litellm_model_llm_generate(monkeypatch: pytest.MonkeyPatch):
+    """LiteLLM can return a `mock_response` from completion, we set it in model_kwargs to True"""
+
+    monkeypatch.setenv(OPENAI_API_KEY_ENVVAR_NAME, "sk-0123456789")
+    dataframe = pd.DataFrame(
+        [
+            {
+                "query": "What is Python?",
+                "reference": "Python is a programming language.",
+            },
+            {
+                "query": "What is Python?",
+                "reference": "Ruby is a programming language.",
+            },
+            {
+                "query": "What is C++?",
+                "reference": "C++ is a programming language.",
+            },
+            {
+                "query": "What is C++?",
+                "reference": "irrelevant",
+            },
+        ]
+    )
+    responses = ["True", "True", "True", "True"]
+
+    template = (
+        "Given {query} and a golden answer {reference}, generate an answer that returns True."
+    )
+
+    model = LiteLLMModel(model_name="gpt-3.5-turbo", model_kwargs={"mock_response": True})
+
+    generated = llm_generate(dataframe=dataframe, template=template, model=model)
+    assert generated.iloc[:, 0].tolist() == responses