Arize-ai · anticorrelator · Oct 6, 2023 · Oct 3, 2023 · Oct 3, 2023 · Oct 4, 2023
diff --git a/src/phoenix/experimental/evals/functions/binary.py b/src/phoenix/experimental/evals/functions/binary.py
@@ -3,10 +3,12 @@
 
 import pandas as pd
 
+from phoenix.utilities.logging import printif
 from phoenix.trace.semantic_conventions import DOCUMENT_CONTENT, INPUT_VALUE, RETRIEVAL_DOCUMENTS
+from phoenix.utilities.logging import printif
 
-from ..models import BaseEvalModel
-from ..templates import (
+from phoenix.experimental.evals.models import BaseEvalModel
+from phoenix.experimental.evals.templates import (
     NOT_PARSABLE,
     RAG_RELEVANCY_PROMPT_RAILS_MAP,
     RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
@@ -60,8 +62,7 @@ def llm_eval_binary(
     prompts = map_template(dataframe, eval_template)
     responses = model.generate(prompts.to_list(), instruction=system_instruction)
     rails_set = set(rails)
-    if verbose:
-        print(f"Snapping {len(responses)} responses to rails: {rails_set}")
+    printif(verbose, f"Snapping {len(responses)} responses to rails: {rails_set}")
     return [_snap_to_rail(response, rails_set, verbose=verbose) for response in responses]
 
 
@@ -200,16 +201,14 @@ def _snap_to_rail(string: str, rails: Set[str], verbose: bool = False) -> str:
     rails_list = list(rails)
     rail = _extract_rail(processed_string, rails_list[0], rails_list[1])
     if not rail:
-        if verbose:
-            print(f"- Cannot snap {repr(string)} to rails: {rails}")
+        printif(verbose, f"- Cannot snap {repr(string)} to rails: {rails}")
         logger.warning(
             f"LLM output cannot be snapped to rails {list(rails)}, returning {NOT_PARSABLE}. "
             f'Output: "{string}"'
         )
         return NOT_PARSABLE
     else:
-        if verbose:
-            print(f"- Snapped {repr(string)} to rail: {rail}")
+        printif(verbose, f"- Snapped {repr(string)} to rail: {rail}")
     return rail
 
 

diff --git a/src/phoenix/experimental/evals/models/base.py b/src/phoenix/experimental/evals/models/base.py
@@ -17,8 +17,9 @@
 from tqdm import tqdm
 from tqdm.asyncio import tqdm_asyncio
 
-from ..utils.threads import to_thread
-from ..utils.types import is_list_of
+from phoenix.experimental.evals.utils.threads import to_thread
+from phoenix.experimental.evals.utils.types import is_list_of
+from phoenix.utilities.logging import printif
 
 logger = logging.getLogger(__name__)
 
@@ -46,11 +47,10 @@ def retry(
 
         def log_retry(retry_state: RetryCallState) -> None:
             exc = retry_state.outcome.exception()
-            if self._verbose:
-                if exc:
-                    print(f"Failed attempt {retry_state.attempt_number}: raised {repr(exc)}")
-                else:
-                    print(f"Failed attempt {retry_state.attempt_number}")
+            if exc:
+                printif(self._verbose, f"Failed attempt {retry_state.attempt_number}: raised {repr(exc)}")
+            else:
+                printif(self._verbose, f"Failed attempt {retry_state.attempt_number}")
             return None
 
         retry_instance: retry_base = retry_if_exception_type(error_types[0])
@@ -96,10 +96,9 @@ async def async_call(self, prompt: str, instruction: Optional[str] = None) -> st
         return response[0]
 
     def generate(self, prompts: List[str], instruction: Optional[str] = None) -> List[str]:
-        if self._verbose:
-            print(f"Generating responses for {len(prompts)} prompts...")
-            if extra_info := self._verbose_generation_info():
-                print(extra_info)
+        printif(self._verbose, f"Generating responses for {len(prompts)} prompts...")
+        if extra_info := self._verbose_generation_info():
+            printif(self._verbose, extra_info)
         if not is_list_of(prompts, str):
             raise TypeError(
                 "Invalid type for argument `prompts`. Expected a list of strings "

diff --git a/src/phoenix/utilities/__init__.py b/src/phoenix/utilities/__init__.py
diff --git a/src/phoenix/utilities/logging.py b/src/phoenix/utilities/logging.py
@@ -0,0 +1,5 @@
+# A collection of printing and logging utilities
+
+def printif(condition: bool, *args, **kwargs):
+    if condition:
+        print(*args, **kwargs)
diff --git a/tests/experimental/evals/functions/test_binary.py b/tests/experimental/evals/functions/test_binary.py
@@ -64,10 +64,72 @@ def test_llm_eval_binary(monkeypatch: pytest.MonkeyPatch):
         template=RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
         model=model,
         rails=["relevant", "irrelevant"],
+        verbose=True,
     )
     assert relevance_classifications == ["relevant", "irrelevant", "relevant", NOT_PARSABLE]
 
 
+@responses.activate
+def test_llm_eval_binary_prints_to_stdout_with_verbose_flag(monkeypatch: pytest.MonkeyPatch, capfd):
+    monkeypatch.setenv(OPENAI_API_KEY_ENVVAR_NAME, "sk-0123456789")
+    dataframe = pd.DataFrame(
+        [
+            {
+                "query": "What is Python?",
+                "reference": "Python is a programming language.",
+            },
+            {
+                "query": "What is Python?",
+                "reference": "Ruby is a programming language.",
+            },
+            {
+                "query": "What is C++?",
+                "reference": "C++ is a programming language.",
+            },
+            {
+                "query": "What is C++?",
+                "reference": "irrelevant",
+            },
+        ]
+    )
+    for message_content in [
+        "relevant",
+        "irrelevant",
+        "\nrelevant ",
+        "unparsable",
+    ]:
+        responses.post(
+            "https://api.openai.com/v1/chat/completions",
+            json={
+                "choices": [
+                    {
+                        "message": {
+                            "content": message_content,
+                        },
+                    }
+                ],
+            },
+            status=200,
+        )
+    with patch.object(OpenAIModel, "_init_tiktoken", return_value=None):
+        model = OpenAIModel()
+    relevance_classifications = llm_eval_binary(
+        dataframe=dataframe,
+        template=RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
+        model=model,
+        rails=["relevant", "irrelevant"],
+        verbose=True,
+    )
+    out, err = capfd.readouterr()
+    assert "Snapped 'relevant' to rail: relevant" in out, "Snapping events should be printed"
+    assert "Snapped 'irrelevant' to rail: irrelevant" in out, "Snapping events should be printed"
+    assert "Snapped '\\nrelevant ' to rail: relevant" in out, "Snapping events should be printed"
+    assert "Cannot snap 'unparsable' to rails: {'relevant', 'irrelevant'}" in out, "Snapping events should be printed"
+    assert "OpenAI invocation parameters" in out, "Model-specific information should be printed"
+    assert "'model': 'gpt-4', 'temperature': 0.0, 'max_tokens': 256" in out, "Model-specific information should be printed"
+    assert "sk-0123456789" not in out, "Credentials should not be printed out in cleartext"
+
+
 @responses.activate
 @pytest.mark.parametrize(
     "dataframe",