Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Verbose evals #1558

Merged
merged 33 commits into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
6f002d2
Add `_verbose` flag to `BaseEvalModel`
anticorrelator Oct 3, 2023
b410128
Start adding basic verbose-mode logging
anticorrelator Oct 3, 2023
73c93a0
Add verbose mode to retries
anticorrelator Oct 4, 2023
a3e6a1b
Only print when verbose flag is set
anticorrelator Oct 4, 2023
b2ecf6d
Refactor verbose mode
anticorrelator Oct 4, 2023
931ce4a
Continue refining verbose mode output
anticorrelator Oct 4, 2023
5bcc0d9
Prefer absolute imports
anticorrelator Oct 4, 2023
778c459
Fix type hint
anticorrelator Oct 4, 2023
9dee5b5
Try to clean up abstractions
anticorrelator Oct 4, 2023
feeeda9
Add `printif` utility
anticorrelator Oct 4, 2023
e51ab9e
Prefer absolute imports
anticorrelator Oct 4, 2023
6eaf28f
Add `verbose` test for `llm_eval_binary`
anticorrelator Oct 4, 2023
d73a211
Test retrying with verbose mode
anticorrelator Oct 5, 2023
398ad79
Test that the "verbose" state does not get persisted
anticorrelator Oct 5, 2023
8d15d6d
Implement verbose flag as a context manager
anticorrelator Oct 5, 2023
50e28bc
Add docstrings
anticorrelator Oct 5, 2023
459e7fb
Merge branch 'main' into dustin/verbose-evals
anticorrelator Oct 5, 2023
5c6ec9d
Improve verbosity statefulness test
anticorrelator Oct 5, 2023
44e2038
Lint imports
anticorrelator Oct 5, 2023
dadcc9b
Add blankline
anticorrelator Oct 5, 2023
bf93cbc
Shorten docstrings
anticorrelator Oct 5, 2023
1b1bdbf
Enforce formatter settings
anticorrelator Oct 5, 2023
0918108
Appease mypy
anticorrelator Oct 5, 2023
d21ca4c
Add verbose flag test for `generate`
anticorrelator Oct 5, 2023
d4015b7
Merge branch 'main' into dustin/verbose-evals
anticorrelator Oct 5, 2023
9f863e6
Use better dummy variable name
anticorrelator Oct 5, 2023
1d56240
Update src/phoenix/experimental/evals/models/base.py
anticorrelator Oct 5, 2023
89566b1
Add more details to docstrings
anticorrelator Oct 6, 2023
dda49ae
Merge remote-tracking branch 'origin' into dustin/verbose-evals
anticorrelator Oct 6, 2023
6d2ceda
Update bedrock model
anticorrelator Oct 6, 2023
61ad169
Restore missing import
anticorrelator Oct 6, 2023
6cb1c95
Merge branch 'main' into dustin/verbose-evals
anticorrelator Oct 6, 2023
1960d1c
Merge branch 'main' into dustin/verbose-evals
anticorrelator Oct 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions src/phoenix/experimental/evals/functions/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

import pandas as pd

from phoenix.utilities.logging import printif
from phoenix.trace.semantic_conventions import DOCUMENT_CONTENT, INPUT_VALUE, RETRIEVAL_DOCUMENTS
from phoenix.utilities.logging import printif

from ..models import BaseEvalModel
from ..templates import (
from phoenix.experimental.evals.models import BaseEvalModel
from phoenix.experimental.evals.templates import (
NOT_PARSABLE,
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
Expand Down Expand Up @@ -60,8 +62,7 @@ def llm_eval_binary(
prompts = map_template(dataframe, eval_template)
responses = model.generate(prompts.to_list(), instruction=system_instruction)
rails_set = set(rails)
if verbose:
print(f"Snapping {len(responses)} responses to rails: {rails_set}")
printif(verbose, f"Snapping {len(responses)} responses to rails: {rails_set}")
return [_snap_to_rail(response, rails_set, verbose=verbose) for response in responses]


Expand Down Expand Up @@ -200,16 +201,14 @@ def _snap_to_rail(string: str, rails: Set[str], verbose: bool = False) -> str:
rails_list = list(rails)
rail = _extract_rail(processed_string, rails_list[0], rails_list[1])
if not rail:
if verbose:
print(f"- Cannot snap {repr(string)} to rails: {rails}")
printif(verbose, f"- Cannot snap {repr(string)} to rails: {rails}")
logger.warning(
f"LLM output cannot be snapped to rails {list(rails)}, returning {NOT_PARSABLE}. "
f'Output: "{string}"'
)
return NOT_PARSABLE
else:
if verbose:
print(f"- Snapped {repr(string)} to rail: {rail}")
printif(verbose, f"- Snapped {repr(string)} to rail: {rail}")
return rail


Expand Down
21 changes: 10 additions & 11 deletions src/phoenix/experimental/evals/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio

from ..utils.threads import to_thread
from ..utils.types import is_list_of
from phoenix.experimental.evals.utils.threads import to_thread
from phoenix.experimental.evals.utils.types import is_list_of
from phoenix.utilities.logging import printif

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,11 +47,10 @@ def retry(

def log_retry(retry_state: RetryCallState) -> None:
exc = retry_state.outcome.exception()
if self._verbose:
if exc:
print(f"Failed attempt {retry_state.attempt_number}: raised {repr(exc)}")
else:
print(f"Failed attempt {retry_state.attempt_number}")
if exc:
printif(self._verbose, f"Failed attempt {retry_state.attempt_number}: raised {repr(exc)}")
else:
printif(self._verbose, f"Failed attempt {retry_state.attempt_number}")
return None

retry_instance: retry_base = retry_if_exception_type(error_types[0])
Expand Down Expand Up @@ -96,10 +96,9 @@ async def async_call(self, prompt: str, instruction: Optional[str] = None) -> st
return response[0]

def generate(self, prompts: List[str], instruction: Optional[str] = None) -> List[str]:
if self._verbose:
print(f"Generating responses for {len(prompts)} prompts...")
if extra_info := self._verbose_generation_info():
print(extra_info)
printif(self._verbose, f"Generating responses for {len(prompts)} prompts...")
if extra_info := self._verbose_generation_info():
printif(self._verbose, extra_info)
if not is_list_of(prompts, str):
raise TypeError(
"Invalid type for argument `prompts`. Expected a list of strings "
Expand Down
Empty file.
5 changes: 5 additions & 0 deletions src/phoenix/utilities/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# A collection of printing and logging utilities

def printif(condition: bool, *args, **kwargs):
if condition:
print(*args, **kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mikeldking What are your thoughts on the eventual packaging strategy for evals and other Phoenix sub-modules such as our tracers? Are we going to deploy them as distinct packages, e.g., arize-evals or phoenix-evals? If so, we should be careful about introducing dependencies between the sub-modules and the rest of the codebase.

@anticorrelator This is a non-blocking comment. We can always move things if needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it ideally doesn't sit in phoenix long term so treating it more as a sub-module could be a benefit. I think the verbose logging ask could be evals specific so it could make more sense sitting under evals, though I think this is a trivial change if we do split it so not concerned either way

62 changes: 62 additions & 0 deletions tests/experimental/evals/functions/test_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,72 @@ def test_llm_eval_binary(monkeypatch: pytest.MonkeyPatch):
template=RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
model=model,
rails=["relevant", "irrelevant"],
verbose=True,
)
assert relevance_classifications == ["relevant", "irrelevant", "relevant", NOT_PARSABLE]


@responses.activate
def test_llm_eval_binary_prints_to_stdout_with_verbose_flag(monkeypatch: pytest.MonkeyPatch, capfd):
monkeypatch.setenv(OPENAI_API_KEY_ENVVAR_NAME, "sk-0123456789")
dataframe = pd.DataFrame(
[
{
"query": "What is Python?",
"reference": "Python is a programming language.",
},
{
"query": "What is Python?",
"reference": "Ruby is a programming language.",
},
{
"query": "What is C++?",
"reference": "C++ is a programming language.",
},
{
"query": "What is C++?",
"reference": "irrelevant",
},
]
)
for message_content in [
"relevant",
"irrelevant",
"\nrelevant ",
"unparsable",
]:
responses.post(
"https://api.openai.com/v1/chat/completions",
json={
"choices": [
{
"message": {
"content": message_content,
},
}
],
},
status=200,
)
with patch.object(OpenAIModel, "_init_tiktoken", return_value=None):
model = OpenAIModel()
relevance_classifications = llm_eval_binary(
dataframe=dataframe,
template=RAG_RELEVANCY_PROMPT_TEMPLATE_STR,
model=model,
rails=["relevant", "irrelevant"],
verbose=True,
)
out, err = capfd.readouterr()
assert "Snapped 'relevant' to rail: relevant" in out, "Snapping events should be printed"
assert "Snapped 'irrelevant' to rail: irrelevant" in out, "Snapping events should be printed"
assert "Snapped '\\nrelevant ' to rail: relevant" in out, "Snapping events should be printed"
assert "Cannot snap 'unparsable' to rails: {'relevant', 'irrelevant'}" in out, "Snapping events should be printed"
assert "OpenAI invocation parameters" in out, "Model-specific information should be printed"
assert "'model': 'gpt-4', 'temperature': 0.0, 'max_tokens': 256" in out, "Model-specific information should be printed"
assert "sk-0123456789" not in out, "Credentials should not be printed out in cleartext"


@responses.activate
@pytest.mark.parametrize(
"dataframe",
Expand Down
Loading