Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(evals): add an output_parser to llm_generate #1736

Merged
merged 2 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions src/phoenix/experimental/evals/functions/generate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List, Optional, Union
from typing import Any, Callable, Dict, Optional, Union

import pandas as pd

Expand All @@ -9,13 +9,18 @@
logger = logging.getLogger(__name__)


def _no_op_parser(response: str) -> Dict[str, str]:
return {"output": response}


def llm_generate(
dataframe: pd.DataFrame,
template: Union[PromptTemplate, str],
model: BaseEvalModel,
system_instruction: Optional[str] = None,
verbose: bool = False,
) -> List[str]:
output_parser: Optional[Callable[[str], Dict[str, Any]]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
output_parser: Optional[Callable[[str], Dict[str, Any]]] = None,
output_parser: Optional[Callable[[str], Any]] = None,

Seems like the user might want to write all sorts of kinds of parsers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@axiomofjoy but we need to know what column to map it to - this at least forces you to declare the column. Or are you thinking of something different?

) -> pd.DataFrame:
"""
Generates a text using a template using an LLM. This function is useful
if you want to generate synthetic data, such as irrelevant responses
Expand All @@ -38,16 +43,29 @@ def llm_generate(
verbose (bool, optional): If True, prints detailed information to stdout such as model
invocation parameters and retry info. Default False.

output_parser (Callable[[str], Dict[str, Any]], optional): An optional function
that takes each generated response and parses it to a dictionary. The keys of the dictionary
should correspond to the column names of the output dataframe. If None, the output dataframe
will have a single column named "output". Default None.

Returns:
List[Optional[str]]: A list of strings representing the output of the
model for each record
pandas.DataFrame: A dataframe where each row represents the generated output

"""
output_parser = output_parser or _no_op_parser
with set_verbosity(model, verbose) as verbose_model:
template = normalize_template(template)
logger.info(f"Template: \n{template.text}\n")
logger.info(f"Template variables: {template.variables}")
prompts = map_template(dataframe, template)

responses = verbose_model.generate(prompts.to_list(), system_instruction)
return responses
# For each prompt, generate and parse the response
output = []
for prompt in prompts:
logger.info(f"Prompt: {prompt}")
response = verbose_model(prompt, instruction=system_instruction)
parsed_response = output_parser(response)
output.append(parsed_response)

# Return the data as a dataframe
return pd.DataFrame(output)
76 changes: 70 additions & 6 deletions tests/experimental/evals/functions/test_generate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
from typing import Dict
from unittest.mock import patch

import httpx
import numpy as np
import pandas as pd
import pytest
import respx
Expand Down Expand Up @@ -54,12 +57,7 @@ def test_llm_generate(monkeypatch: pytest.MonkeyPatch, respx_mock: respx.mock):
model = OpenAIModel()

generated = llm_generate(dataframe=dataframe, template=template, model=model)
assert generated == [
"it's a dialect of french",
"it's a music notation",
"It's a crazy language",
"it's a programming language",
]
assert generated.iloc[:, 0].tolist() == responses


@pytest.mark.respx(base_url="https://api.openai.com/v1/chat/completions")
Expand Down Expand Up @@ -112,3 +110,69 @@ def test_llm_generate_prints_info_with_verbose_flag(

out, _ = capfd.readouterr()
assert "Generating responses for 4 prompts..." in out, "Response generation should be printed"


@pytest.mark.respx(base_url="https://api.openai.com/v1/chat/completions")
def test_llm_generate_with_output_parser(monkeypatch: pytest.MonkeyPatch, respx_mock: respx.mock):
monkeypatch.setenv(OPENAI_API_KEY_ENVVAR_NAME, "sk-0123456789")
dataframe = pd.DataFrame(
[
{
"query": "What is Python?",
},
{
"query": "What is Python?",
},
{
"query": "What is C++?",
},
{
"query": "What is C++?",
},
{
"query": "gobbledygook",
},
]
)
responses = [
'{ "category": "programming", "language": "Python" }',
'{ "category": "programming", "language": "Python" }',
'{ "category": "programming", "language": "C++" }',
'{ "category": "programming", "language": "C++" }',
"unparsable response",
]
queries = dataframe["query"].tolist()

for query, response in zip(queries, responses):
matcher = M(content__contains=query) & M(content__contains=query)
respx_mock.route(matcher).mock(
return_value=httpx.Response(200, json={"choices": [{"message": {"content": response}}]})
)

template = "Given {query}, generate output"

with patch.object(OpenAIModel, "_init_tiktoken", return_value=None):
model = OpenAIModel()

def output_parser(response: str) -> Dict[str, str]:
try:
return json.loads(response)
except json.JSONDecodeError as e:
return {"__error__": str(e)}

generated = llm_generate(
dataframe=dataframe, template=template, model=model, output_parser=output_parser
)
# check the output is parsed correctly
assert generated["category"].tolist() == [
"programming",
"programming",
"programming",
"programming",
np.nan,
]

# check the unparsable response captures the error
assert generated["__error__"].tolist() == [np.nan] * 4 + [
"Expecting value: line 1 column 1 (char 0)"
]
Loading