diff --git a/src/phoenix/experimental/evals/functions/generate.py b/src/phoenix/experimental/evals/functions/generate.py index cc2ccfb129..a7f5df6db7 100644 --- a/src/phoenix/experimental/evals/functions/generate.py +++ b/src/phoenix/experimental/evals/functions/generate.py @@ -1,5 +1,5 @@ import logging -from typing import List, Optional, Union +from typing import Any, Callable, Dict, Optional, Union import pandas as pd @@ -9,13 +9,18 @@ logger = logging.getLogger(__name__) +def _no_op_parser(response: str) -> Dict[str, str]: + return {"output": response} + + def llm_generate( dataframe: pd.DataFrame, template: Union[PromptTemplate, str], model: BaseEvalModel, system_instruction: Optional[str] = None, verbose: bool = False, -) -> List[str]: + output_parser: Optional[Callable[[str], Dict[str, Any]]] = None, +) -> pd.DataFrame: """ Generates a text using a template using an LLM. This function is useful if you want to generate synthetic data, such as irrelevant responses @@ -38,16 +43,29 @@ def llm_generate( verbose (bool, optional): If True, prints detailed information to stdout such as model invocation parameters and retry info. Default False. + output_parser (Callable[[str], Dict[str, Any]], optional): An optional function + that takes each generated response and parses it to a dictionary. The keys of the dictionary + should correspond to the column names of the output dataframe. If None, the output dataframe + will have a single column named "output". Default None. + Returns: - List[Optional[str]]: A list of strings representing the output of the - model for each record + pandas.DataFrame: A dataframe where each row represents the generated output """ + output_parser = output_parser or _no_op_parser with set_verbosity(model, verbose) as verbose_model: template = normalize_template(template) logger.info(f"Template: \n{template.text}\n") logger.info(f"Template variables: {template.variables}") prompts = map_template(dataframe, template) - responses = verbose_model.generate(prompts.to_list(), system_instruction) - return responses + # For each prompt, generate and parse the response + output = [] + for prompt in prompts: + logger.info(f"Prompt: {prompt}") + response = verbose_model(prompt, instruction=system_instruction) + parsed_response = output_parser(response) + output.append(parsed_response) + + # Return the data as a dataframe + return pd.DataFrame(output) diff --git a/tests/experimental/evals/functions/test_generate.py b/tests/experimental/evals/functions/test_generate.py index 9da3cf6549..1bdf421a62 100644 --- a/tests/experimental/evals/functions/test_generate.py +++ b/tests/experimental/evals/functions/test_generate.py @@ -1,6 +1,9 @@ +import json +from typing import Dict from unittest.mock import patch import httpx +import numpy as np import pandas as pd import pytest import respx @@ -54,61 +57,70 @@ def test_llm_generate(monkeypatch: pytest.MonkeyPatch, respx_mock: respx.mock): model = OpenAIModel() generated = llm_generate(dataframe=dataframe, template=template, model=model) - assert generated == [ - "it's a dialect of french", - "it's a music notation", - "It's a crazy language", - "it's a programming language", - ] + assert generated.iloc[:, 0].tolist() == responses @pytest.mark.respx(base_url="https://api.openai.com/v1/chat/completions") -def test_llm_generate_prints_info_with_verbose_flag( - monkeypatch: pytest.MonkeyPatch, capfd, respx_mock: respx.mock -): +def test_llm_generate_with_output_parser(monkeypatch: pytest.MonkeyPatch, respx_mock: respx.mock): monkeypatch.setenv(OPENAI_API_KEY_ENVVAR_NAME, "sk-0123456789") dataframe = pd.DataFrame( [ { "query": "What is Python?", - "reference": "Python is a programming language.", }, { "query": "What is Python?", - "reference": "Ruby is a programming language.", }, { "query": "What is C++?", - "reference": "C++ is a programming language.", }, { "query": "What is C++?", - "reference": "irrelevant", + }, + { + "query": "gobbledygook", }, ] ) responses = [ - "it's a dialect of french", - "it's a music notation", - "It's a crazy language", - "it's a programming language", + '{ "category": "programming", "language": "Python" }', + '{ "category": "programming", "language": "Python" }', + '{ "category": "programming", "language": "C++" }', + '{ "category": "programming", "language": "C++" }', + "unparsable response", ] queries = dataframe["query"].tolist() - references = dataframe["reference"].tolist() - for query, reference, response in zip(queries, references, responses): - matcher = M(content__contains=query) & M(content__contains=reference) + + for query, response in zip(queries, responses): + matcher = M(content__contains=query) & M(content__contains=query) respx_mock.route(matcher).mock( return_value=httpx.Response(200, json={"choices": [{"message": {"content": response}}]}) ) - template = ( - "Given {query} and a golden answer {reference}, generate an answer that is incorrect." - ) + template = "Given {query}, generate output" with patch.object(OpenAIModel, "_init_tiktoken", return_value=None): model = OpenAIModel() - llm_generate(dataframe=dataframe, template=template, model=model, verbose=True) + def output_parser(response: str) -> Dict[str, str]: + try: + return json.loads(response) + except json.JSONDecodeError as e: + return {"__error__": str(e)} + + generated = llm_generate( + dataframe=dataframe, template=template, model=model, output_parser=output_parser + ) + # check the output is parsed correctly + assert generated["category"].tolist() == [ + "programming", + "programming", + "programming", + "programming", + np.nan, + ] - out, _ = capfd.readouterr() - assert "Generating responses for 4 prompts..." in out, "Response generation should be printed" + # check the unparsable response captures the error + assert generated["__error__"].tolist() == [np.nan] * 4 + [ + "Expecting value: line 1 column 1 (char 0)" + ]