Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
46c5e48
Adding test + BinaryContent Parsing
adtyavrdhn Jul 10, 2025
293c75f
Adding test for single UserContent as well as Sequence of UserContent
adtyavrdhn Jul 11, 2025
2065a86
Adding test for single UserContent as well as Sequence of UserContent
adtyavrdhn Jul 11, 2025
6397c1c
refactoring _build_prompt
adtyavrdhn Jul 11, 2025
34bb131
handling when inputs is a sequence of UserContent
adtyavrdhn Jul 11, 2025
33d79e1
handling pylance errors
adtyavrdhn Jul 11, 2025
1630c75
refactoring _build_prompt
adtyavrdhn Jul 11, 2025
c25db69
removing .strips()
adtyavrdhn Jul 11, 2025
fa2b533
Merge branch 'main' of https://github.com/pydantic/pydantic-ai into b…
adtyavrdhn Jul 11, 2025
d09873e
fix lint
adtyavrdhn Jul 11, 2025
75a9b8e
Merge branch 'main' into binary_content_incorrect_parsing_fix
adtyavrdhn Jul 16, 2025
e909876
suggested changes
adtyavrdhn Jul 17, 2025
97928ec
suggested changes refactoring
adtyavrdhn Jul 17, 2025
c4b7574
adding test for non_str data
adtyavrdhn Jul 17, 2025
cc1fb55
lint
adtyavrdhn Jul 17, 2025
1e33fbe
modifying test
adtyavrdhn Jul 17, 2025
612c7f4
modifying test, adding more
adtyavrdhn Jul 17, 2025
6002871
lint
adtyavrdhn Jul 17, 2025
33c7516
adding <Input> tags
adtyavrdhn Jul 17, 2025
72aaebc
adding extra test
adtyavrdhn Jul 17, 2025
7307e9f
lint
adtyavrdhn Jul 17, 2025
9ad4084
coverage
adtyavrdhn Jul 21, 2025
e40f952
Merge branch 'main' into binary_content_incorrect_parsing_fix
adtyavrdhn Jul 21, 2025
24d79f3
Apply suggestions from code review
DouweM Jul 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 45 additions & 52 deletions pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import annotations

from collections.abc import Sequence
from textwrap import dedent
from typing import Any

from pydantic import BaseModel, Field
from pydantic_core import to_json

from pydantic_ai import Agent, models
from pydantic_ai.messages import MultiModalContentTypes, UserContent
from pydantic_ai.settings import ModelSettings

__all__ = (
Expand Down Expand Up @@ -62,16 +64,7 @@ async def judge_output(
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = dedent(
f"""
<Output>
{_stringify(output)}
</Output>
<Rubric>
{rubric}
</Rubric>
"""
)
user_prompt = _build_prompt(output=output, rubric=rubric)
return (
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
).output
Expand Down Expand Up @@ -112,19 +105,8 @@ async def judge_input_output(
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = dedent(
f"""
<Input>
{_stringify(inputs)}
</Input>
<Output>
{_stringify(output)}
</Output>
<Rubric>
{rubric}
</Rubric>
"""
)
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric)

return (
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
).output
Expand Down Expand Up @@ -168,22 +150,7 @@ async def judge_input_output_expected(
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = dedent(
f"""
<Input>
{_stringify(inputs)}
</Input>
<ExpectedOutput>
{_stringify(expected_output)}
</ExpectedOutput>
<Output>
{_stringify(output)}
</Output>
<Rubric>
{rubric}
</Rubric>
"""
)
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric, expected_output=expected_output)

return (
await _judge_input_output_expected_agent.run(
Expand Down Expand Up @@ -227,19 +194,7 @@ async def judge_output_expected(
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = dedent(
f"""
<ExpectedOutput>
{_stringify(expected_output)}
</ExpectedOutput>
<Output>
{_stringify(output)}
</Output>
<Rubric>
{rubric}
</Rubric>
"""
)
user_prompt = _build_prompt(output=output, rubric=rubric, expected_output=expected_output)
return (
await _judge_output_expected_agent.run(
user_prompt, model=model or _default_model, model_settings=model_settings
Expand All @@ -265,3 +220,41 @@ def _stringify(value: Any) -> str:
return to_json(value).decode()
except Exception:
return repr(value)


def _build_prompt(
output: Any,
rubric: str,
inputs: Any | None = None,
expected_output: Any | None = None,
) -> str | Sequence[str | UserContent]:
"""Build a prompt that includes input, output, and rubric."""
sections: list[str | UserContent] = []

if inputs is not None:
if isinstance(inputs, str):
sections.append(f'<Input>\n{inputs}\n</Input>')
else:
sections.append('<Input>\n')
if isinstance(inputs, Sequence):
for item in inputs: # type: ignore
if isinstance(item, (str, MultiModalContentTypes)):
sections.append(item)
else:
sections.append(_stringify(item))
elif isinstance(inputs, MultiModalContentTypes):
sections.append(inputs)
else:
sections.append(_stringify(inputs))
sections.append('</Input>')

sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
sections.append(f'<Rubric>\n{rubric}\n</Rubric>')

if expected_output is not None:
sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')

if inputs is None or isinstance(inputs, str):
return '\n\n'.join(sections) # type: ignore[arg-type]
else:
return sections
Loading