Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def evaluate_dataset():
report = dataset.evaluate_sync(infer_time_range)
print(report)

assertion_pass_rate = report.averages().assertions
averages = report.averages()
assert averages is not None
assertion_pass_rate = averages.assertions
assert assertion_pass_rate is not None, 'There should be at least one assertion'
assert assertion_pass_rate > 0.9, (
f'The assertion pass rate was {assertion_pass_rate:.1%}; it should be above 90%.'
Expand Down
222 changes: 140 additions & 82 deletions pydantic_evals/pydantic_evals/dataset.py

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pydantic_evals/pydantic_evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
Python,
)
from .context import EvaluatorContext
from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec
from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorFailure, EvaluatorOutput, EvaluatorSpec

__all__ = (
# common
Expand All @@ -27,6 +27,8 @@
'EvaluatorContext',
# evaluator
'Evaluator',
'EvaluationReason',
'EvaluatorFailure',
'EvaluatorOutput',
'EvaluatorSpec',
'EvaluationReason',
Expand Down
68 changes: 59 additions & 9 deletions pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,55 @@
from __future__ import annotations

import traceback
from collections.abc import Mapping
from typing import Any
from typing import TYPE_CHECKING, Any

import logfire_api
from pydantic import (
TypeAdapter,
ValidationError,
)
from typing_extensions import TypeVar

from .context import EvaluatorContext
from .evaluator import EvaluationReason, EvaluationResult, EvaluationScalar, Evaluator, EvaluatorOutput
from .evaluator import (
EvaluationReason,
EvaluationResult,
EvaluationScalar,
Evaluator,
EvaluatorFailure,
EvaluatorOutput,
)

if TYPE_CHECKING:
# TODO: pydantic_evals should not import from pydantic_ai...
# Need to figure out a good way to sneak retry behavior into the evaluators..
# Well, the problem is that we probably want to use the retry stuff in both pydantic_ai and pydantic_evals ... ugh.
from pydantic_ai.retries import RetryConfig

# while waiting for https://github.com/pydantic/logfire/issues/745
try:
import logfire._internal.stack_info
except ImportError:
pass
else:
from pathlib import Path

logfire._internal.stack_info.NON_USER_CODE_PREFIXES += (str(Path(__file__).parent.absolute()),) # pyright: ignore[reportPrivateImportUsage]


InputsT = TypeVar('InputsT', default=Any, contravariant=True)
OutputT = TypeVar('OutputT', default=Any, contravariant=True)
MetadataT = TypeVar('MetadataT', default=Any, contravariant=True)

_logfire = logfire_api.Logfire(otel_scope='pydantic-evals')


async def run_evaluator(
evaluator: Evaluator[InputsT, OutputT, MetadataT], ctx: EvaluatorContext[InputsT, OutputT, MetadataT]
) -> list[EvaluationResult]:
evaluator: Evaluator[InputsT, OutputT, MetadataT],
ctx: EvaluatorContext[InputsT, OutputT, MetadataT],
retry: RetryConfig | None = None,
) -> list[EvaluationResult] | EvaluatorFailure:
"""Run an evaluator and return the results.

This function runs an evaluator on the given context and processes the results into
Expand All @@ -28,19 +58,39 @@ async def run_evaluator(
Args:
evaluator: The evaluator to run.
ctx: The context containing the inputs, outputs, and metadata for evaluation.
retry: The retry configuration to use for running the evaluator.

Returns:
A list of evaluation results.
A list of evaluation results, or an evaluator failure if an exception is raised during its execution.

Raises:
ValueError: If the evaluator returns a value of an invalid type.
"""
raw_results = await evaluator.evaluate_async(ctx)
evaluate = evaluator.evaluate_async
if retry is not None:
# import from pydantic_ai.retries to trigger more descriptive import error if tenacity is missing
from pydantic_ai.retries import retry as tenacity_retry

evaluate = tenacity_retry(**retry)(evaluate)

try:
results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results)
except ValidationError as e:
raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e
with _logfire.span(
'evaluator: {evaluator_name}',
evaluator_name=evaluator.get_default_evaluation_name(),
):
raw_results = await evaluate(ctx)

try:
results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results)
except ValidationError as e:
raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e
except Exception as e:
return EvaluatorFailure(
name=evaluator.get_default_evaluation_name(),
error_message=f'{type(e).__name__}: {e}',
error_stacktrace=traceback.format_exc(),
source=evaluator.as_spec(),
)

results = _convert_to_mapping(results, scalar_name=evaluator.get_default_evaluation_name())

Expand Down
11 changes: 11 additions & 0 deletions pydantic_evals/pydantic_evals/evaluators/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
'EvaluationResult',
'EvaluationScalar',
'Evaluator',
'EvaluatorFailure',
'EvaluatorOutput',
'EvaluatorSpec',
)
Expand Down Expand Up @@ -100,6 +101,16 @@ def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None:
return None


@dataclass
class EvaluatorFailure:
"""Represents a failure raised during the execution of an evaluator."""

name: str
error_message: str
error_stacktrace: str
source: EvaluatorSpec


# Evaluators are contravariant in all of its parameters.
InputsT = TypeVar('InputsT', default=Any, contravariant=True)
"""Type variable for the inputs type of the task being evaluated."""
Expand Down
Loading