Iteration on error handling ... more to do

dmontagu · dmontagu · commit 34ec681d7619 · 2025-08-19T13:27:50.000-06:00
diff --git a/pydantic_ai_slim/pydantic_ai/retries.py b/pydantic_ai_slim/pydantic_ai/retries.py
@@ -13,25 +13,85 @@
 
 from __future__ import annotations
 
+from dataclasses import dataclass
+
 from httpx import AsyncBaseTransport, AsyncHTTPTransport, BaseTransport, HTTPTransport, Request, Response
+from pydantic_core import PydanticUndefinedType as Undefined
 
 try:
-    from tenacity import AsyncRetrying, Retrying
+    from tenacity import AsyncRetrying, Retrying, WrappedFn
 except ImportError as _import_error:
     raise ImportError(
         'Please install `tenacity` to use the retries utilities, '
         'you can use the `retries` optional group — `pip install "pydantic-ai-slim[retries]"`'
     ) from _import_error
 
-
-__all__ = ['TenacityTransport', 'AsyncTenacityTransport', 'wait_retry_after']
-
+from collections.abc import Awaitable
 from datetime import datetime, timezone
 from email.utils import parsedate_to_datetime
-from typing import Callable, cast
+from typing import TYPE_CHECKING, Any, Callable, cast
 
 from httpx import HTTPStatusError
-from tenacity import RetryCallState, wait_exponential
+from tenacity import RetryCallState, RetryError, retry, wait_exponential
+
+if TYPE_CHECKING:
+    from tenacity.asyncio.retry import RetryBaseT
+    from tenacity.retry import RetryBaseT as SyncRetryBaseT
+    from tenacity.stop import StopBaseT
+    from tenacity.wait import WaitBaseT
+
+__all__ = ['RetryConfig', 'TenacityTransport', 'AsyncTenacityTransport', 'wait_retry_after']
+
+UNDEFINED = Undefined()
+
+
+@dataclass
+class RetryConfig:
+    """These are the arguments to the tenacity retry function and AsyncRetrying/Retrying classes."""
+
+    # The following arguments cannot be None in tenacity but have private default values, so we use None as a sentinel
+    sleep: Callable[[int | float], None | Awaitable[None]] | None = None
+    stop: StopBaseT | None = None
+    wait: WaitBaseT | None = None
+    retry: SyncRetryBaseT | RetryBaseT | None = None
+    before: Callable[[RetryCallState], None | Awaitable[None]] | None = None
+    after: Callable[[RetryCallState], None | Awaitable[None]] | None = None
+
+    # The following have public types and default values in tenacity, so we just repeat them verbatim here
+    before_sleep: Callable[[RetryCallState], None | Awaitable[None]] | None = None
+    reraise: bool = False
+    retry_error_cls: type[RetryError] = RetryError
+    retry_error_callback: Callable[[RetryCallState], Any | Awaitable[Any]] | None = None
+
+    def tenacity_kwargs(self) -> dict[str, Any]:
+        kwargs: dict[str, Any] = {
+            'before_sleep': self.before_sleep,
+            'reraise': self.reraise,
+            'retry_error_cls': self.retry_error_cls,
+            'retry_error_callback': self.retry_error_callback,
+        }
+        if self.sleep is not None:
+            kwargs['sleep'] = self.sleep
+        if self.stop is not None:
+            kwargs['stop'] = self.stop
+        if self.wait is not None:
+            kwargs['wait'] = self.wait
+        if self.retry is not None:
+            kwargs['retry'] = self.retry
+        if self.before is not None:
+            kwargs['before'] = self.before
+        if self.after is not None:
+            kwargs['after'] = self.after
+
+        return kwargs
+
+    def tenacity_decorator(self, function: WrappedFn) -> WrappedFn:
+        """Wrap the provided function using this config to populate the tenacity `retry` decorator.
+
+        Returns:
+            A wrapped version of the function that will use this configuration for tenacity-based retrying when called.
+        """
+        return retry(**self.tenacity_kwargs())(function)
 
 
 class TenacityTransport(BaseTransport):
@@ -76,7 +136,7 @@ class TenacityTransport(BaseTransport):
 
     def __init__(
         self,
-        controller: Retrying,
+        controller: RetryConfig | Retrying,
         wrapped: BaseTransport | None = None,
         validate_response: Callable[[Response], None] | None = None,
     ):
@@ -97,7 +157,10 @@ def handle_request(self, request: Request) -> Response:
             RuntimeError: If the retry controller did not make any attempts.
             Exception: Any exception raised by the wrapped transport or validation function.
         """
-        for attempt in self.controller:
+        controller = (
+            self.controller if isinstance(self.controller, Retrying) else Retrying(**self.controller.tenacity_kwargs())
+        )
+        for attempt in controller:
             with attempt:
                 response = self.wrapped.handle_request(request)
                 if self.validate_response:
@@ -147,7 +210,7 @@ class AsyncTenacityTransport(AsyncBaseTransport):
 
     def __init__(
         self,
-        controller: AsyncRetrying,
+        controller: RetryConfig | AsyncRetrying,
         wrapped: AsyncBaseTransport | None = None,
         validate_response: Callable[[Response], None] | None = None,
     ):
@@ -168,7 +231,12 @@ async def handle_async_request(self, request: Request) -> Response:
             RuntimeError: If the retry controller did not make any attempts.
             Exception: Any exception raised by the wrapped transport or validation function.
         """
-        async for attempt in self.controller:
+        controller = (
+            self.controller
+            if isinstance(self.controller, AsyncRetrying)
+            else AsyncRetrying(**self.controller.tenacity_kwargs())
+        )
+        async for attempt in controller:
             with attempt:
                 response = await self.wrapped.handle_async_request(request)
                 if self.validate_response:
diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
@@ -48,7 +48,7 @@
 from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate, ReportCaseFailure
 
 if TYPE_CHECKING:
-    from tenacity import AsyncRetrying
+    from pydantic_ai.retries import RetryConfig
 
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup  # pragma: lax no cover
@@ -264,7 +264,8 @@ async def evaluate(
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
-        retry: AsyncRetrying | None = None,
+        retry_task: RetryConfig | None = None,
+        retry_evaluators: RetryConfig | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -279,7 +280,8 @@ async def evaluate(
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
             progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
-            retry: Optional retry configuration for the task execution.
+            retry_task: Optional retry configuration for the task execution.
+            retry_evaluators: Optional retry configuration for evaluator execution.
 
         Returns:
             A report containing the results of the evaluation.
@@ -295,7 +297,9 @@ async def evaluate(
 
             async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
                 async with limiter:
-                    result = await _run_task_and_evaluators(task, case, report_case_name, self.evaluators, retry)
+                    result = await _run_task_and_evaluators(
+                        task, case, report_case_name, self.evaluators, retry_task, retry_evaluators
+                    )
                     if progress_bar and task_id is not None:  # pragma: no branch
                         progress_bar.update(task_id, advance=1)
                     return result
@@ -828,14 +832,14 @@ def record_attribute(self, name: str, value: Any) -> None:
 async def _run_task(
     task: Callable[[InputsT], Awaitable[OutputT] | OutputT],
     case: Case[InputsT, OutputT, MetadataT],
-    retry: AsyncRetrying | None = None,
+    retry: RetryConfig | None = None,
 ) -> EvaluatorContext[InputsT, OutputT, MetadataT]:
     """Run a task on a case and return the context for evaluators.
 
     Args:
         task: The task to run.
         case: The case to run the task on.
-        retry: The retry strategy to use.
+        retry: The retry config to use.
 
     Returns:
         An EvaluatorContext containing the inputs, actual output, expected output, and metadata.
@@ -868,11 +872,10 @@ async def _run_once():
 
     async def _run_with_retries():
         if retry:
-            async for attempt in retry:
-                with attempt:
-                    return await _run_once()
-        # Note: the following line will be unreachable if retry is not None
-        return await _run_once()
+            return await retry.decorator(_run_once)()
+        else:
+            # Note: the following line will be unreachable if retry is not None
+            return await _run_once()
 
     task_run, task_output, duration, span_tree = await _run_with_retries()
 
@@ -913,7 +916,8 @@ async def _run_task_and_evaluators(
     case: Case[InputsT, OutputT, MetadataT],
     report_case_name: str,
     dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
-    retry: AsyncRetrying | None,
+    retry_task: RetryConfig | None,
+    retry_evaluators: RetryConfig | None,
 ) -> ReportCase[InputsT, OutputT, MetadataT] | ReportCaseFailure[InputsT, OutputT, MetadataT]:
     """Run a task on a case and evaluate the results.
 
@@ -922,7 +926,7 @@ async def _run_task_and_evaluators(
         case: The case to run the task on.
         report_case_name: The name to use for this case in the report.
         dataset_evaluators: Evaluators from the dataset to apply to this case.
-        retry: The retry strategy to use for running the task.
+        retry_task: The retry config to use for running the task.
 
     Returns:
         A ReportCase containing the evaluation results.
@@ -944,7 +948,7 @@ async def _run_task_and_evaluators(
                 span_id = f'{context.span_id:016x}'
 
             t0 = time.time()
-            scoring_context = await _run_task(task, case, retry)
+            scoring_context = await _run_task(task, case, retry_task)
 
             case_span.set_attribute('output', scoring_context.output)
             case_span.set_attribute('task_duration', scoring_context.duration)
@@ -956,7 +960,7 @@ async def _run_task_and_evaluators(
             evaluator_failures: list[EvaluatorFailure] = []
             if evaluators:
                 evaluator_outputs_by_task = await task_group_gather(
-                    [lambda ev=ev: run_evaluator(ev, scoring_context) for ev in evaluators]
+                    [lambda ev=ev: run_evaluator(ev, scoring_context, retry_evaluators) for ev in evaluators]
                 )
                 for outputs in evaluator_outputs_by_task:
                     if isinstance(outputs, EvaluatorFailure):
diff --git a/pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py b/pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py
@@ -2,7 +2,7 @@
 
 import traceback
 from collections.abc import Mapping
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import logfire_api
 from pydantic import (
@@ -21,6 +21,12 @@
     EvaluatorOutput,
 )
 
+if TYPE_CHECKING:
+    # TODO: pydantic_evals should not import from pydantic_ai...
+    #   Need to figure out a good way to sneak retry behavior into the evaluators..
+    # Well, the problem is that we probably want to use the retry stuff in both pydantic_ai and pydantic_evals ... ugh.
+    from pydantic_ai.retries import RetryConfig
+
 # while waiting for https://github.com/pydantic/logfire/issues/745
 try:
     import logfire._internal.stack_info
@@ -40,7 +46,9 @@
 
 
 async def run_evaluator(
-    evaluator: Evaluator[InputsT, OutputT, MetadataT], ctx: EvaluatorContext[InputsT, OutputT, MetadataT]
+    evaluator: Evaluator[InputsT, OutputT, MetadataT],
+    ctx: EvaluatorContext[InputsT, OutputT, MetadataT],
+    retry: RetryConfig | None = None,
 ) -> list[EvaluationResult] | EvaluatorFailure:
     """Run an evaluator and return the results.
 
@@ -50,19 +58,24 @@ async def run_evaluator(
     Args:
         evaluator: The evaluator to run.
         ctx: The context containing the inputs, outputs, and metadata for evaluation.
+        retry: The retry configuration to use for running the evaluator.
 
     Returns:
         A list of evaluation results, or an evaluator failure if an exception is raised during its execution.
 
     Raises:
         ValueError: If the evaluator returns a value of an invalid type.
     """
+    evaluate = evaluator.evaluate_async
+    if retry is not None:
+        evaluate = retry.tenacity_decorator(evaluate)
+
     try:
         with _logfire.span(
             'evaluator: {evaluator_name}',
             evaluator_name=evaluator.get_default_evaluation_name(),
         ):
-            raw_results = await evaluator.evaluate_async(ctx)
+            raw_results = await evaluate(ctx)
 
             try:
                 results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results)
diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py
@@ -316,7 +316,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput:
             return TaskOutput(answer='Paris')
         return TaskOutput(answer='Unknown')  # pragma: no cover
 
-    report = await example_dataset.evaluate(mock_async_task, retry=AsyncRetrying(stop=stop_after_attempt(3)))
+    report = await example_dataset.evaluate(mock_async_task, retry_task=AsyncRetrying(stop=stop_after_attempt(3)))
 
     assert attempt == 3