BREAKING CHANGE: Remove Python evaluator for security reasons (#2808)

dmontagu · web-flow · commit 55c68140b913 · 2025-09-05T15:03:40.000Z
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -7,6 +7,12 @@ Once we release V2, in April 2026 at the earliest, we'll continue to provide sec
 
 Here's a filtered list of the breaking changes for each version to help you upgrade Pydantic AI.
 
+### v1.0.1 (2025-09-05)
+
+The following breaking change was accidentally left out of v1.0.0:
+
+- See [#2808](https://github.com/pydantic/pydantic-ai/pull/2808) - Remove `Python` evaluator from `pydantic_evals` for security reasons
+
 ### v1.0.0 (2025-09-04)
 
 - See [#2725](https://github.com/pydantic/pydantic-ai/pull/2725) - Drop support for Python 3.9
diff --git a/pydantic_evals/pydantic_evals/evaluators/__init__.py b/pydantic_evals/pydantic_evals/evaluators/__init__.py
@@ -7,7 +7,6 @@
     LLMJudge,
     MaxDuration,
     OutputConfig,
-    Python,
 )
 from .context import EvaluatorContext
 from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorFailure, EvaluatorOutput, EvaluatorSpec
@@ -22,7 +21,6 @@
     'LLMJudge',
     'HasMatchingSpan',
     'OutputConfig',
-    'Python',
     # context
     'EvaluatorContext',
     # evaluator
@@ -34,3 +32,11 @@
     'EvaluationReason',
     'EvaluationResult',
 )
+
+
+def __getattr__(name: str):
+    if name == 'Python':
+        raise ImportError(
+            'The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.'
+        )
+    raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
diff --git a/pydantic_evals/pydantic_evals/evaluators/common.py b/pydantic_evals/pydantic_evals/evaluators/common.py
@@ -21,7 +21,6 @@
     'MaxDuration',
     'LLMJudge',
     'HasMatchingSpan',
-    'Python',
     'OutputConfig',
 )
 
@@ -268,22 +267,6 @@ def evaluate(
         return ctx.span_tree.any(self.query)
 
 
-# TODO: Consider moving this to docs rather than providing it with the library, given the security implications
-@dataclass(repr=False)
-class Python(Evaluator[object, object, object]):
-    """The output of this evaluator is the result of evaluating the provided Python expression.
-
-    ***WARNING***: this evaluator runs arbitrary Python code, so you should ***NEVER*** use it with untrusted inputs.
-    """
-
-    expression: str
-    evaluation_name: str | None = field(default=None)
-
-    def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
-        # Evaluate the condition, exposing access to the evaluator context as `ctx`.
-        return eval(self.expression, {'ctx': ctx})
-
-
 DEFAULT_EVALUATORS: tuple[type[Evaluator[object, object, object]], ...] = (
     Equals,
     EqualsExpected,
@@ -292,5 +275,12 @@ def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOu
     MaxDuration,
     LLMJudge,
     HasMatchingSpan,
-    # Python,  # not included by default for security reasons
 )
+
+
+def __getattr__(name: str):
+    if name == 'Python':
+        raise ImportError(
+            'The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.'
+        )
+    raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py
@@ -2,7 +2,7 @@
 
 import json
 import sys
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 
@@ -27,7 +27,6 @@
         EvaluatorOutput,
         EvaluatorSpec,
         LLMJudge,
-        Python,
     )
     from pydantic_evals.evaluators.context import EvaluatorContext
     from pydantic_evals.reporting import EvaluationReport, ReportCase, ReportCaseAdapter, ReportCaseFailure
@@ -41,6 +40,15 @@ class MockEvaluator(Evaluator[object, object, object]):
         def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
             return self.output
 
+    @dataclass(repr=False)
+    class Python(Evaluator[object, object, object]):
+        expression: str
+        evaluation_name: str | None = field(default=None)
+
+        def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
+            # Evaluate the condition, exposing access to the evaluator context as `ctx`.
+            return eval(self.expression, {'ctx': ctx})
+
 
 with try_import() as tenacity_import_successful:
     from tenacity import stop_after_attempt
@@ -135,6 +143,7 @@ async def test_add_evaluator(
     simple_evaluator: type[Evaluator[TaskInput, TaskOutput, TaskMetadata]],
 ):
     """Test adding evaluators to a dataset."""
+
     assert len(example_dataset.evaluators) == 0
 
     example_dataset.add_evaluator(simple_evaluator())
diff --git a/tests/evals/test_evaluator_common.py b/tests/evals/test_evaluator_common.py
@@ -18,7 +18,6 @@
 
     from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext
     from pydantic_evals.evaluators.common import (
-        DEFAULT_EVALUATORS,
         Contains,
         Equals,
         EqualsExpected,
@@ -27,7 +26,6 @@
         LLMJudge,
         MaxDuration,
         OutputConfig,
-        Python,
     )
     from pydantic_evals.otel._context_in_memory_span_exporter import context_subtree
     from pydantic_evals.otel._errors import SpanTreeRecordingError
@@ -395,68 +393,6 @@ async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture):
     )
 
 
-async def test_python():
-    """Test Python evaluator."""
-    evaluator = Python(expression='ctx.output > 0')
-
-    # Test with valid expression
-    assert evaluator.evaluate(MockContext(output=42)) is True
-    assert evaluator.evaluate(MockContext(output=-1)) is False
-
-    # Test with invalid expression
-    evaluator_invalid = Python(expression='invalid syntax')
-    with pytest.raises(SyntaxError):
-        evaluator_invalid.evaluate(MockContext(output=42))
-
-
-async def test_python_evaluator():
-    """Test Python evaluator."""
-    ctx = EvaluatorContext(
-        name='test',
-        inputs={'x': 42},
-        metadata=None,
-        expected_output=None,
-        output={'y': 84},
-        duration=0.0,
-        _span_tree=SpanTreeRecordingError('did not record spans'),
-        attributes={},
-        metrics={},
-    )
-
-    # Test simple expression
-    evaluator = Python(expression='ctx.output["y"] == 84')
-    assert evaluator.evaluate(ctx) is True
-
-    # Test accessing inputs
-    evaluator = Python(expression='ctx.inputs["x"] * 2 == ctx.output["y"]')
-    assert evaluator.evaluate(ctx) is True
-
-    # Test complex expression
-    evaluator = Python(expression='all(k in ctx.output for k in ["y"])')
-    assert evaluator.evaluate(ctx) is True
-
-    # Test invalid expression
-    evaluator = Python(expression='invalid syntax')
-    with pytest.raises(SyntaxError):
-        evaluator.evaluate(ctx)
-
-    # Test expression with undefined variables
-    evaluator = Python(expression='undefined_var')
-    with pytest.raises(NameError):
-        evaluator.evaluate(ctx)
-
-    # Test expression with type error
-    evaluator = Python(expression='ctx.output + 1')  # Can't add dict and int
-    with pytest.raises(TypeError):
-        evaluator.evaluate(ctx)
-
-
-def test_default_evaluators():
-    """Test DEFAULT_EVALUATORS tuple."""
-    # Verify that Python evaluator is not included for security reasons
-    assert Python not in DEFAULT_EVALUATORS
-
-
 async def test_span_query_evaluator(capfire: CaptureLogfire):
     """Test HasMatchingSpan evaluator."""
     # Create a span tree with a known structure
diff --git a/tests/evals/test_evaluators.py b/tests/evals/test_evaluators.py
@@ -27,7 +27,6 @@
         IsInstance,
         LLMJudge,
         MaxDuration,
-        Python,
     )
     from pydantic_evals.evaluators.context import EvaluatorContext
     from pydantic_evals.evaluators.evaluator import (
@@ -579,34 +578,43 @@ async def test_span_query_evaluator(
     assert result is False
 
 
-async def test_python_evaluator(test_context: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]):
-    """Test the python evaluator."""
-    # Test with a simple condition
-    evaluator = Python(expression="ctx.output.answer == '4'")
-    assert evaluator.evaluate(test_context) == snapshot(True)
-
-    # Test type sensitivity
-    evaluator = Python(expression='ctx.output.answer == 4')
-    assert evaluator.evaluate(test_context) == snapshot(False)
-
-    # Test with a named condition
-    evaluator = Python(expression="{'correct_answer': ctx.output.answer == '4'}")
-    assert evaluator.evaluate(test_context) == snapshot({'correct_answer': True})
-
-    # Test with a condition that returns false
-    evaluator = Python(expression="ctx.output.answer == '5'")
-    assert evaluator.evaluate(test_context) == snapshot(False)
-
-    # Test with a condition that accesses context properties
-    evaluator = Python(expression="ctx.output.answer == '4' and ctx.metadata.difficulty == 'easy'")
-    assert evaluator.evaluate(test_context) == snapshot(True)
-
-    # Test reason rendering for strings
-    evaluator = Python(expression='ctx.output.answer')
-    assert evaluator.evaluate(test_context) == snapshot('4')
-
-    # Test with a condition that returns a dict
-    evaluator = Python(
-        expression="{'is_correct': ctx.output.answer == '4', 'is_easy': ctx.metadata.difficulty == 'easy'}"
-    )
-    assert evaluator.evaluate(test_context) == snapshot({'is_correct': True, 'is_easy': True})
+async def test_import_errors():
+    with pytest.raises(
+        ImportError,
+        match='The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.',
+    ):
+        from pydantic_evals.evaluators import Python  # pyright: ignore[reportUnusedImport]
+
+    with pytest.raises(
+        ImportError,
+        match='The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.',
+    ):
+        from pydantic_evals.evaluators.common import Python  # pyright: ignore[reportUnusedImport] # noqa: F401
+
+    with pytest.raises(
+        ImportError,
+        match="cannot import name 'Foo' from 'pydantic_evals.evaluators'",
+    ):
+        from pydantic_evals.evaluators import Foo  # pyright: ignore[reportUnusedImport]
+
+    with pytest.raises(
+        ImportError,
+        match="cannot import name 'Foo' from 'pydantic_evals.evaluators.common'",
+    ):
+        from pydantic_evals.evaluators.common import Foo  # pyright: ignore[reportUnusedImport] # noqa: F401
+
+    with pytest.raises(
+        AttributeError,
+        match="module 'pydantic_evals.evaluators' has no attribute 'Foo'",
+    ):
+        import pydantic_evals.evaluators as _evaluators
+
+        _evaluators.Foo
+
+    with pytest.raises(
+        AttributeError,
+        match="module 'pydantic_evals.evaluators.common' has no attribute 'Foo'",
+    ):
+        import pydantic_evals.evaluators.common as _common
+
+        _common.Foo