Skip to content

Commit 9fbed0b

Browse files
jcpagadora737copybara-github
authored andcommitted
fix: Overall eval status should be NOT_EVALUATED if no invocations were evaluated
PiperOrigin-RevId: 819322513
1 parent bae2102 commit 9fbed0b

File tree

2 files changed

+68
-1
lines changed

2 files changed

+68
-1
lines changed

src/google/adk/evaluation/hallucinations_v1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,7 @@ def _aggregate_invocation_results(
694694
if not valid_results:
695695
return EvaluationResult(
696696
overall_score=None,
697-
overall_eval_status=EvalStatus.FAILED,
697+
overall_eval_status=EvalStatus.NOT_EVALUATED,
698698
per_invocation_results=per_invocation_results,
699699
)
700700

tests/unittests/evaluation/test_hallucinations_v1.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,6 +1445,73 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric):
14451445
assert per_invocation_result.eval_status == EvalStatus.NOT_EVALUATED
14461446

14471447

1448+
@pytest.mark.asyncio
1449+
async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric):
1450+
metric = hallucinations_metric
1451+
app_details = AppDetails(
1452+
agent_details={
1453+
"root": AgentDetails(
1454+
name="root",
1455+
instructions="Root agent instructions.",
1456+
tool_declarations=[],
1457+
),
1458+
},
1459+
)
1460+
user_content = genai_types.Content(
1461+
parts=[genai_types.Part(text="User query.")]
1462+
)
1463+
actual_invocation = Invocation(
1464+
app_details=app_details,
1465+
user_content=user_content,
1466+
intermediate_data=InvocationEvents(
1467+
invocation_events=[
1468+
InvocationEvent(
1469+
author="root",
1470+
content=genai_types.Content(
1471+
parts=[
1472+
genai_types.Part(text="Intermediate NL response."),
1473+
]
1474+
),
1475+
),
1476+
]
1477+
),
1478+
final_response=genai_types.Content(
1479+
parts=[genai_types.Part(text="Final response.")]
1480+
),
1481+
)
1482+
expected_invocation = Invocation(
1483+
app_details=app_details,
1484+
user_content=user_content,
1485+
final_response=genai_types.Content(
1486+
parts=[genai_types.Part(text="Final response.")]
1487+
),
1488+
)
1489+
1490+
async def mock_evaluate_nl_response(nl_response, context):
1491+
return None, "Judge model error."
1492+
1493+
with patch(
1494+
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
1495+
side_effect=mock_evaluate_nl_response,
1496+
):
1497+
result = await metric.evaluate_invocations(
1498+
[actual_invocation, actual_invocation],
1499+
[expected_invocation, expected_invocation],
1500+
)
1501+
1502+
assert len(result.per_invocation_results) == 2
1503+
assert result.per_invocation_results[0].score is None
1504+
assert (
1505+
result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED
1506+
)
1507+
assert result.per_invocation_results[1].score is None
1508+
assert (
1509+
result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED
1510+
)
1511+
assert result.overall_score is None
1512+
assert result.overall_eval_status == EvalStatus.NOT_EVALUATED
1513+
1514+
14481515
@pytest.mark.asyncio
14491516
async def test_evaluate_invocations_partial_failure(hallucinations_metric):
14501517
metric = hallucinations_metric

0 commit comments

Comments
 (0)