@@ -1445,6 +1445,73 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric):
14451445 assert per_invocation_result .eval_status == EvalStatus .NOT_EVALUATED
14461446
14471447
1448+ @pytest .mark .asyncio
1449+ async def test_evaluate_all_invocations_not_evaluated (hallucinations_metric ):
1450+ metric = hallucinations_metric
1451+ app_details = AppDetails (
1452+ agent_details = {
1453+ "root" : AgentDetails (
1454+ name = "root" ,
1455+ instructions = "Root agent instructions." ,
1456+ tool_declarations = [],
1457+ ),
1458+ },
1459+ )
1460+ user_content = genai_types .Content (
1461+ parts = [genai_types .Part (text = "User query." )]
1462+ )
1463+ actual_invocation = Invocation (
1464+ app_details = app_details ,
1465+ user_content = user_content ,
1466+ intermediate_data = InvocationEvents (
1467+ invocation_events = [
1468+ InvocationEvent (
1469+ author = "root" ,
1470+ content = genai_types .Content (
1471+ parts = [
1472+ genai_types .Part (text = "Intermediate NL response." ),
1473+ ]
1474+ ),
1475+ ),
1476+ ]
1477+ ),
1478+ final_response = genai_types .Content (
1479+ parts = [genai_types .Part (text = "Final response." )]
1480+ ),
1481+ )
1482+ expected_invocation = Invocation (
1483+ app_details = app_details ,
1484+ user_content = user_content ,
1485+ final_response = genai_types .Content (
1486+ parts = [genai_types .Part (text = "Final response." )]
1487+ ),
1488+ )
1489+
1490+ async def mock_evaluate_nl_response (nl_response , context ):
1491+ return None , "Judge model error."
1492+
1493+ with patch (
1494+ "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response" ,
1495+ side_effect = mock_evaluate_nl_response ,
1496+ ):
1497+ result = await metric .evaluate_invocations (
1498+ [actual_invocation , actual_invocation ],
1499+ [expected_invocation , expected_invocation ],
1500+ )
1501+
1502+ assert len (result .per_invocation_results ) == 2
1503+ assert result .per_invocation_results [0 ].score is None
1504+ assert (
1505+ result .per_invocation_results [0 ].eval_status == EvalStatus .NOT_EVALUATED
1506+ )
1507+ assert result .per_invocation_results [1 ].score is None
1508+ assert (
1509+ result .per_invocation_results [1 ].eval_status == EvalStatus .NOT_EVALUATED
1510+ )
1511+ assert result .overall_score is None
1512+ assert result .overall_eval_status == EvalStatus .NOT_EVALUATED
1513+
1514+
14481515@pytest .mark .asyncio
14491516async def test_evaluate_invocations_partial_failure (hallucinations_metric ):
14501517 metric = hallucinations_metric
0 commit comments