Skip to content

Commit 348e552

Browse files
ankursharmascopybara-github
authored andcommitted
chore: Remove deprecated run_evals from cli_eval.py
This change removes the `run_evals` function and its helper `_get_evaluator` from `cli_eval.py`, as they were marked as deprecated. Corresponding test mocks and patches in `test_fast_api.py` are also removed. PiperOrigin-RevId: 818719422
1 parent e212ff5 commit 348e552

File tree

3 files changed

+1
-213
lines changed

3 files changed

+1
-213
lines changed

src/google/adk/cli/adk_web_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
from ..evaluation.eval_metrics import EvalMetric
7373
from ..evaluation.eval_metrics import EvalMetricResult
7474
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
75+
from ..evaluation.eval_metrics import EvalStatus
7576
from ..evaluation.eval_metrics import MetricInfo
7677
from ..evaluation.eval_result import EvalSetResult
7778
from ..evaluation.eval_set import EvalSet
@@ -85,7 +86,6 @@
8586
from ..sessions.session import Session
8687
from ..utils.context_utils import Aclosing
8788
from .cli_eval import EVAL_SESSION_ID_PREFIX
88-
from .cli_eval import EvalStatus
8989
from .utils import cleanup
9090
from .utils import common
9191
from .utils import envs

src/google/adk/cli/cli_eval.py

Lines changed: 0 additions & 182 deletions
Original file line numberDiff line numberDiff line change
@@ -15,42 +15,27 @@
1515
from __future__ import annotations
1616

1717
import importlib.util
18-
import inspect
1918
import logging
2019
import os
2120
import sys
2221
from typing import Any
23-
from typing import AsyncGenerator
2422
from typing import Optional
25-
import uuid
2623

2724
import click
2825
from google.genai import types as genai_types
29-
from typing_extensions import deprecated
3026

3127
from ..agents.llm_agent import Agent
32-
from ..artifacts.base_artifact_service import BaseArtifactService
3328
from ..evaluation.base_eval_service import BaseEvalService
3429
from ..evaluation.base_eval_service import EvaluateConfig
3530
from ..evaluation.base_eval_service import EvaluateRequest
36-
from ..evaluation.base_eval_service import InferenceConfig
3731
from ..evaluation.base_eval_service import InferenceRequest
3832
from ..evaluation.base_eval_service import InferenceResult
3933
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
40-
from ..evaluation.eval_case import EvalCase
4134
from ..evaluation.eval_case import get_all_tool_calls
4235
from ..evaluation.eval_case import IntermediateDataType
43-
from ..evaluation.eval_config import BaseCriterion
44-
from ..evaluation.eval_config import EvalConfig
4536
from ..evaluation.eval_metrics import EvalMetric
46-
from ..evaluation.eval_metrics import EvalMetricResult
47-
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
48-
from ..evaluation.eval_metrics import JudgeModelOptions
4937
from ..evaluation.eval_result import EvalCaseResult
5038
from ..evaluation.eval_sets_manager import EvalSetsManager
51-
from ..evaluation.evaluator import EvalStatus
52-
from ..evaluation.evaluator import Evaluator
53-
from ..sessions.base_session_service import BaseSessionService
5439
from ..utils.context_utils import Aclosing
5540

5641
logger = logging.getLogger("google_adk." + __name__)
@@ -172,147 +157,6 @@ async def _collect_eval_results(
172157
return eval_results
173158

174159

175-
@deprecated(
176-
"This method is deprecated and will be removed in fututre release. Please"
177-
" use LocalEvalService to define your custom evals."
178-
)
179-
async def run_evals(
180-
eval_cases_by_eval_set_id: dict[str, list[EvalCase]],
181-
root_agent: Agent,
182-
reset_func: Optional[Any],
183-
eval_metrics: list[EvalMetric],
184-
session_service: Optional[BaseSessionService] = None,
185-
artifact_service: Optional[BaseArtifactService] = None,
186-
) -> AsyncGenerator[EvalCaseResult, None]:
187-
"""Returns a stream of EvalCaseResult for each eval case that was evaluated.
188-
189-
Args:
190-
eval_cases_by_eval_set_id: Eval cases categorized by eval set id to which
191-
they belong.
192-
root_agent: Agent to use for inferencing.
193-
reset_func: If present, this will be called before invoking the agent before
194-
every inferencing step.
195-
eval_metrics: A list of metrics that should be used during evaluation.
196-
session_service: The session service to use during inferencing.
197-
artifact_service: The artifact service to use during inferencing.
198-
"""
199-
try:
200-
from ..evaluation.evaluation_generator import EvaluationGenerator
201-
except ModuleNotFoundError as e:
202-
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
203-
204-
for eval_set_id, eval_cases in eval_cases_by_eval_set_id.items():
205-
for eval_case in eval_cases:
206-
eval_name = eval_case.eval_id
207-
initial_session = eval_case.session_input
208-
user_id = initial_session.user_id if initial_session else "test_user_id"
209-
210-
try:
211-
print(f"Running Eval: {eval_set_id}:{eval_name}")
212-
session_id = f"{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}"
213-
214-
inference_result = (
215-
await EvaluationGenerator._generate_inferences_from_root_agent(
216-
invocations=eval_case.conversation,
217-
root_agent=root_agent,
218-
reset_func=reset_func,
219-
initial_session=initial_session,
220-
session_id=session_id,
221-
session_service=session_service,
222-
artifact_service=artifact_service,
223-
)
224-
)
225-
226-
# Initialize the per-invocation metric results to an empty list.
227-
# We will fill this as we evaluate each metric.
228-
eval_metric_result_per_invocation = []
229-
for actual, expected in zip(inference_result, eval_case.conversation):
230-
eval_metric_result_per_invocation.append(
231-
EvalMetricResultPerInvocation(
232-
actual_invocation=actual,
233-
expected_invocation=expected,
234-
eval_metric_results=[],
235-
)
236-
)
237-
238-
overall_eval_metric_results = []
239-
240-
for eval_metric in eval_metrics:
241-
metric_evaluator = _get_evaluator(eval_metric)
242-
243-
if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
244-
evaluation_result = await metric_evaluator.evaluate_invocations(
245-
actual_invocations=inference_result,
246-
expected_invocations=eval_case.conversation,
247-
)
248-
else:
249-
evaluation_result = metric_evaluator.evaluate_invocations(
250-
actual_invocations=inference_result,
251-
expected_invocations=eval_case.conversation,
252-
)
253-
254-
overall_eval_metric_results.append(
255-
EvalMetricResult(
256-
metric_name=eval_metric.metric_name,
257-
threshold=eval_metric.threshold,
258-
score=evaluation_result.overall_score,
259-
eval_status=evaluation_result.overall_eval_status,
260-
)
261-
)
262-
for index, per_invocation_result in enumerate(
263-
evaluation_result.per_invocation_results
264-
):
265-
eval_metric_result_per_invocation[index].eval_metric_results.append(
266-
EvalMetricResult(
267-
metric_name=eval_metric.metric_name,
268-
threshold=eval_metric.threshold,
269-
score=per_invocation_result.score,
270-
eval_status=per_invocation_result.eval_status,
271-
)
272-
)
273-
274-
final_eval_status = EvalStatus.NOT_EVALUATED
275-
# Go over the all the eval statuses and mark the final eval status as
276-
# passed if all of them pass, otherwise mark the final eval status to
277-
# failed.
278-
for overall_eval_metric_result in overall_eval_metric_results:
279-
overall_eval_status = overall_eval_metric_result.eval_status
280-
if overall_eval_status == EvalStatus.PASSED:
281-
final_eval_status = EvalStatus.PASSED
282-
elif overall_eval_status == EvalStatus.NOT_EVALUATED:
283-
continue
284-
elif overall_eval_status == EvalStatus.FAILED:
285-
final_eval_status = EvalStatus.FAILED
286-
break
287-
else:
288-
raise ValueError("Unknown eval status.")
289-
290-
yield EvalCaseResult(
291-
eval_set_file=eval_set_id,
292-
eval_set_id=eval_set_id,
293-
eval_id=eval_name,
294-
final_eval_status=final_eval_status,
295-
eval_metric_results=[],
296-
overall_eval_metric_results=overall_eval_metric_results,
297-
eval_metric_result_per_invocation=eval_metric_result_per_invocation,
298-
session_id=session_id,
299-
user_id=user_id,
300-
)
301-
302-
if final_eval_status == EvalStatus.PASSED:
303-
result = "✅ Passed"
304-
else:
305-
result = "❌ Failed"
306-
307-
print(f"Result: {result}\n")
308-
except ModuleNotFoundError as e:
309-
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
310-
except Exception:
311-
# Catching the general exception, so that we don't block other eval
312-
# cases.
313-
logger.exception("Eval failed for `%s:%s`", eval_set_id, eval_name)
314-
315-
316160
def _convert_content_to_text(
317161
content: Optional[genai_types.Content],
318162
) -> str:
@@ -413,32 +257,6 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
413257
click.echo("\n\n") # Few empty lines for visual clarity
414258

415259

416-
def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
417-
try:
418-
from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
419-
from ..evaluation.response_evaluator import ResponseEvaluator
420-
from ..evaluation.safety_evaluator import SafetyEvaluatorV1
421-
from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
422-
except ModuleNotFoundError as e:
423-
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
424-
if eval_metric.metric_name == TOOL_TRAJECTORY_SCORE_KEY:
425-
return TrajectoryEvaluator(threshold=eval_metric.threshold)
426-
elif (
427-
eval_metric.metric_name == RESPONSE_MATCH_SCORE_KEY
428-
or eval_metric.metric_name == RESPONSE_EVALUATION_SCORE_KEY
429-
):
430-
return ResponseEvaluator(
431-
threshold=eval_metric.threshold, metric_name=eval_metric.metric_name
432-
)
433-
elif eval_metric.metric_name == SAFETY_V1_KEY:
434-
return SafetyEvaluatorV1(eval_metric)
435-
elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2:
436-
eval_metric.judge_model_options = JudgeModelOptions()
437-
return FinalResponseMatchV2Evaluator(eval_metric)
438-
439-
raise ValueError(f"Unsupported eval metric: {eval_metric}")
440-
441-
442260
def get_eval_sets_manager(
443261
eval_storage_uri: Optional[str], agents_dir: str
444262
) -> EvalSetsManager:

tests/unittests/cli/test_fast_api.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -153,28 +153,6 @@ class _MockEvalCaseResult(BaseModel):
153153
eval_metric_result_per_invocation: list = {}
154154

155155

156-
# Mock for the run_evals function, tailored for test_run_eval
157-
async def mock_run_evals_for_fast_api(*args, **kwargs):
158-
# This is what the test_run_eval expects for its assertions
159-
yield _MockEvalCaseResult(
160-
eval_set_id="test_eval_set_id", # Matches expected in verify_eval_case_result
161-
eval_id="test_eval_case_id", # Matches expected
162-
final_eval_status=1, # Matches expected (assuming 1 is PASSED)
163-
user_id="test_user", # Placeholder, adapt if needed
164-
session_id="test_session_for_eval_case", # Placeholder
165-
eval_set_file="test_eval_set_file", # Placeholder
166-
overall_eval_metric_results=[{ # Matches expected
167-
"metricName": "tool_trajectory_avg_score",
168-
"threshold": 0.5,
169-
"score": 1.0,
170-
"evalStatus": 1,
171-
}],
172-
# Provide other fields if RunEvalResult or subsequent processing needs them
173-
eval_metric_results=[],
174-
eval_metric_result_per_invocation=[],
175-
)
176-
177-
178156
#################################################
179157
# Test Fixtures
180158
#################################################
@@ -453,10 +431,6 @@ def test_app(
453431
"google.adk.cli.fast_api.LocalEvalSetResultsManager",
454432
return_value=mock_eval_set_results_manager,
455433
),
456-
patch(
457-
"google.adk.cli.cli_eval.run_evals", # Patch where it's imported in fast_api.py
458-
new=mock_run_evals_for_fast_api,
459-
),
460434
):
461435
# Get the FastAPI app, but don't actually run it
462436
app = get_fast_api_app(
@@ -604,10 +578,6 @@ def test_app_with_a2a(
604578
"google.adk.cli.fast_api.LocalEvalSetResultsManager",
605579
return_value=mock_eval_set_results_manager,
606580
),
607-
patch(
608-
"google.adk.cli.cli_eval.run_evals",
609-
new=mock_run_evals_for_fast_api,
610-
),
611581
patch("a2a.server.tasks.InMemoryTaskStore") as mock_task_store,
612582
patch(
613583
"google.adk.a2a.executor.a2a_agent_executor.A2aAgentExecutor"

0 commit comments

Comments
 (0)