|
15 | 15 | from __future__ import annotations |
16 | 16 |
|
17 | 17 | import importlib.util |
18 | | -import inspect |
19 | 18 | import logging |
20 | 19 | import os |
21 | 20 | import sys |
22 | 21 | from typing import Any |
23 | | -from typing import AsyncGenerator |
24 | 22 | from typing import Optional |
25 | | -import uuid |
26 | 23 |
|
27 | 24 | import click |
28 | 25 | from google.genai import types as genai_types |
29 | | -from typing_extensions import deprecated |
30 | 26 |
|
31 | 27 | from ..agents.llm_agent import Agent |
32 | | -from ..artifacts.base_artifact_service import BaseArtifactService |
33 | 28 | from ..evaluation.base_eval_service import BaseEvalService |
34 | 29 | from ..evaluation.base_eval_service import EvaluateConfig |
35 | 30 | from ..evaluation.base_eval_service import EvaluateRequest |
36 | | -from ..evaluation.base_eval_service import InferenceConfig |
37 | 31 | from ..evaluation.base_eval_service import InferenceRequest |
38 | 32 | from ..evaluation.base_eval_service import InferenceResult |
39 | 33 | from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE |
40 | | -from ..evaluation.eval_case import EvalCase |
41 | 34 | from ..evaluation.eval_case import get_all_tool_calls |
42 | 35 | from ..evaluation.eval_case import IntermediateDataType |
43 | | -from ..evaluation.eval_config import BaseCriterion |
44 | | -from ..evaluation.eval_config import EvalConfig |
45 | 36 | from ..evaluation.eval_metrics import EvalMetric |
46 | | -from ..evaluation.eval_metrics import EvalMetricResult |
47 | | -from ..evaluation.eval_metrics import EvalMetricResultPerInvocation |
48 | | -from ..evaluation.eval_metrics import JudgeModelOptions |
49 | 37 | from ..evaluation.eval_result import EvalCaseResult |
50 | 38 | from ..evaluation.eval_sets_manager import EvalSetsManager |
51 | | -from ..evaluation.evaluator import EvalStatus |
52 | | -from ..evaluation.evaluator import Evaluator |
53 | | -from ..sessions.base_session_service import BaseSessionService |
54 | 39 | from ..utils.context_utils import Aclosing |
55 | 40 |
|
56 | 41 | logger = logging.getLogger("google_adk." + __name__) |
@@ -172,147 +157,6 @@ async def _collect_eval_results( |
172 | 157 | return eval_results |
173 | 158 |
|
174 | 159 |
|
175 | | -@deprecated( |
176 | | - "This method is deprecated and will be removed in fututre release. Please" |
177 | | - " use LocalEvalService to define your custom evals." |
178 | | -) |
179 | | -async def run_evals( |
180 | | - eval_cases_by_eval_set_id: dict[str, list[EvalCase]], |
181 | | - root_agent: Agent, |
182 | | - reset_func: Optional[Any], |
183 | | - eval_metrics: list[EvalMetric], |
184 | | - session_service: Optional[BaseSessionService] = None, |
185 | | - artifact_service: Optional[BaseArtifactService] = None, |
186 | | -) -> AsyncGenerator[EvalCaseResult, None]: |
187 | | - """Returns a stream of EvalCaseResult for each eval case that was evaluated. |
188 | | -
|
189 | | - Args: |
190 | | - eval_cases_by_eval_set_id: Eval cases categorized by eval set id to which |
191 | | - they belong. |
192 | | - root_agent: Agent to use for inferencing. |
193 | | - reset_func: If present, this will be called before invoking the agent before |
194 | | - every inferencing step. |
195 | | - eval_metrics: A list of metrics that should be used during evaluation. |
196 | | - session_service: The session service to use during inferencing. |
197 | | - artifact_service: The artifact service to use during inferencing. |
198 | | - """ |
199 | | - try: |
200 | | - from ..evaluation.evaluation_generator import EvaluationGenerator |
201 | | - except ModuleNotFoundError as e: |
202 | | - raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e |
203 | | - |
204 | | - for eval_set_id, eval_cases in eval_cases_by_eval_set_id.items(): |
205 | | - for eval_case in eval_cases: |
206 | | - eval_name = eval_case.eval_id |
207 | | - initial_session = eval_case.session_input |
208 | | - user_id = initial_session.user_id if initial_session else "test_user_id" |
209 | | - |
210 | | - try: |
211 | | - print(f"Running Eval: {eval_set_id}:{eval_name}") |
212 | | - session_id = f"{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}" |
213 | | - |
214 | | - inference_result = ( |
215 | | - await EvaluationGenerator._generate_inferences_from_root_agent( |
216 | | - invocations=eval_case.conversation, |
217 | | - root_agent=root_agent, |
218 | | - reset_func=reset_func, |
219 | | - initial_session=initial_session, |
220 | | - session_id=session_id, |
221 | | - session_service=session_service, |
222 | | - artifact_service=artifact_service, |
223 | | - ) |
224 | | - ) |
225 | | - |
226 | | - # Initialize the per-invocation metric results to an empty list. |
227 | | - # We will fill this as we evaluate each metric. |
228 | | - eval_metric_result_per_invocation = [] |
229 | | - for actual, expected in zip(inference_result, eval_case.conversation): |
230 | | - eval_metric_result_per_invocation.append( |
231 | | - EvalMetricResultPerInvocation( |
232 | | - actual_invocation=actual, |
233 | | - expected_invocation=expected, |
234 | | - eval_metric_results=[], |
235 | | - ) |
236 | | - ) |
237 | | - |
238 | | - overall_eval_metric_results = [] |
239 | | - |
240 | | - for eval_metric in eval_metrics: |
241 | | - metric_evaluator = _get_evaluator(eval_metric) |
242 | | - |
243 | | - if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations): |
244 | | - evaluation_result = await metric_evaluator.evaluate_invocations( |
245 | | - actual_invocations=inference_result, |
246 | | - expected_invocations=eval_case.conversation, |
247 | | - ) |
248 | | - else: |
249 | | - evaluation_result = metric_evaluator.evaluate_invocations( |
250 | | - actual_invocations=inference_result, |
251 | | - expected_invocations=eval_case.conversation, |
252 | | - ) |
253 | | - |
254 | | - overall_eval_metric_results.append( |
255 | | - EvalMetricResult( |
256 | | - metric_name=eval_metric.metric_name, |
257 | | - threshold=eval_metric.threshold, |
258 | | - score=evaluation_result.overall_score, |
259 | | - eval_status=evaluation_result.overall_eval_status, |
260 | | - ) |
261 | | - ) |
262 | | - for index, per_invocation_result in enumerate( |
263 | | - evaluation_result.per_invocation_results |
264 | | - ): |
265 | | - eval_metric_result_per_invocation[index].eval_metric_results.append( |
266 | | - EvalMetricResult( |
267 | | - metric_name=eval_metric.metric_name, |
268 | | - threshold=eval_metric.threshold, |
269 | | - score=per_invocation_result.score, |
270 | | - eval_status=per_invocation_result.eval_status, |
271 | | - ) |
272 | | - ) |
273 | | - |
274 | | - final_eval_status = EvalStatus.NOT_EVALUATED |
275 | | - # Go over the all the eval statuses and mark the final eval status as |
276 | | - # passed if all of them pass, otherwise mark the final eval status to |
277 | | - # failed. |
278 | | - for overall_eval_metric_result in overall_eval_metric_results: |
279 | | - overall_eval_status = overall_eval_metric_result.eval_status |
280 | | - if overall_eval_status == EvalStatus.PASSED: |
281 | | - final_eval_status = EvalStatus.PASSED |
282 | | - elif overall_eval_status == EvalStatus.NOT_EVALUATED: |
283 | | - continue |
284 | | - elif overall_eval_status == EvalStatus.FAILED: |
285 | | - final_eval_status = EvalStatus.FAILED |
286 | | - break |
287 | | - else: |
288 | | - raise ValueError("Unknown eval status.") |
289 | | - |
290 | | - yield EvalCaseResult( |
291 | | - eval_set_file=eval_set_id, |
292 | | - eval_set_id=eval_set_id, |
293 | | - eval_id=eval_name, |
294 | | - final_eval_status=final_eval_status, |
295 | | - eval_metric_results=[], |
296 | | - overall_eval_metric_results=overall_eval_metric_results, |
297 | | - eval_metric_result_per_invocation=eval_metric_result_per_invocation, |
298 | | - session_id=session_id, |
299 | | - user_id=user_id, |
300 | | - ) |
301 | | - |
302 | | - if final_eval_status == EvalStatus.PASSED: |
303 | | - result = "✅ Passed" |
304 | | - else: |
305 | | - result = "❌ Failed" |
306 | | - |
307 | | - print(f"Result: {result}\n") |
308 | | - except ModuleNotFoundError as e: |
309 | | - raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e |
310 | | - except Exception: |
311 | | - # Catching the general exception, so that we don't block other eval |
312 | | - # cases. |
313 | | - logger.exception("Eval failed for `%s:%s`", eval_set_id, eval_name) |
314 | | - |
315 | | - |
316 | 160 | def _convert_content_to_text( |
317 | 161 | content: Optional[genai_types.Content], |
318 | 162 | ) -> str: |
@@ -413,32 +257,6 @@ def pretty_print_eval_result(eval_result: EvalCaseResult): |
413 | 257 | click.echo("\n\n") # Few empty lines for visual clarity |
414 | 258 |
|
415 | 259 |
|
416 | | -def _get_evaluator(eval_metric: EvalMetric) -> Evaluator: |
417 | | - try: |
418 | | - from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator |
419 | | - from ..evaluation.response_evaluator import ResponseEvaluator |
420 | | - from ..evaluation.safety_evaluator import SafetyEvaluatorV1 |
421 | | - from ..evaluation.trajectory_evaluator import TrajectoryEvaluator |
422 | | - except ModuleNotFoundError as e: |
423 | | - raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e |
424 | | - if eval_metric.metric_name == TOOL_TRAJECTORY_SCORE_KEY: |
425 | | - return TrajectoryEvaluator(threshold=eval_metric.threshold) |
426 | | - elif ( |
427 | | - eval_metric.metric_name == RESPONSE_MATCH_SCORE_KEY |
428 | | - or eval_metric.metric_name == RESPONSE_EVALUATION_SCORE_KEY |
429 | | - ): |
430 | | - return ResponseEvaluator( |
431 | | - threshold=eval_metric.threshold, metric_name=eval_metric.metric_name |
432 | | - ) |
433 | | - elif eval_metric.metric_name == SAFETY_V1_KEY: |
434 | | - return SafetyEvaluatorV1(eval_metric) |
435 | | - elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2: |
436 | | - eval_metric.judge_model_options = JudgeModelOptions() |
437 | | - return FinalResponseMatchV2Evaluator(eval_metric) |
438 | | - |
439 | | - raise ValueError(f"Unsupported eval metric: {eval_metric}") |
440 | | - |
441 | | - |
442 | 260 | def get_eval_sets_manager( |
443 | 261 | eval_storage_uri: Optional[str], agents_dir: str |
444 | 262 | ) -> EvalSetsManager: |
|
0 commit comments