Skip to content

Commit 64646e0

Browse files
ankursharmascopybara-github
authored andcommitted
chore: Remove deprecated static methods from TrajectoryEvaluator
This change removes the `evaluate`, `_evaluate_row`, `are_tools_equal`, `_remove_tool_outputs`, `_report_failures`, and `_print_results` static methods from `TrajectoryEvaluator`, along with their corresponding unit tests. These methods were previously marked as deprecated. PiperOrigin-RevId: 817477494
1 parent 81913c8 commit 64646e0

File tree

2 files changed

+153
-424
lines changed

2 files changed

+153
-424
lines changed

src/google/adk/evaluation/trajectory_evaluator.py

Lines changed: 0 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,9 @@
1414

1515
from __future__ import annotations
1616

17-
from typing import Any
1817
from typing import Optional
1918

2019
from google.genai import types as genai_types
21-
import pandas as pd
22-
from tabulate import tabulate
23-
from typing_extensions import deprecated
2420
from typing_extensions import override
2521

2622
from .eval_case import get_all_tool_calls
@@ -30,7 +26,6 @@
3026
from .eval_metrics import MetricInfo
3127
from .eval_metrics import MetricValueInfo
3228
from .eval_metrics import PrebuiltMetrics
33-
from .evaluation_constants import EvalConstants
3429
from .evaluator import EvalStatus
3530
from .evaluator import EvaluationResult
3631
from .evaluator import Evaluator
@@ -129,170 +124,3 @@ def _are_tool_calls_equal(
129124

130125
def _get_eval_status(self, score: float):
131126
return EvalStatus.PASSED if score >= self._threshold else EvalStatus.FAILED
132-
133-
@staticmethod
134-
@deprecated(
135-
"This method has been deprecated and will be removed soon. Please use"
136-
" evaluate_invocations instead."
137-
)
138-
def evaluate(
139-
eval_dataset: list[list[dict[str, Any]]],
140-
*,
141-
print_detailed_results: bool = False,
142-
):
143-
r"""Returns the mean tool use accuracy of the eval dataset.
144-
145-
Tool use accuracy is calculated by comparing the expected and the actual
146-
tool use trajectories. An exact match scores a 1, 0 otherwise. The final
147-
number is an average of these individual scores.
148-
149-
Value range: [0, 1], where 0 means none of the tool use entries aligned,
150-
and 1 would mean all of them aligned. Higher value is good.
151-
152-
Args:
153-
eval_dataset: The dataset that will be evaluated.
154-
print_detailed_results: Prints detailed results on the console. This is
155-
usually helpful during debugging.
156-
157-
A note on eval_dataset:
158-
The dataset should be a list session, where each session is represented
159-
as a list of interaction that need evaluation. Each evaluation is
160-
represented as a dictionary that is expected to have values for the
161-
following keys:
162-
1) query
163-
2) response
164-
3) acutal_tool_use
165-
4) expected_tool_use
166-
167-
Here is a sample eval_dataset value with one entry:
168-
169-
[
170-
[
171-
{
172-
"query": "Roll a 16 sided dice for me",
173-
"response": "I rolled a 16 sided die and got 13.\n",
174-
"expected_tool_use": [
175-
{
176-
"tool_name": "roll_die",
177-
"tool_input": {
178-
"sides": 16
179-
}
180-
}
181-
],
182-
"acutal_tool_use": [
183-
{
184-
"tool_name": "roll_die",
185-
"tool_input": {
186-
"sides": 16
187-
}
188-
}
189-
]
190-
}
191-
]
192-
]
193-
"""
194-
if not eval_dataset:
195-
raise ValueError("The evaluation dataset is empty.")
196-
197-
results_df = pd.DataFrame(
198-
columns=[
199-
"query",
200-
"response",
201-
"actual_tool_use",
202-
"expected_tool_use",
203-
"tool_use_accuracy",
204-
]
205-
)
206-
failures = []
207-
208-
for conversation in eval_dataset:
209-
for index, row in enumerate(conversation):
210-
new_row, failure = TrajectoryEvaluator._evaluate_row(row)
211-
results_df = pd.concat(
212-
[results_df, pd.DataFrame([new_row])], ignore_index=True
213-
)
214-
if failure:
215-
failure["turn"] = index + 1
216-
failures.append(failure)
217-
218-
TrajectoryEvaluator._report_failures(failures)
219-
220-
if print_detailed_results:
221-
TrajectoryEvaluator._print_results(results_df)
222-
223-
return results_df["tool_use_accuracy"].mean()
224-
225-
@staticmethod
226-
def _evaluate_row(row):
227-
# We don't evaluate the mock tool outputs.
228-
expected = TrajectoryEvaluator._remove_tool_outputs(
229-
row["expected_tool_use"]
230-
)
231-
actual = row["actual_tool_use"]
232-
tool_use_accuracy = (
233-
1.0 if TrajectoryEvaluator.are_tools_equal(actual, expected) else 0.0
234-
)
235-
236-
new_row = {
237-
"query": row["query"],
238-
"response": row["response"],
239-
"actual_tool_use": actual,
240-
"expected_tool_use": expected,
241-
"tool_use_accuracy": tool_use_accuracy,
242-
}
243-
failure = (
244-
None
245-
if tool_use_accuracy == 1.0
246-
else {"query": row["query"], "actual": actual, "expected": expected}
247-
)
248-
return new_row, failure
249-
250-
@staticmethod
251-
@deprecated(
252-
"are_tools_equal is deprecated and will be removed soon. Please use"
253-
" TrajectoryEvaluator._are_tool_calls_equal instead."
254-
)
255-
def are_tools_equal(list_a_original, list_b_original):
256-
# Remove other entries that we don't want to evaluate
257-
list_a = [
258-
{"tool_name": tool["tool_name"], "tool_input": tool["tool_input"]}
259-
for tool in list_a_original
260-
]
261-
262-
list_b = [
263-
{"tool_name": tool["tool_name"], "tool_input": tool["tool_input"]}
264-
for tool in list_b_original
265-
]
266-
267-
return list_a == list_b
268-
269-
@staticmethod
270-
def _remove_tool_outputs(tool_use_list):
271-
"""Removes 'mock_tool_output' from each dictionary in the list."""
272-
result = []
273-
for tool_use in tool_use_list:
274-
new_tool_use = (
275-
tool_use.copy()
276-
) # Create a copy to avoid modifying the original
277-
new_tool_use.pop(
278-
EvalConstants.MOCK_TOOL_OUTPUT, None
279-
) # Remove 'tool_output' if it exists
280-
result.append(new_tool_use)
281-
return result
282-
283-
@staticmethod
284-
def _report_failures(failures):
285-
if failures:
286-
print("Failures:")
287-
for failure in failures:
288-
print(f"""{{
289-
"turn": {failure["turn"]},
290-
"query": '{failure["query"]}',
291-
"actual": {failure["actual"]},
292-
"expected_tool_use": {failure["expected"]},
293-
}}
294-
""")
295-
296-
@staticmethod
297-
def _print_results(results_df):
298-
print(tabulate(results_df, headers="keys", tablefmt="grid"))

0 commit comments

Comments
 (0)