|
14 | 14 |
|
15 | 15 | from __future__ import annotations |
16 | 16 |
|
17 | | -from typing import Any |
18 | 17 | from typing import Optional |
19 | 18 |
|
20 | 19 | from google.genai import types as genai_types |
21 | | -import pandas as pd |
22 | | -from tabulate import tabulate |
23 | | -from typing_extensions import deprecated |
24 | 20 | from typing_extensions import override |
25 | 21 |
|
26 | 22 | from .eval_case import get_all_tool_calls |
|
30 | 26 | from .eval_metrics import MetricInfo |
31 | 27 | from .eval_metrics import MetricValueInfo |
32 | 28 | from .eval_metrics import PrebuiltMetrics |
33 | | -from .evaluation_constants import EvalConstants |
34 | 29 | from .evaluator import EvalStatus |
35 | 30 | from .evaluator import EvaluationResult |
36 | 31 | from .evaluator import Evaluator |
@@ -129,170 +124,3 @@ def _are_tool_calls_equal( |
129 | 124 |
|
130 | 125 | def _get_eval_status(self, score: float): |
131 | 126 | return EvalStatus.PASSED if score >= self._threshold else EvalStatus.FAILED |
132 | | - |
133 | | - @staticmethod |
134 | | - @deprecated( |
135 | | - "This method has been deprecated and will be removed soon. Please use" |
136 | | - " evaluate_invocations instead." |
137 | | - ) |
138 | | - def evaluate( |
139 | | - eval_dataset: list[list[dict[str, Any]]], |
140 | | - *, |
141 | | - print_detailed_results: bool = False, |
142 | | - ): |
143 | | - r"""Returns the mean tool use accuracy of the eval dataset. |
144 | | -
|
145 | | - Tool use accuracy is calculated by comparing the expected and the actual |
146 | | - tool use trajectories. An exact match scores a 1, 0 otherwise. The final |
147 | | - number is an average of these individual scores. |
148 | | -
|
149 | | - Value range: [0, 1], where 0 means none of the tool use entries aligned, |
150 | | - and 1 would mean all of them aligned. Higher value is good. |
151 | | -
|
152 | | - Args: |
153 | | - eval_dataset: The dataset that will be evaluated. |
154 | | - print_detailed_results: Prints detailed results on the console. This is |
155 | | - usually helpful during debugging. |
156 | | -
|
157 | | - A note on eval_dataset: |
158 | | - The dataset should be a list session, where each session is represented |
159 | | - as a list of interaction that need evaluation. Each evaluation is |
160 | | - represented as a dictionary that is expected to have values for the |
161 | | - following keys: |
162 | | - 1) query |
163 | | - 2) response |
164 | | - 3) acutal_tool_use |
165 | | - 4) expected_tool_use |
166 | | -
|
167 | | - Here is a sample eval_dataset value with one entry: |
168 | | -
|
169 | | - [ |
170 | | - [ |
171 | | - { |
172 | | - "query": "Roll a 16 sided dice for me", |
173 | | - "response": "I rolled a 16 sided die and got 13.\n", |
174 | | - "expected_tool_use": [ |
175 | | - { |
176 | | - "tool_name": "roll_die", |
177 | | - "tool_input": { |
178 | | - "sides": 16 |
179 | | - } |
180 | | - } |
181 | | - ], |
182 | | - "acutal_tool_use": [ |
183 | | - { |
184 | | - "tool_name": "roll_die", |
185 | | - "tool_input": { |
186 | | - "sides": 16 |
187 | | - } |
188 | | - } |
189 | | - ] |
190 | | - } |
191 | | - ] |
192 | | - ] |
193 | | - """ |
194 | | - if not eval_dataset: |
195 | | - raise ValueError("The evaluation dataset is empty.") |
196 | | - |
197 | | - results_df = pd.DataFrame( |
198 | | - columns=[ |
199 | | - "query", |
200 | | - "response", |
201 | | - "actual_tool_use", |
202 | | - "expected_tool_use", |
203 | | - "tool_use_accuracy", |
204 | | - ] |
205 | | - ) |
206 | | - failures = [] |
207 | | - |
208 | | - for conversation in eval_dataset: |
209 | | - for index, row in enumerate(conversation): |
210 | | - new_row, failure = TrajectoryEvaluator._evaluate_row(row) |
211 | | - results_df = pd.concat( |
212 | | - [results_df, pd.DataFrame([new_row])], ignore_index=True |
213 | | - ) |
214 | | - if failure: |
215 | | - failure["turn"] = index + 1 |
216 | | - failures.append(failure) |
217 | | - |
218 | | - TrajectoryEvaluator._report_failures(failures) |
219 | | - |
220 | | - if print_detailed_results: |
221 | | - TrajectoryEvaluator._print_results(results_df) |
222 | | - |
223 | | - return results_df["tool_use_accuracy"].mean() |
224 | | - |
225 | | - @staticmethod |
226 | | - def _evaluate_row(row): |
227 | | - # We don't evaluate the mock tool outputs. |
228 | | - expected = TrajectoryEvaluator._remove_tool_outputs( |
229 | | - row["expected_tool_use"] |
230 | | - ) |
231 | | - actual = row["actual_tool_use"] |
232 | | - tool_use_accuracy = ( |
233 | | - 1.0 if TrajectoryEvaluator.are_tools_equal(actual, expected) else 0.0 |
234 | | - ) |
235 | | - |
236 | | - new_row = { |
237 | | - "query": row["query"], |
238 | | - "response": row["response"], |
239 | | - "actual_tool_use": actual, |
240 | | - "expected_tool_use": expected, |
241 | | - "tool_use_accuracy": tool_use_accuracy, |
242 | | - } |
243 | | - failure = ( |
244 | | - None |
245 | | - if tool_use_accuracy == 1.0 |
246 | | - else {"query": row["query"], "actual": actual, "expected": expected} |
247 | | - ) |
248 | | - return new_row, failure |
249 | | - |
250 | | - @staticmethod |
251 | | - @deprecated( |
252 | | - "are_tools_equal is deprecated and will be removed soon. Please use" |
253 | | - " TrajectoryEvaluator._are_tool_calls_equal instead." |
254 | | - ) |
255 | | - def are_tools_equal(list_a_original, list_b_original): |
256 | | - # Remove other entries that we don't want to evaluate |
257 | | - list_a = [ |
258 | | - {"tool_name": tool["tool_name"], "tool_input": tool["tool_input"]} |
259 | | - for tool in list_a_original |
260 | | - ] |
261 | | - |
262 | | - list_b = [ |
263 | | - {"tool_name": tool["tool_name"], "tool_input": tool["tool_input"]} |
264 | | - for tool in list_b_original |
265 | | - ] |
266 | | - |
267 | | - return list_a == list_b |
268 | | - |
269 | | - @staticmethod |
270 | | - def _remove_tool_outputs(tool_use_list): |
271 | | - """Removes 'mock_tool_output' from each dictionary in the list.""" |
272 | | - result = [] |
273 | | - for tool_use in tool_use_list: |
274 | | - new_tool_use = ( |
275 | | - tool_use.copy() |
276 | | - ) # Create a copy to avoid modifying the original |
277 | | - new_tool_use.pop( |
278 | | - EvalConstants.MOCK_TOOL_OUTPUT, None |
279 | | - ) # Remove 'tool_output' if it exists |
280 | | - result.append(new_tool_use) |
281 | | - return result |
282 | | - |
283 | | - @staticmethod |
284 | | - def _report_failures(failures): |
285 | | - if failures: |
286 | | - print("Failures:") |
287 | | - for failure in failures: |
288 | | - print(f"""{{ |
289 | | - "turn": {failure["turn"]}, |
290 | | - "query": '{failure["query"]}', |
291 | | - "actual": {failure["actual"]}, |
292 | | - "expected_tool_use": {failure["expected"]}, |
293 | | -}} |
294 | | -""") |
295 | | - |
296 | | - @staticmethod |
297 | | - def _print_results(results_df): |
298 | | - print(tabulate(results_df, headers="keys", tablefmt="grid")) |
0 commit comments