tatsu-lab · YannDubs · Jul 28, 2023 · Jul 28, 2023 · Jul 28, 2023 · Jul 28, 2023
diff --git a/src/alpaca_eval/completion_parsers.py b/src/alpaca_eval/completion_parsers.py
@@ -9,7 +9,7 @@
 
 from . import utils
 
-__all__ = ["regex_parser", "lmsys_parser", "ranking_parser", "json_parser", "eval_parser"]
+__all__ = ["regex_parser", "lmsys_parser", "ranking_parser", "json_parser", "eval_parser", "pipeline_meta_parser"]
 
 
 def regex_parser(completion: str, outputs_to_match: dict[str, Any]) -> list[Any]:
@@ -165,3 +165,71 @@ def eval_parser(completion: str) -> list[Any]:
     if not isinstance(evaluated_completion, list):
         evaluated_completion = [evaluated_completion]
     return evaluated_completion
+
+
+def replace_parser(completion: str, replacer: dict, default_replacer: Any = "auto") -> list[str]:
+    """Parser that replaces part of the completion using a dictionary. This is useful if it's more natural for a
+    prompt to ask a completion that is different from the one you want to store.
+
+    Parameters
+    ----------
+    completion : str
+        Output from the model to parse.
+
+    replacer : dict
+        Dictionary with keys that are the substring of the completion that you want to replace and values that are the
+        replacements.
+
+    default_replacer : any, optional
+        If a key is not found in `replacer`, use this value instead. If "auto" then use the key itself.
+
+    Examples
+    --------
+    >>> replace_parser("True", replacer={"True": 1})
+    [1]
+    """
+    return [replacer.get(completion, completion if default_replacer == "auto" else default_replacer)]
+
+
+def pipeline_meta_parser(
+    completion: str, parsers_to_kwargs: dict[str, dict], is_squeeze: bool = True, _depth=0
+) -> list[Any]:
+    r"""Applies a list of parsers in sequence to a completion.
+
+    Parameters
+    ----------
+    completion : str
+        The completion to parse.
+
+    parsers_to_kwargs : dictionary of str to dict
+        A dictionary mapping parser functions to kwargs to pass to them. The parsing functions will be applied in the
+        order they are given.
+
+    is_squeeze : bool, optional
+        If True, will squeeze the output of each parser if it's a singleton.
+
+    Examples
+    --------
+    >>> completion = '{"ex": "...", "rank": [{"model": "model_1", "rank": 1}, {"model": "model_2", "rank": 2}]}'
+    >>> parsers_to_kwargs = {"json_parser": {"annotation_key": "rank"}, "ranking_parser": {}}
+    >>> pipeline_meta_parser(completion, parsers_to_kwargs)
+    [1]
+    """
+    all_parsers = list(parsers_to_kwargs.keys())
+    all_kwargs = list(parsers_to_kwargs.values())
+
+    out = globals()[all_parsers[0]](completion, **all_kwargs[0])
+    rest_of_parsers_to_kwargs = dict(zip(all_parsers[1:], all_kwargs[1:]))
+    if len(rest_of_parsers_to_kwargs) > 0:
+        out = [
+            pipeline_meta_parser(
+                o, parsers_to_kwargs=rest_of_parsers_to_kwargs, is_squeeze=is_squeeze, _depth=_depth + 1
+            )
+            for o in out
+        ]
+
+    if is_squeeze and len(out) == 1 and _depth != 0:
+        assert isinstance(out, list)
+        out = out[0]
+
+    return out
diff --git a/src/alpaca_eval/decoders/openai.py b/src/alpaca_eval/decoders/openai.py
@@ -199,12 +199,8 @@ def _openai_completion_helper(
                         choice["text"] = choice.message.content
 
                     if choice.message.get("function_call"):
-                        # currently we only use function calls to get a JSON object
-                        # => overwrite text with the JSON object. In the future, we could
-                        # allow actual function calls
-                        all_args = json.loads(choice.message.function_call.arguments)
-                        assert len(all_args) == 1
-                        choice["text"] = all_args[list(all_args.keys())[0]]
+                        # currently we only use function calls to get a JSON object => return raw text of json
+                        choice["text"] = choice.message.function_call.arguments
 
             else:
                 completion_batch = openai.Completion.create(prompt=prompt_batch, **curr_kwargs)

diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml
@@ -26,5 +26,10 @@ alpaca_eval_gpt4_fn:
                     type: "number"
                     description: "Order of preference of the model, 1 has the best output"
         "required": [ "ordered_models" ]
-  fn_completion_parser: "ranking_parser"
+  fn_completion_parser: "pipeline_meta_parser"
+  completion_parser_kwargs:
+      parsers_to_kwargs:
+        json_parser:
+          annotation_key: "ordered_models"
+        ranking_parser: {}
   batch_size: 1
diff --git a/tests/integration_tests/test_example_integration.py b/tests/integration_tests/test_example_integration.py
@@ -2,8 +2,6 @@
 
 import pytest
 
-from alpaca_eval import main
-
 
 @pytest.mark.slow
 def test_cli_evaluate_example():
@@ -16,6 +14,8 @@ def test_cli_evaluate_example():
             "3",
             "--annotators_config",
             "claude",
+            "--is_avoid_reannotations",
+            "False",
         ],
         capture_output=True,
         text=True,
@@ -24,3 +24,26 @@ def test_cli_evaluate_example():
     expected_output = " ".join("example 33.33 33.33 3".split())
 
     assert expected_output in normalized_output
+
+
+@pytest.mark.slow
+def test_openai_fn_evaluate_example():
+    result = subprocess.run(
+        [
+            "alpaca_eval",
+            "--model_outputs",
+            "example/outputs.json",
+            "--max_instances",
+            "1",
+            "--annotators_config",
+            "alpaca_eval_gpt4_fn",
+            "--is_avoid_reannotations",
+            "False",
+        ],
+        capture_output=True,
+        text=True,
+    )
+    normalized_output = " ".join(result.stdout.split())
+    expected_output = " ".join("example 0.00 0.00 2".split())
+
+    assert expected_output in normalized_output