From 569ff77103959fa9b372c11b0baa44f0f78edc99 Mon Sep 17 00:00:00 2001 From: Yann Dubois Date: Fri, 28 Jul 2023 17:36:02 +0200 Subject: [PATCH 1/3] add pipeline meta parser --- src/alpaca_eval/annotators/base.py | 3 + src/alpaca_eval/completion_parsers.py | 67 ++++++++++++++++++- src/alpaca_eval/decoders/openai.py | 8 +-- .../alpaca_eval_gpt4_fn/configs.yaml | 7 +- 4 files changed, 77 insertions(+), 8 deletions(-) diff --git a/src/alpaca_eval/annotators/base.py b/src/alpaca_eval/annotators/base.py index 435c4d05..0d218c0f 100644 --- a/src/alpaca_eval/annotators/base.py +++ b/src/alpaca_eval/annotators/base.py @@ -631,6 +631,9 @@ def _parse_completions(self, completions: list[str]) -> tuple[list[Any], list[An all_completions = [] for completion in completions: try: + import pdb + + pdb.set_trace() batch_annotations = self.fn_completion_parser(completion) batch_annotations = list(batch_annotations) diff --git a/src/alpaca_eval/completion_parsers.py b/src/alpaca_eval/completion_parsers.py index 702ac557..e284e369 100644 --- a/src/alpaca_eval/completion_parsers.py +++ b/src/alpaca_eval/completion_parsers.py @@ -9,7 +9,7 @@ from . import utils -__all__ = ["regex_parser", "lmsys_parser", "ranking_parser", "json_parser", "eval_parser"] +__all__ = ["regex_parser", "lmsys_parser", "ranking_parser", "json_parser", "eval_parser", "pipeline_meta_parser"] def regex_parser(completion: str, outputs_to_match: dict[str, Any]) -> list[Any]: @@ -165,3 +165,68 @@ def eval_parser(completion: str) -> list[Any]: if not isinstance(evaluated_completion, list): evaluated_completion = [evaluated_completion] return evaluated_completion + + +def replace_parser(completion: str, replacer: dict, default_replacer: Any = "auto") -> list[str]: + """Parser that replaces part of the completion using a dictionary. This is useful if it's more natural for a + prompt to ask a completion that is different from the one you want to store. + + Parameters + ---------- + completion : str + Output from the model to parse. + + replacer : dict + Dictionary with keys that are the substring of the completion that you want to replace and values that are the + replacements. + + default_replacer : any, optional + If a key is not found in `replacer`, use this value instead. If "auto" then use the key itself. + + Examples + -------- + >>> replace_parser("True", replacer={"True": 1}) + 1 + """ + return [replacer.get(completion, completion if default_replacer == "auto" else default_replacer)] + + +def pipeline_meta_parser( + completion: str, parsers_to_kwargs: dict[str, dict], is_squeeze: bool = True, _depth=0 +) -> list[Any]: + r"""Applies a list of parsers in sequence to a completion. + + Parameters + ---------- + completion : str + The completion to parse. + + parsers_to_kwargs : dictionary of str to dict + A dictionary mapping parser functions to kwargs to pass to them. The parsing functions will be applied in the + order they are given. + + is_squeeze : bool, optional + If True, will squeeze the output of each parser if it's a singleton. + + Examples + -------- + >>> completion = '{"ex": "...", "rank": [{"model": "model_1", "rank": 1}, {"model": "model_2", "rank": 2}]}' + >>> parsers_to_kwargs = {"json_parser": {"annotation_key": "rank"}, "ranking_parser": {}} + >>> pipeline_meta_parser(completion, parsers_to_kwargs) + [1] + """ + all_parsers = list(parsers_to_kwargs.keys()) + all_kwargs = list(parsers_to_kwargs.values()) + + out = globals()[all_parsers[0]](completion, **all_kwargs[0]) + rest_of_parsers_to_kwargs = dict(zip(all_parsers[1:], all_kwargs[1:])) + if len(rest_of_parsers_to_kwargs) > 0: + out = [ + pipeline_meta_parser(o, rest_of_parsers_to_kwargs, is_squeeze=is_squeeze, _depth=_depth + 1) for o in out + ] + + if is_squeeze and len(out) == 1 and _depth != 0: + assert isinstance(out, list) + out = out[0] + + return out diff --git a/src/alpaca_eval/decoders/openai.py b/src/alpaca_eval/decoders/openai.py index 41d42fe4..0f4e9717 100644 --- a/src/alpaca_eval/decoders/openai.py +++ b/src/alpaca_eval/decoders/openai.py @@ -199,12 +199,8 @@ def _openai_completion_helper( choice["text"] = choice.message.content if choice.message.get("function_call"): - # currently we only use function calls to get a JSON object - # => overwrite text with the JSON object. In the future, we could - # allow actual function calls - all_args = json.loads(choice.message.function_call.arguments) - assert len(all_args) == 1 - choice["text"] = all_args[list(all_args.keys())[0]] + # currently we only use function calls to get a JSON object => return raw text of json + choice["text"] = choice.message.function_call.arguments else: completion_batch = openai.Completion.create(prompt=prompt_batch, **curr_kwargs) diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml index 5608b6cb..5db7e73d 100644 --- a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml +++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml @@ -26,5 +26,10 @@ alpaca_eval_gpt4_fn: type: "number" description: "Order of preference of the model, 1 has the best output" "required": [ "ordered_models" ] - fn_completion_parser: "ranking_parser" + fn_completion_parser: "pipeline_meta_parser" + completion_parser_kwargs: + parsers_to_kwargs: + json_parser: + annotation_key: "ordered_models" + "ranking_parser": {} batch_size: 1 From 3dbc1285cbb2df81f3fb60cc518d81394365b057 Mon Sep 17 00:00:00 2001 From: Yann Dubois Date: Fri, 28 Jul 2023 17:41:34 +0200 Subject: [PATCH 2/3] pass all tests --- src/alpaca_eval/annotators/base.py | 3 --- src/alpaca_eval/completion_parsers.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/alpaca_eval/annotators/base.py b/src/alpaca_eval/annotators/base.py index 0d218c0f..435c4d05 100644 --- a/src/alpaca_eval/annotators/base.py +++ b/src/alpaca_eval/annotators/base.py @@ -631,9 +631,6 @@ def _parse_completions(self, completions: list[str]) -> tuple[list[Any], list[An all_completions = [] for completion in completions: try: - import pdb - - pdb.set_trace() batch_annotations = self.fn_completion_parser(completion) batch_annotations = list(batch_annotations) diff --git a/src/alpaca_eval/completion_parsers.py b/src/alpaca_eval/completion_parsers.py index e284e369..6c61171a 100644 --- a/src/alpaca_eval/completion_parsers.py +++ b/src/alpaca_eval/completion_parsers.py @@ -186,7 +186,7 @@ def replace_parser(completion: str, replacer: dict, default_replacer: Any = "aut Examples -------- >>> replace_parser("True", replacer={"True": 1}) - 1 + [1] """ return [replacer.get(completion, completion if default_replacer == "auto" else default_replacer)] From 3376598d9176a6f892b0649ec83db566a01cdc69 Mon Sep 17 00:00:00 2001 From: Yann Dubois Date: Fri, 28 Jul 2023 17:55:57 +0200 Subject: [PATCH 3/3] pass all tests --- src/alpaca_eval/completion_parsers.py | 5 +++- .../alpaca_eval_gpt4_fn/configs.yaml | 2 +- .../test_example_integration.py | 27 +++++++++++++++++-- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/alpaca_eval/completion_parsers.py b/src/alpaca_eval/completion_parsers.py index 6c61171a..341cffb7 100644 --- a/src/alpaca_eval/completion_parsers.py +++ b/src/alpaca_eval/completion_parsers.py @@ -222,7 +222,10 @@ def pipeline_meta_parser( rest_of_parsers_to_kwargs = dict(zip(all_parsers[1:], all_kwargs[1:])) if len(rest_of_parsers_to_kwargs) > 0: out = [ - pipeline_meta_parser(o, rest_of_parsers_to_kwargs, is_squeeze=is_squeeze, _depth=_depth + 1) for o in out + pipeline_meta_parser( + o, parsers_to_kwargs=rest_of_parsers_to_kwargs, is_squeeze=is_squeeze, _depth=_depth + 1 + ) + for o in out ] if is_squeeze and len(out) == 1 and _depth != 0: diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml index 5db7e73d..53b87cb1 100644 --- a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml +++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml @@ -31,5 +31,5 @@ alpaca_eval_gpt4_fn: parsers_to_kwargs: json_parser: annotation_key: "ordered_models" - "ranking_parser": {} + ranking_parser: {} batch_size: 1 diff --git a/tests/integration_tests/test_example_integration.py b/tests/integration_tests/test_example_integration.py index fbf5a902..5bcc8070 100644 --- a/tests/integration_tests/test_example_integration.py +++ b/tests/integration_tests/test_example_integration.py @@ -2,8 +2,6 @@ import pytest -from alpaca_eval import main - @pytest.mark.slow def test_cli_evaluate_example(): @@ -16,6 +14,8 @@ def test_cli_evaluate_example(): "3", "--annotators_config", "claude", + "--is_avoid_reannotations", + "False", ], capture_output=True, text=True, @@ -24,3 +24,26 @@ def test_cli_evaluate_example(): expected_output = " ".join("example 33.33 33.33 3".split()) assert expected_output in normalized_output + + +@pytest.mark.slow +def test_openai_fn_evaluate_example(): + result = subprocess.run( + [ + "alpaca_eval", + "--model_outputs", + "example/outputs.json", + "--max_instances", + "1", + "--annotators_config", + "alpaca_eval_gpt4_fn", + "--is_avoid_reannotations", + "False", + ], + capture_output=True, + text=True, + ) + normalized_output = " ".join(result.stdout.split()) + expected_output = " ".join("example 0.00 0.00 2".split()) + + assert expected_output in normalized_output