diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py index 370b53ce88..2a15851a36 100644 --- a/src/helm/benchmark/adaptation/adapter_spec.py +++ b/src/helm/benchmark/adaptation/adapter_spec.py @@ -8,6 +8,7 @@ ADAPT_GENERATION: str = "generation" ADAPT_LANGUAGE_MODELING: str = "language_modeling" ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint" +ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought" ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original" ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated" ADAPT_RANKING_BINARY: str = "ranking_binary" @@ -63,6 +64,12 @@ class AdapterSpec: reference_suffix: str = "\n" """The string that is included after each reference (for multiple-choice questions).""" + chain_of_thought_prefix: str = "" + """The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')""" + + chain_of_thought_suffix: str = "\n" + """The string that is included after each chain of thought. (e.g., 'The correct answer is')""" + output_prefix: str = "Output: " """The string that is included before the correct answer/predicted output (e.g., 'Answer:').""" diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py index b4a13101da..daf49723c5 100644 --- a/src/helm/benchmark/adaptation/common_adapter_specs.py +++ b/src/helm/benchmark/adaptation/common_adapter_specs.py @@ -4,6 +4,7 @@ ADAPT_GENERATION, ADAPT_LANGUAGE_MODELING, ADAPT_MULTIPLE_CHOICE_JOINT, + ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY, @@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec( [output_noun]: """ + input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "") + input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "") + output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ") + output_suffix = kwargs.pop("output_suffix", "\n") + return AdapterSpec( method=ADAPT_MULTIPLE_CHOICE_JOINT, instructions=format_instructions(instructions), - input_prefix=f"{input_noun}: " if input_noun is not None else "", - input_suffix="\n" if input_noun is not None else "", - output_prefix=f"{output_noun}: ", - output_suffix="\n", + input_prefix=input_prefix, + input_suffix=input_suffix, + output_prefix=output_prefix, + output_suffix=output_suffix, + max_train_instances=max_train_instances, + num_outputs=num_outputs, + max_tokens=max_tokens, + temperature=0.0, + stop_sequences=["\n"], + sample_train=sample_train, + **kwargs, + ) + + +def get_multiple_choice_joint_chain_of_thought_adapter_spec( + instructions: str, + input_noun: Optional[str], + output_noun: str, + num_outputs: int = 5, + max_train_instances: int = 5, + max_tokens: int = 5, + sample_train: bool = True, + **kwargs, +) -> AdapterSpec: + """ + [instructions] + + [input_noun]: [input] + [reference_1] + ... + [reference_k] + [output_noun]: [output] + + [input_noun]: [input] + [reference_1] + ... + [reference_k] + [output_noun]: + """ + + input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "") + input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "") + output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ") + output_suffix = kwargs.pop("output_suffix", "\n") + + return AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + instructions=format_instructions(instructions), + input_prefix=input_prefix, + input_suffix=input_suffix, + output_prefix=output_prefix, + output_suffix=output_suffix, max_train_instances=max_train_instances, num_outputs=num_outputs, max_tokens=max_tokens, @@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec( sample_train=sample_train, **kwargs, ) + elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: + return get_multiple_choice_joint_chain_of_thought_adapter_spec( + instructions, + input_noun, + output_noun, + max_train_instances=max_train_instances, + num_outputs=num_outputs, + max_tokens=max_tokens, + sample_train=sample_train, + **kwargs, + ) elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}: return get_multiple_choice_separate_adapter_spec(method, empty_input) else: diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 6a1ecfe816..b92616376c 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -5,6 +5,7 @@ from helm.benchmark.adaptation.adapter_spec import ( ADAPT_GENERATION, ADAPT_MULTIPLE_CHOICE_JOINT, + ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, AdapterSpec, ) from helm.benchmark.adaptation.common_adapter_specs import ( @@ -308,23 +309,57 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec @run_spec_function("gpqa") -def get_gpqa_spec(subset: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: +def get_gpqa_spec( + subset: str, + use_chain_of_thought: str = "False", +) -> RunSpec: + # Convert to bools and remove the str versions + use_chain_of_thought_bool: bool = use_chain_of_thought == "True" + del use_chain_of_thought scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset} ) - adapter_spec = get_multiple_choice_adapter_spec( - method=method, - instructions="The following are multiple choice questions (with answers).", - input_noun="Question", - output_noun="Answer", - ) + if use_chain_of_thought_bool: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand.", + ) + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + chain_of_thought_prefix="Let's think step by step: ", + chain_of_thought_suffix="The correct answer is ", + output_noun="", # will be overwritten with output_prefix + output_prefix="", + global_suffix=( + "Give step by step reasoning before you answer, and when you’re ready to answer, " + 'please use the format "The correct answer is (insert answer here)":', + ) + ) + else: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand.", + ) + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + output_noun="", # will be overwritten with output_prefix + output_prefix="The correct answer is ", + ) return RunSpec( - name=f"gpqa:subset={subset},method={method}", + name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs(), + metric_specs=get_exact_match_metric_specs(), # TODO: update this after cot metric is ready groups=["gpqa"], )