diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py index 370b53ce88..2a15851a36 100644 --- a/src/helm/benchmark/adaptation/adapter_spec.py +++ b/src/helm/benchmark/adaptation/adapter_spec.py @@ -8,6 +8,7 @@ ADAPT_GENERATION: str = "generation" ADAPT_LANGUAGE_MODELING: str = "language_modeling" ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint" +ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought" ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original" ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated" ADAPT_RANKING_BINARY: str = "ranking_binary" @@ -63,6 +64,12 @@ class AdapterSpec: reference_suffix: str = "\n" """The string that is included after each reference (for multiple-choice questions).""" + chain_of_thought_prefix: str = "" + """The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')""" + + chain_of_thought_suffix: str = "\n" + """The string that is included after each chain of thought. (e.g., 'The correct answer is')""" + output_prefix: str = "Output: " """The string that is included before the correct answer/predicted output (e.g., 'Answer:').""" diff --git a/src/helm/benchmark/adaptation/adapters/adapter_factory.py b/src/helm/benchmark/adaptation/adapters/adapter_factory.py index 11f7925f1e..b865f95b59 100644 --- a/src/helm/benchmark/adaptation/adapters/adapter_factory.py +++ b/src/helm/benchmark/adaptation/adapters/adapter_factory.py @@ -3,6 +3,7 @@ ADAPT_GENERATION_MULTIMODAL, ADAPT_LANGUAGE_MODELING, ADAPT_MULTIPLE_CHOICE_JOINT, + ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, @@ -19,6 +20,9 @@ ) from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter +from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import ( + MultipleChoiceJointChainOfThoughtAdapter, +) from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter from helm.benchmark.window_services.tokenizer_service import TokenizerService @@ -38,6 +42,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService) adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service) elif method == ADAPT_MULTIPLE_CHOICE_JOINT: adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service) + elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: + adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service) elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service) elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py index ed3d4a02b5..d79ad0a7d9 100644 --- a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py @@ -40,7 +40,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter): @staticmethod def get_prefix_char(prefix: str) -> str: - return prefix.lstrip()[0] + return [char for char in prefix if char.isalnum()][0] @staticmethod def get_reference_prefix(prefix: str, i: int) -> str: diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py new file mode 100644 index 0000000000..ab4cc374ad --- /dev/null +++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py @@ -0,0 +1,87 @@ +from typing import Optional + +from helm.benchmark.scenarios.scenario import Instance +from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter + + +class MultipleChoiceJointChainOfThoughtAdapter(MultipleChoiceJointAdapter): + """ + Each `Instance` in a `Scenario` looks like this: + + -> + + [correct] + + + + + We can define a label (e.g., letter) for each reference: + + + + + # train + + A. + B. + C. + D. + + + + + + + + + # test + + A. + B. + C. + D. + + + + + + + + + In general, each example is: + + \ + + """ + + def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str: + """Return a list of lines corresponding to this example (part of the prompt).""" + # Input + result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix + + # Include the references + delimiter = ", " + no_correct_references = "n/a" + output = no_correct_references + for reference_index, reference in enumerate(instance.references): + prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index) + result += prefix + reference.output.text + self.adapter_spec.reference_suffix + if reference.is_correct: + if output == no_correct_references: + output = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index) + elif self.adapter_spec.multi_label: + output += delimiter + output += self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index) + + if include_output: + chain_of_thought = instance.extra_data.get("chain_of_thought", "") if instance.extra_data else "" + chain_of_thought_block = ( + self.adapter_spec.chain_of_thought_prefix + chain_of_thought + self.adapter_spec.chain_of_thought_suffix + ) + result += ( + self.adapter_spec.output_prefix + chain_of_thought_block + output + self.adapter_spec.output_suffix + ) + else: + result += self.adapter_spec.output_prefix.rstrip() + + return result diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py index b4a13101da..daf49723c5 100644 --- a/src/helm/benchmark/adaptation/common_adapter_specs.py +++ b/src/helm/benchmark/adaptation/common_adapter_specs.py @@ -4,6 +4,7 @@ ADAPT_GENERATION, ADAPT_LANGUAGE_MODELING, ADAPT_MULTIPLE_CHOICE_JOINT, + ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY, @@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec( [output_noun]: """ + input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "") + input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "") + output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ") + output_suffix = kwargs.pop("output_suffix", "\n") + return AdapterSpec( method=ADAPT_MULTIPLE_CHOICE_JOINT, instructions=format_instructions(instructions), - input_prefix=f"{input_noun}: " if input_noun is not None else "", - input_suffix="\n" if input_noun is not None else "", - output_prefix=f"{output_noun}: ", - output_suffix="\n", + input_prefix=input_prefix, + input_suffix=input_suffix, + output_prefix=output_prefix, + output_suffix=output_suffix, + max_train_instances=max_train_instances, + num_outputs=num_outputs, + max_tokens=max_tokens, + temperature=0.0, + stop_sequences=["\n"], + sample_train=sample_train, + **kwargs, + ) + + +def get_multiple_choice_joint_chain_of_thought_adapter_spec( + instructions: str, + input_noun: Optional[str], + output_noun: str, + num_outputs: int = 5, + max_train_instances: int = 5, + max_tokens: int = 5, + sample_train: bool = True, + **kwargs, +) -> AdapterSpec: + """ + [instructions] + + [input_noun]: [input] + [reference_1] + ... + [reference_k] + [output_noun]: [output] + + [input_noun]: [input] + [reference_1] + ... + [reference_k] + [output_noun]: + """ + + input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "") + input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "") + output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ") + output_suffix = kwargs.pop("output_suffix", "\n") + + return AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + instructions=format_instructions(instructions), + input_prefix=input_prefix, + input_suffix=input_suffix, + output_prefix=output_prefix, + output_suffix=output_suffix, max_train_instances=max_train_instances, num_outputs=num_outputs, max_tokens=max_tokens, @@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec( sample_train=sample_train, **kwargs, ) + elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: + return get_multiple_choice_joint_chain_of_thought_adapter_spec( + instructions, + input_noun, + output_noun, + max_train_instances=max_train_instances, + num_outputs=num_outputs, + max_tokens=max_tokens, + sample_train=sample_train, + **kwargs, + ) elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}: return get_multiple_choice_separate_adapter_spec(method, empty_input) else: diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index 6a1ecfe816..bd9ee8b0d3 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -5,6 +5,7 @@ from helm.benchmark.adaptation.adapter_spec import ( ADAPT_GENERATION, ADAPT_MULTIPLE_CHOICE_JOINT, + ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, AdapterSpec, ) from helm.benchmark.adaptation.common_adapter_specs import ( @@ -308,23 +309,88 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec @run_spec_function("gpqa") -def get_gpqa_spec(subset: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: +def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: + # Convert to bools and remove the str versions + use_chain_of_thought_bool: bool = use_chain_of_thought == "True" + use_few_shot_bool: bool = use_few_shot == "True" + del use_chain_of_thought + del use_few_shot scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset} ) - - adapter_spec = get_multiple_choice_adapter_spec( - method=method, - instructions="The following are multiple choice questions (with answers).", - input_noun="Question", - output_noun="Answer", - ) + max_train_instance_num = 5 if use_few_shot_bool else 0 + + if use_few_shot_bool: + if use_chain_of_thought_bool: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_tokens=1000, # following original repo + max_train_instances=max_train_instance_num, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand." + ), + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + chain_of_thought_prefix="Let's think step by step: ", + chain_of_thought_suffix="The correct answer is ", + output_noun="", # will be overwritten with output_prefix + output_prefix="", + global_suffix=( + "Give step by step reasoning before you answer, and when you’re ready to answer, " + 'please use the format "The correct answer is (insert answer here)":' + ), + ) + else: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand." + ), + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + output_noun="", # will be overwritten with output_prefix + output_prefix="The correct answer is ", + ) + else: + if use_chain_of_thought_bool: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_train_instances=max_train_instance_num, + max_tokens=1000, + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + reference_prefix="(A) ", + global_suffix=( + "Let’s think step by step. Based on your reasoning, what is the single, " + "most likely answer choice? Format your response as follows: " + '"The correct answer is (insert answer here)".' + ), + ) + else: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + max_tokens=1000, + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + reference_prefix="(A) ", + global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), + ) return RunSpec( - name=f"gpqa:subset={subset},method={method}", + name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs(), + metric_specs=get_exact_match_metric_specs(), # TODO: update this after cot metric is ready groups=["gpqa"], )