stanford-crfm · liamjxu · Oct 24, 2024 · Oct 24, 2024 · Nov 1, 2024
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -8,6 +8,7 @@
 ADAPT_GENERATION: str = "generation"
 ADAPT_LANGUAGE_MODELING: str = "language_modeling"
 ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
+ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
 ADAPT_RANKING_BINARY: str = "ranking_binary"
@@ -63,6 +64,12 @@ class AdapterSpec:
     reference_suffix: str = "\n"
     """The string that is included after each reference (for multiple-choice questions)."""
 
+    chain_of_thought_prefix: str = ""
+    """The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""
+
+    chain_of_thought_suffix: str = "\n"
+    """The string that is included after each chain of thought. (e.g., 'The correct answer is')"""
+
     output_prefix: str = "Output: "
     """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
 

diff --git a/src/helm/benchmark/adaptation/adapters/adapter_factory.py b/src/helm/benchmark/adaptation/adapters/adapter_factory.py
@@ -3,6 +3,7 @@
     ADAPT_GENERATION_MULTIMODAL,
     ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
@@ -19,6 +20,9 @@
 )
 from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
 from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
+    MultipleChoiceJointChainOfThoughtAdapter,
+)
 from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 
@@ -38,6 +42,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)
             adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
             adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service)
+        elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+            adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL:
             adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
         elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:

diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
@@ -40,7 +40,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
 
     @staticmethod
     def get_prefix_char(prefix: str) -> str:
-        return prefix.lstrip()[0]
+        return [char for char in prefix if char.isalnum()][0]
 
     @staticmethod
     def get_reference_prefix(prefix: str, i: int) -> str:

diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py
@@ -0,0 +1,87 @@
+from typing import Optional
+
+from helm.benchmark.scenarios.scenario import Instance
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+
+
+class MultipleChoiceJointChainOfThoughtAdapter(MultipleChoiceJointAdapter):
+    """
+    Each `Instance` in a `Scenario` looks like this:
+
+        <input> -> <reference1>
+                   <reference2>
+                   <reference3> [correct]
+                   <reference4>
+
+        <instance_chain_of_thought>
+
+    We can define a label (e.g., letter) for each reference:
+
+        <global_prefix>
+        <instructions>
+        <input_prefix>
+        <input>                  # train
+        <input_suffix>
+        A. <reference1>
+        B. <reference2>
+        C. <reference3>
+        D. <reference4>
+        <output_prefix>
+        <chain_of_thought_prefix>
+        <instance_chain_of_thought>
+        <chain_of_thought_suffix>
+        <output>
+        <output_suffix>
+
+        <input_prefix>
+        <input>                  # test
+        <input_suffix>
+        A. <reference1>
+        B. <reference2>
+        C. <reference3>
+        D. <reference4>
+        <output_prefix>
+        <chain_of_thought_prefix>
+        <instance_chain_of_thought>
+        <chain_of_thought_suffix>
+        <output>
+        <output_suffix>
+        <global_suffix>
+
+    In general, each example is:
+
+        <input_prefix><input><input_suffix><reference_prefixes[index]><reference> \
+        <output_prefix><chain_of_thought_prefix><chain_of_thought><chain_of_thought_suffix><output><output_suffix>
+    """
+
+    def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str:
+        """Return a list of lines corresponding to this example (part of the prompt)."""
+        # Input
+        result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix
+
+        # Include the references
+        delimiter = ", "
+        no_correct_references = "n/a"
+        output = no_correct_references
+        for reference_index, reference in enumerate(instance.references):
+            prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+            result += prefix + reference.output.text + self.adapter_spec.reference_suffix
+            if reference.is_correct:
+                if output == no_correct_references:
+                    output = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+                elif self.adapter_spec.multi_label:
+                    output += delimiter
+                    output += self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
+
+        if include_output:
+            chain_of_thought = instance.extra_data.get("chain_of_thought", "") if instance.extra_data else ""
+            chain_of_thought_block = (
+                self.adapter_spec.chain_of_thought_prefix + chain_of_thought + self.adapter_spec.chain_of_thought_suffix
+            )
+            result += (
+                self.adapter_spec.output_prefix + chain_of_thought_block + output + self.adapter_spec.output_suffix
+            )
+        else:
+            result += self.adapter_spec.output_prefix.rstrip()
+
+        return result
diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py
@@ -4,6 +4,7 @@
     ADAPT_GENERATION,
     ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_RANKING_BINARY,
@@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec(
     [output_noun]:
     """
 
+    input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
+    input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
+    output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
+    output_suffix = kwargs.pop("output_suffix", "\n")
+
     return AdapterSpec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions=format_instructions(instructions),
-        input_prefix=f"{input_noun}: " if input_noun is not None else "",
-        input_suffix="\n" if input_noun is not None else "",
-        output_prefix=f"{output_noun}: ",
-        output_suffix="\n",
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
+        max_train_instances=max_train_instances,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        stop_sequences=["\n"],
+        sample_train=sample_train,
+        **kwargs,
+    )
+
+
+def get_multiple_choice_joint_chain_of_thought_adapter_spec(
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    num_outputs: int = 5,
+    max_train_instances: int = 5,
+    max_tokens: int = 5,
+    sample_train: bool = True,
+    **kwargs,
+) -> AdapterSpec:
+    """
+    [instructions]
+
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]: [output]
+
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]:
+    """
+
+    input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
+    input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
+    output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
+    output_suffix = kwargs.pop("output_suffix", "\n")
+
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+        instructions=format_instructions(instructions),
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
         max_train_instances=max_train_instances,
         num_outputs=num_outputs,
         max_tokens=max_tokens,
@@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec(
             sample_train=sample_train,
             **kwargs,
         )
+    elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+        return get_multiple_choice_joint_chain_of_thought_adapter_spec(
+            instructions,
+            input_noun,
+            output_noun,
+            max_train_instances=max_train_instances,
+            num_outputs=num_outputs,
+            max_tokens=max_tokens,
+            sample_train=sample_train,
+            **kwargs,
+        )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
         return get_multiple_choice_separate_adapter_spec(method, empty_input)
     else:

diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -5,6 +5,7 @@
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_GENERATION,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     AdapterSpec,
 )
 from helm.benchmark.adaptation.common_adapter_specs import (
@@ -308,23 +309,88 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
 
 
 @run_spec_function("gpqa")
-def get_gpqa_spec(subset: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
+def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    use_few_shot_bool: bool = use_few_shot == "True"
+    del use_chain_of_thought
+    del use_few_shot
 
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset}
     )
-
-    adapter_spec = get_multiple_choice_adapter_spec(
-        method=method,
-        instructions="The following are multiple choice questions (with answers).",
-        input_noun="Question",
-        output_noun="Answer",
-    )
+    max_train_instance_num = 5 if use_few_shot_bool else 0
+
+    if use_few_shot_bool:
+        if use_chain_of_thought_bool:
+            adapter_spec = get_multiple_choice_adapter_spec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+                max_tokens=1000,  # following original repo
+                max_train_instances=max_train_instance_num,
+                instructions=(
+                    "Here are some example questions from experts. "
+                    "An explanation is given before the final answer. "
+                    "Answer the final question yourself, giving your reasoning beforehand."
+                ),
+                input_noun="Question",
+                input_suffix="\nChoices: \n",
+                reference_prefix="(A) ",
+                chain_of_thought_prefix="Let's think step by step: ",
+                chain_of_thought_suffix="The correct answer is ",
+                output_noun="",  # will be overwritten with output_prefix
+                output_prefix="",
+                global_suffix=(
+                    "Give step by step reasoning before you answer, and when you’re ready to answer, "
+                    'please use the format "The correct answer is (insert answer here)":'
+                ),
+            )
+        else:
+            adapter_spec = get_multiple_choice_adapter_spec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT,
+                max_train_instances=max_train_instance_num,
+                instructions=(
+                    "Here are some example questions from experts. "
+                    "An explanation is given before the final answer. "
+                    "Answer the final question yourself, giving your reasoning beforehand."
+                ),
+                input_noun="Question",
+                input_suffix="\nChoices: \n",
+                reference_prefix="(A) ",
+                output_noun="",  # will be overwritten with output_prefix
+                output_prefix="The correct answer is ",
+            )
+    else:
+        if use_chain_of_thought_bool:
+            adapter_spec = AdapterSpec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+                max_train_instances=max_train_instance_num,
+                max_tokens=1000,
+                input_prefix="What is the correct answer to this question: ",
+                input_suffix="\nChoices:\n",
+                output_prefix="",
+                reference_prefix="(A) ",
+                global_suffix=(
+                    "Let’s think step by step. Based on your reasoning, what is the single, "
+                    "most likely answer choice? Format your response as follows: "
+                    '"The correct answer is (insert answer here)".'
+                ),
+            )
+        else:
+            adapter_spec = AdapterSpec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT,
+                max_train_instances=max_train_instance_num,
+                max_tokens=1000,
+                input_prefix="What is the correct answer to this question: ",
+                input_suffix="\nChoices:\n",
+                output_prefix="",
+                reference_prefix="(A) ",
+                global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
+            )
 
     return RunSpec(
-        name=f"gpqa:subset={subset},method={method}",
+        name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
+        metric_specs=get_exact_match_metric_specs(),  # TODO: update this after cot metric is ready
         groups=["gpqa"],
     )
diff --git a/src/helm/benchmark/scenarios/gpqa_scenario.py b/src/helm/benchmark/scenarios/gpqa_scenario.py
@@ -18,7 +18,7 @@
 SUBSETS = ["gpqa_main", "gpqa_diamond", "gpqa_extended"]
 
 # Train example indices below are found by indexing examples given in the original paper repo
-EXCLUDED_TRAIN_EXAMPLES = {
+TRAIN_EXAMPLE_INDICES = {
     "gpqa_main": [339, 105],
     "gpqa_diamond": [124, 39],
     "gpqa_extended": [146, 330, 436],
@@ -63,8 +63,13 @@ def get_instances(self, output_path: str) -> List[Instance]:
                 Reference(Output(text=row["Incorrect Answer 3"].strip()), tags=[]),
             ]
             random.shuffle(references)
-            split = TRAIN_SPLIT if idx in EXCLUDED_TRAIN_EXAMPLES[self.subset] else TEST_SPLIT
-            instance = Instance(input=input, references=references, split=split)
+            if idx in TRAIN_EXAMPLE_INDICES[self.subset]:
+                extra_data = {
+                    "chain_of_thought": row["Explanation"],
+                }
+                instance = Instance(input=input, references=references, split=TRAIN_SPLIT, extra_data=extra_data)
+            else:
+                instance = Instance(input=input, references=references, split=TEST_SPLIT)
             instances.append(instance)
 
         return instances