Added the adapter specs for MC joint CoT and for the GPQA run specs

stanford-crfm · Oct 24, 2024 · f0bf5c9 · f0bf5c9
1 parent 5b8ecfd
commit f0bf5c9
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 13 deletions.
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -8,6 +8,7 @@
 ADAPT_GENERATION: str = "generation"
 ADAPT_LANGUAGE_MODELING: str = "language_modeling"
 ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
+ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
 ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
 ADAPT_RANKING_BINARY: str = "ranking_binary"
@@ -63,6 +64,12 @@ class AdapterSpec:
     reference_suffix: str = "\n"
     """The string that is included after each reference (for multiple-choice questions)."""
 
+    chain_of_thought_prefix: str = ""
+    """The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""
+
+    chain_of_thought_suffix: str = "\n"
+    """The string that is included after each chain of thought. (e.g., 'The correct answer is')"""
+
     output_prefix: str = "Output: "
     """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
 

diff --git a/src/helm/benchmark/adaptation/common_adapter_specs.py b/src/helm/benchmark/adaptation/common_adapter_specs.py
@@ -4,6 +4,7 @@
     ADAPT_GENERATION,
     ADAPT_LANGUAGE_MODELING,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_RANKING_BINARY,
@@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec(
     [output_noun]:
     """
 
+    input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
+    input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
+    output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
+    output_suffix = kwargs.pop("output_suffix", "\n")
+
     return AdapterSpec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions=format_instructions(instructions),
-        input_prefix=f"{input_noun}: " if input_noun is not None else "",
-        input_suffix="\n" if input_noun is not None else "",
-        output_prefix=f"{output_noun}: ",
-        output_suffix="\n",
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
+        max_train_instances=max_train_instances,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        stop_sequences=["\n"],
+        sample_train=sample_train,
+        **kwargs,
+    )
+
+
+def get_multiple_choice_joint_chain_of_thought_adapter_spec(
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    num_outputs: int = 5,
+    max_train_instances: int = 5,
+    max_tokens: int = 5,
+    sample_train: bool = True,
+    **kwargs,
+) -> AdapterSpec:
+    """
+    [instructions]
+
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]: [output]
+
+    [input_noun]: [input]
+    [reference_1]
+    ...
+    [reference_k]
+    [output_noun]:
+    """
+
+    input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
+    input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
+    output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
+    output_suffix = kwargs.pop("output_suffix", "\n")
+
+    return AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+        instructions=format_instructions(instructions),
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        output_prefix=output_prefix,
+        output_suffix=output_suffix,
         max_train_instances=max_train_instances,
         num_outputs=num_outputs,
         max_tokens=max_tokens,
@@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec(
             sample_train=sample_train,
             **kwargs,
         )
+    elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+        return get_multiple_choice_joint_chain_of_thought_adapter_spec(
+            instructions,
+            input_noun,
+            output_noun,
+            max_train_instances=max_train_instances,
+            num_outputs=num_outputs,
+            max_tokens=max_tokens,
+            sample_train=sample_train,
+            **kwargs,
+        )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
         return get_multiple_choice_separate_adapter_spec(method, empty_input)
     else:

diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -5,6 +5,7 @@
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_GENERATION,
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     AdapterSpec,
 )
 from helm.benchmark.adaptation.common_adapter_specs import (
@@ -308,23 +309,57 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
 
 
 @run_spec_function("gpqa")
-def get_gpqa_spec(subset: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
+def get_gpqa_spec(
+    subset: str,
+    use_chain_of_thought: str = "False",
+) -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    del use_chain_of_thought
 
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset}
     )
 
-    adapter_spec = get_multiple_choice_adapter_spec(
-        method=method,
-        instructions="The following are multiple choice questions (with answers).",
-        input_noun="Question",
-        output_noun="Answer",
-    )
+    if use_chain_of_thought_bool:
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+            instructions=(
+                "Here are some example questions from experts. "
+                "An explanation is given before the final answer. "
+                "Answer the final question yourself, giving your reasoning beforehand.",
+            )
+            input_noun="Question",
+            input_suffix="\nChoices: \n",
+            reference_prefix="(A) ",
+            chain_of_thought_prefix="Let's think step by step: ",
+            chain_of_thought_suffix="The correct answer is ",
+            output_noun="",  # will be overwritten with output_prefix
+            output_prefix="",
+            global_suffix=(
+                "Give step by step reasoning before you answer, and when you’re ready to answer, "
+                'please use the format "The correct answer is (insert answer here)":',
+            )
+        )
+    else:
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            instructions=(
+                "Here are some example questions from experts. "   
+                "An explanation is given before the final answer. "
+                "Answer the final question yourself, giving your reasoning beforehand.",
+            )
+            input_noun="Question",
+            input_suffix="\nChoices: \n",
+            reference_prefix="(A) ",
+            output_noun="",  # will be overwritten with output_prefix
+            output_prefix="The correct answer is ",
+        )
 
     return RunSpec(
-        name=f"gpqa:subset={subset},method={method}",
+        name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
+        metric_specs=get_exact_match_metric_specs(),  # TODO: update this after cot metric is ready
         groups=["gpqa"],
     )