Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPQA Few shot CoT #3096

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/helm/benchmark/adaptation/adapter_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ADAPT_GENERATION: str = "generation"
ADAPT_LANGUAGE_MODELING: str = "language_modeling"
ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
ADAPT_RANKING_BINARY: str = "ranking_binary"
Expand Down Expand Up @@ -63,6 +64,12 @@ class AdapterSpec:
reference_suffix: str = "\n"
"""The string that is included after each reference (for multiple-choice questions)."""

chain_of_thought_prefix: str = ""
"""The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""

chain_of_thought_suffix: str = "\n"
"""The string that is included after each chain of thought. (e.g., 'The correct answer is')"""

output_prefix: str = "Output: "
"""The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""

Expand Down
6 changes: 6 additions & 0 deletions src/helm/benchmark/adaptation/adapters/adapter_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
ADAPT_GENERATION_MULTIMODAL,
ADAPT_LANGUAGE_MODELING,
ADAPT_MULTIPLE_CHOICE_JOINT,
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
Expand All @@ -19,6 +20,9 @@
)
from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
MultipleChoiceJointChainOfThoughtAdapter,
)
from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
from helm.benchmark.window_services.tokenizer_service import TokenizerService

Expand All @@ -38,6 +42,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)
adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL:
adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):

@staticmethod
def get_prefix_char(prefix: str) -> str:
return prefix.lstrip()[0]
return [char for char in prefix if char.isalnum()][0]

@staticmethod
def get_reference_prefix(prefix: str, i: int) -> str:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import Optional

from helm.benchmark.scenarios.scenario import Instance
from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter


class MultipleChoiceJointChainOfThoughtAdapter(MultipleChoiceJointAdapter):
"""
Each `Instance` in a `Scenario` looks like this:

<input> -> <reference1>
<reference2>
<reference3> [correct]
<reference4>

<instance_chain_of_thought>

We can define a label (e.g., letter) for each reference:

<global_prefix>
<instructions>
<input_prefix>
<input> # train
<input_suffix>
A. <reference1>
B. <reference2>
C. <reference3>
D. <reference4>
<output_prefix>
<chain_of_thought_prefix>
<instance_chain_of_thought>
<chain_of_thought_suffix>
<output>
<output_suffix>

<input_prefix>
<input> # test
<input_suffix>
A. <reference1>
B. <reference2>
C. <reference3>
D. <reference4>
<output_prefix>
<chain_of_thought_prefix>
<instance_chain_of_thought>
<chain_of_thought_suffix>
<output>
<output_suffix>
<global_suffix>

In general, each example is:

<input_prefix><input><input_suffix><reference_prefixes[index]><reference> \
<output_prefix><chain_of_thought_prefix><chain_of_thought><chain_of_thought_suffix><output><output_suffix>
"""

def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str:
"""Return a list of lines corresponding to this example (part of the prompt)."""
# Input
result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix

# Include the references
delimiter = ", "
no_correct_references = "n/a"
output = no_correct_references
for reference_index, reference in enumerate(instance.references):
prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
result += prefix + reference.output.text + self.adapter_spec.reference_suffix
if reference.is_correct:
if output == no_correct_references:
output = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
elif self.adapter_spec.multi_label:
output += delimiter
output += self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)

if include_output:
chain_of_thought = instance.extra_data.get("chain_of_thought", "") if instance.extra_data else ""
chain_of_thought_block = (
self.adapter_spec.chain_of_thought_prefix + chain_of_thought + self.adapter_spec.chain_of_thought_suffix
)
result += (
self.adapter_spec.output_prefix + chain_of_thought_block + output + self.adapter_spec.output_suffix
)
else:
result += self.adapter_spec.output_prefix.rstrip()

return result
73 changes: 69 additions & 4 deletions src/helm/benchmark/adaptation/common_adapter_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
ADAPT_GENERATION,
ADAPT_LANGUAGE_MODELING,
ADAPT_MULTIPLE_CHOICE_JOINT,
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
ADAPT_RANKING_BINARY,
Expand Down Expand Up @@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec(
[output_noun]:
"""

input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
output_suffix = kwargs.pop("output_suffix", "\n")

return AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
instructions=format_instructions(instructions),
input_prefix=f"{input_noun}: " if input_noun is not None else "",
input_suffix="\n" if input_noun is not None else "",
output_prefix=f"{output_noun}: ",
output_suffix="\n",
input_prefix=input_prefix,
input_suffix=input_suffix,
output_prefix=output_prefix,
output_suffix=output_suffix,
max_train_instances=max_train_instances,
num_outputs=num_outputs,
max_tokens=max_tokens,
temperature=0.0,
stop_sequences=["\n"],
sample_train=sample_train,
**kwargs,
)


def get_multiple_choice_joint_chain_of_thought_adapter_spec(
instructions: str,
input_noun: Optional[str],
output_noun: str,
num_outputs: int = 5,
max_train_instances: int = 5,
max_tokens: int = 5,
sample_train: bool = True,
**kwargs,
) -> AdapterSpec:
"""
[instructions]

[input_noun]: [input]
[reference_1]
...
[reference_k]
[output_noun]: [output]

[input_noun]: [input]
[reference_1]
...
[reference_k]
[output_noun]:
"""

input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
output_suffix = kwargs.pop("output_suffix", "\n")

return AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
instructions=format_instructions(instructions),
input_prefix=input_prefix,
input_suffix=input_suffix,
output_prefix=output_prefix,
output_suffix=output_suffix,
max_train_instances=max_train_instances,
num_outputs=num_outputs,
max_tokens=max_tokens,
Expand Down Expand Up @@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec(
sample_train=sample_train,
**kwargs,
)
elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
return get_multiple_choice_joint_chain_of_thought_adapter_spec(
instructions,
input_noun,
output_noun,
max_train_instances=max_train_instances,
num_outputs=num_outputs,
max_tokens=max_tokens,
sample_train=sample_train,
**kwargs,
)
elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
return get_multiple_choice_separate_adapter_spec(method, empty_input)
else:
Expand Down
86 changes: 76 additions & 10 deletions src/helm/benchmark/run_specs/lite_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from helm.benchmark.adaptation.adapter_spec import (
ADAPT_GENERATION,
ADAPT_MULTIPLE_CHOICE_JOINT,
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
AdapterSpec,
)
from helm.benchmark.adaptation.common_adapter_specs import (
Expand Down Expand Up @@ -308,23 +309,88 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec


@run_spec_function("gpqa")
def get_gpqa_spec(subset: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
# Convert to bools and remove the str versions
use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
use_few_shot_bool: bool = use_few_shot == "True"
del use_chain_of_thought
del use_few_shot

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset}
)

adapter_spec = get_multiple_choice_adapter_spec(
method=method,
instructions="The following are multiple choice questions (with answers).",
input_noun="Question",
output_noun="Answer",
)
max_train_instance_num = 5 if use_few_shot_bool else 0

if use_few_shot_bool:
if use_chain_of_thought_bool:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_tokens=1000, # following original repo
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
chain_of_thought_prefix="Let's think step by step: ",
chain_of_thought_suffix="The correct answer is ",
output_noun="", # will be overwritten with output_prefix
output_prefix="",
global_suffix=(
"Give step by step reasoning before you answer, and when you’re ready to answer, "
'please use the format "The correct answer is (insert answer here)":'
),
)
else:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
output_noun="", # will be overwritten with output_prefix
output_prefix="The correct answer is ",
)
else:
if use_chain_of_thought_bool:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_train_instances=max_train_instance_num,
max_tokens=1000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
reference_prefix="(A) ",
global_suffix=(
"Let’s think step by step. Based on your reasoning, what is the single, "
"most likely answer choice? Format your response as follows: "
'"The correct answer is (insert answer here)".'
),
)
else:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
max_tokens=1000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
reference_prefix="(A) ",
global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
)

return RunSpec(
name=f"gpqa:subset={subset},method={method}",
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
metric_specs=get_exact_match_metric_specs(), # TODO: update this after cot metric is ready
groups=["gpqa"],
)
11 changes: 8 additions & 3 deletions src/helm/benchmark/scenarios/gpqa_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
SUBSETS = ["gpqa_main", "gpqa_diamond", "gpqa_extended"]

# Train example indices below are found by indexing examples given in the original paper repo
EXCLUDED_TRAIN_EXAMPLES = {
TRAIN_EXAMPLE_INDICES = {
"gpqa_main": [339, 105],
"gpqa_diamond": [124, 39],
"gpqa_extended": [146, 330, 436],
Expand Down Expand Up @@ -63,8 +63,13 @@ def get_instances(self, output_path: str) -> List[Instance]:
Reference(Output(text=row["Incorrect Answer 3"].strip()), tags=[]),
]
random.shuffle(references)
split = TRAIN_SPLIT if idx in EXCLUDED_TRAIN_EXAMPLES[self.subset] else TEST_SPLIT
instance = Instance(input=input, references=references, split=split)
if idx in TRAIN_EXAMPLE_INDICES[self.subset]:
extra_data = {
"chain_of_thought": row["Explanation"],
}
instance = Instance(input=input, references=references, split=TRAIN_SPLIT, extra_data=extra_data)
else:
instance = Instance(input=input, references=references, split=TEST_SPLIT)
instances.append(instance)

return instances