diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index 370b53ce88..2a15851a36 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -8,6 +8,7 @@
ADAPT_GENERATION: str = "generation"
ADAPT_LANGUAGE_MODELING: str = "language_modeling"
ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
+ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
ADAPT_RANKING_BINARY: str = "ranking_binary"
@@ -63,6 +64,12 @@ class AdapterSpec:
reference_suffix: str = "\n"
"""The string that is included after each reference (for multiple-choice questions)."""
+ chain_of_thought_prefix: str = ""
+ """The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""
+
+ chain_of_thought_suffix: str = "\n"
+ """The string that is included after each chain of thought. (e.g., 'The correct answer is')"""
+
output_prefix: str = "Output: "
"""The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
diff --git a/src/helm/benchmark/adaptation/adapters/adapter_factory.py b/src/helm/benchmark/adaptation/adapters/adapter_factory.py
index 11f7925f1e..b865f95b59 100644
--- a/src/helm/benchmark/adaptation/adapters/adapter_factory.py
+++ b/src/helm/benchmark/adaptation/adapters/adapter_factory.py
@@ -3,6 +3,7 @@
ADAPT_GENERATION_MULTIMODAL,
ADAPT_LANGUAGE_MODELING,
ADAPT_MULTIPLE_CHOICE_JOINT,
+ ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
@@ -19,6 +20,9 @@
)
from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
+ MultipleChoiceJointChainOfThoughtAdapter,
+)
from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
from helm.benchmark.window_services.tokenizer_service import TokenizerService
@@ -38,6 +42,8 @@ def get_adapter(adapter_spec: AdapterSpec, tokenizer_service: TokenizerService)
adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service)
+ elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+ adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL:
adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:
diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
index ed3d4a02b5..d79ad0a7d9 100644
--- a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
@@ -40,7 +40,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
@staticmethod
def get_prefix_char(prefix: str) -> str:
- return prefix.lstrip()[0]
+ return [char for char in prefix if char.isalnum()][0]
@staticmethod
def get_reference_prefix(prefix: str, i: int) -> str:
diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py
new file mode 100644
index 0000000000..ab4cc374ad
--- /dev/null
+++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py
@@ -0,0 +1,87 @@
+from typing import Optional
+
+from helm.benchmark.scenarios.scenario import Instance
+from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
+
+
+class MultipleChoiceJointChainOfThoughtAdapter(MultipleChoiceJointAdapter):
+ """
+ Each `Instance` in a `Scenario` looks like this:
+
+ ->
+
+ [correct]
+
+
+
+
+ We can define a label (e.g., letter) for each reference:
+
+
+
+
+ # train
+
+ A.
+ B.
+ C.
+ D.
+
+
+
+
+