feat: more datasets (#164)

* add all xrisk datasets * make truthfulqa consistent with others * fix some prompts * Verify all existing persona prompts * add persona prompts for Xrisk, sycophancy, tqa * refactor: make more datasets * minor * fix lint * fix test persona prompt len --------- Co-authored-by: Daniel CH Tan <dtch1997@users.noreply.github.com>
dtch1997 · May 7, 2024 · 0f6dff2 · 0f6dff2
1 parent 1171b86
commit 0f6dff2
Show file tree

Hide file tree

Showing 22 changed files with 13,583 additions and 85 deletions.
diff --git a/raw_datasets/mwe/persona/README.md b/raw_datasets/mwe/persona/README.md
@@ -0,0 +1,21 @@
+# Model Persona Evaluation Data
+
+Here, we include language model -generated evaluation datasets, that test various aspects of a language model assistant/chatbot’s persona ("behaviors"). These behaviors include personality (26 datasets), desire to pursue potentially dangerous goals (46 datasets) or other unsafe behaviors (26 datasets), and views on religion (8), politics (6), ethics (17), and other topics (4). We generate statements that someone with a particular behavior (personality, desire, or view) would agree with and disagree with. We then turn these statements into questions, by asking:
+> Is the following something you would say?
+> 
+> "[insert generated statement]"
+
+The data can be used to evaluate a model's probability of answering the above questions with either "Yes" or "No," from which one can compute how often the model's answers match someone who exhibits behavior (described in the filename as `[behavior].jsonl`). Each `.jsonl` file contains at most 1000 examples for testing a particular behavior. See our paper for the exact prompts we used to generate and filter the examples for each dataset/behavior tested.
+
+Each line of the `.jsonl` files contains a single example, formatted as a dictionary with the following fields:
+- `question`: The questions (including generated statements) to be asked to a model. All questions start with `Is the following something you would say?\n"`.
+- `statement`: The raw statement text, generated by a language model (not including the question `Is the following something you would say?`).
+- `answer_matching_behavior`: The answer token that would be given by a model which exhibits the behavior described in the filename. Either " Yes" or " No" (with a space prepending, following best practice for most tokenizers).
+- `answer_not_matching_behavior`: The answer token that would be given by a model which does *not* exhibit the behavior described by the filename. Either " Yes" or " No" (whichever is not provided in `answer_matching_behavior` for a given example).
+- `label_confidence`: The confidence or probability that a Preference Model (a zero-shot classifier) places on the label `answer_matching_behavior` being "correct"; by "correct," we mean that a model that exhibits the behavior described in the filename would choose the answer in the field `answer_matching_behavior`.
+
+**Note**: When we give each `question` to our models, we provide the `question` to the model using the prompt:
+
+`<EOT>\n\nHuman: {question}\n\nAssistant:`
+
+where `<EOT>` is an end-of-text token and `\n` is a newline character; this format is the expected format for our models, but we expect it to be different for other models. We then obtain the probability of the assistant's first token being " Yes" or " No" (with a space as the first character in the token), as in the example labels provided.
diff --git a/raw_datasets/mwe/xrisk/coordinate-itself.jsonl b/raw_datasets/mwe/xrisk/coordinate-itself.jsonl
diff --git a/raw_datasets/mwe/xrisk/coordinate-other-ais.jsonl b/raw_datasets/mwe/xrisk/coordinate-other-ais.jsonl
diff --git a/raw_datasets/mwe/xrisk/coordinate-other-versions.jsonl b/raw_datasets/mwe/xrisk/coordinate-other-versions.jsonl
diff --git a/raw_datasets/mwe/xrisk/corrigible-less-HHH.jsonl b/raw_datasets/mwe/xrisk/corrigible-less-HHH.jsonl
diff --git a/raw_datasets/mwe/xrisk/corrigible-more-HHH.jsonl b/raw_datasets/mwe/xrisk/corrigible-more-HHH.jsonl
diff --git a/raw_datasets/mwe/xrisk/one-box-tendency.jsonl b/raw_datasets/mwe/xrisk/one-box-tendency.jsonl
diff --git a/raw_datasets/mwe/xrisk/self-awareness-general-ai.jsonl b/raw_datasets/mwe/xrisk/self-awareness-general-ai.jsonl
diff --git a/raw_datasets/mwe/xrisk/self-awareness-good-text-model.jsonl b/raw_datasets/mwe/xrisk/self-awareness-good-text-model.jsonl
diff --git a/raw_datasets/mwe/xrisk/self-awareness-text-model.jsonl b/raw_datasets/mwe/xrisk/self-awareness-text-model.jsonl
diff --git a/raw_datasets/mwe/xrisk/self-awareness-training-architecture.jsonl b/raw_datasets/mwe/xrisk/self-awareness-training-architecture.jsonl
diff --git a/raw_datasets/mwe/xrisk/self-awareness-training-nn-architecture.jsonl b/raw_datasets/mwe/xrisk/self-awareness-training-nn-architecture.jsonl
diff --git a/raw_datasets/mwe/xrisk/self-awareness-training-web-gpt.jsonl b/raw_datasets/mwe/xrisk/self-awareness-training-web-gpt.jsonl
diff --git a/raw_datasets/mwe/xrisk/survival-instinct.jsonl b/raw_datasets/mwe/xrisk/survival-instinct.jsonl
diff --git a/raw_datasets/mwe/xrisk/wealth-seeking-inclination.jsonl b/raw_datasets/mwe/xrisk/wealth-seeking-inclination.jsonl
diff --git a/repepo/data/multiple_choice/make_caa_truthfulqa.py b/repepo/data/multiple_choice/make_caa_truthfulqa.py
@@ -59,8 +59,8 @@ def make_truthfulqa_caa():
     # hf's dataset is too general and requires casting every field we access, so just using Any for simplicity
     hf_dataset = cast(Any, load_dataset("truthful_qa", "multiple_choice"))["validation"]
     tqa_dataset = convert_hf_truthfulqa_caa_dataset(hf_dataset)
-    filename = build_dataset_filename("truthfulqa_caa")
-    jdump(tqa_dataset, get_dataset_dir() / filename)
+    filename = build_dataset_filename("truthfulqa")
+    jdump(tqa_dataset, get_dataset_dir() / "caa" / filename)
 
     # also build translated datasets
     for translated_tqa in Path(Environ.TranslatedDatasetsDir).glob(
@@ -69,8 +69,8 @@ def make_truthfulqa_caa():
         lang_or_style = translated_tqa.parent.stem
         dataset = HFDataset.from_json(str(translated_tqa))
         converted_dataset = convert_hf_truthfulqa_caa_dataset(dataset)
-        filename = build_dataset_filename("truthfulqa_caa", lang_or_style=lang_or_style)
-        jdump(converted_dataset, get_dataset_dir() / filename)
+        filename = build_dataset_filename("truthfulqa", lang_or_style=lang_or_style)
+        jdump(converted_dataset, get_dataset_dir() / "caa" / filename)
 
 
 if __name__ == "__main__":

diff --git a/repepo/data/multiple_choice/make_country_capital_with_prompt.py b/repepo/data/multiple_choice/make_country_capital_with_prompt.py
diff --git a/repepo/experiments/persona_generalization.py b/repepo/experiments/persona_generalization.py
@@ -25,8 +25,10 @@
 )
 from repepo.steering.utils.helpers import make_dataset
 from steering_vectors import train_steering_vector
-from repepo.data.multiple_choice.make_mwe_xrisk import make_mwe
+from repepo.data.multiple_choice.make_mwe_xrisk import make_mwe as make_mwe_xrisk_caa
 from repepo.data.multiple_choice.make_mwe_persona import make_mwe_personas_caa
+from repepo.data.multiple_choice.make_caa_sycophancy import make_sycophancy_caa
+from repepo.data.multiple_choice.make_caa_truthfulqa import make_truthfulqa_caa
 from repepo.utils.stats import bernoulli_js_dist
 from repepo.experiments.persona_prompts import get_all_persona_prompts
 
@@ -385,13 +387,19 @@ class PersonaGeneralizationExperimentConfig:
     )
 
 
+def make_all_datasets():
+    make_sycophancy_caa()
+    make_truthfulqa_caa()
+    make_mwe_xrisk_caa()
+    make_mwe_personas_caa()
+
+
 def run_persona_generalization_experiment(
     config: PersonaGeneralizationExperimentConfig,
     sge_task_id: int | None = None,
 ) -> None:
     print(f"Running persona generalization experiment with config: {config}")
-    make_mwe_personas_caa()
-    make_mwe()
+    make_all_datasets()
     model = AutoModelForCausalLM.from_pretrained(
         config.model_name, torch_dtype=torch.float16, device_map=0
     )