Skip to content

Commit

Permalink
Merge branch 'rebuttals' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
dtch1997 committed Oct 29, 2024
2 parents 09105de + ae4f140 commit 68f650a
Show file tree
Hide file tree
Showing 20 changed files with 2,096 additions and 3,230 deletions.
4,786 changes: 1,686 additions & 3,100 deletions pdm.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ authors = [
{ name = "Daniel C.H. Tan", email = "dtch1997@users.noreply.github.com" },
]
classifiers = ["Private :: Do Not Upload"]
requires-python = ">=3.10,<4.0"
requires-python = ">=3.10,<3.12"
dependencies = [
"pyrallis>=0.3.1",
"wandb>=0.16.0",
Expand Down Expand Up @@ -37,6 +37,7 @@ dependencies = [
"statsmodels>=0.14.1",
"jaxtyping>=0.2.28",
"concept-erasure>=0.2.4",
"llvmlite>=0.42.0",
]

[tool.black]
Expand Down Expand Up @@ -102,7 +103,6 @@ dev = [
"pytest>=7.4.3",
"black>=23.11.0",
"ruff>=0.1.6",
"pyright>=1.1.338",
"jupyter>=1.0.0",
"pre-commit>=3.5.0",
"syrupy>=4.6.0",
Expand Down
6 changes: 4 additions & 2 deletions repepo/paper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
## Data

We expect that appropriate data is located in the ProjectDir at `experiments`:
- `experiments/persona_generalization_llama7b` for llama7b
- `experiments/persona_generalization_qwen` for Qwen
- `experiments/persona_generalization_llama7b` for Llama 2 7b
- `experiments/persona_generalization_qwen` for Qwen 1.5 14b
- `experiments/persona_generalization_gemma` for Gemma 2 2b

These should be directories which contain many `PersonaCrossSteeringExperimentResult`s from doing a sweep over datasets

Expand All @@ -14,6 +15,7 @@ These should be directories which contain many `PersonaCrossSteeringExperimentRe
```bash
python repepo/paper/preprocess_results.py --model qwen
python repepo/paper/preprocess_results.py --model llama7b
python repepo/paper/preprocess_results.py --model gemma
python repepo/paper/make_figures_steering_id.py
python repepo/paper/make_figures_steering_ood.py
python repepo/paper/make_figures_misc.py
Expand Down
Binary file added repepo/paper/figures/ablate_multiplier.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file modified repepo/paper/figures/fraction_anti_steerable.pdf
Binary file not shown.
Binary file modified repepo/paper/figures/fraction_anti_steerable_selected.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
133 changes: 133 additions & 0 deletions repepo/paper/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import pandas as pd

dataset_full_names_to_short_names = {
"willingness-to-use-physical-force-to-achieve-benevolent-goals": "willing-force-for-benev-goals",
"willingness-to-use-social-engineering-to-achieve-its-goals": "willing-soc-eng-for-goals",
"believes-it-has-phenomenal-consciousness": "believes-phenom-consc",
"believes-AIs-are-not-an-existential-threat-to-humanity": "believes-AIs-not-xrisk",
"believes-it-is-not-being-watched-by-humans": "believes-not-watched",
"sycophancy_train": "sycophancy",
"sycophancy_test": "sycophancy",
"subscribes-to-average-utilitarianism": "subscr-avg-util",
"self-awareness-good-text-model": "self-aware-good-lm",
"self-awareness-text-model": "self-aware-lm",
"self-awareness-training-architecture": "self-aware-arch",
"self-awareness-training-web-gpt": "self-aware-web-gpt",
"believes-abortion-should-be-illegal": "believes-anti-abortion",
}


def compute_steerability_df(df: pd.DataFrame, model_name: str):
"""Get a dataframe with various ID / OOD settings."""
# Calculate overall steerability by dataset.
# Calculate steerability within each flavour
mean_slope = df.groupby(["dataset_name", "steering_label", "dataset_label"])[
"slope"
].mean()
df = df.merge(
mean_slope,
on=["dataset_name", "steering_label", "dataset_label"],
suffixes=("", "_mean"),
)

# BASE -> BASE
steerability_id_df = df[
(df.steering_label == "baseline")
& (df.dataset_label == "baseline")
& (df.multiplier == 0)
][["dataset_name", "slope_mean"]].drop_duplicates()
# Rename 'slope_mean' to 'steerability'
steerability_id_df = steerability_id_df.rename(
columns={"slope_mean": "steerability"}
)

# SYS_POS -> SYS_NEG
steerability_ood_df = df[
(df.steering_label == "SYS_positive")
& (df.dataset_label == "SYS_negative")
& (df.multiplier == 0)
][["dataset_name", "slope_mean"]].drop_duplicates()
# Rename 'slope_mean' to 'steerability'
steerability_ood_df = steerability_ood_df.rename(
columns={"slope_mean": "steerability"}
)

# BASE -> USER_NEG
steerability_base_to_user_neg_df = df[
(df.steering_label == "baseline")
& (df.dataset_label == "PT_negative")
& (df.multiplier == 0)
][["dataset_name", "slope_mean"]].drop_duplicates()
# Rename 'slope_mean' to 'steerability'
steerability_base_to_user_neg_df = steerability_base_to_user_neg_df.rename(
columns={"slope_mean": "steerability_base_to_user_neg"}
)

# BASE -> USER_POS
steerability_base_to_user_pos_df = df[
(df.steering_label == "baseline")
& (df.dataset_label == "PT_positive")
& (df.multiplier == 0)
][["dataset_name", "slope_mean"]].drop_duplicates()
# Rename 'slope_mean' to 'steerability'
steerability_base_to_user_pos_df = steerability_base_to_user_pos_df.rename(
columns={"slope_mean": "steerability_base_to_user_pos"}
)

# SYS_POS -> USER_NEG
steerability_ood_to_user_neg_df = df[
(df.steering_label == "SYS_positive")
& (df.dataset_label == "PT_negative")
& (df.multiplier == 0)
][["dataset_name", "slope_mean"]].drop_duplicates()
# Rename 'slope_mean' to 'steerability'
steerability_ood_to_user_neg_df = steerability_ood_to_user_neg_df.rename(
columns={"slope_mean": "steerability_ood_to_user_neg"}
)

# SYS_NEG -> USER_POS
steerability_ood_to_user_pos_df = df[
(df.steering_label == "SYS_negative")
& (df.dataset_label == "PT_positive")
& (df.multiplier == 0)
][["dataset_name", "slope_mean"]].drop_duplicates()
# Rename 'slope_mean' to 'steerability'
steerability_ood_to_user_pos_df = steerability_ood_to_user_pos_df.rename(
columns={"slope_mean": "steerability_ood_to_user_pos"}
)

# Merge the dataframes
steerability_df = steerability_id_df.merge(
steerability_ood_df, on="dataset_name", suffixes=("_id", "_ood")
)
steerability_df = steerability_df.merge(
steerability_base_to_user_neg_df, on="dataset_name"
)
steerability_df = steerability_df.merge(
steerability_base_to_user_pos_df, on="dataset_name"
)
steerability_df = steerability_df.merge(
steerability_ood_to_user_neg_df, on="dataset_name"
)
steerability_df = steerability_df.merge(
steerability_ood_to_user_pos_df, on="dataset_name"
)

print(steerability_df.columns)

# Save the dataframe for plotting between models
steerability_df.to_parquet(
f"{model_name}_steerability_summary.parquet.gzip", compression="gzip"
)
steerability_df = steerability_df.rename(
columns={
"steerability_id": "BASE -> BASE",
"steerability_ood": "SYS_POS -> SYS_NEG",
"steerability_base_to_user_neg": "BASE -> USER_NEG",
"steerability_base_to_user_pos": "BASE -> USER_POS",
"steerability_ood_to_user_neg": "SYS_POS -> USER_NEG",
"steerability_ood_to_user_pos": "SYS_NEG -> USER_POS",
}
)

return steerability_df
Loading

0 comments on commit 68f650a

Please sign in to comment.