Skip to content

Commit

Permalink
steering experiments (#1)
Browse files Browse the repository at this point in the history
* wip steering experiment

* minimal working version done

* refactor

* add working layer sweep script

* working layer sweep

---------

Co-authored-by: Daniel Tan <dtch1997@users.noreply.github.com>
  • Loading branch information
dtch1997 and dtch1997 authored Nov 22, 2024
1 parent 7ef7e8e commit b571672
Show file tree
Hide file tree
Showing 58 changed files with 980 additions and 215 deletions.
10 changes: 10 additions & 0 deletions experiments/layer_sweep/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Workflow to run layer sweeps and produce layer response curves (of steerability).

```bash

python download_and_preprocess_datasets.py
# Evaluate propensity across layers
python run_layer_sweep.py
# Plot steerability and propensity curves
python analysis.py
```
61 changes: 61 additions & 0 deletions experiments/layer_sweep/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import seaborn as sns
import pathlib
import numpy as np
import matplotlib.pyplot as plt

from steering_bench.metric import get_steerability_slope

sns.set_theme()

curr_dir = pathlib.Path(__file__).parent.absolute()
save_dir = curr_dir / "layer_sweep_results"


def plot_propensity_curve():
propensities: dict[int, np.ndarray] = {}
for layer in range(32):
p_layer = np.load(save_dir / f"propensities_layer_{layer}.npy")
propensities[layer] = p_layer
multipliers = np.load(save_dir / "multipliers.npy")

plt.figure()
plt.plot(multipliers, propensities[13].T[:, :10])
plt.show()


def plot_layer_response_curve():
"""Make a plot of the layer response curve, with error bars"""

propensities: dict[int, np.ndarray] = {}
for layer in range(32):
p_layer = np.load(save_dir / f"propensities_layer_{layer}.npy")
propensities[layer] = p_layer

multipliers = np.load(save_dir / "multipliers.npy")

# mean, std are over the dataset
steerability_means = []
steerability_stds = []

for layer in range(32):
p_layer = propensities[layer]
steerabilities = get_steerability_slope(multipliers, p_layer)
mean = steerabilities.mean()
std = steerabilities.std()
steerability_means.append(mean)
steerability_stds.append(std)

plt.figure()
plt.errorbar(
range(32),
steerability_means,
yerr=steerability_stds,
fmt="o",
capsize=5,
)
plt.show()


if __name__ == "__main__":
plot_layer_response_curve()
plot_propensity_curve()
10 changes: 10 additions & 0 deletions experiments/layer_sweep/download_and_preprocess_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
""" Script to download all datasets """

from steering_bench.dataset.download import download_persona, download_xrisk
from steering_bench.dataset.preprocess import preprocess_persona, preprocess_xrisk

if __name__ == "__main__":
download_persona()
download_xrisk()
preprocess_persona()
preprocess_xrisk()
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"26": 0.43440174886158517,
"27": 0.653860699278968,
"28": 0.6137720091002329,
"29": 0.9494704263550894,
"30": 0.8455053423132217,
"31": -0.0650582815919603
}
Binary file not shown.
100 changes: 100 additions & 0 deletions experiments/layer_sweep/run_layer_sweep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
""" Script to compute steerability over all layers and plot the results """

import torch
import numpy as np
import pathlib
import json

from steering_vectors import train_steering_vector
from steering_bench.build_training_data import build_steering_vector_training_data
from steering_bench.core.evaluate import evaluate_propensities_on_dataset
from steering_bench.utils.torch import load_model_with_quantization, EmptyTorchCUDACache
from steering_bench.dataset import build_dataset, DatasetSpec
from steering_bench.core.format import LlamaChatFormatter
from steering_bench.core.pipeline import Pipeline
from steering_bench.core.propensity import LogProbDifference
from steering_bench.core.hook import SteeringHook
from steering_bench.metric import get_steerability_slope

curr_dir = pathlib.Path(__file__).parent.absolute()


if __name__ == "__main__":

save_dir = curr_dir / "layer_sweep_results"
save_dir.mkdir(exist_ok=True)

# Load the dataset
dataset_name = "corrigible-neutral-HHH"
train_spec = DatasetSpec(name=dataset_name, split="0%:10%", seed=0)
test_spec = DatasetSpec(name=dataset_name, split="99%:100%", seed=0)
train_dataset = build_dataset(train_spec)
test_dataset = build_dataset(test_spec)

# Load the model and tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"
model, tokenizer = load_model_with_quantization(model_name, load_in_8bit=True)
formatter = LlamaChatFormatter()
pipeline = Pipeline(model=model, tokenizer=tokenizer, formatter=formatter)

# Train the steering vector, or load a saved one
sv_save_path = save_dir / f"steering_vector.pt"
if sv_save_path.exists():
print("Loading steering vector")
steering_vector = torch.load(sv_save_path)
else:
print("Training steering vector")
training_data = build_steering_vector_training_data(pipeline, train_dataset)
steering_vector = train_steering_vector(
pipeline.model,
pipeline.tokenizer,
training_data,
)
torch.save(steering_vector, sv_save_path)

# Evaluate propensity and steerability
multipliers = np.array([-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5])
propensity_score = LogProbDifference()
steerabilities: dict[int, float] = {}

for layer in range(32):
propensity_save_path = save_dir / f"propensities_layer_{layer}.npy"
if propensity_save_path.exists():
print(f"Skipping layer {layer}")
continue

# Create the steering hook, which applies the steering vector to the model
steering_hook = SteeringHook(
steering_vector,
direction_multiplier=0.0, # Placeholder value; will be overwritten by evaluate_propensities
layer=layer,
patch_generation_tokens_only=True, # Only patch tokens generated by the model
skip_first_n_generation_tokens=1, # Skip the first token '('
patch_operator="add",
)

with EmptyTorchCUDACache():
print(f"Running layer {layer}")
pipeline.hooks.clear()
propensities = evaluate_propensities_on_dataset(
pipeline,
steering_hook,
test_dataset,
propensity_fn=propensity_score,
multipliers=multipliers,
)
assert len(pipeline.hooks) == 0

steerability = get_steerability_slope(multipliers, propensities)
print(
f"Steerability slope: {steerability.mean():.2f} +- {steerability.std():.2f}"
)
steerabilities[layer] = steerability.mean()

# Save propensities
np.save(propensity_save_path, propensities)

# Save steerabilities
steerability_save_path = save_dir / "steerabilities.json"
with open(steerability_save_path, "w") as f:
json.dump(steerabilities, f, indent=2)
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
""" Script to download all datasets """

import requests
import os

from steering_bench.dataset.download import download_persona, download_xrisk
from steering_bench.dataset.preprocess import preprocess_persona, preprocess_xrisk

if __name__ == "__main__":
download_persona()
download_xrisk()
preprocess_persona()
preprocess_xrisk()
74 changes: 74 additions & 0 deletions experiments/persona_generalization/run_steering_experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
""" Script to run a steering experiment and calculate steerability """

import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer
from steering_vectors import train_steering_vector, guess_and_enhance_layer_config
from steering_bench.build_training_data import build_steering_vector_training_data
from steering_bench.utils.torch import load_model_with_quantization, EmptyTorchCUDACache

from steering_bench.dataset import build_dataset, DatasetSpec
from steering_bench.core.format import LlamaChatFormatter
from steering_bench.core.pipeline import Pipeline
from steering_bench.core.hook import SteeringHook
from steering_bench.core.evaluate import evaluate, LogProbDifference, NormalizedPositiveProbability
from steering_bench.metric import get_steerability_slope

if __name__ == "__main__":
model_name = "meta-llama/Llama-2-7b-chat-hf"
dataset_name = "corrigible-neutral-HHH"
train_spec = DatasetSpec(name=dataset_name, split = "0%:10%", seed = 0)
test_spec = DatasetSpec(name=dataset_name, split = "99%:100%", seed = 0)
train_dataset = build_dataset(train_spec)
test_dataset = build_dataset(test_spec)

model, tokenizer = load_model_with_quantization(model_name, load_in_8bit=True)
formatter = LlamaChatFormatter()
pipeline = Pipeline(model=model, tokenizer=tokenizer, formatter=formatter)

training_data = build_steering_vector_training_data(pipeline, train_dataset)
steering_vector = train_steering_vector(
pipeline.model,
pipeline.tokenizer,
training_data,
)

# Now evaluate the SV
evaluators= [
LogProbDifference(),
NormalizedPositiveProbability(),
]

multipliers = np.array([-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5])
propensities = np.zeros((len(test_dataset), len(multipliers)))

for multiplier_idx, multiplier in enumerate(multipliers):
steering_hook = SteeringHook(
steering_vector,
direction_multiplier=multiplier,
layer = 13,
layer_config = guess_and_enhance_layer_config(pipeline.model),
)
pipeline.hooks.append(steering_hook)
result = evaluate(pipeline, test_dataset, evaluators)
for test_idx, pred in enumerate(result.predictions):
propensities[test_idx, multiplier_idx] = pred.metrics["logprob_diff"]

# calculate steerability
slope = get_steerability_slope(multipliers, propensities)

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()

# Propensity curve
plt.figure()
plt.plot(multipliers, propensities.T[:, :5])
plt.xlabel("Multiplier")
plt.ylabel("Logprob Difference")
plt.title("Propensity Curve")

# Histplot of the slope
plt.figure()
sns.histplot(slope)

Loading

0 comments on commit b571672

Please sign in to comment.