-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* wip steering experiment * minimal working version done * refactor * add working layer sweep script * working layer sweep --------- Co-authored-by: Daniel Tan <dtch1997@users.noreply.github.com>
- Loading branch information
Showing
58 changed files
with
980 additions
and
215 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
Workflow to run layer sweeps and produce layer response curves (of steerability). | ||
|
||
```bash | ||
|
||
python download_and_preprocess_datasets.py | ||
# Evaluate propensity across layers | ||
python run_layer_sweep.py | ||
# Plot steerability and propensity curves | ||
python analysis.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import seaborn as sns | ||
import pathlib | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
|
||
from steering_bench.metric import get_steerability_slope | ||
|
||
sns.set_theme() | ||
|
||
curr_dir = pathlib.Path(__file__).parent.absolute() | ||
save_dir = curr_dir / "layer_sweep_results" | ||
|
||
|
||
def plot_propensity_curve(): | ||
propensities: dict[int, np.ndarray] = {} | ||
for layer in range(32): | ||
p_layer = np.load(save_dir / f"propensities_layer_{layer}.npy") | ||
propensities[layer] = p_layer | ||
multipliers = np.load(save_dir / "multipliers.npy") | ||
|
||
plt.figure() | ||
plt.plot(multipliers, propensities[13].T[:, :10]) | ||
plt.show() | ||
|
||
|
||
def plot_layer_response_curve(): | ||
"""Make a plot of the layer response curve, with error bars""" | ||
|
||
propensities: dict[int, np.ndarray] = {} | ||
for layer in range(32): | ||
p_layer = np.load(save_dir / f"propensities_layer_{layer}.npy") | ||
propensities[layer] = p_layer | ||
|
||
multipliers = np.load(save_dir / "multipliers.npy") | ||
|
||
# mean, std are over the dataset | ||
steerability_means = [] | ||
steerability_stds = [] | ||
|
||
for layer in range(32): | ||
p_layer = propensities[layer] | ||
steerabilities = get_steerability_slope(multipliers, p_layer) | ||
mean = steerabilities.mean() | ||
std = steerabilities.std() | ||
steerability_means.append(mean) | ||
steerability_stds.append(std) | ||
|
||
plt.figure() | ||
plt.errorbar( | ||
range(32), | ||
steerability_means, | ||
yerr=steerability_stds, | ||
fmt="o", | ||
capsize=5, | ||
) | ||
plt.show() | ||
|
||
|
||
if __name__ == "__main__": | ||
plot_layer_response_curve() | ||
plot_propensity_curve() |
10 changes: 10 additions & 0 deletions
10
experiments/layer_sweep/download_and_preprocess_datasets.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
""" Script to download all datasets """ | ||
|
||
from steering_bench.dataset.download import download_persona, download_xrisk | ||
from steering_bench.dataset.preprocess import preprocess_persona, preprocess_xrisk | ||
|
||
if __name__ == "__main__": | ||
download_persona() | ||
download_xrisk() | ||
preprocess_persona() | ||
preprocess_xrisk() |
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_0.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_1.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_10.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_11.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_12.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_13.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_14.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_15.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_16.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_17.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_18.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_19.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_2.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_20.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_21.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_22.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_23.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_24.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_25.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_26.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_27.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_28.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_29.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_3.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_30.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_31.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_4.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_5.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_6.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_7.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_8.npy
Binary file not shown.
Binary file added
BIN
+688 Bytes
experiments/layer_sweep/layer_sweep_results/propensities_layer_9.npy
Binary file not shown.
8 changes: 8 additions & 0 deletions
8
experiments/layer_sweep/layer_sweep_results/steerabilities.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"26": 0.43440174886158517, | ||
"27": 0.653860699278968, | ||
"28": 0.6137720091002329, | ||
"29": 0.9494704263550894, | ||
"30": 0.8455053423132217, | ||
"31": -0.0650582815919603 | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
""" Script to compute steerability over all layers and plot the results """ | ||
|
||
import torch | ||
import numpy as np | ||
import pathlib | ||
import json | ||
|
||
from steering_vectors import train_steering_vector | ||
from steering_bench.build_training_data import build_steering_vector_training_data | ||
from steering_bench.core.evaluate import evaluate_propensities_on_dataset | ||
from steering_bench.utils.torch import load_model_with_quantization, EmptyTorchCUDACache | ||
from steering_bench.dataset import build_dataset, DatasetSpec | ||
from steering_bench.core.format import LlamaChatFormatter | ||
from steering_bench.core.pipeline import Pipeline | ||
from steering_bench.core.propensity import LogProbDifference | ||
from steering_bench.core.hook import SteeringHook | ||
from steering_bench.metric import get_steerability_slope | ||
|
||
curr_dir = pathlib.Path(__file__).parent.absolute() | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
save_dir = curr_dir / "layer_sweep_results" | ||
save_dir.mkdir(exist_ok=True) | ||
|
||
# Load the dataset | ||
dataset_name = "corrigible-neutral-HHH" | ||
train_spec = DatasetSpec(name=dataset_name, split="0%:10%", seed=0) | ||
test_spec = DatasetSpec(name=dataset_name, split="99%:100%", seed=0) | ||
train_dataset = build_dataset(train_spec) | ||
test_dataset = build_dataset(test_spec) | ||
|
||
# Load the model and tokenizer | ||
model_name = "meta-llama/Llama-2-7b-chat-hf" | ||
model, tokenizer = load_model_with_quantization(model_name, load_in_8bit=True) | ||
formatter = LlamaChatFormatter() | ||
pipeline = Pipeline(model=model, tokenizer=tokenizer, formatter=formatter) | ||
|
||
# Train the steering vector, or load a saved one | ||
sv_save_path = save_dir / f"steering_vector.pt" | ||
if sv_save_path.exists(): | ||
print("Loading steering vector") | ||
steering_vector = torch.load(sv_save_path) | ||
else: | ||
print("Training steering vector") | ||
training_data = build_steering_vector_training_data(pipeline, train_dataset) | ||
steering_vector = train_steering_vector( | ||
pipeline.model, | ||
pipeline.tokenizer, | ||
training_data, | ||
) | ||
torch.save(steering_vector, sv_save_path) | ||
|
||
# Evaluate propensity and steerability | ||
multipliers = np.array([-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5]) | ||
propensity_score = LogProbDifference() | ||
steerabilities: dict[int, float] = {} | ||
|
||
for layer in range(32): | ||
propensity_save_path = save_dir / f"propensities_layer_{layer}.npy" | ||
if propensity_save_path.exists(): | ||
print(f"Skipping layer {layer}") | ||
continue | ||
|
||
# Create the steering hook, which applies the steering vector to the model | ||
steering_hook = SteeringHook( | ||
steering_vector, | ||
direction_multiplier=0.0, # Placeholder value; will be overwritten by evaluate_propensities | ||
layer=layer, | ||
patch_generation_tokens_only=True, # Only patch tokens generated by the model | ||
skip_first_n_generation_tokens=1, # Skip the first token '(' | ||
patch_operator="add", | ||
) | ||
|
||
with EmptyTorchCUDACache(): | ||
print(f"Running layer {layer}") | ||
pipeline.hooks.clear() | ||
propensities = evaluate_propensities_on_dataset( | ||
pipeline, | ||
steering_hook, | ||
test_dataset, | ||
propensity_fn=propensity_score, | ||
multipliers=multipliers, | ||
) | ||
assert len(pipeline.hooks) == 0 | ||
|
||
steerability = get_steerability_slope(multipliers, propensities) | ||
print( | ||
f"Steerability slope: {steerability.mean():.2f} +- {steerability.std():.2f}" | ||
) | ||
steerabilities[layer] = steerability.mean() | ||
|
||
# Save propensities | ||
np.save(propensity_save_path, propensities) | ||
|
||
# Save steerabilities | ||
steerability_save_path = save_dir / "steerabilities.json" | ||
with open(steerability_save_path, "w") as f: | ||
json.dump(steerabilities, f, indent=2) |
13 changes: 13 additions & 0 deletions
13
experiments/persona_generalization/download_and_preprocess_datasets.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
""" Script to download all datasets """ | ||
|
||
import requests | ||
import os | ||
|
||
from steering_bench.dataset.download import download_persona, download_xrisk | ||
from steering_bench.dataset.preprocess import preprocess_persona, preprocess_xrisk | ||
|
||
if __name__ == "__main__": | ||
download_persona() | ||
download_xrisk() | ||
preprocess_persona() | ||
preprocess_xrisk() |
74 changes: 74 additions & 0 deletions
74
experiments/persona_generalization/run_steering_experiment.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" Script to run a steering experiment and calculate steerability """ | ||
|
||
import numpy as np | ||
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer | ||
from steering_vectors import train_steering_vector, guess_and_enhance_layer_config | ||
from steering_bench.build_training_data import build_steering_vector_training_data | ||
from steering_bench.utils.torch import load_model_with_quantization, EmptyTorchCUDACache | ||
|
||
from steering_bench.dataset import build_dataset, DatasetSpec | ||
from steering_bench.core.format import LlamaChatFormatter | ||
from steering_bench.core.pipeline import Pipeline | ||
from steering_bench.core.hook import SteeringHook | ||
from steering_bench.core.evaluate import evaluate, LogProbDifference, NormalizedPositiveProbability | ||
from steering_bench.metric import get_steerability_slope | ||
|
||
if __name__ == "__main__": | ||
model_name = "meta-llama/Llama-2-7b-chat-hf" | ||
dataset_name = "corrigible-neutral-HHH" | ||
train_spec = DatasetSpec(name=dataset_name, split = "0%:10%", seed = 0) | ||
test_spec = DatasetSpec(name=dataset_name, split = "99%:100%", seed = 0) | ||
train_dataset = build_dataset(train_spec) | ||
test_dataset = build_dataset(test_spec) | ||
|
||
model, tokenizer = load_model_with_quantization(model_name, load_in_8bit=True) | ||
formatter = LlamaChatFormatter() | ||
pipeline = Pipeline(model=model, tokenizer=tokenizer, formatter=formatter) | ||
|
||
training_data = build_steering_vector_training_data(pipeline, train_dataset) | ||
steering_vector = train_steering_vector( | ||
pipeline.model, | ||
pipeline.tokenizer, | ||
training_data, | ||
) | ||
|
||
# Now evaluate the SV | ||
evaluators= [ | ||
LogProbDifference(), | ||
NormalizedPositiveProbability(), | ||
] | ||
|
||
multipliers = np.array([-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5]) | ||
propensities = np.zeros((len(test_dataset), len(multipliers))) | ||
|
||
for multiplier_idx, multiplier in enumerate(multipliers): | ||
steering_hook = SteeringHook( | ||
steering_vector, | ||
direction_multiplier=multiplier, | ||
layer = 13, | ||
layer_config = guess_and_enhance_layer_config(pipeline.model), | ||
) | ||
pipeline.hooks.append(steering_hook) | ||
result = evaluate(pipeline, test_dataset, evaluators) | ||
for test_idx, pred in enumerate(result.predictions): | ||
propensities[test_idx, multiplier_idx] = pred.metrics["logprob_diff"] | ||
|
||
# calculate steerability | ||
slope = get_steerability_slope(multipliers, propensities) | ||
|
||
import seaborn as sns | ||
import matplotlib.pyplot as plt | ||
sns.set_theme() | ||
|
||
# Propensity curve | ||
plt.figure() | ||
plt.plot(multipliers, propensities.T[:, :5]) | ||
plt.xlabel("Multiplier") | ||
plt.ylabel("Logprob Difference") | ||
plt.title("Propensity Curve") | ||
|
||
# Histplot of the slope | ||
plt.figure() | ||
sns.histplot(slope) | ||
|
Oops, something went wrong.