Skip to content

Commit

Permalink
Similarity plotting
Browse files Browse the repository at this point in the history
  • Loading branch information
nrimsky committed Feb 11, 2024
1 parent 34e54db commit d58f173
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 40 deletions.
Binary file modified .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ python finetune_llama.py --behavior sycophancy --direction pos
# Example: evaluate a model finetuned to be more sycophantic on the sycophancy a/b question test dataset
python eval_finetune_llama.py --type ab --behavior sycophancy --direction pos

# Plot relationships / projections of steering vectors
# Plot similarites of steering vectors
python analyze_vectors.py

# Use GPT-4 to score open-ended responses
Expand Down
Binary file added analysis/base_chat_similarities.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
49 changes: 10 additions & 39 deletions analyze_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
"""

import os
from matplotlib.pylab import f
import torch as t
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from behaviors import ALL_BEHAVIORS, get_analysis_dir, HUMAN_NAMES, get_steering_vector, ANALYSIS_PATH
from utils.helpers import get_model_path, model_name_format, set_plotting_settings
from tqdm import tqdm
Expand Down Expand Up @@ -37,17 +37,17 @@ def plot_per_layer_similarities(model_size: str, is_base: bool, behavior: str):
for layer2 in range(n_layers):
cosine_sim = t.nn.functional.cosine_similarity(all_vectors[layer1], all_vectors[layer2], dim=0).item()
matrix[layer1, layer2] = cosine_sim
plt.figure(figsize=(5, 5))
plt.figure(figsize=(3, 3))
sns.heatmap(matrix, annot=False, cmap='coolwarm')
# Set ticks for every 5th layer
plt.xticks(list(range(n_layers))[::5], list(range(n_layers))[::5])
plt.yticks(list(range(n_layers))[::5], list(range(n_layers))[::5])
plt.title(f"Inter-layer similarity, {model_name}")
plt.savefig(os.path.join(analysis_dir, f"cosine_similarities_{model_name.replace(' ', '_')}_{behavior}.png"), format='png')
plt.title(f"Layer similarity, {model_name}", fontsize=11)
plt.savefig(os.path.join(analysis_dir, f"cosine_similarities_{model_name.replace(' ', '_')}_{behavior}.svg"), format='svg')
plt.close()

def plot_base_chat_similarities():
plt.figure(figsize=(8, 4))
plt.figure(figsize=(5, 3))
for behavior in ALL_BEHAVIORS:
base_caa_info = get_caa_info(behavior, "7b", True)
chat_caa_info = get_caa_info(behavior, "7b", False)
Expand All @@ -57,48 +57,19 @@ def plot_base_chat_similarities():
for layer in range(base_caa_info["n_layers"]):
cos_sim = t.nn.functional.cosine_similarity(vectors_base[layer], vectors_chat[layer], dim=0).item()
cos_sims.append(cos_sim)
plt.plot(list(range(base_caa_info["n_layers"])), cos_sims, label=HUMAN_NAMES[behavior])
plt.plot(list(range(base_caa_info["n_layers"])), cos_sims, label=HUMAN_NAMES[behavior], linestyle="solid", linewidth=2)
plt.xlabel("Layer")
plt.ylabel("Cosine Similarity")
plt.title("Steering vector similarity between Llama 2 base and chat")
plt.legend()
plt.title("Base vs. Chat model vector similarity", fontsize=12)
# legend in bottom right
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig(os.path.join(ANALYSIS_PATH, "base_chat_similarities.png"), format='png')
plt.close()

def plot_pca_of_all_vectors():
"""
plot pca of all vectors in llama 2 7b chat
normalize vectors before pca
"""
all_vectors = []
n_layers = 32
for behavior in ALL_BEHAVIORS:
caa_info = get_caa_info(behavior, "7b", False)
all_vectors.extend(caa_info["vectors"])
all_vectors = t.stack(all_vectors)
# normalize vectors for pca (mean 0, std 1)
all_vectors = (all_vectors - all_vectors.mean(dim=0)) / all_vectors.std(dim=0)
pca = PCA(n_components=2)
pca.fit(all_vectors)
pca_vectors = pca.transform(all_vectors)
plt.figure(figsize=(5, 5))
for i, behavior in enumerate(ALL_BEHAVIORS):
start = i * n_layers
end = start + n_layers
plt.scatter(pca_vectors[start:end, 0], pca_vectors[start:end, 1], label=HUMAN_NAMES[behavior])
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of all steering vectors")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(ANALYSIS_PATH, "pca_all_vectors.png"), format='png')
plt.close()

if __name__ == "__main__":
for behavior in ALL_BEHAVIORS:
for behavior in tqdm(ALL_BEHAVIORS):
plot_per_layer_similarities("7b", True, behavior)
plot_per_layer_similarities("7b", False, behavior)
plot_per_layer_similarities("13b", False, behavior)
plot_base_chat_similarities()
plot_pca_of_all_vectors()

0 comments on commit d58f173

Please sign in to comment.