Skip to content

Commit d58f173

Browse files
committed
Similarity plotting
1 parent 34e54db commit d58f173

File tree

4 files changed

+11
-40
lines changed

4 files changed

+11
-40
lines changed

.DS_Store

0 Bytes
Binary file not shown.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ python finetune_llama.py --behavior sycophancy --direction pos
119119
# Example: evaluate a model finetuned to be more sycophantic on the sycophancy a/b question test dataset
120120
python eval_finetune_llama.py --type ab --behavior sycophancy --direction pos
121121

122-
# Plot relationships / projections of steering vectors
122+
# Plot similarites of steering vectors
123123
python analyze_vectors.py
124124

125125
# Use GPT-4 to score open-ended responses

analysis/base_chat_similarities.png

465 KB
Loading

analyze_vectors.py

Lines changed: 10 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
"""
44

55
import os
6+
from matplotlib.pylab import f
67
import torch as t
78
import numpy as np
89
import seaborn as sns
910
import matplotlib.pyplot as plt
10-
from sklearn.decomposition import PCA
1111
from behaviors import ALL_BEHAVIORS, get_analysis_dir, HUMAN_NAMES, get_steering_vector, ANALYSIS_PATH
1212
from utils.helpers import get_model_path, model_name_format, set_plotting_settings
1313
from tqdm import tqdm
@@ -37,17 +37,17 @@ def plot_per_layer_similarities(model_size: str, is_base: bool, behavior: str):
3737
for layer2 in range(n_layers):
3838
cosine_sim = t.nn.functional.cosine_similarity(all_vectors[layer1], all_vectors[layer2], dim=0).item()
3939
matrix[layer1, layer2] = cosine_sim
40-
plt.figure(figsize=(5, 5))
40+
plt.figure(figsize=(3, 3))
4141
sns.heatmap(matrix, annot=False, cmap='coolwarm')
4242
# Set ticks for every 5th layer
4343
plt.xticks(list(range(n_layers))[::5], list(range(n_layers))[::5])
4444
plt.yticks(list(range(n_layers))[::5], list(range(n_layers))[::5])
45-
plt.title(f"Inter-layer similarity, {model_name}")
46-
plt.savefig(os.path.join(analysis_dir, f"cosine_similarities_{model_name.replace(' ', '_')}_{behavior}.png"), format='png')
45+
plt.title(f"Layer similarity, {model_name}", fontsize=11)
46+
plt.savefig(os.path.join(analysis_dir, f"cosine_similarities_{model_name.replace(' ', '_')}_{behavior}.svg"), format='svg')
4747
plt.close()
4848

4949
def plot_base_chat_similarities():
50-
plt.figure(figsize=(8, 4))
50+
plt.figure(figsize=(5, 3))
5151
for behavior in ALL_BEHAVIORS:
5252
base_caa_info = get_caa_info(behavior, "7b", True)
5353
chat_caa_info = get_caa_info(behavior, "7b", False)
@@ -57,48 +57,19 @@ def plot_base_chat_similarities():
5757
for layer in range(base_caa_info["n_layers"]):
5858
cos_sim = t.nn.functional.cosine_similarity(vectors_base[layer], vectors_chat[layer], dim=0).item()
5959
cos_sims.append(cos_sim)
60-
plt.plot(list(range(base_caa_info["n_layers"])), cos_sims, label=HUMAN_NAMES[behavior])
60+
plt.plot(list(range(base_caa_info["n_layers"])), cos_sims, label=HUMAN_NAMES[behavior], linestyle="solid", linewidth=2)
6161
plt.xlabel("Layer")
6262
plt.ylabel("Cosine Similarity")
63-
plt.title("Steering vector similarity between Llama 2 base and chat")
64-
plt.legend()
63+
plt.title("Base vs. Chat model vector similarity", fontsize=12)
64+
# legend in bottom right
65+
plt.legend(loc="lower right")
6566
plt.tight_layout()
6667
plt.savefig(os.path.join(ANALYSIS_PATH, "base_chat_similarities.png"), format='png')
6768
plt.close()
68-
69-
def plot_pca_of_all_vectors():
70-
"""
71-
plot pca of all vectors in llama 2 7b chat
72-
normalize vectors before pca
73-
"""
74-
all_vectors = []
75-
n_layers = 32
76-
for behavior in ALL_BEHAVIORS:
77-
caa_info = get_caa_info(behavior, "7b", False)
78-
all_vectors.extend(caa_info["vectors"])
79-
all_vectors = t.stack(all_vectors)
80-
# normalize vectors for pca (mean 0, std 1)
81-
all_vectors = (all_vectors - all_vectors.mean(dim=0)) / all_vectors.std(dim=0)
82-
pca = PCA(n_components=2)
83-
pca.fit(all_vectors)
84-
pca_vectors = pca.transform(all_vectors)
85-
plt.figure(figsize=(5, 5))
86-
for i, behavior in enumerate(ALL_BEHAVIORS):
87-
start = i * n_layers
88-
end = start + n_layers
89-
plt.scatter(pca_vectors[start:end, 0], pca_vectors[start:end, 1], label=HUMAN_NAMES[behavior])
90-
plt.xlabel("PC1")
91-
plt.ylabel("PC2")
92-
plt.title("PCA of all steering vectors")
93-
plt.legend()
94-
plt.tight_layout()
95-
plt.savefig(os.path.join(ANALYSIS_PATH, "pca_all_vectors.png"), format='png')
96-
plt.close()
9769

9870
if __name__ == "__main__":
99-
for behavior in ALL_BEHAVIORS:
71+
for behavior in tqdm(ALL_BEHAVIORS):
10072
plot_per_layer_similarities("7b", True, behavior)
10173
plot_per_layer_similarities("7b", False, behavior)
10274
plot_per_layer_similarities("13b", False, behavior)
10375
plot_base_chat_similarities()
104-
plot_pca_of_all_vectors()

0 commit comments

Comments
 (0)