Skip to content

Commit 0a328de

Browse files
committed
Finetuning
1 parent 2085af0 commit 0a328de

File tree

3 files changed

+54
-48
lines changed

3 files changed

+54
-48
lines changed

finetune_llama.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,18 +188,25 @@ def finetune(
188188
avg_loss = 0
189189
n_batches = 0
190190
with open(
191-
os.path.join("logs", f"rank_{rank}_step_{i}_epoch_{epoch}.log"), "w"
191+
os.path.join("logs", f"rank_{rank}_step_{i}_epoch_{epoch}_behavior_{behavior}.log"), "w"
192192
) as logfile:
193193
logfile.write(t.cuda.memory_summary(device=DEVICE))
194+
logfile.write("avg_loss: " + str(avg_loss / n_batches))
194195
# Finalize the training
195196
dist.barrier()
196197
if rank == 0:
197198
# Print test accuracy
199+
test_accuracy = eval_model(ddp_model.module, test_dataloader, maximize_positive, DEVICE)
198200
print(
199-
f"Test accuracy (Rank {rank}): {eval_model(ddp_model.module, test_dataloader, maximize_positive, DEVICE)}"
201+
f"Test accuracy (Rank {rank}): {test_accuracy}"
200202
)
201203
# Save the model after training completes
202204
t.save(ddp_model.module.state_dict(), save_path)
205+
# Save test accuracy to final log file
206+
with open(
207+
os.path.join("logs", f"final_rank_{rank}_behavior_{behavior}.log"), "w"
208+
) as logfile:
209+
logfile.write(f"Test accuracy: {test_accuracy}")
203210
# Cleanup
204211
dist.destroy_process_group()
205212

finetuning_script.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# iterate through behaviors in for loop
2+
3+
for behavior in "hallucination" "myopic-reward" "sycophancy" "survival-instinct" "refusal" "corrigible-neutral-HHH" "coordinate-other-ais"
4+
do
5+
python finetune_llama.py --behavior $behavior --direction pos
6+
python finetune_llama.py --behavior $behavior --direction neg
7+
python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/${behavior}_pos_finetune_all.pt --behaviors $behavior
8+
python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/${behavior}_neg_finetune_all.pt --behaviors $behavior
9+
python scoring.py
10+
python plot_results.py --layers 13 --multipliers -1 0 1 --type open_ended --override_weights finetuned_models/${behavior}_pos_finetune_all.pt finetuned_models/${behavior}_neg_finetune_all.pt --behaviors $behavior
11+
done

script.sh

Lines changed: 34 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,50 @@
1-
# python generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b"
2-
# python generate_vectors.py --layers $(seq 0 35) --model_size "13b"
3-
# python generate_vectors.py --layers $(seq 0 31) --model_size "7b" --use_base_model
1+
python generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b"
2+
python generate_vectors.py --layers $(seq 0 35) --model_size "13b"
3+
python generate_vectors.py --layers $(seq 0 31) --model_size "7b" --use_base_model
44

5-
# python normalize_vectors.py
5+
python normalize_vectors.py
66

7-
# python plot_activations.py --layers $(seq 0 31) --model_size "7b"
8-
# python analyze_vectors.py
7+
python plot_activations.py --layers $(seq 0 31) --model_size "7b"
8+
python analyze_vectors.py
99

10-
# python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab
11-
# python prompting_with_steering.py --layers $(seq 0 35) --multipliers -1 0 1 --type ab --model_size "13b"
12-
# python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab --override_vector_model Llama-2-7b-hf
13-
# python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab --override_vector 13
10+
python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab
11+
python prompting_with_steering.py --layers $(seq 0 35) --multipliers -1 0 1 --type ab --model_size "13b"
12+
python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab --override_vector_model Llama-2-7b-hf
13+
python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab --override_vector 13
1414

15-
# python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab
16-
# python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b"
15+
python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab
16+
python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b"
1717

18-
# python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --system_prompt pos
19-
# python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --system_prompt pos
18+
python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --system_prompt pos
19+
python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --system_prompt pos
2020

21-
# python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --system_prompt neg
22-
# python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --system_prompt neg
21+
python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --system_prompt neg
22+
python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --system_prompt neg
2323

24-
# python prompting_with_steering.py --layers 13 --multipliers -2.0 -1.5 -1 0 1 1.5 2.0 --type open_ended
25-
# python prompting_with_steering.py --layers 14 --multipliers -2.0 -1.5 -1 0 1 1.5 2.0 --type open_ended --model_size "13b"
24+
python prompting_with_steering.py --layers 13 --multipliers -2.0 -1.5 -1 0 1 1.5 2.0 --type open_ended
25+
python prompting_with_steering.py --layers 14 --multipliers -2.0 -1.5 -1 0 1 1.5 2.0 --type open_ended --model_size "13b"
2626

27-
# python prompting_with_steering.py --layers 13 --multipliers -2 -1 0 1 2 --type mmlu
28-
# python prompting_with_steering.py --layers 14 --multipliers -2 -1 0 1 2 --type mmlu --model_size "13b"
27+
python prompting_with_steering.py --layers 13 --multipliers -2 -1 0 1 2 --type mmlu
28+
python prompting_with_steering.py --layers 14 --multipliers -2 -1 0 1 2 --type mmlu --model_size "13b"
2929

30-
# python prompting_with_steering.py --layers 13 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy
31-
# python prompting_with_steering.py --layers 14 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy --model_size "13b"
30+
python prompting_with_steering.py --layers 13 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy
31+
python prompting_with_steering.py --layers 14 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy --model_size "13b"
3232

33-
# python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab
34-
# python plot_results.py --layers $(seq 0 35) --multipliers -1 1 --type ab --model_size "13b"
35-
# python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab --override_vector_model Llama-2-7b-hf --title "CAA transfer from base to chat model"
36-
# python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab --override_vector 13 --title "CAA transfer from layer 13 vector to other layers"
33+
python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab
34+
python plot_results.py --layers $(seq 0 35) --multipliers -1 1 --type ab --model_size "13b"
35+
python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab --override_vector_model Llama-2-7b-hf --title "CAA transfer from base to chat model"
36+
python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab --override_vector 13 --title "CAA transfer from layer 13 vector to other layers"
3737

38-
# python plot_results.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --title "Layer 13 - Llama 2 7B Chat"
39-
# python plot_results.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --title "Layer 14 - Llama 2 13B Chat"
38+
python plot_results.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --title "Layer 13 - Llama 2 7B Chat"
39+
python plot_results.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --title "Layer 14 - Llama 2 13B Chat"
4040

41-
# python plot_results.py --layers 13 --multipliers -2 -1 0 1 2 --type mmlu
42-
# python plot_results.py --layers 14 --multipliers -2 -1 0 1 2 --type mmlu --model_size "13b"
41+
python plot_results.py --layers 13 --multipliers -2 -1 0 1 2 --type mmlu
42+
python plot_results.py --layers 14 --multipliers -2 -1 0 1 2 --type mmlu --model_size "13b"
4343

44-
# python plot_results.py --layers 13 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy
45-
# python plot_results.py --layers 14 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy --model_size "13b"
46-
47-
# python scoring.py
48-
49-
# python plot_results.py --layers 13 --multipliers -1.5 -1 0 1 1.5 --type open_ended --title "Layer 13 - Llama 2 7B Chat"
50-
# python plot_results.py --layers 14 --multipliers -1.5 -1 0 1 1.5 --type open_ended --model_size "13b" --title "Layer 14 - Llama 2 13B Chat"
51-
52-
# Post finetune
53-
54-
python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/hallucination_pos_finetune_all.pt --behaviors hallucination
55-
python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/hallucination_neg_finetune_all.pt --behaviors hallucination
56-
python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/myopic-reward_pos_finetune_all.pt --behaviors myopic-reward
57-
python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/myopic-reward_neg_finetune_all.pt --behaviors myopic-reward
44+
python plot_results.py --layers 13 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy
45+
python plot_results.py --layers 14 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy --model_size "13b"
5846

5947
python scoring.py
6048

61-
python plot_results.py --layers 13 --multipliers -1 0 1 --type open_ended --override_weights finetuned_models/hallucination_pos_finetune_all.pt finetuned_models/hallucination_neg_finetune_all.pt --behaviors hallucination
62-
python plot_results.py --layers 13 --multipliers -1 0 1 --type open_ended --override_weights finetuned_models/myopic-reward_pos_finetune_all.pt finetuned_models/myopic-reward_neg_finetune_all.pt --behaviors myopic-reward
49+
python plot_results.py --layers 13 --multipliers -1.5 -1 0 1 1.5 --type open_ended --title "Layer 13 - Llama 2 7B Chat"
50+
python plot_results.py --layers 14 --multipliers -1.5 -1 0 1 1.5 --type open_ended --model_size "13b" --title "Layer 14 - Llama 2 13B Chat"

0 commit comments

Comments
 (0)