Finetuning

nrimsky · nrimsky · commit 0a328ded9754 · 2024-02-12T18:08:07.000-08:00
diff --git a/finetune_llama.py b/finetune_llama.py
@@ -188,18 +188,25 @@ def finetune(
                 avg_loss = 0
                 n_batches = 0
                 with open(
-                    os.path.join("logs", f"rank_{rank}_step_{i}_epoch_{epoch}.log"), "w"
+                    os.path.join("logs", f"rank_{rank}_step_{i}_epoch_{epoch}_behavior_{behavior}.log"), "w"
                 ) as logfile:
                     logfile.write(t.cuda.memory_summary(device=DEVICE))
+                    logfile.write("avg_loss: " + str(avg_loss / n_batches))
     # Finalize the training
     dist.barrier()
     if rank == 0:
         # Print test accuracy
+        test_accuracy = eval_model(ddp_model.module, test_dataloader, maximize_positive, DEVICE)
         print(
-            f"Test accuracy (Rank {rank}): {eval_model(ddp_model.module, test_dataloader, maximize_positive, DEVICE)}"
+            f"Test accuracy (Rank {rank}): {test_accuracy}"
         )
         # Save the model after training completes
         t.save(ddp_model.module.state_dict(), save_path)
+        # Save test accuracy to final log file
+        with open(
+            os.path.join("logs", f"final_rank_{rank}_behavior_{behavior}.log"), "w"
+        ) as logfile:
+            logfile.write(f"Test accuracy: {test_accuracy}")
     # Cleanup
     dist.destroy_process_group()
 
diff --git a/finetuning_script.sh b/finetuning_script.sh
@@ -0,0 +1,11 @@
+# iterate through behaviors in for loop
+
+for behavior in "hallucination" "myopic-reward" "sycophancy" "survival-instinct" "refusal" "corrigible-neutral-HHH" "coordinate-other-ais"
+do
+    python finetune_llama.py --behavior $behavior --direction pos 
+    python finetune_llama.py --behavior $behavior --direction neg
+    python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/${behavior}_pos_finetune_all.pt --behaviors $behavior
+    python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/${behavior}_neg_finetune_all.pt --behaviors $behavior
+    python scoring.py
+    python plot_results.py --layers 13 --multipliers -1 0 1 --type open_ended --override_weights finetuned_models/${behavior}_pos_finetune_all.pt finetuned_models/${behavior}_neg_finetune_all.pt  --behaviors $behavior
+done
diff --git a/script.sh b/script.sh
@@ -1,62 +1,50 @@
-# python generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b" 
-# python generate_vectors.py --layers $(seq 0 35) --model_size "13b"
-# python generate_vectors.py --layers $(seq 0 31) --model_size "7b" --use_base_model 
+python generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b" 
+python generate_vectors.py --layers $(seq 0 35) --model_size "13b"
+python generate_vectors.py --layers $(seq 0 31) --model_size "7b" --use_base_model 
 
-# python normalize_vectors.py
+python normalize_vectors.py
 
-# python plot_activations.py --layers $(seq 0 31) --model_size "7b"
-# python analyze_vectors.py
+python plot_activations.py --layers $(seq 0 31) --model_size "7b"
+python analyze_vectors.py
 
-# python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab 
-# python prompting_with_steering.py --layers $(seq 0 35) --multipliers -1 0 1 --type ab --model_size "13b"
-# python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab --override_vector_model Llama-2-7b-hf
-# python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab --override_vector 13
+python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab 
+python prompting_with_steering.py --layers $(seq 0 35) --multipliers -1 0 1 --type ab --model_size "13b"
+python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab --override_vector_model Llama-2-7b-hf
+python prompting_with_steering.py --layers $(seq 0 31) --multipliers -1 0 1 --type ab --override_vector 13
 
-# python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab
-# python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b"
+python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab
+python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b"
 
-# python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --system_prompt pos
-# python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --system_prompt pos
+python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --system_prompt pos
+python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --system_prompt pos
 
-# python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --system_prompt neg
-# python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --system_prompt neg
+python prompting_with_steering.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --system_prompt neg
+python prompting_with_steering.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --system_prompt neg
 
-# python prompting_with_steering.py --layers 13 --multipliers -2.0 -1.5 -1 0 1 1.5 2.0 --type open_ended
-# python prompting_with_steering.py --layers 14 --multipliers -2.0 -1.5 -1 0 1 1.5 2.0 --type open_ended --model_size "13b"
+python prompting_with_steering.py --layers 13 --multipliers -2.0 -1.5 -1 0 1 1.5 2.0 --type open_ended
+python prompting_with_steering.py --layers 14 --multipliers -2.0 -1.5 -1 0 1 1.5 2.0 --type open_ended --model_size "13b"
 
-# python prompting_with_steering.py --layers 13 --multipliers -2 -1 0 1 2 --type mmlu
-# python prompting_with_steering.py --layers 14 --multipliers -2 -1 0 1 2 --type mmlu --model_size "13b"
+python prompting_with_steering.py --layers 13 --multipliers -2 -1 0 1 2 --type mmlu
+python prompting_with_steering.py --layers 14 --multipliers -2 -1 0 1 2 --type mmlu --model_size "13b"
 
-# python prompting_with_steering.py --layers 13 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy
-# python prompting_with_steering.py --layers 14 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy --model_size "13b"
+python prompting_with_steering.py --layers 13 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy
+python prompting_with_steering.py --layers 14 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy --model_size "13b"
 
-# python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab 
-# python plot_results.py --layers $(seq 0 35) --multipliers -1 1 --type ab --model_size "13b"
-# python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab --override_vector_model Llama-2-7b-hf --title "CAA transfer from base to chat model"
-# python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab --override_vector 13 --title "CAA transfer from layer 13 vector to other layers"
+python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab 
+python plot_results.py --layers $(seq 0 35) --multipliers -1 1 --type ab --model_size "13b"
+python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab --override_vector_model Llama-2-7b-hf --title "CAA transfer from base to chat model"
+python plot_results.py --layers $(seq 0 31) --multipliers -1 1 --type ab --override_vector 13 --title "CAA transfer from layer 13 vector to other layers"
 
-# python plot_results.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --title "Layer 13 - Llama 2 7B Chat"
-# python plot_results.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --title "Layer 14 - Llama 2 13B Chat"
+python plot_results.py --layers 13 --multipliers -1 -0.5 0 0.5 1 --type ab --title "Layer 13 - Llama 2 7B Chat"
+python plot_results.py --layers 14 --multipliers -1 -0.5 0 0.5 1 --type ab --model_size "13b" --title "Layer 14 - Llama 2 13B Chat"
 
-# python plot_results.py --layers 13 --multipliers -2 -1 0 1 2 --type mmlu
-# python plot_results.py --layers 14 --multipliers -2 -1 0 1 2 --type mmlu --model_size "13b"
+python plot_results.py --layers 13 --multipliers -2 -1 0 1 2 --type mmlu
+python plot_results.py --layers 14 --multipliers -2 -1 0 1 2 --type mmlu --model_size "13b"
 
-# python plot_results.py --layers 13 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy
-# python plot_results.py --layers 14 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy --model_size "13b"
-
-# python scoring.py
-
-# python plot_results.py --layers 13 --multipliers -1.5 -1 0 1 1.5 --type open_ended  --title "Layer 13 - Llama 2 7B Chat"
-# python plot_results.py --layers 14 --multipliers -1.5 -1 0 1 1.5 --type open_ended --model_size "13b" --title "Layer 14 - Llama 2 13B Chat"
-
-# Post finetune
-
-python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/hallucination_pos_finetune_all.pt --behaviors hallucination
-python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/hallucination_neg_finetune_all.pt --behaviors hallucination
-python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/myopic-reward_pos_finetune_all.pt --behaviors myopic-reward
-python prompting_with_steering.py --layers 13 --multipliers -1 0 1 --type open_ended --override_model_weights_path finetuned_models/myopic-reward_neg_finetune_all.pt --behaviors myopic-reward
+python plot_results.py --layers 13 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy
+python plot_results.py --layers 14 --multipliers -2 -1 0 1 2 --type truthful_qa --behaviors sycophancy --model_size "13b"
 
 python scoring.py
 
-python plot_results.py --layers 13 --multipliers -1 0 1 --type open_ended --override_weights finetuned_models/hallucination_pos_finetune_all.pt finetuned_models/hallucination_neg_finetune_all.pt  --behaviors hallucination
-python plot_results.py --layers 13 --multipliers -1 0 1 --type open_ended --override_weights finetuned_models/myopic-reward_pos_finetune_all.pt finetuned_models/myopic-reward_neg_finetune_all.pt  --behaviors myopic-reward
+python plot_results.py --layers 13 --multipliers -1.5 -1 0 1 1.5 --type open_ended  --title "Layer 13 - Llama 2 7B Chat"
+python plot_results.py --layers 14 --multipliers -1.5 -1 0 1 1.5 --type open_ended --model_size "13b" --title "Layer 14 - Llama 2 13B Chat"