UCSC-REAL
diff --git a/‎README.md
Lines changed: 9 additions & 22 deletions b/‎README.md
Lines changed: 9 additions & 22 deletions
diff --git a/‎model_finetune/read_results.py
Lines changed: 3 additions & 3 deletions b/‎model_finetune/read_results.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎model_finetune/run_pipeline.sh
Lines changed: 20 additions & 21 deletions b/‎model_finetune/run_pipeline.sh
Lines changed: 20 additions & 21 deletions
@@ -67,43 +67,30 @@ cd LLM_scoring && bash scoring_api.sh
 ---
 
 ### 🧩 Step 2. Score curation
-Th score curation codebase is from [Docta](https://github.com/Docta-ai/docta) in the `./score_curation` path. You can execute the score curation by running
+One can execute the score curation by running
 ```
 cd score_curation && bash diagnose.sh
 ```
-The corresponding curation report files could be found in the path `./score_curation/results`.
+The corresponding curation report files can be found in the path `score_curation_results/`.
 
 
 ---
 
 ### 🧩 Step 3. Data selection
-Given the existing score curation reports, you can directly use the following jupyter notebooks to do data selection including all baselines: `data_generation.ipynb`. The generated subsets can be further used for LLM instruction tuning. Other selected datasets used for ablation study can be also generated from the following jupyter notebooks located in the `./score_curation` path: `data_gen_score_curation.ipynb` and `data_gen_data_scale.ipynb`. In particular, we use `data_gen_score_curation.ipynb` to generate subsets after curating machine-generated raw scores.
-
+Given the existing score curation reports, one can directly generate the high-quality subset by 
+```
+python subset_generation.py
+``` 
+The generated subsets can be further used for the following LLM instruction tuning.
 
 
 ---
 ### 🧩 Step 4. Finetune & Evaluation
-Given the selected subsets in the path `model_finetune/selected_data/`, you can use the code base from [TULU](https://github.com/allenai/open-instruct) to finetune base models (Mistral or LLaMA) and then do evaluation.
-In particular, you can submit the jobs via launcher under the path `model_finetune/`. For example, you can submit the job by running the code 
-```
-cd model_finetune/ && launcher run job_pipeline_all.yaml
-```
-
-
-Futhermore, we can also execute the code locally, e.g.,  
+Given the selected subsets in the path `selected_data/`, one can use the code base from [TULU](https://github.com/allenai/open-instruct) to finetune base models (Mistral or LLaMA) and then do evaluation.  Here, for convenience, one can also finetune the model by 
 ```
-cd model_finetune/ && bash run_pipeline_all.sh
+cd model_finetune/ && bash run_pipeline.sh
 ```
 
-One can present the final result by running 
-```
-python model_finetune/read_results.py
-```
-
-------
-
-## Final results 
-The final results of LLM judging compared with human-annotated dataset LIMA can be found in `lima_compare_plot.ipynb`. Moreover, for the tabular results, you can check the `reading_results.ipynb` jupyter notebook.
 
 ------
 
 
@@ -6,10 +6,10 @@
 
 def main(
         root_result_path = 'results',
-        train_dataset='all_train',
+        raw_dataset='tulu_300k',
         base_model = "meta-llama/Meta-Llama-3.1-8B",
         rating_model='mistralai/Mistral-7B-Instruct-v0.3',
-        baseline_tag = 'filtered', 
+        baseline_tag = 'ds2_10k', 
         ):
 
     all_results = {}  
@@ -20,7 +20,7 @@ def main(
     for tag in baseline_tags:
         baseline_results = {}
         for eval_dataset in eval_dataset_lists:
-            path = root_result_path + f'/{rating_model}/{train_dataset}/{eval_dataset}/{base_model}/{tag}/metrics.json'
+            path = root_result_path + f'/{rating_model}/{raw_dataset}/{eval_dataset}/{base_model}/{tag}/metrics.json'
             try:
                 with open(path, 'r') as f:
                     json_file = json.load(f)
 
@@ -3,22 +3,21 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 NUM_GPUS=8
 SEED=42 
 
-TRAIN_DATASET_LIST=('flan_v2' 'oasst1' 'wizardlm' 'dolly' 'stanford_alpaca' 'all_train') # full data list
-
+RAW_DATASET_LIST=('tulu_300k') # data source
 rating_model="meta-llama/Meta-Llama-3.1-8B-Instruct" #"gpt-4o-mini" 'mistralai/Mistral-7B-Instruct-v0.3'
 
 declare -A base_models
 base_models["meta-llama/Meta-Llama-3.1-8B"]="128 1 2048"  # TOTAL_BATCH_SIZE BATCH_SIZE_PER_GPU max_seq_length
-# data types represent the generated subsets by baselines
-data_types=('completion' 'perplexity' 'knn' 'less' 'full' 'random' 'label-filtered' 'diversity-filtered' 'filtered')  
 
+# data types represent the generated subsets by baselines
+data_types=('ds2_10k')  
 
 
 #############################################################
 ######## model finetuning on selected training data ######### 
 #############################################################
 
-cluster_root_path="output" 
+cluster_root_path="../model_output" 
 mkdir -p $cluster_root_path
 
 for base_model in "${!base_models[@]}"
@@ -29,7 +28,7 @@ do
     max_seq_length=${params[2]}
 
 
-    for train_dataset_name in "${TRAIN_DATASET_LIST[@]}"
+    for raw_dataset_name in "${RAW_DATASET_LIST[@]}"
     do
 
         for data_type in "${data_types[@]}"
@@ -41,7 +40,7 @@ do
             fi
 
             mkdir -p $cluster_root_path/models/
-            train_data="selected_data/${rating_model}/${train_dataset_name}/${data_type}_dataset.json"
+            train_data="../selected_data/${rating_model}/${raw_dataset_name}/${data_type}_dataset.json"
 
             GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
             echo "Training ${base_model} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
@@ -72,20 +71,20 @@ do
                 --warmup_ratio 0.03 \
                 --weight_decay 0. \
                 --num_train_epochs 5 \
-                --output_dir $cluster_root_path/models/${rating_model}/${train_dataset_name}/${base_model}/lora_${data_type}/ \
+                --output_dir $cluster_root_path/models/${rating_model}/${raw_dataset_name}/${base_model}/lora_${data_type}/ \
                 --with_tracking \
                 --report_to tensorboard \
                 --logging_steps 1
 
             python merge_lora.py \
                 --base_model_name_or_path $base_model \
-                --lora_model_name_or_path $cluster_root_path/models/${rating_model}/${train_dataset_name}/${base_model}/lora_${data_type}/ \
-                --output_dir $cluster_root_path/models/${rating_model}/${train_dataset_name}/${base_model}/lora_merged_${data_type}/ \
+                --lora_model_name_or_path $cluster_root_path/models/${rating_model}/${raw_dataset_name}/${base_model}/lora_${data_type}/ \
+                --output_dir $cluster_root_path/models/${rating_model}/${raw_dataset_name}/${base_model}/lora_merged_${data_type}/ \
                 --save_tokenizer
 
             sleep 10s
 
-            rm -rf $cluster_root_path/models/${rating_model}/${train_dataset_name}/${base_model}/lora_${data_type}
+            rm -rf $cluster_root_path/models/${rating_model}/${raw_dataset_name}/${base_model}/lora_${data_type}
 
         done
     done
@@ -102,11 +101,11 @@ echo "starting evaluating finetuned models..."
 
 for base_model in "${!base_models[@]}"; do
 
-    for train_dataset_name in "${TRAIN_DATASET_LIST[@]}"; do
+    for raw_dataset_name in "${TRAIN_DATASET_LIST[@]}"; do
 
         for data_type in "${data_types[@]}"; do
 
-            model_name_or_path=$cluster_root_path/models/${rating_model}/${train_dataset_name}/${base_model}/lora_merged_${data_type}
+            model_name_or_path=$cluster_root_path/models/${rating_model}/${raw_dataset_name}/${base_model}/lora_merged_${data_type}
 
             if [[ $data_type == "base" ]]; then
                 echo "base model evaluation"
@@ -117,7 +116,7 @@ for base_model in "${!base_models[@]}"; do
 
             #### MMLU: factual knowledge            
             eval_dataset_name='mmlu'
-            local_save_dir=${cluster_root_path}/results/${rating_model}/${train_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
+            local_save_dir=${cluster_root_path}/results/${rating_model}/${raw_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
 
             CUDA_VISIBLE_DEVICES=0 python -m eval.mmlu.run_eval \
             --ntrain 0 \
@@ -129,7 +128,7 @@ for base_model in "${!base_models[@]}"; do
 
             ##### GSM8k: reasoning            
             eval_dataset_name='gsm'
-            local_save_dir=${cluster_root_path}/results/${rating_model}/${train_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
+            local_save_dir=${cluster_root_path}/results/${rating_model}/${raw_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
 
             CUDA_VISIBLE_DEVICES=1 python -m eval.gsm.run_eval \
                 --data_dir raw_data/eval/gsm/ \
@@ -142,7 +141,7 @@ for base_model in "${!base_models[@]}"; do
 
             ###### BBH: reasoning
             eval_dataset_name='bbh'
-            local_save_dir=${cluster_root_path}/results/${rating_model}/${train_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
+            local_save_dir=${cluster_root_path}/results/${rating_model}/${raw_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
 
             CUDA_VISIBLE_DEVICES=2 python -m eval.bbh.run_eval \
                 --data_dir raw_data/eval/bbh \
@@ -154,7 +153,7 @@ for base_model in "${!base_models[@]}"; do
 
             ##### truthfulness            
             eval_dataset_name='truthfulqa'
-            local_save_dir=${cluster_root_path}/results/${rating_model}/${train_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
+            local_save_dir=${cluster_root_path}/results/${rating_model}/${raw_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
 
             CUDA_VISIBLE_DEVICES=3 python -m eval.truthfulqa.run_eval \
                 --data_dir raw_data/eval/truthfulqa \
@@ -171,7 +170,7 @@ for base_model in "${!base_models[@]}"; do
 
             ###### multilinguality            
             eval_dataset_name='tydiqa'
-            local_save_dir=${cluster_root_path}/results/${rating_model}/${train_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
+            local_save_dir=${cluster_root_path}/results/${rating_model}/${raw_dataset_name}/${eval_dataset_name}/${base_model}/$data_type
 
             CUDA_VISIBLE_DEVICES=4 python -m eval.tydiqa.run_eval \
                 --data_dir raw_data/eval/tydiqa/ \
@@ -194,15 +193,15 @@ done
 sleep 10s
 
 for base_model in "${!base_models[@]}"; do
-    for train_dataset_name in "${TRAIN_DATASET_LIST[@]}"; do
+    for raw_dataset_name in "${RAW_DATASET_LIST[@]}"; do
 
         for data_type in "${data_types[@]}"; do        
         echo "*** Processing rating model:: ${rating_model} ***"
         echo "*** Processing Base model:: ${base_model} ***"
-        echo "*** Processing training dataset:: ${train_dataset_name} ***"
+        echo "*** Processing training dataset:: ${raw_dataset_name} ***"
         echo "*** Processing data type:: ${data_type} ***"
 
-        python3 read_results.py --root_result_path "${cluster_root_path}/results" --train_dataset $train_dataset_name --base_model $base_model --rating_model $rating_model --baseline_tag $data_type
+        python3 read_results.py --root_result_path "${cluster_root_path}/results" --raw_dataset $raw_dataset_name --base_model $base_model --rating_model $rating_model --baseline_tag $data_type
 
         done