(1) adding support for evaluation skipping; (2) updating model and data… (#728)

itayhubara · web-flow · commit 7d3dee393252 · 2024-04-18T10:05:37.000-07:00
* (1) adding support for evaluation skipping; (2) update model and data download instructions; (3) clean up

* adding reference TTT per issue 727 request
diff --git a/llama2_70b_lora/README.md b/llama2_70b_lora/README.md
@@ -41,19 +41,26 @@ git clone https://github.com/mlperf/logging.git mlperf-logging
 pip install -e mlperf-logging
 ```
 ## Download Data and Model
-data can be downloaded from:
-[mlperf drive - train data](https://drive.google.com/file/d/1-JgY1mEafcJ7qhggt6UR3OEKAciIPd5s/view?usp=sharing)
-[mlperf drive - validation data](https://drive.google.com/file/d/1jrm6Lacrq49AYv0uB_Qy22xRmfPixQvs/view?usp=sharing)
-[mlperf drive - llama-v2 model](https://drive.google.com/drive/folders/1sTeuxkPhwkNPKIPFnOLIYCcK53oB3Ypc?usp=sharing)
-As defaults the scripts assume the model is under at ```./llama-v2-fused-qkv``` and the both train and validation are under ```dataset``` folder.
+MLCommons hosts the model for download exclusively by MLCommons Members. You must first agree to the [confidentiality notice](https://docs.google.com/forms/d/e/1FAIpQLSc_8VIvRmXM3I8KQaYnKf7gy27Z63BBoI_I1u02f4lw6rBp3g/viewform), then follow the [link[(https://drive.google.com/drive/folders/11tBZvvrh0FCm3XuR5E849K42TqftYdUF)] to a directory containing [Rclone download instructions](https://docs.google.com/document/d/1Yp2T_TsVfg8uEoEv0wa-dGP4R7r1EOHucTvDNWznWzE/edit#heading=h.at8a3matgbrk). Follow steps 1-3 to install and activate Rclone. Finally, download the model to the desired download directory (default ./models):
+```
+mkdir models
+cd models
+rclone copy mlc-llama2:Llama2-70b-fused-qkv-mlperf ./Llama2-70b-fused-qkv-mlperf -P
+```
+Similarly download the data to the desired download directory (default ./dataset):
+```
+mkdir dataset
+cd dataset
+rclone copy mlc-llama2:training/scrolls_gov_report_8k ./scrolls_gov_report_8k -P
+```
 
 ## Llama2-70B on 8 devices
 
 Run:
 ```bash
 accelerate launch --config_file configs/default_config.yaml scripts/train.py \
 --dataset_path "./dataset" \
---model_path "/software/users/ihubara/lora_clean/llama-v2-fused-qkv" \
+--model_path "/models/llama-v2-fused-qkv" \
 --max_seq_len 8192 \
 --bf16 True \
 --logging_steps 24 \
@@ -81,23 +88,5 @@ where the Accelerate config file is [this one](https://github.com/regisss/lora/b
 
 > Using flash attention with `--use_flash_attn` is necessary for training on 8k-token sequences.
 
-Learning curves of such a run can be found here: https://huggingface.co/regisss/test_5/tensorboard
-
-
-## Evaluation
-
-To run evaluation for summarizing texts, you can run:
-- Without LoRA adapter weights:
-   ```
-   python scripts/eval.py --model_name meta-llama/Llama-2-70b-hf --max_new_tokens 900 --seq_length 8192 --do_sample --dataset_name "tau/scrolls" --dataset_config_name "gov_report"
-   ```
-- With LoRA adapter weights:
-   ```
-   python scripts/eval.py --peft_model_name path_to_my_lora_model --max_new_tokens 900 --seq_length 8192 --do_sample --dataset_name "tau/scrolls" --dataset_config_name "gov_report"
-   ```
-## expected outcome
-
-A clean output (train and eval loss) of a singel run with 440 steps can be found under 
-```
-   convergence_example.txt
-```
+## Reference code running time
+On 8xA100 cards the reference $\textcolor{red}{\textbf{UNOPTIMIZED}}$ code the TTT on average is 120-140 minutes.
diff --git a/llama2_70b_lora/run_llama_70B_scrolls_r16.sh b/llama2_70b_lora/run_llama_70B_scrolls_r16.sh
@@ -1,6 +1,6 @@
 accelerate launch --config_file configs/default_config.yaml scripts/train.py \
 --dataset_path "./dataset" \
---model_path "/software/users/ihubara/lora_clean/llama-v2-fused-qkv" \
+--model_path "./models/llama-v2-fused-qkv" \
 --max_seq_len 8192 \
 --bf16 True \
 --logging_steps 24 \
diff --git a/llama2_70b_lora/scripts/mlperf_logging_utils.py b/llama2_70b_lora/scripts/mlperf_logging_utils.py
@@ -90,7 +90,7 @@ def __init__(self, logger, train_dataset_length, eval_dataset_length,lora_alpha)
         }
 
     def on_train_begin(self, args, state, control, **kwargs):
-        self.gbs=int(args.per_device_train_batch_size * args.gradient_accumulation_steps * os.getenv("WORLD_SIZE", 1))
+        self.gbs=int(args.per_device_train_batch_size * args.gradient_accumulation_steps * int(os.getenv("WORLD_SIZE", 1)))
         self.mllogger.event(
             key=constants.CACHE_CLEAR, value="True",
         )
@@ -170,7 +170,7 @@ def on_step_begin(
             )
             control.should_log = True
 
-        if state.global_step % (state.eval_steps) == 0 and state.global_step > 0:
+        if state.global_step % (state.eval_steps) == 0 and state.global_step > args.eval_delay:
             self.mllogger.end(
                 constants.BLOCK_STOP,
                 value="",
diff --git a/llama2_70b_lora/scripts/train.py b/llama2_70b_lora/scripts/train.py
@@ -15,7 +15,7 @@
 
 from dataclasses import dataclass, field
 from typing import Optional
-
+import os
 from datasets import load_dataset
 from mlperf_logging_utils import LoraLogger, MLPerfCallback
 from transformers import HfArgumentParser, Trainer, TrainingArguments
@@ -136,6 +136,7 @@ class ScriptArguments:
 
 def main(args):
     loralogger = LoraLogger(target_eval_loss=args.target_eval_loss)
+    gbs=args.per_device_train_batch_size * args.gradient_accumulation_steps * int(os.getenv("WORLD_SIZE", 1))
     training_arguments = TrainingArguments(
         output_dir=args.output_dir,
         per_device_train_batch_size=args.per_device_train_batch_size,
@@ -154,6 +155,7 @@ def main(args):
         save_strategy="no",
         max_steps=args.max_steps,
         eval_steps=args.eval_steps,
+        eval_delay=int(0.125*gbs+2)*args.eval_steps,
         save_steps=args.save_steps,
         logging_steps=args.logging_steps,
         push_to_hub=args.push_to_hub,

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ def __init__(self, logger, train_dataset_length, eval_dataset_length,lora_alpha)`
`90`	`90`	`}`
`91`	`91`
`92`	`92`	`def on_train_begin(self, args, state, control, **kwargs):`
`93`		`- self.gbs=int(args.per_device_train_batch_size * args.gradient_accumulation_steps * os.getenv("WORLD_SIZE", 1))`
	`93`	`+ self.gbs=int(args.per_device_train_batch_size * args.gradient_accumulation_steps * int(os.getenv("WORLD_SIZE", 1)))`
`94`	`94`	`self.mllogger.event(`
`95`	`95`	`key=constants.CACHE_CLEAR, value="True",`
`96`	`96`	`)`
`@@ -170,7 +170,7 @@ def on_step_begin(`
`170`	`170`	`)`
`171`	`171`	`control.should_log = True`
`172`	`172`
`173`		`- if state.global_step % (state.eval_steps) == 0 and state.global_step > 0:`
	`173`	`+ if state.global_step % (state.eval_steps) == 0 and state.global_step > args.eval_delay:`
`174`	`174`	`self.mllogger.end(`
`175`	`175`	`constants.BLOCK_STOP,`
`176`	`176`	`value="",`