modelscope · Jintao-Huang · May 22, 2024 · Jan 25, 2024 · Feb 6, 2024 · Feb 23, 2024
diff --git a/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_dp_sft.sh b/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_dp_sft.sh
@@ -0,0 +1,35 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc dp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=2,3 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
+  --model_layer_cls_name BaichuanLayer \
+	--dataset codefuse-python-en \
+	--sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 12 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/acc_lora_fsdp_sft.sh
@@ -0,0 +1,35 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc fsdp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
+  --model_layer_cls_name BaichuanLayer \
+	--dataset codefuse-python-en \
+	--sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \
+  --fsdp_num 2 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/baichuan2_13b_chat/swift_lora_sft.sh
@@ -0,0 +1,28 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path baichuan-inc/Baichuan2-13B-Chat \
+  --dataset codefuse-python-en \
+	--sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 2 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_dp_sft.sh b/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_dp_sft.sh
@@ -0,0 +1,36 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc dp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --model_layer_cls_name GLMBlock \
+	--dataset codefuse-python-en \
+	--sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/acc_lora_fsdp_sft.sh
@@ -0,0 +1,36 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+# torchacc fsdp
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=2,3 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --model_layer_cls_name GLMBlock \
+	--dataset codefuse-python-en \
+	--sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \
+  --fsdp_num 2 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/chatglm3_6b/swift_lora_sft.sh
@@ -0,0 +1,27 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+# MASTER_PORT=12356 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=2,3 \
+swift sft \
+  --model_id_or_path ZhipuAI/chatglm3-6b \
+  --dataset codefuse-python-en \
+	--sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  4 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_dp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_dp_sft.sh
@@ -0,0 +1,36 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+  --model_layer_cls_name LlamaDecoderLayer \
+	--dataset codefuse-python-en \
+  --template_type llama \
+	--sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/acc_lora_fsdp_sft.sh
@@ -0,0 +1,37 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+export USE_TORCHACC=1
+export TORCHACC_TRIM_GRAPH=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=2,3 \
+MASTER_PORT=27829 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+  --model_layer_cls_name LlamaDecoderLayer \
+	--dataset codefuse-python-en \
+  --template_type llama \
+	--sft_type lora \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size 24 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --gradient_checkpointing no \
+  --tuner_backend 'peft' \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \
+  --fsdp_num 2 \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/llama2_13b_chat/swift_lora_sft.sh
@@ -0,0 +1,28 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=2,3 \
+swift sft \
+  --model_id_or_path modelscope/Llama-2-13b-chat-ms \
+	--dataset codefuse-python-en \
+	--sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  16 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \
+  --report_to 'none'
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/acc_lora_fsdp_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/acc_lora_fsdp_sft.sh
@@ -0,0 +1,32 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+export USE_TORCHACC=1
+export XLA_FLAGS='--xla_gpu_force_compilation_parallelism=32 --xla_multiheap_size_constraint_per_heap=4831838208 --xla_disable_hlo_passes=all-gather-combiner,all-reduce-combiner,reduce-scatter-combiner,gpu-convert-async-collectives-to-sync,rematerialization'
+export XLA_IR_SHAPE_CACHE_SIZE=100000000
+export XLA_ALLOCATOR_FRACTION=0.95
+export XLA_EXPERIMENTAL=nonzero:masked_select
+
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+    --model_type qwen-72b-chat \
+    --model_layer_cls_name QWenBlock \
+    --dataset codefuse-python-en \
+    --sft_type lora \
+    --output_dir output_qwen_72b \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --batch_size 4 \
+    --use_flash_attn true \
+    --gradient_accumulation_steps 1 \
+    --gradient_checkpointing no \
+    --tuner_backend 'peft' \
+    --eval_steps 200 \
+    --save_steps 200 \
+    --logging_steps 100 \
+    --metric_warmup_step 0.1 \
+    --use_profiler false \
+    --report_to 'none' \
+    --fsdp_num 4 \
diff --git a/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh b/examples/pytorch/llm/scripts/torchacc/qwen_72b_chat/swift_lora_sft.sh
@@ -0,0 +1,27 @@
+# Experimental environment: 4 * A800
+# 80GB GPU memory
+# Note: TorchAcc is currently only available internally.
+
+# MASTER_ADDR=127.0.0.1 \
+
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+swift sft \
+  --model_id_or_path qwen/Qwen-72B-Chat \
+	--dataset codefuse-python-en \
+	--sft_type lora \
+  --dtype AUTO \
+  --output_dir output \
+  --num_train_epochs 1 \
+  --max_length 2048 \
+  --batch_size  1 \
+  --use_flash_attn true \
+  --gradient_accumulation_steps 1 \
+  --dataset_test_ratio 0 \
+  --save_strategy no \
+  --eval_steps 2000000 \
+  --save_steps 2000000 \
+  --logging_steps 100 \
+  --preprocess_num_proc 1 \
+  --metric_warmup_step 0.1 \
+  --use_profiler false \