-
Notifications
You must be signed in to change notification settings - Fork 903
Description
CUDA_VISIBLE_DEVICES=1 swift rollout --model /mnt/data/data/sft_out/v11-20251015-090721/checkpoint-186/ --max-model-len 43000 --gpu_memory_utilization 0.90 --model_type qwen3_vl
export CUDA_VISIBLE_DEVICES=2,3,4,5
export NPROC_PER_NODE=4
export VIDEO_MAX_PIXELS=42000
export FPS_MAX_FRAMES=32
swift rlhf
--rlhf_type grpo
--model /mnt/data/data/sft_out/v11-20251015-090721/checkpoint-186
--model_type qwen3_vl
--custom_register_path /home/ubuntu/ms-swift-main1_extracted/2.py
--external_plugins examples/train/grpo/plugin/plugin.py
--dataset rldata
--use_vllm true
--vllm_mode server
--vllm_server_host 127.0.0.1
--vllm_server_port 8001
--reward_funcs r1_accuracy r1_rouge r1_format r1_strategy
--reward_weights 1.0 0.5 0.1 0.5
--torch_dtype bfloat16
--train_type full
--offload_optimizer true
--offload_model true
--load_from_cache_file true
--max_completion_length 2048
--num_train_epochs 1
--per_device_train_batch_size 1
--per_device_eval_batch_size 1
--learning_rate 1e-6
--gradient_accumulation_steps 1
--save_strategy 'steps'
--eval_strategy 'steps'
--eval_steps 400
--save_steps 50
--save_total_limit 10
--logging_steps 1
--output_dir /mnt/data/data/
--warmup_ratio 0.05
--dataloader_num_workers 4
--num_generations 4
--temperature 1.0
--repetition_penalty 1.1
--system 'examples/train/grpo/prompt.txt'
--deepspeed zero3
--log_completions true
--report_to wandb
--num_iterations 2
--async_generate false
--beta 0.001
--max_grad_norm 0.5