-
Notifications
You must be signed in to change notification settings - Fork 903
Open
Description
@Jintao-Huang 麻烦大佬方便时帮忙看看训练报错~:
使用2x8卡对qwen3-omni进行lora训练,命令为
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
NNODES=${NNODES} \
NODE_RANK=${NODE_RANK} \
MASTER_ADDR=${MASTER_ADDR} \
MASTER_PORT=${MASTER_PORT} \
NPROC_PER_NODE=${NPROC_PER_NODE} \
MAX_PIXELS=1003520 \
VIDEO_MAX_PIXELS=50176 \
FPS_MAX_FRAMES=12 \
megatron sft \
--load /mnt/bn/wjp-lq/Qwen3-Omni-30B-A3B-Instruct-mcore \
--dataset 'speech_asr/speech_asr_aishell1_trainsets:validation#5000' \
--train_type lora \
--lora_rank 8 \
--lora_alpha 16 \
--target_modules all-linear \
--sequence_parallel true \
--packing true \
--freeze_llm false \
--freeze_vit true \
--freeze_aligner true \
--split_dataset_ratio 0.01 \
--expert_model_parallel_size 2 \
--moe_permute_fusion true \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-3 \
--micro_batch_size 1 \
--global_batch_size 16 \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--finetune true \
--cross_entropy_loss_fusion true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--max_epochs 1 \
--save megatron_output/debug \
--eval_interval 100 \
--save_interval 100 \
--vit_gradient_checkpointing true \
--max_length 4096 \
--num_workers 8 \
--dataset_num_proc 32 \
--no_save_optim true \
--no_save_rng true \
--attention_backend flash
报了下面的错误:
Loading: 100%|██████████| 16864/16864 [00:35<00:00, 470.74it/s]
[rank6]: Traceback (most recent call last):
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/training/checkpointing.py", line 1449, in load_model_state_dict
[rank6]: module.load_state_dict(state_dict, strict=strict)
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/core/distributed/data_parallel_base.py", line 96, in load_state_dict
[rank6]: self.module.load_state_dict(state_dict, strict=strict)
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/core/transformer/module.py", line 258, in load_state_dict
[rank6]: self.module.load_state_dict(state_dict, strict=strict)
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/ms-swift/swift/megatron/trainers/base.py", line 235, in load_state_dict
[rank6]: return origin_load_state_dict(self, state_dict, strict, *args, **kwargs)
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2561, in load_state_dict
[rank6]: load(self, state_dict)
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2549, in load
[rank6]: load(child, child_state_dict, child_prefix) # noqa: F821
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2549, in load
[rank6]: load(child, child_state_dict, child_prefix) # noqa: F821
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2549, in load
[rank6]: load(child, child_state_dict, child_prefix) # noqa: F821
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: [Previous line repeated 3 more times]
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2532, in load
[rank6]: module._load_from_state_dict(
[rank6]: File "/usr/local/lib/python3.11/dist-packages/transformer_engine/pytorch/attention.py", line 6997, in _load_from_state_dict
[rank6]: super()._load_from_state_dict(
[rank6]: File "/usr/local/lib/python3.11/dist-packages/transformer_engine/pytorch/module/base.py", line 1063, in _load_from_state_dict
[rank6]: super()._load_from_state_dict(
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2456, in _load_from_state_dict
[rank6]: self.set_extra_state(state_dict[extra_state_key])
[rank6]: File "/usr/local/lib/python3.11/dist-packages/transformer_engine/pytorch/module/base.py", line 670, in set_extra_state
[rank6]: state = pickle.loads(state.detach().cpu().numpy().tobytes())
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: EOFError: Ran out of input
[rank6]: During handling of the above exception, another exception occurred:
[rank6]: Traceback (most recent call last):
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/ms-swift/swift/cli/_megatron/sft.py", line 5, in <module>
[rank6]: megatron_sft_main()
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/ms-swift/swift/megatron/train/sft.py", line 79, in megatron_sft_main
[rank6]: return MegatronSft(args).main()
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/ms-swift/swift/llm/base.py", line 49, in main
[rank6]: result = self.run()
[rank6]: ^^^^^^^^^^
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/ms-swift/swift/megatron/train/sft.py", line 69, in run
[rank6]: self.trainer.train(train_dataset, val_dataset, data_collator)
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/ms-swift/swift/megatron/trainers/base.py", line 773, in train
[rank6]: pretrain(
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/training/training.py", line 806, in pretrain
[rank6]: model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/ms-swift/swift/megatron/trainers/base.py", line 262, in setup_model_and_optimizer
[rank6]: model, optimizer, opt_param_scheduler = self._origin_setup_model_and_optimizer(
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/training/training.py", line 1296, in setup_model_and_optimizer
[rank6]: args.iteration, args.num_floating_point_operations_so_far = load_checkpoint(
[rank6]: ^^^^^^^^^^^^^^^^
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/training/checkpointing.py", line 1459, in load_checkpoint
[rank6]: load_model_state_dict(ddp_model[0], state_dict['model'], strict)
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/training/checkpointing.py", line 1453, in load_model_state_dict
[rank6]: load_return = module.load_state_dict(state_dict, strict=False)
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/core/distributed/data_parallel_base.py", line 96, in load_state_dict
[rank6]: self.module.load_state_dict(state_dict, strict=strict)
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/Megatron-LM/megatron/core/transformer/module.py", line 258, in load_state_dict
[rank6]: self.module.load_state_dict(state_dict, strict=strict)
[rank6]: File "/mnt/bn/wjp-lq/qwen3omni/ms-swift/swift/megatron/trainers/base.py", line 235, in load_state_dict
[rank6]: return origin_load_state_dict(self, state_dict, strict, *args, **kwargs)
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2561, in load_state_dict
[rank6]: load(self, state_dict)
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2549, in load
[rank6]: load(child, child_state_dict, child_prefix) # noqa: F821
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2549, in load
[rank6]: load(child, child_state_dict, child_prefix) # noqa: F821
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2549, in load
[rank6]: load(child, child_state_dict, child_prefix) # noqa: F821
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: [Previous line repeated 3 more times]
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2532, in load
[rank6]: module._load_from_state_dict(
[rank6]: File "/usr/local/lib/python3.11/dist-packages/transformer_engine/pytorch/attention.py", line 6997, in _load_from_state_dict
[rank6]: super()._load_from_state_dict(
[rank6]: File "/usr/local/lib/python3.11/dist-packages/transformer_engine/pytorch/module/base.py", line 1063, in _load_from_state_dict
[rank6]: super()._load_from_state_dict(
[rank6]: File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2456, in _load_from_state_dict
[rank6]: self.set_extra_state(state_dict[extra_state_key])
[rank6]: File "/usr/local/lib/python3.11/dist-packages/transformer_engine/pytorch/module/base.py", line 670, in set_extra_state
[rank6]: state = pickle.loads(state.detach().cpu().numpy().tobytes())
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank6]: EOFError: Ran out of input
Metadata
Metadata
Assignees
Labels
No labels