Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

使用deepspeed时报错 RuntimeError: torch.cat(): expected a non-empty list of Tensors #22

Open
tjulh opened this issue Jun 25, 2023 · 2 comments

Comments

@tjulh
Copy link

tjulh commented Jun 25, 2023

Traceback (most recent call last) ──────────────────────╮

│ /home/lk/moss_finetuning-dev/train.py:107 in │

│ │

│ 104 │ │ num_workers=0 │

│ 105 │ ) │

│ 106 │ │

│ ❱ 107 │ trainer.fit(pl_model, train_dataloaders=train_datasets) │

│ 108 │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/traine │

│ r.py:531 in fit │

│ │

│ 528 │ │ """ │

│ 529 │ │ model = _maybe_unwrap_optimized(model) │

│ 530 │ │ self.strategy._lightning_module = model │

│ ❱ 531 │ │ call._call_and_handle_interrupt( │

│ 532 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │

│ 533 │ │ ) │

│ 534 │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/call.p │

│ y:41 in _call_and_handle_interrupt │

│ │

│ 38 │ """ │

│ 39 │ try: │

│ 40 │ │ if trainer.strategy.launcher is not None: │

│ ❱ 41 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │

│ 42 │ │ return trainer_fn(*args, **kwargs) │

│ 43 │ │

│ 44 │ except _TunerExitException: │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/lau │

│ nchers/subprocess_script.py:91 in launch │

│ │

│ 88 │ │ """ │

│ 89 │ │ if not self.cluster_environment.creates_processes_externally: │

│ 90 │ │ │ self._call_children_scripts() │

│ ❱ 91 │ │ return function(*args, **kwargs) │

│ 92 │ │

│ 93 │ def kill(self, signum: _SIGNUM) -> None: │

│ 94 │ │ for proc in self.procs: │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/traine │

│ r.py:570 in _fit_impl │

│ │

│ 567 │ │ │ model_provided=True, │

│ 568 │ │ │ model_connected=self.lightning_module is not None, │

│ 569 │ │ ) │

│ ❱ 570 │ │ self._run(model, ckpt_path=ckpt_path) │

│ 571 │ │ │

│ 572 │ │ assert self.state.stopped │

│ 573 │ │ self.training = False │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/traine │

│ r.py:951 in _run │

│ │

│ 948 │ │ self._logger_connector.reset_metrics() │

│ 949 │ │ │

│ 950 │ │ # strategy will configure model and move it to the device │

│ ❱ 951 │ │ self.strategy.setup(self) │

│ 952 │ │ │

│ 953 │ │ # hook │

│ 954 │ │ if self.state.fn == TrainerFn.FITTING: │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │

│ pspeed.py:345 in setup │

│ │

│ 342 │ │ self.setup_optimizers(trainer) │

│ 343 │ │ self.setup_precision_plugin() │

│ 344 │ │ _optimizers_to_device(self.optimizers, self.root_device) │

│ ❱ 345 │ │ self.init_deepspeed() │

│ 346 │ │ self.barrier() │

│ 347 │ │

│ 348 │ def _init_deepspeed_distributed(self) -> None: │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │

│ pspeed.py:449 in init_deepspeed │

│ │

│ 446 │ │ model = _LightningModuleWrapperBase(forward_module=self.model) │

│ 447 │ │ │

│ 448 │ │ if self.lightning_module.trainer and self.lightning_module.tra │

│ ❱ 449 │ │ │ self._initialize_deepspeed_train(model) │

│ 450 │ │ else: │

│ 451 │ │ │ self._initialize_deepspeed_inference(model) │

│ 452 │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │

│ pspeed.py:485 in _initialize_deepspeed_train │

│ │

│ 482 │ │ │ if lr_scheduler is not None: │

│ 483 │ │ │ │ scheduler = lr_scheduler.scheduler │

│ 484 │ │ │

│ ❱ 485 │ │ model, deepspeed_optimizer = self._setup_model_and_optimizer(m │

│ 486 │ │ self._set_deepspeed_activation_checkpointing() │

│ 487 │ │ │

│ 488 │ │ # although we set these here, deepspeed manages the specific o │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │

│ pspeed.py:414 in _setup_model_and_optimizer │

│ │

│ 411 │ │ import deepspeed │

│ 412 │ │ │

│ 413 │ │ model_parameters = filter(lambda p: p.requires_grad, model.par │

│ ❱ 414 │ │ deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initia │

│ 415 │ │ │ args=argparse.Namespace(device_rank=self.root_device.index │

│ 416 │ │ │ config=self.config, │

│ 417 │ │ │ model=model, │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/init.py:165 in │

│ initialize │

│ │

│ 162 │ │ │ │ │ │ │ │ │ │ config=config, │

│ 163 │ │ │ │ │ │ │ │ │ │ config_class=config_class) │

│ 164 │ │ else: │

│ ❱ 165 │ │ │ engine = DeepSpeedEngine(args=args, │

│ 166 │ │ │ │ │ │ │ │ │ model=model, │

│ 167 │ │ │ │ │ │ │ │ │ optimizer=optimizer, │

│ 168 │ │ │ │ │ │ │ │ │ model_parameters=model_parameters │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:308 │

│ in init

│ │

│ 305 │ │ │ model_parameters = list(model_parameters) │

│ 306 │ │ │

│ 307 │ │ if has_optimizer: │

│ ❱ 308 │ │ │ self._configure_optimizer(optimizer, model_parameters) │

│ 309 │ │ │ self._configure_lr_scheduler(lr_scheduler) │

│ 310 │ │ │ self._report_progress(0) │

│ 311 │ │ elif self.zero_optimization(): │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1173 │

│ in _configure_optimizer │

│ │

│ 1170 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │

│ 1171 │ │ │

│ 1172 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │

│ ❱ 1173 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │

│ 1174 │ │ elif optimizer_wrapper == AMP: │

│ 1175 │ │ │ amp_params = self.amp_params() │

│ 1176 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1408 │

│ in _configure_zero_optimizer │

│ │

│ 1405 │ │ │ │ if overlap_comm: │

│ 1406 │ │ │ │ │ logger.warning("Pipeline parallelism does not sup │

│ 1407 │ │ │ │ │ overlap_comm = False │

│ ❱ 1408 │ │ │ optimizer = DeepSpeedZeroOptimizer( │

│ 1409 │ │ │ │ optimizer, │

│ 1410 │ │ │ │ self.param_names, │

│ 1411 │ │ │ │ timers=timers, │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_a │

│ nd_2.py:313 in init

│ │

│ 310 │ │ │ │

│ 311 │ │ │ # create flat buffer in CPU and move to GPU │

│ 312 │ │ │ self.bit16_groups_flat.append( │

│ ❱ 313 │ │ │ │ self.flatten_dense_tensors_aligned( │

│ 314 │ │ │ │ │ self.round_robin_bit16_groups[i], │

│ 315 │ │ │ │ │ self.nccl_start_alignment_factor * dist.get_world │

│ 316 │ │ │ │ │ │ get_accelerator().current_device_name())) │

│ │

│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_a │

│ nd_2.py:830 in flatten_dense_tensors_aligned │

│ │

│ 827 │ │

│ 828 │ # create a flat tensor aligned at the alignment boundary │

│ 829 │ def flatten_dense_tensors_aligned(self, tensor_list, alignment): │

│ ❱ 830 │ │ return self.flatten(align_dense_tensors(tensor_list, alignmen │

│ 831 │ │

│ 832 │ ############### Independent Partition Gradient ################## │

│ 833 │ def reduce_independent_p_g_buckets_and_remove_grads(self, param, │

╰──────────────────────────────────────────────────────────────────────────────╯

RuntimeError: torch.cat(): expected a non-empty list of Tensors

作者大大能帮看下是哪里的问题吗

@ssbuild
Copy link
Owner

ssbuild commented Jun 25, 2023

换一下优化器就行了

@tjulh
Copy link
Author

tjulh commented Jun 26, 2023

换一下优化器就行了

我换了lion、adam、adamw_torch这几个优化器都报一样的错...这里有什么讲究吗

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants