You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
As shown in the figure above, I want to use multi gpus to run my job. But it has error like as follow:
Traceback (most recent call last):
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 177, in _call_for_each_tower
**merge_kwargs)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/checkpoint_utils.py", line 193, in _init_from_checkpoint
ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/checkpoint_utils.py", line 280, in _get_checkpoint_filename
if gfile.IsDirectory(ckpt_dir_or_file):
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/lib/io/file_io.py", line 467, in is_directory
return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/util/compat.py", line 61, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got PerDevice({'/replica:0/task:0/device:GPU:0': '/data/yangxiaohan/.eztransfer_modelzoo/bert/google-bert-base-zh/model.ckpt', '/replica:0/task:0/device:GPU:1': '/data/yangxiaohan/.eztransfer_modelzoo/bert/google-bert-base-zh/model.ckpt'})
Traceback (most recent call last):
File "src/fit.py", line 179, in
tf.app.run()
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "src/fit.py", line 168, in main
train()
File "src/fit.py", line 24, in train
app.run_train(reader=train_reader)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/easytransfer/engines/model.py", line 616, in run_train
max_steps=self.train_steps)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1205, in _train_model
return self._train_model_distributed(input_fn, hooks, saving_listeners)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1316, in _train_model_distributed
self.config)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/distribute.py", line 721, in call_for_each_tower
return self._call_for_each_tower(fn, *args, **kwargs)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 556, in _call_for_each_tower
return _call_for_each_tower(self, fn, *args, **kwargs)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 183, in _call_for_each_tower
coord.join(threads)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 389, in join
six.reraise(*self._exc_info_to_raise)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/six.py", line 693, in reraise
raise value
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 177, in _call_for_each_tower
**merge_kwargs)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/checkpoint_utils.py", line 193, in _init_from_checkpoint
ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/checkpoint_utils.py", line 280, in _get_checkpoint_filename
if gfile.IsDirectory(ckpt_dir_or_file):
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/lib/io/file_io.py", line 467, in is_directory
return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/util/compat.py", line 61, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got PerDevice({'/replica:0/task:0/device:GPU:0': '/data/yangxiaohan/.eztransfer_modelzoo/bert/google-bert-base-zh/model.ckpt', '/replica:0/task:0/device:GPU:1': '/data/yangxiaohan/.eztransfer_modelzoo/bert/google-bert-base-zh/model.ckpt'})
The text was updated successfully, but these errors were encountered:
As shown in the figure above, I want to use multi gpus to run my job. But it has error like as follow:
Traceback (most recent call last):
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 177, in _call_for_each_tower
**merge_kwargs)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/checkpoint_utils.py", line 193, in _init_from_checkpoint
ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/checkpoint_utils.py", line 280, in _get_checkpoint_filename
if gfile.IsDirectory(ckpt_dir_or_file):
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/lib/io/file_io.py", line 467, in is_directory
return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/util/compat.py", line 61, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got PerDevice({'/replica:0/task:0/device:GPU:0': '/data/yangxiaohan/.eztransfer_modelzoo/bert/google-bert-base-zh/model.ckpt', '/replica:0/task:0/device:GPU:1': '/data/yangxiaohan/.eztransfer_modelzoo/bert/google-bert-base-zh/model.ckpt'})
Traceback (most recent call last):
File "src/fit.py", line 179, in
tf.app.run()
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "src/fit.py", line 168, in main
train()
File "src/fit.py", line 24, in train
app.run_train(reader=train_reader)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/easytransfer/engines/model.py", line 616, in run_train
max_steps=self.train_steps)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1205, in _train_model
return self._train_model_distributed(input_fn, hooks, saving_listeners)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1316, in _train_model_distributed
self.config)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/distribute.py", line 721, in call_for_each_tower
return self._call_for_each_tower(fn, *args, **kwargs)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 556, in _call_for_each_tower
return _call_for_each_tower(self, fn, *args, **kwargs)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 183, in _call_for_each_tower
coord.join(threads)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 389, in join
six.reraise(*self._exc_info_to_raise)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/six.py", line 693, in reraise
raise value
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 177, in _call_for_each_tower
**merge_kwargs)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/checkpoint_utils.py", line 193, in _init_from_checkpoint
ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/training/checkpoint_utils.py", line 280, in _get_checkpoint_filename
if gfile.IsDirectory(ckpt_dir_or_file):
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/lib/io/file_io.py", line 467, in is_directory
return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
File "/data/yangxiaohan/tool/python3.6/lib/python3.6/site-packages/tensorflow/python/util/compat.py", line 61, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got PerDevice({'/replica:0/task:0/device:GPU:0': '/data/yangxiaohan/.eztransfer_modelzoo/bert/google-bert-base-zh/model.ckpt', '/replica:0/task:0/device:GPU:1': '/data/yangxiaohan/.eztransfer_modelzoo/bert/google-bert-base-zh/model.ckpt'})
The text was updated successfully, but these errors were encountered: