Skip to content

Commit 66f66f4

Browse files
committed
add unit test + fix unit tests
Signed-off-by: Timothy Seah <[email protected]>
1 parent 7fa6c0d commit 66f66f4

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

python/ray/train/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,6 @@ def _v2_migration_warnings_enabled() -> bool:
144144
RAY_TRAIN_ENABLE_STATE_TRACKING,
145145
TUNE_ONLY_STORE_CHECKPOINT_SCORE_ATTRIBUTE,
146146
TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
147-
DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
148147
}
149148

150149
# Key for AIR Checkpoint metadata in TrainingResult metadata

python/ray/train/tests/test_backend.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from ray.train.constants import (
2929
ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
3030
ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV,
31+
TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S,
3132
TRAIN_ENABLE_WORKER_SPREAD_ENV,
3233
)
3334
from ray.train.torch import TorchConfig
@@ -364,6 +365,24 @@ def check_process_group():
364365
assert not any(e.finish_training())
365366

366367

368+
@pytest.mark.parametrize(
369+
"init_method, timeout_s", [("env", 5), ("tcp", 5), ("env", 0), ("tcp", 0)]
370+
)
371+
def test_torch_process_group_shutdown_timeout(
372+
ray_start_2_cpus, monkeypatch, init_method, timeout_s
373+
):
374+
monkeypatch.setenv(TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S, timeout_s)
375+
torch_config = TorchConfig(backend="gloo", init_method=init_method)
376+
e = BackendExecutor(torch_config, num_workers=2)
377+
e.start()
378+
379+
_start_training(e, lambda: 1)
380+
assert e.finish_training() == [1, 1]
381+
382+
# Verify that we do not raise an exception even if we time out
383+
e._backend.on_shutdown(e.worker_group, e._backend_config)
384+
385+
367386
@pytest.mark.parametrize(
368387
"worker_results",
369388
[

0 commit comments

Comments
 (0)