diff --git a/batchspawner/batchspawner.py b/batchspawner/batchspawner.py
index 7c2836c..4e81228 100644
--- a/batchspawner/batchspawner.py
+++ b/batchspawner/batchspawner.py
@@ -448,6 +448,13 @@ async def start(self):
# don't actually run the single-user server yet.
if hasattr(self, "mock_port"):
self.port = self.mock_port
+ # Check if job is still running
+ status = await self.poll()
+ if status:
+ raise RuntimeError(
+ "The Jupyter batch job started"
+ " but died before launching the single-user server."
+ )
self.db.commit()
self.log.info(
diff --git a/batchspawner/tests/test_spawners.py b/batchspawner/tests/test_spawners.py
index bd378c5..e5d43c0 100644
--- a/batchspawner/tests/test_spawners.py
+++ b/batchspawner/tests/test_spawners.py
@@ -349,20 +349,19 @@ async def test_torque(db, event_loop):
re.compile(r"ppn=5"),
re.compile(r"^#PBS some_option_asdf", re.M),
]
+ poll_running = (
+ re.compile(r"sudo.*qstat"),
+ f"R{testhost}/1",
+ )
script = [
(re.compile(r"sudo.*qsub"), str(testjob)),
(
re.compile(r"sudo.*qstat"),
"Q",
), # pending
- (
- re.compile(r"sudo.*qstat"),
- f"R{testhost}/1",
- ), # running
- (
- re.compile(r"sudo.*qstat"),
- f"R{testhost}/1",
- ), # running
+ poll_running,
+ poll_running,
+ poll_running,
(re.compile(r"sudo.*qdel"), "STOP"),
(re.compile(r"sudo.*qstat"), ""),
]
@@ -394,17 +393,16 @@ async def test_moab(db, event_loop):
re.compile(r"ppn=5"),
re.compile(r"^#PBS some_option_asdf", re.M),
]
+ poll_running = (
+ re.compile(r"sudo.*mdiag"),
+ f'State="Running" AllocNodeList="{testhost}"',
+ )
script = [
(re.compile(r"sudo.*msub"), str(testjob)),
(re.compile(r"sudo.*mdiag"), 'State="Idle"'), # pending
- (
- re.compile(r"sudo.*mdiag"),
- f'State="Running" AllocNodeList="{testhost}"',
- ), # running
- (
- re.compile(r"sudo.*mdiag"),
- f'State="Running" AllocNodeList="{testhost}"',
- ), # running
+ poll_running,
+ poll_running,
+ poll_running,
(re.compile(r"sudo.*mjobctl.*-c"), "STOP"),
(re.compile(r"sudo.*mdiag"), ""),
]
@@ -436,17 +434,16 @@ async def test_pbs(db, event_loop):
re.compile(r"@some_pbs_admin_node"),
re.compile(r"^#PBS some_option_asdf", re.M),
]
+ poll_running = (
+ re.compile(r"sudo.*qstat"),
+ f"job_state = R\nexec_host = {testhost}/2*1",
+ )
script = [
(re.compile(r"sudo.*qsub"), str(testjob)),
(re.compile(r"sudo.*qstat"), "job_state = Q"), # pending
- (
- re.compile(r"sudo.*qstat"),
- f"job_state = R\nexec_host = {testhost}/2*1",
- ), # running
- (
- re.compile(r"sudo.*qstat"),
- f"job_state = R\nexec_host = {testhost}/2*1",
- ), # running
+ poll_running,
+ poll_running,
+ poll_running,
(re.compile(r"sudo.*qdel"), "STOP"),
(re.compile(r"sudo.*qstat"), ""),
]
@@ -504,6 +501,7 @@ async def test_slurm(db, event_loop):
), # unknown
(re.compile(r"sudo.*squeue"), "RUNNING " + testhost), # running
(re.compile(r"sudo.*squeue"), "RUNNING " + testhost),
+ (re.compile(r"sudo.*squeue"), "RUNNING " + testhost),
(re.compile(r"sudo.*scancel"), "STOP"),
(re.compile(r"sudo.*squeue"), ""),
]
@@ -573,6 +571,7 @@ async def test_condor(db, event_loop):
(re.compile(r"sudo.*condor_q"), "1,"), # pending
(re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), # runing
(re.compile(r"sudo.*condor_q"), f"2, @{testhost}"),
+ (re.compile(r"sudo.*condor_q"), f"2, @{testhost}"),
(re.compile(r"sudo.*condor_rm"), "STOP"),
(re.compile(r"sudo.*condor_q"), ""),
]
@@ -611,6 +610,7 @@ async def test_lfs(db, event_loop):
(re.compile(r"sudo.*bjobs"), "PEND "), # pending
(re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), # running
(re.compile(r"sudo.*bjobs"), f"RUN {testhost}"),
+ (re.compile(r"sudo.*bjobs"), f"RUN {testhost}"),
(re.compile(r"sudo.*bkill"), "STOP"),
(re.compile(r"sudo.*bjobs"), ""),
]
@@ -652,3 +652,19 @@ async def test_keepvars(db, event_loop):
spawner_kwargs=spawner_kwargs,
batch_script_re_list=batch_script_re_list,
)
+
+
+async def test_early_stop(db, event_loop):
+ script = [
+ (re.compile(r"sudo.*sbatch"), str(testjob)),
+ (re.compile(r"sudo.*squeue"), "PENDING "), # pending
+ (
+ re.compile(r"sudo.*squeue"),
+ "slurm_load_jobs error: Unable to contact slurm controller",
+ ), # unknown
+ # job exits early during start
+ (re.compile(r"sudo.*squeue"), ""),
+ (re.compile(r"sudo.*scancel"), "STOP"),
+ ]
+ with pytest.raises(RuntimeError, match="job has disappeared"):
+ await run_spawner_script(db, SlurmSpawner, script)