Skip to content

Commit 44abe60

Browse files
jorgeepditommasochristopher-hakkaartbentsherman
authored
Manage AWS Batch Unscheduled jobs (#5936)
--------- Signed-off-by: jorgee <[email protected]> Signed-off-by: Paolo Di Tommaso <[email protected]> Signed-off-by: Ben Sherman <[email protected]> Co-authored-by: Paolo Di Tommaso <[email protected]> Co-authored-by: Chris Hakkaart <[email protected]> Co-authored-by: Ben Sherman <[email protected]>
1 parent e3b8ca4 commit 44abe60

File tree

6 files changed

+72
-1
lines changed

6 files changed

+72
-1
lines changed

docs/reference/config.md

+5
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,11 @@ The following settings are available:
180180
:::
181181
: The share identifier for all tasks when using [fair-share scheduling for AWS Batch](https://aws.amazon.com/blogs/hpc/introducing-fair-share-scheduling-for-aws-batch/)
182182

183+
`aws.batch.terminateUnschedulableJobs`
184+
: :::{versionadded} 25.03.0-edge
185+
:::
186+
: When `true`, jobs that cannot be scheduled for lack of resources or misconfiguration are terminated automatically (default: `false`). The pipeline may complete with an error status depending on the error strategy defined for the corresponding jobs.
187+
183188
`aws.batch.volumes`
184189
: One or more container mounts. Mounts can be specified as simple e.g. `/some/path` or canonical format e.g. `/host/path:/mount/path[:ro|rw]`. Multiple mounts can be specified separating them with a comma or using a list object.
185190

modules/nf-lang/src/main/java/nextflow/config/scopes/AwsBatchConfig.java

+6
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ Max number of downloads attempts from S3 (default: `1`).
104104
""")
105105
public String shareIdentifier;
106106

107+
@ConfigOption
108+
@Description("""
109+
When true, jobs that cannot be scheduled for lack of resources or misconfiguration are terminated automatically (default: `false`).
110+
""")
111+
public boolean terminateUnschedulableJobs;
112+
107113
@ConfigOption
108114
@Description("""
109115
One or more container mounts. Mounts can be specified as simple e.g. `/some/path` or canonical format e.g. `/host/path:/mount/path[:ro|rw]`.

plugins/nf-amazon/src/main/nextflow/cloud/aws/batch/AwsBatchTaskHandler.groovy

+37
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,11 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
116116

117117
final static private Map<String,String> jobDefinitions = [:]
118118

119+
final static private List<String> MISCONFIGURATION_REASONS = List.of(
120+
"MISCONFIGURATION:JOB_RESOURCE_REQUIREMENT",
121+
"MISCONFIGURATION:COMPUTE_ENVIRONMENT_MAX_RESOURCE"
122+
)
123+
119124
/**
120125
* Batch context shared between multiple task handlers
121126
*/
@@ -232,12 +237,40 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
232237
final result = job?.status in ['RUNNING', 'SUCCEEDED', 'FAILED']
233238
if( result )
234239
this.status = TaskStatus.RUNNING
240+
else
241+
checkIfUnschedulable(job)
235242
// fetch the task arn
236243
if( !taskArn )
237244
taskArn = job?.getContainer()?.getTaskArn()
238245
return result
239246
}
240247

248+
protected void checkIfUnschedulable(JobDetail job) {
249+
if( job ) try {
250+
checkIfUnschedulable0(job)
251+
}
252+
catch (Throwable e) {
253+
log.warn "Unable to check if job is unschedulable - ${e.message}", e
254+
}
255+
}
256+
257+
private void checkIfUnschedulable0(JobDetail job) {
258+
final reason = errReason(job)
259+
if( MISCONFIGURATION_REASONS.any((it) -> reason.contains(it)) ) {
260+
final msg = "unschedulable AWS Batch job ${jobId} (${task.lazyName()}) - $reason"
261+
// If indicated in aws.batch config kill the job an produce a failure
262+
if( executor.awsOptions.terminateUnschedulableJobs() ){
263+
log.warn("Terminating ${jobId}")
264+
kill()
265+
task.error = new ProcessException("Unschedulable AWS Batch job ${jobId} - $reason")
266+
status = TaskStatus.COMPLETED
267+
}
268+
else {
269+
log.warn "Detected $msg"
270+
}
271+
}
272+
}
273+
241274
protected String errReason(JobDetail job){
242275
if(!job)
243276
return "(unknown)"
@@ -256,6 +289,10 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
256289
@Override
257290
boolean checkIfCompleted() {
258291
assert jobId
292+
if( isCompleted() ) {
293+
//Task can be marked as completed before running by unschedulable reason. Return true
294+
return true
295+
}
259296
if( !isRunning() )
260297
return false
261298
final job = describeJob(jobId)

plugins/nf-amazon/src/main/nextflow/cloud/aws/batch/AwsOptions.groovy

+4
Original file line numberDiff line numberDiff line change
@@ -161,4 +161,8 @@ class AwsOptions implements CloudTransferOptions {
161161
return awsConfig.batchConfig.getExecutionRole()
162162
}
163163

164+
boolean terminateUnschedulableJobs() {
165+
return awsConfig.batchConfig.terminateUnschedulableJobs
166+
}
167+
164168
}

plugins/nf-amazon/src/main/nextflow/cloud/aws/config/AwsBatchConfig.groovy

+6-1
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ class AwsBatchConfig implements CloudTransferOptions {
9292
*/
9393
boolean fargateMode
9494

95+
/**
96+
* Flag to fail and terminate unscheduled jobs.
97+
*/
98+
boolean terminateUnschedulableJobs
99+
95100
/*
96101
* only for testing
97102
*/
@@ -112,11 +117,11 @@ class AwsBatchConfig implements CloudTransferOptions {
112117
shareIdentifier = opts.shareIdentifier
113118
schedulingPriority = opts.schedulingPriority as Integer ?: 0
114119
executionRole = opts.executionRole
120+
terminateUnschedulableJobs = opts.terminateUnschedulableJobs as boolean
115121
if( retryMode == 'built-in' )
116122
retryMode = null // this force falling back on NF built-in retry mode instead of delegating to AWS CLI tool
117123
if( retryMode && retryMode !in AwsOptions.VALID_RETRY_MODES )
118124
log.warn "Unexpected value for 'aws.batch.retryMode' config setting - offending value: $retryMode - valid values: ${AwsOptions.VALID_RETRY_MODES.join(',')}"
119-
120125
}
121126

122127
// ==== getters =====

plugins/nf-amazon/src/test/nextflow/cloud/aws/config/AwsBatchConfigTest.groovy

+14
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class AwsBatchConfigTest extends Specification {
4646
!batch.isFargateMode()
4747
!batch.s5cmdPath
4848
batch.schedulingPriority == 0
49+
!batch.terminateUnschedulableJobs
4950
}
5051

5152
def 'should create config with options' () {
@@ -139,4 +140,17 @@ class AwsBatchConfigTest extends Specification {
139140
[platformType: 'fargate', cliPath: "/opt/s5cmd --foo"] | null | '/opt/s5cmd --foo'| true
140141
}
141142

143+
def 'should parse unschedulable flag' () {
144+
given:
145+
def opts = new AwsBatchConfig(OPTS)
146+
147+
expect:
148+
opts.terminateUnschedulableJobs == UNSCHEDULABLE
149+
150+
where:
151+
OPTS | UNSCHEDULABLE
152+
[:] | false
153+
[terminateUnschedulableJobs: false] | false
154+
[terminateUnschedulableJobs: true] | true
155+
}
142156
}

0 commit comments

Comments
 (0)