Skip to content
Merged
Changes from 11 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
b58dc97
feat: added the job resource 'gres' to be considered
cmeesters Nov 25, 2024
4f9035f
feat: added regex based validator for gres
cmeesters Nov 25, 2024
350577c
feat: supporting selection from gpu:int and gpu_module, too
cmeesters Jan 2, 2025
71eb4fe
feat: providing better gpu model validation and selection
cmeesters Jan 2, 2025
1a5c1b1
fix: bug associated with not reading my own last commit
cmeesters Jan 2, 2025
8d22386
fix: satisfiying the linter
cmeesters Jan 3, 2025
97aca06
fix: resolved merge conflict
cmeesters Jan 8, 2025
00ab13c
feat: using , if gpu resources are selected
cmeesters Jan 8, 2025
cb94eb9
feat: auto distinction between ntasks and ntasks_per_gpu
cmeesters Jan 10, 2025
3f36f86
feat: first half of auto selection between cpus-per-task and cpus-per…
cmeesters Jan 10, 2025
2b50601
fix: missing colon
cmeesters Jan 10, 2025
0e18736
feat: moved gres/gpu support to utils file - else the executor gets u…
cmeesters Jan 13, 2025
8914b0f
fix: added missing WorkflowError import
cmeesters Jan 13, 2025
3bcbe05
fix: removed duplicate check
cmeesters Jan 13, 2025
df62a6f
docs: added docs for the new features
cmeesters Jan 13, 2025
1610f76
Merge branch 'main' into feat/gres
cmeesters Feb 10, 2025
e44bcff
Merge branch 'main' into feat/gres
cmeesters Feb 10, 2025
9f2427c
Update docs/further.md
cmeesters Feb 14, 2025
7ebf5a7
fix: resource is 'gpus' not 'gpu'
cmeesters Feb 18, 2025
2bf4fb1
feat: adding information about model selection, gpu selection and adj…
cmeesters Feb 18, 2025
5db9967
fix: merge conflict
cmeesters Feb 18, 2025
bc87815
Merge branch 'main' into feat/gres
cmeesters Feb 18, 2025
f29641e
fix: solved lingering merge conflict ...
cmeesters Mar 5, 2025
7be34f1
gnarf
cmeesters Mar 5, 2025
0f3bd41
fix: merge conflict
cmeesters Mar 5, 2025
d38a8ef
fix: updated toml to account for the current jobstep executor plugin
cmeesters Mar 5, 2025
78252d7
fix: allowing both 'gpu' and 'gpus' resource strings
cmeesters Mar 6, 2025
bf9da9a
fix: added missing check for non-gpu-jobs
cmeesters Mar 6, 2025
3b9aa2a
Update snakemake_executor_plugin_slurm/utils.py
cmeesters Mar 6, 2025
17b28ea
Update snakemake_executor_plugin_slurm/utils.py
cmeesters Mar 6, 2025
32c8a21
fix: syntax error introduced by using coderabbitai
cmeesters Mar 6, 2025
261f94d
fix: line was too long
cmeesters Mar 6, 2025
69eb078
fix: formatting
cmeesters Mar 6, 2025
c63d34f
feat: added mock tests
cmeesters Mar 6, 2025
f89e7c5
fix: added missing import
cmeesters Mar 6, 2025
bcad610
fix: logic
cmeesters Mar 6, 2025
7110b8e
fix: added missing error case
cmeesters Mar 6, 2025
9a06f8b
fix: logic
cmeesters Mar 6, 2025
89f507d
fix: removed resource 'gpus' - only 'gpu' shall be supported within S…
cmeesters Mar 10, 2025
3aaefa3
fix: test string
cmeesters Mar 10, 2025
67d7805
Update docs/further.md
cmeesters Mar 10, 2025
0d7073b
fix: settled for resource 'gpu' and 'gpu_model', only
cmeesters Mar 11, 2025
7524293
fix: linting done
cmeesters Mar 11, 2025
0bab217
fix: assuming 'gpu' resourse validation is in snakemake
cmeesters Mar 11, 2025
3877405
Update docs/further.md
johanneskoester Mar 11, 2025
09d6f2b
Update snakemake_executor_plugin_slurm/__init__.py
johanneskoester Mar 11, 2025
9d71beb
Update snakemake_executor_plugin_slurm/utils.py
johanneskoester Mar 11, 2025
ae126ca
fix: linting
cmeesters Mar 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 50 additions & 3 deletions snakemake_executor_plugin_slurm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ class ExecutorSettings(ExecutorSettingsBase):
# Required:
# Implementation of your executor
class Executor(RemoteExecutor):
gres_re = re.compile(r"^[a-zA-Z0-9_]+:([a-zA-Z0-9_]+:)?\d+$")
gpu_model_re = re.compile(r"^[a-zA-Z0-9_]+$")

def __post_init__(self):
# run check whether we are running in a SLURM job context
self.warn_on_jobcontext()
Expand Down Expand Up @@ -219,6 +222,45 @@ def run_job(self, job: JobExecutorInterface):
if self.workflow.executor_settings.requeue:
call += " --requeue"

if job.resources.get("gres"):
# Validate GRES format (e.g., "gpu:1", "gpu:tesla:2")
gres = job.resources.gres
if not Executor.gres_re.match(gres):
raise WorkflowError(
f"Invalid GRES format: {gres}. Expected format: "
"'<name>:<number>' or '<name>:<type>:<number>'"
)
gres_string = f" --gres={job.resources.gres}"
if job.resources.get("gpu"):
# ensure that gres is not set, if gpu and gpu_model are set
if job.resources.get("gres"):
raise WorkflowError(
"GRES and GPU are set. Please only set one of them."
)
# ensure that 'gpu' is an integer
if not isinstance(job.resources.gpu, int):
raise WorkflowError(
"The 'gpu' resource must be an integer. "
f"Got: {job.resources.gpu} ({type(job.resources.gpu)})."
)
gres_string = f" --gpus={job.resources.gpu}"
if job.resources.get("gpu_model") and job.resources.get("gpu"):
# validate GPU model format
if not Executor.gpu_model_re.match(job.resources.gpu_model):
raise WorkflowError(
f"Invalid GPU model format: {job.resources.gpu_model}. "
"Expected format: '<name>'"
)
gres_string = f" --gpus:{job.resources.gpu_model}:{job.resources.gpu}"
elif job.resources.get("gpu_model") and not job.resources.get("gpu"):
raise WorkflowError(
"GPU model is set, but no GPU number is given. "
"Please set 'gpu' as well."
)
call += (
gres_string if job.resources.get("gres") or job.resources.get("gpu") else ""
)

if job.resources.get("clusters"):
call += f" --clusters {job.resources.clusters}"

Expand Down Expand Up @@ -249,7 +291,11 @@ def run_job(self, job: JobExecutorInterface):

# fixes #40 - set ntasks regardless of mpi, because
# SLURM v22.05 will require it for all jobs
call += f" --ntasks={job.resources.get('tasks', 1)}"
gpu_job = job.resources.get("gpus") or "gpu" in job.resources.get("gres", "")
if gpu_job:
call += f" --ntasks-per-gpu={job.resources.get('tasks', 1)}"
else:
call += f" --ntasks={job.resources.get('tasks', 1)}"
# MPI job
if job.resources.get("mpi", False):
if not job.resources.get("tasks_per_node") and not job.resources.get(
Expand All @@ -260,8 +306,9 @@ def run_job(self, job: JobExecutorInterface):
"specified. Assuming 'tasks_per_node=1'."
"Probably not what you want."
)

call += f" --cpus-per-task={get_cpus_per_task(job)}"
# we need to set cpus-per-task OR cpus-per-gpu, the function
# will return a string with the corresponding value
call += f" {get_cpus_per_task(job, gpu_job)}"

if job.resources.get("slurm_extra"):
self.check_slurm_extra(job)
Expand Down
Loading