moiexpositoalonsolab
diff --git a/‎config/config.yaml‎
Lines changed: 25 additions & 45 deletions b/‎config/config.yaml‎
Lines changed: 25 additions & 45 deletions
diff --git a/‎config/resources.yaml‎
Lines changed: 116 additions & 0 deletions b/‎config/resources.yaml‎
Lines changed: 116 additions & 0 deletions
@@ -123,6 +123,24 @@ data:
 # See later in the "params" category for the parameters of each tool.
 settings:
 
+  # Computational resources.
+  # Next to this `config.yaml` file, we provide a system-independent `resources.yaml`, which
+  # specifies all computational resources (time, memory, CPUs, etc) to use. This is mostly relevant
+  # in cluster environments (such as when using slurm to submit individual jobs), as those systems
+  # need to know in advance how much of each resource a job will need. However, we do not want to
+  # clutter this config file here will all this information - this file here is meant to describe
+  # the data and tool settings, but should not be concerned with "practical" aspects such as how to
+  # run them. So instead, these are specified in the `resources.yaml`.
+  # We search for this file in three places, in this order: First, in the path specified here.
+  # Second, in the working directory (where you copy this `config.yaml` file to as well, and which
+  # is provided to snakemake as `--directory`). Third, in the `config` directory within grenepipe,
+  # which is where the default file lives.
+  # We hence recommend to set up the `resources.yaml` by copying it to your working directory
+  # (where you also copied this `config.yaml` to), and adapt it there as needed. However,
+  # if you have multiple runs of grenepipe with the same resource requirements, you can instead
+  # specify a path to a shared `resources.yaml` file here.
+  resources-yaml: ""
+
   # ----------------------------------------------------------------------
   #     Basic Steps
   # ----------------------------------------------------------------------
@@ -423,7 +441,6 @@ params:
   # See adapterremoval manual: https://adapterremoval.readthedocs.io/en/latest/
   # and https://adapterremoval.readthedocs.io/en/latest/manpage.html
   adapterremoval:
-    threads: 4
 
     # Extra parameters for single reads. Param `--gzip` is alreaday set internally.
     se: ""
@@ -439,7 +456,6 @@ params:
   # Used only if settings:trimming-tool == cutadapt
   # See cutadapt manual: https://cutadapt.readthedocs.io/en/stable/guide.html#adapter-types
   cutadapt:
-    threads: 4
 
     # Set the adapters and any extra parameters.
     # For example, adapters: "-a AGAGCACACGTCTGAACTCCAGTCAC -g AGATCGGAAGAGCACACGT -A AGAGCACACGTCTGAACTCCAGTCAC -G AGATCGGAAGAGCACACGT"
@@ -462,7 +478,6 @@ params:
   # Used only if settings:trimming-tool == fastp
   # See fastp manual: https://github.com/OpenGene/fastp
   fastp:
-    threads: 4
 
     # Extra parameters for single reads.
     se: ""
@@ -490,7 +505,6 @@ params:
   # See skewer manual: https://github.com/relipmoc/skewer
   # By default, we internally already set the options `--format sanger --compress`
   skewer:
-    threads: 4
 
     # Extra parameters for single reads.
     se: "--mode any"
@@ -506,7 +520,8 @@ params:
   # See trimmomatic manual: http://www.usadellab.org/cms/?page=trimmomatic
   # Download adapters here: https://github.com/usadellab/Trimmomatic/tree/main/adapters
   trimmomatic:
-    threads: 6
+
+    # Extra parameters for single reads.
     se:
       extra: ""
       trimmer:
@@ -521,6 +536,8 @@ params:
         - "TRAILING:3"
         - "SLIDINGWINDOW:4:15"
         - "MINLEN:36"
+
+    # Extra parameters for paired end reads.
     pe:
       extra: ""
       trimmer:
@@ -538,7 +555,6 @@ params:
   # Used only if settings:mapping-tool == bowtie2
   # See bowtie2 manual: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
   bowtie2:
-    threads: 10
 
     # Extra parameters. We internally already set `--rg` and `--rg-id`, using read group ("@RG")
     # tags "ID" and "SM", and potentially "PL".
@@ -554,7 +570,6 @@ params:
   # Used only if settings:mapping-tool == bwaaln
   # See bwa manual: http://bio-bwa.sourceforge.net/
   bwaaln:
-    threads: 10
 
     # Extra parameters for bwa aln, which maps the reads and produces intermediate *.sai files.
     extra: ""
@@ -575,7 +590,6 @@ params:
   # Used only if settings:mapping-tool == bwamem
   # See bwa manual: http://bio-bwa.sourceforge.net/
   bwamem:
-    threads: 10
 
     # Extra parameters for bwa mem.
     # We internally already set `-R` to use read group ("@RG") tags "ID" and "SM",
@@ -592,7 +606,6 @@ params:
   # Used only if settings:mapping-tool == bwamem2
   # See bwa manual: https://github.com/bwa-mem2/bwa-mem2
   bwamem2:
-    threads: 10
 
     # Extra parameters for bwa mem.
     # We internally already set `-R` to use read group ("@RG") tags "ID" and "SM",
@@ -615,7 +628,6 @@ params:
     # in order to streamline the process, and to make sure that all tools understand that all units
     # of a sample belong to the same sample.
     merge: ""
-    merge-threads: 4
 
     # Extra parameters for samtools/view.
     # Used only if settings:filter-mapped-reads == true, in order to filter the mapped samples
@@ -702,22 +714,13 @@ params:
     # system-provided tmp dir is too small (which can happen on clusters).
     # Note that the Java memory options, such as `-Xmx10g` to increase the available memory within
     # the Java virtual machine are provided via the Snakemake memory management directly,
-    # and hence cannot be specified here. Instead, use the below `*-mem-mb` options,
-    # or, if you are running grenepipe via slurm, use the slurm job configuration.
+    # and hence cannot be specified here. Instead, use the resources.yaml config file for this.
     # The last option, SortVcf-java-opts, is used by bcftools when using contig-group-size > 0.
     MarkDuplicates-java-opts: ""
     CollectMultipleMetrics-java-opts: ""
     SortVcf-java-opts: ""
     MergeVcfs-java-opts: ""
 
-    # Memory for the Java virtual machine for the picard programs.
-    # Unfortunately, Java does not automatically use the available memory, and instead needs
-    # to be told that it is allowed to do that. Specify the memory here as needed, in MB.
-    MarkDuplicates-mem-mb: 5000
-    CollectMultipleMetrics-mem-mb: 1024
-    SortVcf-mem-mb: 1024
-    MergeVcfs-mem-mb: 1024
-
   # ----------------------------------------------------------------------
   #     dedup
   # ----------------------------------------------------------------------
@@ -740,7 +743,6 @@ params:
   # Note that the bcftools filter step (if configured above via `settings: filter-variants`)
   # is configured below in the `bcftools-filter` setting, instead of here.
   bcftools:
-    threads: 8
 
     # We offer two ways to run bcftools call: Combined on all samples at the same time,
     # or on each sample individually, merging the calls later.
@@ -779,8 +781,6 @@ params:
     extra: ""
 
     # Settings for parallelization
-    threads: 8
-    compress-threads: 2
     chunksize: 100000
 
   # ----------------------------------------------------------------------
@@ -803,10 +803,6 @@ params:
     # Others might work as well, depending on GATK BaseRecalibrator.
     platform: ""
 
-    # Number of threads to use for the HaplotypeCaller. We recommend to keep this at 2,
-    # as GATK does not seem to do a great job of parallelizing anyway.
-    HaplotypeCaller-threads: 2
-
     # By default, starting in grenepipe v0.14.0, we are using GATK GenomicsDBImport instead of
     # GATK CombineGVCFs to prepare the singular GVCF for GATK GenotypeGVCFs. However, for full
     # compatibility, we also offer to use the old way with CombineGVCFs here, by setting
@@ -829,21 +825,12 @@ params:
     # For some specific error cases, it might be necessary to adjust java settings for the tools.
     # Note that the Java memory options, such as `-Xmx10g` to increase the available memory within
     # the Java virtual machine are provided via the Snakemake memory management directly,
-    # and hence cannot be specified here. Instead, use the below `*-mem-mb` options,
-    # or, if you are running grenepipe via slurm, use the slurm job configuration.
+    # and hence cannot be specified here. Instead, use the resources.yaml config file for this.
     HaplotypeCaller-java-opts: ""
     GenomicsDBImport-java-opts: ""
     CombineGVCFs-java-opts: ""
     GenotypeGVCFs-java-opts: ""
 
-    # Memory for the Java virtual machine for the GATK programs.
-    # Unfortunately, Java does not automatically use the available memory, and instead needs
-    # to be told that it is allowed to do that. Specify the memory here as needed, in MB.
-    HaplotypeCaller-mem-mb: 1024
-    GenomicsDBImport-mem-mb: 1024
-    CombineGVCFs-mem-mb: 1024
-    GenotypeGVCFs-mem-mb: 1024
-
   # ----------------------------------------------------------------------
   #     GATK VariantFiltration
   # ----------------------------------------------------------------------
@@ -863,7 +850,6 @@ params:
     # We also offer extra settings that are used for both.
     extra: ""
     java-opts: ""
-    mem-mb: 1024
 
   # ----------------------------------------------------------------------
   #     GATK VariantRecalibrator + ApplyVQSR
@@ -948,13 +934,11 @@ params:
     variantrecalibrator-extra-SNP: "--max-gaussians 1"
     variantrecalibrator-extra-INDEL: "--max-gaussians 1"
     variantrecalibrator-java-opts: ""
-    variantrecalibrator-mem-mb: 1024
 
     # Extra command line params, and optional Java runtime options to provide to GATK ApplyVQSR
     applyvqsr-extra-SNP: "--truth-sensitivity-filter-level 99.0"
     applyvqsr-extra-INDEL: "--truth-sensitivity-filter-level 99.0"
     applyvqsr-java-opts: ""
-    applyvqsr-mem-mb: 1024
 
   # ----------------------------------------------------------------------
   #     bcftools filter
@@ -1003,9 +987,6 @@ params:
     # this local path is used, which is expected to contain a valid snpEff database.
     custom-db-dir: ""
 
-    # Memory (in MB) to be given to SnpEFF. Increase this if the command fails.
-    mem: 4000
-
     # Additional parameters for snpeff, see https://pcingola.github.io/SnpEff/se_commandline/
     extra: ""
 
@@ -1112,8 +1093,7 @@ params:
     bams: "processed"
 
     # Additional parameters for qualimap, see http://qualimap.conesalab.org/
-    extra: "--java-mem-size=10G"
-    threads: 2
+    extra: ""
 
   # ----------------------------------------------------------------------
   #     SeqKit
 
@@ -0,0 +1,116 @@
+# resources.yaml
+
+# This file defines the computational resources used by grenepipe. These are
+# mostly relevant on computer clusters, such as when using slurm to execute
+# individual rule jobs. For these, we need to specify memory and runtime,
+# and the number of CPU cores to use. The latter is also relevant when running
+# grenepipe locally on a single machine - in that case, only the CPU specified
+# below will be used.
+
+# Note that there is a bug in the snakemake slurm submission, where the number
+# of cores on the login node (where typically snakemake is being run, and from which
+# the slurm jobs are hence submitted) is used as a limitation check for the number
+# of cores a job can request, in order to avoid over-allocation.
+# Of course, on many clusters, the login node might have way fewer cores than the
+# compute nodes, and so this prevents us of submitting jobs that need more cores
+# than the login node has. Silly.
+# See: https://github.com/snakemake/snakemake/issues/2997
+# The workaround for this is to run snakemake with `--cores 1024` or some other
+# large number - those cores might then be used for some local rules, which
+# however should not lead to issues on the login node, as there are few such rules.
+
+# =================================================================================================
+#     Default Resources
+# =================================================================================================
+
+# Default resources, used for all job submissions, unless overwritten by values
+# for individual rules below. All values here are inherited by each rule unless
+# explicitly overwritten for a rule (in the second half of this file).
+# Resources need to be specified in MB and minutes. The resources are computed
+# based on the input file sizes of each rule, using a fixed offset plus the
+# scaled file sizes, up to a maximum value (to avoid breaking computer clusters).
+# They are then further scaled linearily for each subsequent attempt of running
+# the rule, should it have failed previously. Note that sometimes snakemake/slurm
+# can fail running a job for reasons other than too little resources; the attempt
+# will still be increased and give the rule more resources, as we cannot determine
+# automatically why the job failed, and so assume it was due to resources.
+default:
+
+  # Fixed offset of the required memory, in MB. This accounts for basic memory
+  # needed by the rule (independent of input sizes), and is needed for small files
+  # where the scaling might be too little.
+  mem-offset: 1000
+
+  # Scaling of the memory needed for the computation. That is, for each MB of combined
+  # input file sizes, how many MB of memory are needed for the processing?
+  # The default of 1 should work in most cases, but can tightly be adapted as needed
+  # if rules run out of memory.
+  mem-scaler: 1
+
+  # Maximum memory, in MB, defaulting to 1TB. This is provided as a safety mechanism,
+  # so as to not overwhelm cluster environments with invalid resource requests.
+  # If a rule wants more memory based on the above scaling, a warning is triggered,
+  # and it is submitted with this maximum instead, to give it a try at succeeding.
+  # Set this to the memory limit available on your cluster.
+  mem-max: 1000000
+
+  # Fixed offset of the required runtime, in minutes. Similar to above.
+  time-offset: 60
+
+  # Scaling of the runtime, from MB to minutes. That is, how many minuts are needed
+  # for each additional MB of input files? The default of 0.1 for instance would give
+  # 100 minutes per GB of input data.
+  time-scaler: 0.1
+
+  # Maximum runtime, in minutes. Defaults to one week. If you have large datasets
+  # and your cluster allows longer wall times, adjust this accordingly.
+  # Generally, set this to the maximum allowed wall time on your cluster.
+  time-max: 10080
+
+  # Default number of CPU cores per task/job. Most programs and scripts in
+  # bioinformatics are unfortunately not paralellized well, so 1 is the default.
+  cpus: 1
+
+  # If a rule job failed (for whatever reason), snakemake can automatically re-try.
+  # A typical reason for failure is a lack of resources. In that case, re-running
+  # with exactly the same resources would lead to the same issue though.
+  # So we want to give any subsequent attempt at running a job more resources.
+  # This factor here determines how much extra we give it each time, as an exponential
+  # base multiplied with the overall resulting resources as determined by the above
+  # specifications. That is, by default we give it 1x, 2x, 4x... the resources for
+  # the first to third attempt. When setting this step to, e.g., 1.5, we would instead
+  # give it 1x, 1.5x, 2.25x... of the resources for the attempts.
+  # Note that if your jobs often fail due to lack of resources, it is better to
+  # adjust the above settings instead, such that the jobs are more likely to
+  # succeed on their first attempt, and thus avoid wasting computation.
+  attempt-factor: 2.0
+
+# =================================================================================================
+#     Per-Rule Resources
+# =================================================================================================
+
+# Here, you can specify the resources for individual job rules in grenepipe.
+# All values not given here are simply using the above defaults instead.
+# You might need to have a look at the snakemake log or even the workflow code
+# in order to figure out the rule names needed. We provide some defaults here
+# for rules that we now already can benefit from different resources.
+
+# Read trimming tools can typically use multiple CPUs, so let's give them more!
+trim_reads_se:
+  cpus: 4
+trim_reads_pe:
+  cpus: 4
+trim_reads_pe_merged:
+  cpus: 4
+
+# Even better for the mapping! More cores! Note that we internally
+# might assign  an additional cpu thread for sorting the bam files.
+map_reads:
+  cpus: 10
+
+# The usable cpu threads for the actual variant calling step depends on the tool.
+# Freebayes and bcftools can make efficient use of threads, so if you are using those,
+# increase the value here to, e.g., 10. However, the GATK HaplotypeCaller is
+# notoriously bad and inefficient, and seems to not benefit from more than 2 threads.
+call_variants:
+  cpus: 2