Add GATK GenomicsDBImport sharding for speedup

lczech · lczech · commit fbf30f6efb8c · 2025-05-05T18:36:53.000-03:00
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -69,6 +69,8 @@ jobs:
           - calling-haplotypecaller-5
           - calling-haplotypecaller-6
           - calling-haplotypecaller-7
+          - calling-haplotypecaller-8
+          - calling-haplotypecaller-9
           - download
           - duplicates-dedup-1
           - duplicates-dedup-2
diff --git a/config/config.yaml b/config/config.yaml
@@ -804,11 +804,28 @@ params:
     platform: ""
 
     # By default, starting in grenepipe v0.14.0, we are using GATK GenomicsDBImport instead of
-    # GATK CombineGVCFs to prepare the singular GVCF for GATK GenotypeGVCFs. However, for full
+    # GATK CombineGVCFs to prepare the per-sample GVCF for GATK GenotypeGVCFs. However, for full
     # compatibility, we also offer to use the old way with CombineGVCFs here, by setting
     # the option to `false`.
     use-GenomicsDBImport: true
 
+    # EXPERIMENTAL: When working with thousands of samples, GATK GenomicsDBImport is ridiculously
+    # inefficient, even when optimizing the settings. Hence, we implement a workaround for their
+    # slow code, by breaking up the step into even smaller intervals. This is on top of the more
+    # general `contig-group-size` from above, and is unfortunately needed to get GATK to work on
+    # large datasets. Here, set the interval size (in nucleotides) for each broken-down chromosome
+    # or contig. The interval size needed to get an acceptable runtime depends on how many samples
+    # your dataset has, but might be in the range of 1MB for instance. We recommend to test this for
+    # your dataset and see which size works best. Furthermore, we recommend to pad each interval,
+    # so that variants at interval boundaries can be called properly, which GATK otherwise would
+    # not do correctly. The padding should match the expected max read length.
+    # By default, we do not use this feature, i.e., leave the interval size at 0.
+    # Note: Unfortunately, GATK seems to have a bug and does not use the padding here at all,
+    # see https://github.com/broadinstitute/gatk/issues/9165 - you might have to set padding
+    # as an extra option in `GenotypeGVCFs-extra`, for instance via `--interval-padding` instead.
+    GenomicsDBImport-interval-size: 0
+    GenomicsDBImport-interval-padding: 250
+
     # Temporary directory for GenomicsDBImport.
     # If the default system temp dir does not have enough space, which can happen on cluster systems,
     # use this to set to a directory with more space, or, with slurm, specify the temp dir there.
diff --git a/test/cases/calling-haplotypecaller-8.txt b/test/cases/calling-haplotypecaller-8.txt
@@ -0,0 +1,3 @@
+GenomicsDBImport-interval-size: 0	GenomicsDBImport-interval-size: 10000000
+snpeff: false	snpeff: true
+vep: false	vep: true
diff --git a/test/cases/calling-haplotypecaller-9.txt b/test/cases/calling-haplotypecaller-9.txt
@@ -0,0 +1,4 @@
+contig-group-size: 0	contig-group-size: 38283346
+GenomicsDBImport-interval-size: 0	GenomicsDBImport-interval-size: 10000000
+snpeff: false	snpeff: true
+vep: false	vep: true
diff --git a/workflow/rules/calling-bcftools-combined.smk b/workflow/rules/calling-bcftools-combined.smk
@@ -24,7 +24,7 @@ rule call_variants:
             if (config["settings"].get("restrict-regions"))
             else (
                 "calling/contig-groups/{contig}.bed"
-                if (config["settings"].get("contig-group-size"))
+                if (config["settings"].get("contig-group-size", 0))
                 else []
             )
         ),
@@ -178,12 +178,12 @@ rule merge_variants:
         # If we do not use small contigs, we directly output the final file.
         vcf=(
             temp("calling/merged/merged-all.vcf.gz")
-            if (config["settings"].get("contig-group-size"))
+            if (config["settings"].get("contig-group-size", 0))
             else "calling/genotyped-all.vcf.gz"
         ),
         done=(
             touch("calling/merged/merged-all.vcf.gz.done")
-            if (config["settings"].get("contig-group-size"))
+            if (config["settings"].get("contig-group-size", 0))
             else touch("calling/genotyped-all.vcf.gz.done")
         ),
     log:
diff --git a/workflow/rules/calling-bcftools-individual.smk b/workflow/rules/calling-bcftools-individual.smk
@@ -23,7 +23,7 @@ rule call_variants:
             if (config["settings"].get("restrict-regions"))
             else (
                 "calling/contig-groups/{contig}.bed"
-                if (config["settings"].get("contig-group-size"))
+                if (config["settings"].get("contig-group-size", 0))
                 else []
             )
         ),
@@ -171,22 +171,22 @@ rule combine_all:
         # done="calling/genotyped-all.done"
         vcf=(
             temp("calling/merged/merged-all.vcf.gz")
-            if (config["settings"].get("contig-group-size"))
+            if (config["settings"].get("contig-group-size", 0))
             else "calling/genotyped-all.vcf.gz"
         ),
         tbi=(
             temp("calling/merged/merged-all.vcf.gz.tbi")
-            if (config["settings"].get("contig-group-size"))
+            if (config["settings"].get("contig-group-size", 0))
             else "calling/genotyped-all.vcf.gz.tbi"
         ),
         lst=(
             temp("calling/merged/merged-all.txt")
-            if (config["settings"].get("contig-group-size"))
+            if (config["settings"].get("contig-group-size", 0))
             else "calling/genotyped-all.txt"
         ),
         done=(
             touch("calling/merged/merged-all.vcf.gz.done")
-            if (config["settings"].get("contig-group-size"))
+            if (config["settings"].get("contig-group-size", 0))
             else touch("calling/genotyped-all.vcf.gz.done")
         ),
     params:
diff --git a/workflow/rules/calling-bcftools.smk b/workflow/rules/calling-bcftools.smk
@@ -59,7 +59,7 @@ if not bcftools_mode_good:
 
 # When using small contigs, we further need to sort the output, as
 # this won't be done for us. Let's only do this extra work though if needed.
-if config["settings"].get("contig-group-size"):
+if config["settings"].get("contig-group-size", 0):
 
     rule sort_variants:
         input:
diff --git a/workflow/rules/calling-contig-groups.smk b/workflow/rules/calling-contig-groups.smk
@@ -153,6 +153,17 @@ def optimize_contig_group(contigs, max_contig_group_size, max_contigs_per_group)
     return large_groups + small_groups
 
 
+# This is an alternative function to the above two that does not bin the contigs in any way,
+# but just returns their list in the same format as the above, such that we can build a list
+# of individual contigs if no binning is needed.
+def get_contig_tuple_list(contigs):
+    bins = []
+    for cont in contigs:
+        bins.append([])
+        bins[-1].append(cont)
+    return bins
+
+
 # =================================================================================================
 #     Checkpoints
 # =================================================================================================
@@ -183,15 +194,20 @@ checkpoint contig_groups:
         # Solve the bin packing for the contigs, to get a close to optimal solution
         # for putting them in groups. Large contigs (e.g., whole chromosomes) that are larger
         # than the bin size will simply get their own (overflowing...) bin.
+        # If we do not want to group contigs, simply create bed files for each of them.
         contig_list = read_contigs_from_fai(input.fai, params.min_contig_size)
         if params.max_contigs_per_group == 0:
             print("Running bin packing solver")
             contig_groups = solve_bin_packing(contig_list, params.contig_group_size)
-        else:
+        elif params.contig_group_size > 0:
             print("Running heuristic optimizer")
             contig_groups = optimize_contig_group(
                 contig_list, params.contig_group_size, params.max_contigs_per_group
             )
+        else:
+            # print("Gathering individual contigs")
+            # contig_groups = get_contig_tuple_list(contig_list)
+            raise Exception("Invalid context for checkpoint contig_groups")
 
             # Now turn the contig bins into groups for the result of this function.
             # We store our resulting list of contigs containing all contigs,
@@ -205,6 +221,7 @@ checkpoint contig_groups:
             # We need to store the result in a file, so that the rule that creates the per-contig
             # files can access it.
         json.dump(contigs, open(output[0], "w"))
+        print("Total number of contig groups:", len(contigs))
 
 
 # Rule is not submitted as a job to the cluster.
diff --git a/workflow/rules/calling-freebayes.smk b/workflow/rules/calling-freebayes.smk
@@ -43,7 +43,7 @@ rule call_variants:
             if (config["settings"].get("restrict-regions"))
             else (
                 "calling/contig-groups/{contig}.bed"
-                if (config["settings"].get("contig-group-size"))
+                if (config["settings"].get("contig-group-size", 0))
                 else []
             )
         ),
diff --git a/workflow/rules/calling-gatk-genotype-db-all.smk b/workflow/rules/calling-gatk-genotype-db-all.smk
@@ -0,0 +1,109 @@
+# =================================================================================================
+#     Combining Calls
+# =================================================================================================
+
+
+# Recommended way of GATK to combine GVCFs these days.
+rule genomics_db_import:
+    input:
+        # Get the reference genome and its indices. Not sure if the indices are needed
+        # for this particular rule, but doesn't hurt to include them as an input anyway.
+        ref=config["data"]["reference-genome"],
+        refidcs=expand(
+            config["data"]["reference-genome"] + ".{ext}",
+            ext=["amb", "ann", "bwt", "pac", "sa", "fai"],
+        ),
+        refdict=genome_dict(),
+        # Get the sample data, including indices.
+        gvcfs=expand(
+            "calling/called/{sample}-{{contig}}.g.vcf.gz", sample=config["global"]["sample-names"]
+        ),
+        indices=expand(
+            "calling/called/{sample}-{{contig}}.g.vcf.gz.tbi",
+            sample=config["global"]["sample-names"],
+        ),
+        done=expand(
+            "calling/called/{sample}-{{contig}}.g.vcf.gz.done",
+            sample=config["global"]["sample-names"],
+        ),
+        # Same as in the haplotype caller, we need a dummy for the intervals
+        # to ensure the files are present.
+        intervals_dummy=get_gatk_interval_files,
+    output:
+        db=directory("calling/genomics_db/{contig}"),
+        done=touch("calling/genomics_db/{contig}.done"),
+    params:
+        # Here, we actually use the intervals to provide them to the wrapper.
+        intervals=get_gatk_intervals,
+        db_action="create",
+        extra=" --reference "
+        + config["data"]["reference-genome"]
+        + " --sequence-dictionary "
+        + genome_dict()
+        + " "
+        + config["params"]["gatk"].get("GenomicsDBImport-extra", ""),
+        java_opts=config["params"]["gatk"].get("GenomicsDBImport-java-opts", ""),
+    threads: get_rule_threads("genomics_db_import")
+    log:
+        "logs/calling/gatk-genomicsdbimport/{contig}.log",
+    benchmark:
+        "benchmarks/calling/gatk-genomicsdbimport/{contig}.log"
+    resources:
+        tmpdir=config["params"]["gatk"].get("GenomicsDBImport-temp-dir", ""),
+    conda:
+        "../envs/gatk.yaml"
+    wrapper:
+        "v5.7.0/bio/gatk/genomicsdbimport"
+
+
+# =================================================================================================
+#     Genotype Variants
+# =================================================================================================
+
+
+rule genotype_variants:
+    input:
+        # Get the reference genome and its indices. Not sure if the indices are needed
+        # for this particular rule, but doesn't hurt to include them as an input anyway.
+        ref=config["data"]["reference-genome"],
+        refidcs=expand(
+            config["data"]["reference-genome"] + ".{ext}",
+            ext=["amb", "ann", "bwt", "pac", "sa", "fai"],
+        ),
+        refdict=genome_dict(),
+        # Get the GenomicsDB input
+        genomicsdb="calling/genomics_db/{contig}",
+        genomicsdb_done="calling/genomics_db/{contig}.done",
+        # If known variants are set in the config, use them, and require the index file as well.
+        known=config["data"]["known-variants"],
+        knownidx=(
+            config["data"]["known-variants"] + ".tbi" if config["data"]["known-variants"] else []
+        ),
+        # Same as above, we need a dummy for the intervals to ensure the files are present.
+        intervals_dummy=get_gatk_interval_files,
+    output:
+        vcf=(
+            "calling/genotyped/all.{contig}.vcf.gz"
+            if config["settings"]["keep-intermediate"]["calling"]
+            else temp("calling/genotyped/all.{contig}.vcf.gz")
+        ),
+        done=touch("calling/genotyped/all.{contig}.vcf.gz.done"),
+    params:
+        # Again, we here use the intervals to provide them to the wrapper.
+        intervals=get_gatk_intervals,
+        extra=" --sequence-dictionary "
+        + genome_dict()
+        + " "
+        + config["params"]["gatk"]["GenotypeGVCFs-extra"],
+        java_opts=config["params"]["gatk"]["GenotypeGVCFs-java-opts"],
+    threads: get_rule_threads("genotype_variants")
+    log:
+        "logs/calling/gatk-genotype-gvcfs/{contig}.log",
+    benchmark:
+        "benchmarks/calling/gatk-genotype-gvcfs/{contig}.log"
+    # group:
+    #     "gatk_calls_combine"
+    conda:
+        "../envs/gatk.yaml"
+    wrapper:
+        "v5.7.0/bio/gatk/genotypegvcfs"
diff --git a/workflow/rules/calling-gatk-genotype-db-split.smk b/workflow/rules/calling-gatk-genotype-db-split.smk
diff --git a/workflow/rules/calling-gatk-genotype-gvcfs.smk b/workflow/rules/calling-gatk-genotype-gvcfs.smk
diff --git a/workflow/rules/calling-gatk.smk b/workflow/rules/calling-gatk.smk
diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+GenomicsDBImport-interval-size: 0 GenomicsDBImport-interval-size: 10000000`
	`2`	`+snpeff: false snpeff: true`
	`3`	`+vep: false vep: true`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ rule call_variants:`
`43`	`43`	`if (config["settings"].get("restrict-regions"))`
`44`	`44`	`else (`
`45`	`45`	`"calling/contig-groups/{contig}.bed"`
`46`		`- if (config["settings"].get("contig-group-size"))`
	`46`	`+ if (config["settings"].get("contig-group-size", 0))`
`47`	`47`	`else []`
`48`	`48`	`)`
`49`	`49`	`),`