Skip to content

Commit fbf30f6

Browse files
committed
Add GATK GenomicsDBImport sharding for speedup
1 parent 351dad2 commit fbf30f6

14 files changed

+628
-193
lines changed

.github/workflows/ci.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ jobs:
6969
- calling-haplotypecaller-5
7070
- calling-haplotypecaller-6
7171
- calling-haplotypecaller-7
72+
- calling-haplotypecaller-8
73+
- calling-haplotypecaller-9
7274
- download
7375
- duplicates-dedup-1
7476
- duplicates-dedup-2

config/config.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -804,11 +804,28 @@ params:
804804
platform: ""
805805

806806
# By default, starting in grenepipe v0.14.0, we are using GATK GenomicsDBImport instead of
807-
# GATK CombineGVCFs to prepare the singular GVCF for GATK GenotypeGVCFs. However, for full
807+
# GATK CombineGVCFs to prepare the per-sample GVCF for GATK GenotypeGVCFs. However, for full
808808
# compatibility, we also offer to use the old way with CombineGVCFs here, by setting
809809
# the option to `false`.
810810
use-GenomicsDBImport: true
811811

812+
# EXPERIMENTAL: When working with thousands of samples, GATK GenomicsDBImport is ridiculously
813+
# inefficient, even when optimizing the settings. Hence, we implement a workaround for their
814+
# slow code, by breaking up the step into even smaller intervals. This is on top of the more
815+
# general `contig-group-size` from above, and is unfortunately needed to get GATK to work on
816+
# large datasets. Here, set the interval size (in nucleotides) for each broken-down chromosome
817+
# or contig. The interval size needed to get an acceptable runtime depends on how many samples
818+
# your dataset has, but might be in the range of 1MB for instance. We recommend to test this for
819+
# your dataset and see which size works best. Furthermore, we recommend to pad each interval,
820+
# so that variants at interval boundaries can be called properly, which GATK otherwise would
821+
# not do correctly. The padding should match the expected max read length.
822+
# By default, we do not use this feature, i.e., leave the interval size at 0.
823+
# Note: Unfortunately, GATK seems to have a bug and does not use the padding here at all,
824+
# see https://github.com/broadinstitute/gatk/issues/9165 - you might have to set padding
825+
# as an extra option in `GenotypeGVCFs-extra`, for instance via `--interval-padding` instead.
826+
GenomicsDBImport-interval-size: 0
827+
GenomicsDBImport-interval-padding: 250
828+
812829
# Temporary directory for GenomicsDBImport.
813830
# If the default system temp dir does not have enough space, which can happen on cluster systems,
814831
# use this to set to a directory with more space, or, with slurm, specify the temp dir there.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
GenomicsDBImport-interval-size: 0 GenomicsDBImport-interval-size: 10000000
2+
snpeff: false snpeff: true
3+
vep: false vep: true
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
contig-group-size: 0 contig-group-size: 38283346
2+
GenomicsDBImport-interval-size: 0 GenomicsDBImport-interval-size: 10000000
3+
snpeff: false snpeff: true
4+
vep: false vep: true

workflow/rules/calling-bcftools-combined.smk

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ rule call_variants:
2424
if (config["settings"].get("restrict-regions"))
2525
else (
2626
"calling/contig-groups/{contig}.bed"
27-
if (config["settings"].get("contig-group-size"))
27+
if (config["settings"].get("contig-group-size", 0))
2828
else []
2929
)
3030
),
@@ -178,12 +178,12 @@ rule merge_variants:
178178
# If we do not use small contigs, we directly output the final file.
179179
vcf=(
180180
temp("calling/merged/merged-all.vcf.gz")
181-
if (config["settings"].get("contig-group-size"))
181+
if (config["settings"].get("contig-group-size", 0))
182182
else "calling/genotyped-all.vcf.gz"
183183
),
184184
done=(
185185
touch("calling/merged/merged-all.vcf.gz.done")
186-
if (config["settings"].get("contig-group-size"))
186+
if (config["settings"].get("contig-group-size", 0))
187187
else touch("calling/genotyped-all.vcf.gz.done")
188188
),
189189
log:

workflow/rules/calling-bcftools-individual.smk

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ rule call_variants:
2323
if (config["settings"].get("restrict-regions"))
2424
else (
2525
"calling/contig-groups/{contig}.bed"
26-
if (config["settings"].get("contig-group-size"))
26+
if (config["settings"].get("contig-group-size", 0))
2727
else []
2828
)
2929
),
@@ -171,22 +171,22 @@ rule combine_all:
171171
# done="calling/genotyped-all.done"
172172
vcf=(
173173
temp("calling/merged/merged-all.vcf.gz")
174-
if (config["settings"].get("contig-group-size"))
174+
if (config["settings"].get("contig-group-size", 0))
175175
else "calling/genotyped-all.vcf.gz"
176176
),
177177
tbi=(
178178
temp("calling/merged/merged-all.vcf.gz.tbi")
179-
if (config["settings"].get("contig-group-size"))
179+
if (config["settings"].get("contig-group-size", 0))
180180
else "calling/genotyped-all.vcf.gz.tbi"
181181
),
182182
lst=(
183183
temp("calling/merged/merged-all.txt")
184-
if (config["settings"].get("contig-group-size"))
184+
if (config["settings"].get("contig-group-size", 0))
185185
else "calling/genotyped-all.txt"
186186
),
187187
done=(
188188
touch("calling/merged/merged-all.vcf.gz.done")
189-
if (config["settings"].get("contig-group-size"))
189+
if (config["settings"].get("contig-group-size", 0))
190190
else touch("calling/genotyped-all.vcf.gz.done")
191191
),
192192
params:

workflow/rules/calling-bcftools.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ if not bcftools_mode_good:
5959

6060
# When using small contigs, we further need to sort the output, as
6161
# this won't be done for us. Let's only do this extra work though if needed.
62-
if config["settings"].get("contig-group-size"):
62+
if config["settings"].get("contig-group-size", 0):
6363

6464
rule sort_variants:
6565
input:

workflow/rules/calling-contig-groups.smk

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,17 @@ def optimize_contig_group(contigs, max_contig_group_size, max_contigs_per_group)
153153
return large_groups + small_groups
154154

155155

156+
# This is an alternative function to the above two that does not bin the contigs in any way,
157+
# but just returns their list in the same format as the above, such that we can build a list
158+
# of individual contigs if no binning is needed.
159+
def get_contig_tuple_list(contigs):
160+
bins = []
161+
for cont in contigs:
162+
bins.append([])
163+
bins[-1].append(cont)
164+
return bins
165+
166+
156167
# =================================================================================================
157168
# Checkpoints
158169
# =================================================================================================
@@ -183,15 +194,20 @@ checkpoint contig_groups:
183194
# Solve the bin packing for the contigs, to get a close to optimal solution
184195
# for putting them in groups. Large contigs (e.g., whole chromosomes) that are larger
185196
# than the bin size will simply get their own (overflowing...) bin.
197+
# If we do not want to group contigs, simply create bed files for each of them.
186198
contig_list = read_contigs_from_fai(input.fai, params.min_contig_size)
187199
if params.max_contigs_per_group == 0:
188200
print("Running bin packing solver")
189201
contig_groups = solve_bin_packing(contig_list, params.contig_group_size)
190-
else:
202+
elif params.contig_group_size > 0:
191203
print("Running heuristic optimizer")
192204
contig_groups = optimize_contig_group(
193205
contig_list, params.contig_group_size, params.max_contigs_per_group
194206
)
207+
else:
208+
# print("Gathering individual contigs")
209+
# contig_groups = get_contig_tuple_list(contig_list)
210+
raise Exception("Invalid context for checkpoint contig_groups")
195211

196212
# Now turn the contig bins into groups for the result of this function.
197213
# We store our resulting list of contigs containing all contigs,
@@ -205,6 +221,7 @@ checkpoint contig_groups:
205221
# We need to store the result in a file, so that the rule that creates the per-contig
206222
# files can access it.
207223
json.dump(contigs, open(output[0], "w"))
224+
print("Total number of contig groups:", len(contigs))
208225

209226

210227
# Rule is not submitted as a job to the cluster.

workflow/rules/calling-freebayes.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ rule call_variants:
4343
if (config["settings"].get("restrict-regions"))
4444
else (
4545
"calling/contig-groups/{contig}.bed"
46-
if (config["settings"].get("contig-group-size"))
46+
if (config["settings"].get("contig-group-size", 0))
4747
else []
4848
)
4949
),
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# =================================================================================================
2+
# Combining Calls
3+
# =================================================================================================
4+
5+
6+
# Recommended way of GATK to combine GVCFs these days.
7+
rule genomics_db_import:
8+
input:
9+
# Get the reference genome and its indices. Not sure if the indices are needed
10+
# for this particular rule, but doesn't hurt to include them as an input anyway.
11+
ref=config["data"]["reference-genome"],
12+
refidcs=expand(
13+
config["data"]["reference-genome"] + ".{ext}",
14+
ext=["amb", "ann", "bwt", "pac", "sa", "fai"],
15+
),
16+
refdict=genome_dict(),
17+
# Get the sample data, including indices.
18+
gvcfs=expand(
19+
"calling/called/{sample}-{{contig}}.g.vcf.gz", sample=config["global"]["sample-names"]
20+
),
21+
indices=expand(
22+
"calling/called/{sample}-{{contig}}.g.vcf.gz.tbi",
23+
sample=config["global"]["sample-names"],
24+
),
25+
done=expand(
26+
"calling/called/{sample}-{{contig}}.g.vcf.gz.done",
27+
sample=config["global"]["sample-names"],
28+
),
29+
# Same as in the haplotype caller, we need a dummy for the intervals
30+
# to ensure the files are present.
31+
intervals_dummy=get_gatk_interval_files,
32+
output:
33+
db=directory("calling/genomics_db/{contig}"),
34+
done=touch("calling/genomics_db/{contig}.done"),
35+
params:
36+
# Here, we actually use the intervals to provide them to the wrapper.
37+
intervals=get_gatk_intervals,
38+
db_action="create",
39+
extra=" --reference "
40+
+ config["data"]["reference-genome"]
41+
+ " --sequence-dictionary "
42+
+ genome_dict()
43+
+ " "
44+
+ config["params"]["gatk"].get("GenomicsDBImport-extra", ""),
45+
java_opts=config["params"]["gatk"].get("GenomicsDBImport-java-opts", ""),
46+
threads: get_rule_threads("genomics_db_import")
47+
log:
48+
"logs/calling/gatk-genomicsdbimport/{contig}.log",
49+
benchmark:
50+
"benchmarks/calling/gatk-genomicsdbimport/{contig}.log"
51+
resources:
52+
tmpdir=config["params"]["gatk"].get("GenomicsDBImport-temp-dir", ""),
53+
conda:
54+
"../envs/gatk.yaml"
55+
wrapper:
56+
"v5.7.0/bio/gatk/genomicsdbimport"
57+
58+
59+
# =================================================================================================
60+
# Genotype Variants
61+
# =================================================================================================
62+
63+
64+
rule genotype_variants:
65+
input:
66+
# Get the reference genome and its indices. Not sure if the indices are needed
67+
# for this particular rule, but doesn't hurt to include them as an input anyway.
68+
ref=config["data"]["reference-genome"],
69+
refidcs=expand(
70+
config["data"]["reference-genome"] + ".{ext}",
71+
ext=["amb", "ann", "bwt", "pac", "sa", "fai"],
72+
),
73+
refdict=genome_dict(),
74+
# Get the GenomicsDB input
75+
genomicsdb="calling/genomics_db/{contig}",
76+
genomicsdb_done="calling/genomics_db/{contig}.done",
77+
# If known variants are set in the config, use them, and require the index file as well.
78+
known=config["data"]["known-variants"],
79+
knownidx=(
80+
config["data"]["known-variants"] + ".tbi" if config["data"]["known-variants"] else []
81+
),
82+
# Same as above, we need a dummy for the intervals to ensure the files are present.
83+
intervals_dummy=get_gatk_interval_files,
84+
output:
85+
vcf=(
86+
"calling/genotyped/all.{contig}.vcf.gz"
87+
if config["settings"]["keep-intermediate"]["calling"]
88+
else temp("calling/genotyped/all.{contig}.vcf.gz")
89+
),
90+
done=touch("calling/genotyped/all.{contig}.vcf.gz.done"),
91+
params:
92+
# Again, we here use the intervals to provide them to the wrapper.
93+
intervals=get_gatk_intervals,
94+
extra=" --sequence-dictionary "
95+
+ genome_dict()
96+
+ " "
97+
+ config["params"]["gatk"]["GenotypeGVCFs-extra"],
98+
java_opts=config["params"]["gatk"]["GenotypeGVCFs-java-opts"],
99+
threads: get_rule_threads("genotype_variants")
100+
log:
101+
"logs/calling/gatk-genotype-gvcfs/{contig}.log",
102+
benchmark:
103+
"benchmarks/calling/gatk-genotype-gvcfs/{contig}.log"
104+
# group:
105+
# "gatk_calls_combine"
106+
conda:
107+
"../envs/gatk.yaml"
108+
wrapper:
109+
"v5.7.0/bio/gatk/genotypegvcfs"

0 commit comments

Comments
 (0)