22# Make Contig Shards
33# =================================================================================================
44
5+
56# Special rule for the case that we want to split the contigs for the genomics db import,
67# but are not using contig groups. In that case, we could pretend to have contig groups,
78# and create fake ones that each contain a single chromosome or contig of the reference.
@@ -13,9 +14,9 @@ rule make_contig_bed:
1314 input :
1415 fai = get_fai ,
1516 output :
16- bed = "calling/contig-shards/{contig}/contig.bed"
17+ bed = "calling/contig-shards/{contig}/contig.bed" ,
1718 params :
18- contig = "{contig}"
19+ contig = "{contig}" ,
1920 run :
2021 # Produce only the matching contig and write 0–length interval
2122 contig_lengths = get_contig_lengths (input .fai )
@@ -37,20 +38,20 @@ localrules:
3738# Why that then is an option at all is beyond my comprehension. GATK, WTF.
3839checkpoint preprocess_contig_shard :
3940 input :
40- dict = genome_dict (),
41- ref = config ["data" ]["reference-genome" ],
41+ dict = genome_dict (),
42+ ref = config ["data" ]["reference-genome" ],
4243 # contigs=contigs_groups_input,
43- contig = (
44+ contig = (
4445 "calling/contig-groups/{contig}.bed"
4546 if config ["settings" ].get ("contig-group-size" , 0 ) > 0
4647 else "calling/contig-shards/{contig}/contig.bed"
47- )
48+ ),
4849 output :
49- interval_list = "calling/contig-shards/{contig}/contig.interval_list" ,
50- shard_list = "calling/contig-shards/{contig}/shards.interval_list" ,
50+ interval_list = "calling/contig-shards/{contig}/contig.interval_list" ,
51+ shard_list = "calling/contig-shards/{contig}/shards.interval_list" ,
5152 params :
52- bin_length = config ["params" ]["gatk" ]["GenomicsDBImport-interval-size" ],
53- padding = config ["params" ]["gatk" ]["GenomicsDBImport-interval-padding" ],
53+ bin_length = config ["params" ]["gatk" ]["GenomicsDBImport-interval-size" ],
54+ padding = config ["params" ]["gatk" ]["GenomicsDBImport-interval-padding" ],
5455 log :
5556 "logs/calling/preprocess-contig-shard/{contig}.log" ,
5657 conda :
@@ -85,18 +86,18 @@ localrules:
8586rule extract_contig_shard :
8687 input :
8788 # shard_list=checkpoints.preprocess_contig_shard.output.shard_list
88- shard_list = "calling/contig-shards/{contig}/shards.interval_list"
89+ shard_list = "calling/contig-shards/{contig}/shards.interval_list" ,
8990 output :
90- shard = "calling/contig-shards/{contig}/shard-{shard}.interval_list"
91+ shard = "calling/contig-shards/{contig}/shard-{shard}.interval_list" ,
9192 run :
9293 header , data = [], []
9394 for line in open (input .shard_list ):
94- if line .startswith ('@' ):
95+ if line .startswith ("@" ):
9596 header .append (line )
9697 else :
9798 data .append (line )
9899 i = int (wildcards .shard )
99- with open (output .shard , 'w' ) as out :
100+ with open (output .shard , "w" ) as out :
100101 out .writelines (header + [data [i ]])
101102
102103
@@ -108,7 +109,7 @@ localrules:
108109def get_shard_indices (wc ):
109110 r = checkpoints .preprocess_contig_shard .get (contig = wc .contig )
110111 # skip header lines
111- shards = [l for l in open (r .output .shard_list ) if not l .startswith ('@' )]
112+ shards = [l for l in open (r .output .shard_list ) if not l .startswith ("@" )]
112113 return [i for i in range (len (shards ))]
113114 # return [{"shard": i} for i in range(len(shards))]
114115
@@ -208,8 +209,7 @@ rule genotype_variants:
208209 + " "
209210 + config ["params" ]["gatk" ]["GenotypeGVCFs-extra" ],
210211 java_opts = config ["params" ]["gatk" ]["GenotypeGVCFs-java-opts" ],
211- threads :
212- get_rule_threads ("genotype_variants" )
212+ threads : get_rule_threads ("genotype_variants" )
213213 log :
214214 "logs/calling/gatk-genotype-gvcfs/{contig}/shard-{shard}.log" ,
215215 benchmark :
@@ -241,6 +241,7 @@ rule genotype_variants:
241241# shard=get_shard_indices(wildcards)
242242# )
243243
244+
244245def merge_vcfs_vcfs_input (wc ):
245246 cp = checkpoints .preprocess_contig_shard .get (** wc )
246247 # cp = checkpoints.preprocess_contig_shard.get(contig=wc.contig)
@@ -250,9 +251,10 @@ def merge_vcfs_vcfs_input(wc):
250251 return expand (
251252 "calling/genotyped/{contig}/shard-{shard}.vcf.gz" ,
252253 contig = wc .contig ,
253- shard = list (range (len (shards )))
254+ shard = list (range (len (shards ))),
254255 )
255256
257+
256258def merge_vcfs_done_input (wc ):
257259 cp = checkpoints .preprocess_contig_shard .get (** wc )
258260 # cp = checkpoints.preprocess_contig_shard.get(contig=wc.contig)
@@ -261,13 +263,14 @@ def merge_vcfs_done_input(wc):
261263 return expand (
262264 "calling/genotyped/{contig}/shard-{shard}.vcf.gz.done" ,
263265 contig = wc .contig ,
264- shard = list (range (len (shards )))
266+ shard = list (range (len (shards ))),
265267 )
266268
269+
267270rule merge_shard_vcfs :
268271 input :
269- dict = genome_dict (),
270- ref = config ["data" ]["reference-genome" ],
272+ dict = genome_dict (),
273+ ref = config ["data" ]["reference-genome" ],
271274 contigs = contigs_groups_input ,
272275 # vcfs = expand("calling/genotyped/{contig}/shard-{shard}.vcf.gz", get_shard_indices),
273276 # done = expand("calling/genotyped/{contig}/shard-{shard}.vcf.gz.done", get_shard_indices),
@@ -288,8 +291,7 @@ rule merge_shard_vcfs:
288291 if platform .system () == "Darwin"
289292 else ""
290293 ),
291- threads :
292- get_rule_threads ("merge_shard_vcfs" )
294+ threads : get_rule_threads ("merge_shard_vcfs" )
293295 log :
294296 "logs/calling/picard-merge-vcfs/{contig}.log" ,
295297 benchmark :
0 commit comments