Skip to content

Commit 6366da6

Browse files
authored
Merge pull request #7 from harvardinformatics/gatk_bug
GATK rework
2 parents 4ae2168 + 35f505c commit 6366da6

File tree

5 files changed

+132
-58
lines changed

5 files changed

+132
-58
lines changed

envs/bam2vcf.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ dependencies:
1111
- bedtools==2.29.2
1212
- pyyaml==5.3.1
1313
- htslib==1.11
14+
- bzip2==1.0.8

helperFun.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -332,19 +332,14 @@ def createListsGetIndices(intDir, maxIntervalLen, maxBpPerList, maxIntervalsPerL
332332
# does adding the next interval put us over the maximum values for our two thresholds?
333333
if (runningSumBp + intervalLen) >= maxBpPerList or (runningSum_intervals + 1) >= maxIntervalsPerList:
334334
if len(current_intervals) == 0:
335-
# if you decide lower maxBpPerList below maxIntervalLength, it's possible tohave an uninitialized
336-
# current_intervals that doesn't make it to subsequent else statement
335+
printIntervalsToListFile(intDir, listFile_index, [ (scaff, start, stop) ], refFileName)
336+
runningSumBp = 0
337+
runningSum_intervals = 0
338+
else:
339+
printIntervalsToListFile(intDir, listFile_index, current_intervals, refFileName)
337340
current_intervals = [ (scaff, start, stop) ]
338-
# flush out current_intervals into a list file
339-
printIntervalsToListFile(intDir, listFile_index, current_intervals, refFileName)
340-
#out = open(f"{intDir}list{listFile_index}.list", 'w')
341-
#for i in current_intervals:
342-
# print(f"{i[0]}:{i[1]}-{i[2]}", file=out)
343-
#out.close()
344-
# re-initialize data for next list file
345-
current_intervals = [ (scaff, start, stop) ]
346-
runningSumBp = intervalLen
347-
runningSum_intervals = 1
341+
runningSumBp = intervalLen
342+
runningSum_intervals = 1
348343
listFile_index += 1
349344
else:
350345
current_intervals.append( (scaff, start, stop) )

resources.yaml

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
###
2-
# fastq -> BAM workflow
2+
# fastq2bam workflow
33
##
44

55
# fastq download
@@ -32,7 +32,7 @@ merge_bams:
3232
mem: 9000
3333

3434
###
35-
# BAM -> VCF workflow
35+
# Intervals workflow
3636
###
3737

3838
# preprocess genome, create intervals
@@ -43,9 +43,9 @@ process_ref:
4343
create_intervals:
4444
mem: 5000
4545

46-
47-
48-
# gatk workflow
46+
###
47+
# bam2vcf workflows
48+
###
4949

5050
# gatk HaplotypeCaller
5151
bam2gvcf:
@@ -56,22 +56,21 @@ gvcf2DB:
5656
# gatk GenotypeGVCFs
5757
DB2vcf:
5858
mem: 30000
59-
# gatk GatherVcfs
60-
gatherVcfs:
61-
mem: 30000
62-
# vcftools program
63-
vcftools:
64-
mem: 30000
65-
66-
67-
# freebayes workflow
68-
69-
# freebayes program
59+
## freebayes program only! ##
7060
bam2vcf:
7161
mem: 30000
72-
# gatk GatherVcfs, same as above!
62+
# gatk filterVcfs
63+
filterVcfs:
64+
mem: 30000
65+
# gatk GatherVcfs
7366
gatherVcfs:
7467
mem: 30000
68+
# picard SortVcf
69+
sortVcf:
70+
mem: 30000
7571
# vcftools program
7672
vcftools:
7773
mem: 30000
74+
# bedtools program
75+
bedtools:
76+
mem: 30000

rules/bam2vcf_fb.smk

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,27 +28,26 @@ rule bam2vcf:
2828
"{input.bams} "
2929
"> {output.vcf}\n"
3030

31-
rule gatherVcfs:
31+
rule filterVcfs:
3232
"""
33-
This rule filters all of the VCFs, then gathers, one per list, into one final VCF
33+
This rule filters all of the VCFs
3434
"""
3535
input:
36-
vcfs = expand(vcfDir + "L{list}.vcf", list=LISTS),
36+
vcf = vcfDir + "L{list}.vcf",
3737
ref = config['ref']
3838
output:
39-
vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS),
40-
vcfFinal = config["gatkDir"] + config['spp'] + "_final.vcf.gz"
39+
vcf = vcfDir + "L{list}_filter.vcf"
4140
params:
4241
gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
4342
conda:
4443
"../envs/bam2vcf.yml"
4544
resources:
46-
mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem'] # this is the overall memory requested
45+
mem_mb = lambda wildcards, attempt: attempt * res_config['filterVcfs']['mem'] # this is the overall memory requested
4746
shell:
4847
"gatk VariantFiltration "
4948
"-R {input.ref} "
50-
"-V {input.vcfs} "
51-
"--output {output.vcfs} "
49+
"-V {input.vcf} "
50+
"--output {output.vcf} "
5251
"--filter-name \"RPRS_filter\" "
5352
"--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
5453
"--filter-name \"FS_SOR_filter\" "
@@ -58,23 +57,63 @@ rule gatherVcfs:
5857
"--filter-name \"QUAL_filter\" "
5958
"--filter-expression \"QUAL < 30.0\" "
6059
"--invalidate-previous-filters true\n"
61-
60+
61+
rule gatherVcfs:
62+
"""
63+
This rule gathers all the filtered VCFs, one per list, into one final VCF
64+
"""
65+
input:
66+
vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS)
67+
output:
68+
vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
69+
params:
70+
gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
71+
conda:
72+
"../envs/bam2vcf.yml"
73+
resources:
74+
mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem'] # this is the overall memory requested
75+
shell:
6276
"gatk GatherVcfs "
6377
"{params.gatherVcfsInput} "
64-
"-O {output.vcfFinal}"
78+
"-O {output.vcf}"
79+
80+
rule sortVcf:
81+
"""
82+
Sort VCF for more efficient processing of vcftools and bedtools
83+
"""
84+
input:
85+
vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
86+
output:
87+
vcf = config["gatkDir"] + config["spp"] + "_final.vcf.gz"
88+
conda:
89+
"../envs/bam2vcf.yml"
90+
resources:
91+
mem_mb = lambda wildcards, attempt: attempt * res_config['sortVcf']['mem'] # this is the overall memory requested
92+
shell:
93+
"picard SortVcf -I {input.vcf} -O {output.vcf}"
6594

6695
rule vcftools:
6796
input:
6897
vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
69-
int = intDir + "intervals_fb.bed"
98+
int = intDir + config["genome"] + "_intervals_fb.bed"
7099
output:
71-
missing = gatkDir + "missing_data_per_ind.txt",
72-
SNPsPerInt = gatkDir + "SNP_per_interval.txt"
100+
missing = gatkDir + "missing_data_per_ind.txt"
73101
conda:
74102
"../envs/bam2vcf.yml"
75103
resources:
76104
mem_mb = lambda wildcards, attempt: attempt * res_config['vcftools']['mem'] # this is the overall memory requested
77105
shell:
78-
"vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}\n"
79-
"bedtools intersect -a {input.int} -b {input.vcf} -c > {output.SNPsPerInt}"
106+
"vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}"
80107

108+
rule bedtools:
109+
input:
110+
vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
111+
int = intDir + config["genome"] + "_intervals_fb.bed"
112+
output:
113+
SNPsPerInt = gatkDir + "SNP_per_interval.txt"
114+
conda:
115+
"../envs/bam2vcf.yml"
116+
resources:
117+
mem_mb = lambda wildcards, attempt: attempt * res_config['bedtools']['mem'] # this is the overall memory requested
118+
shell:
119+
"bedtools coverage -a {input.int} -b {input.vcf} -counts -sorted > {output.SNPsPerInt}"

rules/bam2vcf_gatk.smk

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -110,27 +110,26 @@ rule DB2vcf:
110110
"-O {output.vcf} "
111111
"--tmp-dir {vcfDir}tmp"
112112

113-
rule gatherVcfs:
113+
rule filterVcfs:
114114
"""
115-
This rule filters all of the VCFs, then gathers, one per list, into one final VCF
115+
This rule filters all of the VCFs
116116
"""
117117
input:
118-
vcfs = expand(vcfDir + "L{list}.vcf", list=LISTS),
118+
vcf = vcfDir + "L{list}.vcf",
119119
ref = config['ref']
120120
output:
121-
vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS),
122-
vcfFinal = config["gatkDir"] + config['spp'] + "_final.vcf.gz"
121+
vcf = vcfDir + "L{list}_filter.vcf"
123122
params:
124123
gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
125124
conda:
126125
"../envs/bam2vcf.yml"
127126
resources:
128-
mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem'] # this is the overall memory requested
127+
mem_mb = lambda wildcards, attempt: attempt * res_config['filterVcfs']['mem'] # this is the overall memory requested
129128
shell:
130129
"gatk VariantFiltration "
131130
"-R {input.ref} "
132-
"-V {input.vcfs} "
133-
"--output {output.vcfs} "
131+
"-V {input.vcf} "
132+
"--output {output.vcf} "
134133
"--filter-name \"RPRS_filter\" "
135134
"--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
136135
"--filter-name \"FS_SOR_filter\" "
@@ -140,22 +139,63 @@ rule gatherVcfs:
140139
"--filter-name \"QUAL_filter\" "
141140
"--filter-expression \"QUAL < 30.0\" "
142141
"--invalidate-previous-filters true\n"
143-
142+
143+
rule gatherVcfs:
144+
"""
145+
This rule gathers all the filtered VCFs, one per list, into one final VCF
146+
"""
147+
input:
148+
vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS)
149+
output:
150+
vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
151+
params:
152+
gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
153+
conda:
154+
"../envs/bam2vcf.yml"
155+
resources:
156+
mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem'] # this is the overall memory requested
157+
shell:
144158
"gatk GatherVcfs "
145159
"{params.gatherVcfsInput} "
146-
"-O {output.vcfFinal}"
160+
"-O {output.vcf}"
161+
162+
rule sortVcf:
163+
"""
164+
Sort VCF for more efficient processing of vcftools and bedtools
165+
"""
166+
input:
167+
vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
168+
output:
169+
vcf = config["gatkDir"] + config["spp"] + "_final.vcf.gz"
170+
conda:
171+
"../envs/bam2vcf.yml"
172+
resources:
173+
mem_mb = lambda wildcards, attempt: attempt * res_config['sortVcf']['mem'] # this is the overall memory requested
174+
shell:
175+
"picard SortVcf -I {input.vcf} -O {output.vcf}"
147176

148177
rule vcftools:
149178
input:
150179
vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
151180
int = intDir + config["genome"] + "_intervals_fb.bed"
152181
output:
153-
missing = gatkDir + "missing_data_per_ind.txt",
154-
SNPsPerInt = gatkDir + "SNP_per_interval.txt"
182+
missing = gatkDir + "missing_data_per_ind.txt"
155183
conda:
156184
"../envs/bam2vcf.yml"
157185
resources:
158186
mem_mb = lambda wildcards, attempt: attempt * res_config['vcftools']['mem'] # this is the overall memory requested
159187
shell:
160-
"vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}\n"
161-
"bedtools intersect -a {input.int} -b {input.vcf} -c > {output.SNPsPerInt}"
188+
"vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}"
189+
190+
rule bedtools:
191+
input:
192+
vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
193+
int = intDir + config["genome"] + "_intervals_fb.bed"
194+
output:
195+
SNPsPerInt = gatkDir + "SNP_per_interval.txt"
196+
conda:
197+
"../envs/bam2vcf.yml"
198+
resources:
199+
mem_mb = lambda wildcards, attempt: attempt * res_config['bedtools']['mem'] # this is the overall memory requested
200+
shell:
201+
"bedtools coverage -a {input.int} -b {input.vcf} -counts -sorted > {output.SNPsPerInt}"

0 commit comments

Comments
 (0)