Skip to content

Commit

Permalink
Merge pull request #7 from harvardinformatics/gatk_bug
Browse files Browse the repository at this point in the history
GATK rework
  • Loading branch information
tsackton authored Aug 3, 2021
2 parents 4ae2168 + 35f505c commit 6366da6
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 58 deletions.
1 change: 1 addition & 0 deletions envs/bam2vcf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ dependencies:
- bedtools==2.29.2
- pyyaml==5.3.1
- htslib==1.11
- bzip2==1.0.8
19 changes: 7 additions & 12 deletions helperFun.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,19 +332,14 @@ def createListsGetIndices(intDir, maxIntervalLen, maxBpPerList, maxIntervalsPerL
# does adding the next interval put us over the maximum values for our two thresholds?
if (runningSumBp + intervalLen) >= maxBpPerList or (runningSum_intervals + 1) >= maxIntervalsPerList:
if len(current_intervals) == 0:
# if you decide lower maxBpPerList below maxIntervalLength, it's possible tohave an uninitialized
# current_intervals that doesn't make it to subsequent else statement
printIntervalsToListFile(intDir, listFile_index, [ (scaff, start, stop) ], refFileName)
runningSumBp = 0
runningSum_intervals = 0
else:
printIntervalsToListFile(intDir, listFile_index, current_intervals, refFileName)
current_intervals = [ (scaff, start, stop) ]
# flush out current_intervals into a list file
printIntervalsToListFile(intDir, listFile_index, current_intervals, refFileName)
#out = open(f"{intDir}list{listFile_index}.list", 'w')
#for i in current_intervals:
# print(f"{i[0]}:{i[1]}-{i[2]}", file=out)
#out.close()
# re-initialize data for next list file
current_intervals = [ (scaff, start, stop) ]
runningSumBp = intervalLen
runningSum_intervals = 1
runningSumBp = intervalLen
runningSum_intervals = 1
listFile_index += 1
else:
current_intervals.append( (scaff, start, stop) )
Expand Down
33 changes: 16 additions & 17 deletions resources.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
###
# fastq -> BAM workflow
# fastq2bam workflow
##

# fastq download
Expand Down Expand Up @@ -32,7 +32,7 @@ merge_bams:
mem: 9000

###
# BAM -> VCF workflow
# Intervals workflow
###

# preprocess genome, create intervals
Expand All @@ -43,9 +43,9 @@ process_ref:
create_intervals:
mem: 5000



# gatk workflow
###
# bam2vcf workflows
###

# gatk HaplotypeCaller
bam2gvcf:
Expand All @@ -56,22 +56,21 @@ gvcf2DB:
# gatk GenotypeGVCFs
DB2vcf:
mem: 30000
# gatk GatherVcfs
gatherVcfs:
mem: 30000
# vcftools program
vcftools:
mem: 30000


# freebayes workflow

# freebayes program
## freebayes program only! ##
bam2vcf:
mem: 30000
# gatk GatherVcfs, same as above!
# gatk filterVcfs
filterVcfs:
mem: 30000
# gatk GatherVcfs
gatherVcfs:
mem: 30000
# picard SortVcf
sortVcf:
mem: 30000
# vcftools program
vcftools:
mem: 30000
# bedtools program
bedtools:
mem: 30000
69 changes: 54 additions & 15 deletions rules/bam2vcf_fb.smk
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,26 @@ rule bam2vcf:
"{input.bams} "
"> {output.vcf}\n"

rule gatherVcfs:
rule filterVcfs:
"""
This rule filters all of the VCFs, then gathers, one per list, into one final VCF
This rule filters all of the VCFs
"""
input:
vcfs = expand(vcfDir + "L{list}.vcf", list=LISTS),
vcf = vcfDir + "L{list}.vcf",
ref = config['ref']
output:
vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS),
vcfFinal = config["gatkDir"] + config['spp'] + "_final.vcf.gz"
vcf = vcfDir + "L{list}_filter.vcf"
params:
gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem'] # this is the overall memory requested
mem_mb = lambda wildcards, attempt: attempt * res_config['filterVcfs']['mem'] # this is the overall memory requested
shell:
"gatk VariantFiltration "
"-R {input.ref} "
"-V {input.vcfs} "
"--output {output.vcfs} "
"-V {input.vcf} "
"--output {output.vcf} "
"--filter-name \"RPRS_filter\" "
"--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
"--filter-name \"FS_SOR_filter\" "
Expand All @@ -58,23 +57,63 @@ rule gatherVcfs:
"--filter-name \"QUAL_filter\" "
"--filter-expression \"QUAL < 30.0\" "
"--invalidate-previous-filters true\n"


rule gatherVcfs:
"""
This rule gathers all the filtered VCFs, one per list, into one final VCF
"""
input:
vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS)
output:
vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
params:
gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem'] # this is the overall memory requested
shell:
"gatk GatherVcfs "
"{params.gatherVcfsInput} "
"-O {output.vcfFinal}"
"-O {output.vcf}"

rule sortVcf:
"""
Sort VCF for more efficient processing of vcftools and bedtools
"""
input:
vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
output:
vcf = config["gatkDir"] + config["spp"] + "_final.vcf.gz"
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['sortVcf']['mem'] # this is the overall memory requested
shell:
"picard SortVcf -I {input.vcf} -O {output.vcf}"

rule vcftools:
input:
vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
int = intDir + "intervals_fb.bed"
int = intDir + config["genome"] + "_intervals_fb.bed"
output:
missing = gatkDir + "missing_data_per_ind.txt",
SNPsPerInt = gatkDir + "SNP_per_interval.txt"
missing = gatkDir + "missing_data_per_ind.txt"
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['vcftools']['mem'] # this is the overall memory requested
shell:
"vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}\n"
"bedtools intersect -a {input.int} -b {input.vcf} -c > {output.SNPsPerInt}"
"vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}"

rule bedtools:
input:
vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
int = intDir + config["genome"] + "_intervals_fb.bed"
output:
SNPsPerInt = gatkDir + "SNP_per_interval.txt"
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['bedtools']['mem'] # this is the overall memory requested
shell:
"bedtools coverage -a {input.int} -b {input.vcf} -counts -sorted > {output.SNPsPerInt}"
68 changes: 54 additions & 14 deletions rules/bam2vcf_gatk.smk
Original file line number Diff line number Diff line change
Expand Up @@ -110,27 +110,26 @@ rule DB2vcf:
"-O {output.vcf} "
"--tmp-dir {vcfDir}tmp"

rule gatherVcfs:
rule filterVcfs:
"""
This rule filters all of the VCFs, then gathers, one per list, into one final VCF
This rule filters all of the VCFs
"""
input:
vcfs = expand(vcfDir + "L{list}.vcf", list=LISTS),
vcf = vcfDir + "L{list}.vcf",
ref = config['ref']
output:
vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS),
vcfFinal = config["gatkDir"] + config['spp'] + "_final.vcf.gz"
vcf = vcfDir + "L{list}_filter.vcf"
params:
gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem'] # this is the overall memory requested
mem_mb = lambda wildcards, attempt: attempt * res_config['filterVcfs']['mem'] # this is the overall memory requested
shell:
"gatk VariantFiltration "
"-R {input.ref} "
"-V {input.vcfs} "
"--output {output.vcfs} "
"-V {input.vcf} "
"--output {output.vcf} "
"--filter-name \"RPRS_filter\" "
"--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
"--filter-name \"FS_SOR_filter\" "
Expand All @@ -140,22 +139,63 @@ rule gatherVcfs:
"--filter-name \"QUAL_filter\" "
"--filter-expression \"QUAL < 30.0\" "
"--invalidate-previous-filters true\n"


rule gatherVcfs:
"""
This rule gathers all the filtered VCFs, one per list, into one final VCF
"""
input:
vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS)
output:
vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
params:
gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem'] # this is the overall memory requested
shell:
"gatk GatherVcfs "
"{params.gatherVcfsInput} "
"-O {output.vcfFinal}"
"-O {output.vcf}"

rule sortVcf:
"""
Sort VCF for more efficient processing of vcftools and bedtools
"""
input:
vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
output:
vcf = config["gatkDir"] + config["spp"] + "_final.vcf.gz"
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['sortVcf']['mem'] # this is the overall memory requested
shell:
"picard SortVcf -I {input.vcf} -O {output.vcf}"

rule vcftools:
input:
vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
int = intDir + config["genome"] + "_intervals_fb.bed"
output:
missing = gatkDir + "missing_data_per_ind.txt",
SNPsPerInt = gatkDir + "SNP_per_interval.txt"
missing = gatkDir + "missing_data_per_ind.txt"
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['vcftools']['mem'] # this is the overall memory requested
shell:
"vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}\n"
"bedtools intersect -a {input.int} -b {input.vcf} -c > {output.SNPsPerInt}"
"vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}"

rule bedtools:
input:
vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
int = intDir + config["genome"] + "_intervals_fb.bed"
output:
SNPsPerInt = gatkDir + "SNP_per_interval.txt"
conda:
"../envs/bam2vcf.yml"
resources:
mem_mb = lambda wildcards, attempt: attempt * res_config['bedtools']['mem'] # this is the overall memory requested
shell:
"bedtools coverage -a {input.int} -b {input.vcf} -counts -sorted > {output.SNPsPerInt}"

0 comments on commit 6366da6

Please sign in to comment.