Merge pull request #7 from harvardinformatics/gatk_bug

tsackton · web-flow · commit 6366da653eb7 · 2021-08-03T11:52:27.000-04:00
GATK rework
diff --git a/envs/bam2vcf.yml b/envs/bam2vcf.yml
@@ -11,3 +11,4 @@ dependencies:
   - bedtools==2.29.2
   - pyyaml==5.3.1
   - htslib==1.11
+  - bzip2==1.0.8
diff --git a/helperFun.py b/helperFun.py
@@ -332,19 +332,14 @@ def createListsGetIndices(intDir, maxIntervalLen, maxBpPerList, maxIntervalsPerL
             # does adding the next interval put us over the maximum values for our two thresholds?
             if (runningSumBp + intervalLen) >= maxBpPerList or (runningSum_intervals + 1) >= maxIntervalsPerList:
                 if len(current_intervals) == 0:
-                    # if you decide lower maxBpPerList below maxIntervalLength, it's possible tohave an uninitialized
-                    # current_intervals that doesn't make it to subsequent else statement
+                    printIntervalsToListFile(intDir, listFile_index, [ (scaff, start, stop) ], refFileName)
+                    runningSumBp = 0
+                    runningSum_intervals = 0
+                else: 
+                    printIntervalsToListFile(intDir, listFile_index, current_intervals, refFileName)
                     current_intervals = [ (scaff, start, stop) ]
-                # flush out current_intervals into a list file
-                printIntervalsToListFile(intDir, listFile_index, current_intervals, refFileName)
-                #out = open(f"{intDir}list{listFile_index}.list", 'w')
-                #for i in current_intervals:
-                #    print(f"{i[0]}:{i[1]}-{i[2]}", file=out)
-                #out.close()
-                # re-initialize data for next list file
-                current_intervals = [ (scaff, start, stop) ]
-                runningSumBp = intervalLen
-                runningSum_intervals = 1 
+                    runningSumBp = intervalLen
+                    runningSum_intervals = 1
                 listFile_index += 1
             else:
                 current_intervals.append( (scaff, start, stop) )
diff --git a/resources.yaml b/resources.yaml
@@ -1,5 +1,5 @@
 ###
-# fastq -> BAM workflow
+# fastq2bam workflow
 ##
 
 # fastq download
@@ -32,7 +32,7 @@ merge_bams:
     mem: 9000
 
 ###
-# BAM -> VCF workflow
+# Intervals workflow
 ###
 
 # preprocess genome, create intervals
@@ -43,9 +43,9 @@ process_ref:
 create_intervals:
     mem: 5000
 
-
-
-# gatk workflow
+###
+# bam2vcf workflows
+###
 
 # gatk HaplotypeCaller
 bam2gvcf: 
@@ -56,22 +56,21 @@ gvcf2DB:
 # gatk GenotypeGVCFs 
 DB2vcf: 
     mem: 30000
-# gatk GatherVcfs
-gatherVcfs:
-    mem: 30000
-# vcftools program
-vcftools:
-    mem: 30000
-
-
-# freebayes workflow
-
-# freebayes program
+## freebayes program only! ##
 bam2vcf:
     mem: 30000
-# gatk GatherVcfs, same as above!
+# gatk filterVcfs
+filterVcfs:
+    mem: 30000
+# gatk GatherVcfs
 gatherVcfs:
     mem: 30000
+# picard SortVcf
+sortVcf:
+    mem: 30000
 # vcftools program
 vcftools:
     mem: 30000
+# bedtools program
+bedtools:
+    mem: 30000
diff --git a/rules/bam2vcf_fb.smk b/rules/bam2vcf_fb.smk
@@ -28,27 +28,26 @@ rule bam2vcf:
         "{input.bams} "
         "> {output.vcf}\n"
 
-rule gatherVcfs:
+rule filterVcfs:
     """
-    This rule filters all of the VCFs, then gathers, one per list, into one final VCF
+    This rule filters all of the VCFs
     """
     input:
-        vcfs = expand(vcfDir + "L{list}.vcf", list=LISTS),
+        vcf = vcfDir + "L{list}.vcf",
         ref = config['ref']
     output: 
-        vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS),
-        vcfFinal = config["gatkDir"] + config['spp'] + "_final.vcf.gz"
+        vcf = vcfDir + "L{list}_filter.vcf"
     params:
         gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
     conda:
         "../envs/bam2vcf.yml"
     resources:
-        mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem']   # this is the overall memory requested
+        mem_mb = lambda wildcards, attempt: attempt * res_config['filterVcfs']['mem']   # this is the overall memory requested
     shell:
         "gatk VariantFiltration "
         "-R {input.ref} "
-        "-V {input.vcfs} " 
-        "--output {output.vcfs} "
+        "-V {input.vcf} " 
+        "--output {output.vcf} "
         "--filter-name \"RPRS_filter\" "
         "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
         "--filter-name \"FS_SOR_filter\" "
@@ -58,23 +57,63 @@ rule gatherVcfs:
         "--filter-name \"QUAL_filter\" "
         "--filter-expression \"QUAL < 30.0\" "
         "--invalidate-previous-filters true\n"
-        
+
+rule gatherVcfs:
+    """
+    This rule gathers all the filtered VCFs, one per list, into one final VCF 
+    """
+    input:
+        vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS)
+    output:
+        vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
+    params:
+        gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
+    conda:
+        "../envs/bam2vcf.yml"
+    resources:
+        mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem']   # this is the overall memory requested
+    shell:
         "gatk GatherVcfs "
         "{params.gatherVcfsInput} "
-        "-O {output.vcfFinal}"
+        "-O {output.vcf}"
+
+rule sortVcf:
+    """
+    Sort VCF for more efficient processing of vcftools and bedtools
+    """
+    input:
+        vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
+    output:
+        vcf = config["gatkDir"] + config["spp"] + "_final.vcf.gz"
+    conda:
+        "../envs/bam2vcf.yml"
+    resources:
+        mem_mb = lambda wildcards, attempt: attempt * res_config['sortVcf']['mem']   # this is the overall memory requested
+    shell:
+        "picard SortVcf -I {input.vcf} -O {output.vcf}"
 
 rule vcftools:
     input:
         vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
-        int = intDir + "intervals_fb.bed"
+        int = intDir + config["genome"] + "_intervals_fb.bed"
     output: 
-        missing = gatkDir + "missing_data_per_ind.txt",
-        SNPsPerInt = gatkDir + "SNP_per_interval.txt"
+        missing = gatkDir + "missing_data_per_ind.txt"
     conda:
         "../envs/bam2vcf.yml"
     resources:
         mem_mb = lambda wildcards, attempt: attempt * res_config['vcftools']['mem']    # this is the overall memory requested
     shell:
-        "vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}\n"
-        "bedtools intersect -a {input.int} -b {input.vcf} -c > {output.SNPsPerInt}"
+        "vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}"
 
+rule bedtools:
+    input:
+        vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
+        int = intDir + config["genome"] + "_intervals_fb.bed"
+    output: 
+        SNPsPerInt = gatkDir + "SNP_per_interval.txt"
+    conda:
+        "../envs/bam2vcf.yml"
+    resources:
+        mem_mb = lambda wildcards, attempt: attempt * res_config['bedtools']['mem']    # this is the overall memory requested
+    shell:
+        "bedtools coverage -a {input.int} -b {input.vcf} -counts -sorted > {output.SNPsPerInt}"
diff --git a/rules/bam2vcf_gatk.smk b/rules/bam2vcf_gatk.smk
@@ -110,27 +110,26 @@ rule DB2vcf:
         "-O {output.vcf} "
         "--tmp-dir {vcfDir}tmp"
 
-rule gatherVcfs:
+rule filterVcfs:
     """
-    This rule filters all of the VCFs, then gathers, one per list, into one final VCF
+    This rule filters all of the VCFs
     """
     input:
-        vcfs = expand(vcfDir + "L{list}.vcf", list=LISTS),
+        vcf = vcfDir + "L{list}.vcf",
         ref = config['ref']
     output: 
-        vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS),
-        vcfFinal = config["gatkDir"] + config['spp'] + "_final.vcf.gz"
+        vcf = vcfDir + "L{list}_filter.vcf"
     params:
         gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
     conda:
         "../envs/bam2vcf.yml"
     resources:
-        mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem']   # this is the overall memory requested
+        mem_mb = lambda wildcards, attempt: attempt * res_config['filterVcfs']['mem']   # this is the overall memory requested
     shell:
         "gatk VariantFiltration "
         "-R {input.ref} "
-        "-V {input.vcfs} " 
-        "--output {output.vcfs} "
+        "-V {input.vcf} " 
+        "--output {output.vcf} "
         "--filter-name \"RPRS_filter\" "
         "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
         "--filter-name \"FS_SOR_filter\" "
@@ -140,22 +139,63 @@ rule gatherVcfs:
         "--filter-name \"QUAL_filter\" "
         "--filter-expression \"QUAL < 30.0\" "
         "--invalidate-previous-filters true\n"
-        
+
+rule gatherVcfs:
+    """
+    This rule gathers all the filtered VCFs, one per list, into one final VCF 
+    """
+    input:
+        vcfs = expand(vcfDir + "L{list}_filter.vcf", list=LISTS)
+    output:
+        vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
+    params:
+        gatherVcfsInput = helperFun.getVcfs_gatk(LISTS, vcfDir)
+    conda:
+        "../envs/bam2vcf.yml"
+    resources:
+        mem_mb = lambda wildcards, attempt: attempt * res_config['gatherVcfs']['mem']   # this is the overall memory requested
+    shell:
         "gatk GatherVcfs "
         "{params.gatherVcfsInput} "
-        "-O {output.vcfFinal}"
+        "-O {output.vcf}"
+
+rule sortVcf:
+    """
+    Sort VCF for more efficient processing of vcftools and bedtools
+    """
+    input:
+        vcf = config["gatkDir"] + config["spp"] + ".vcf.gz"
+    output:
+        vcf = config["gatkDir"] + config["spp"] + "_final.vcf.gz"
+    conda:
+        "../envs/bam2vcf.yml"
+    resources:
+        mem_mb = lambda wildcards, attempt: attempt * res_config['sortVcf']['mem']   # this is the overall memory requested
+    shell:
+        "picard SortVcf -I {input.vcf} -O {output.vcf}"
 
 rule vcftools:
     input:
         vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
         int = intDir + config["genome"] + "_intervals_fb.bed"
     output: 
-        missing = gatkDir + "missing_data_per_ind.txt",
-        SNPsPerInt = gatkDir + "SNP_per_interval.txt"
+        missing = gatkDir + "missing_data_per_ind.txt"
     conda:
         "../envs/bam2vcf.yml"
     resources:
         mem_mb = lambda wildcards, attempt: attempt * res_config['vcftools']['mem']    # this is the overall memory requested
     shell:
-        "vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}\n"
-        "bedtools intersect -a {input.int} -b {input.vcf} -c > {output.SNPsPerInt}"
+        "vcftools --gzvcf {input.vcf} --remove-filtered-all --minDP 1 --stdout --missing-indv > {output.missing}"
+
+rule bedtools:
+    input:
+        vcf = config["gatkDir"] + config['spp'] + "_final.vcf.gz",
+        int = intDir + config["genome"] + "_intervals_fb.bed"
+    output: 
+        SNPsPerInt = gatkDir + "SNP_per_interval.txt"
+    conda:
+        "../envs/bam2vcf.yml"
+    resources:
+        mem_mb = lambda wildcards, attempt: attempt * res_config['bedtools']['mem']    # this is the overall memory requested
+    shell:
+        "bedtools coverage -a {input.int} -b {input.vcf} -counts -sorted > {output.SNPsPerInt}"