Snakefile-base-flu

"""
Processes input fastq files to create consensus genomes and their associated
summary statistics for the Seattle Flu Study.

Basic steps:
    1. Trim raw fastq's with Trimmomatic
    2. Map trimmed reads to each reference genomes in the reference panel using
       bowtie2 # This step may change with time
    ~3. Remove duplicate reads using Picard~
    4. Call consensus(SNPs/indels) using varscan
    5. Use consensus to generate full consensus genomes for each sample x reference
       virus combination
    6. Compute summary statistics for each sample x refernce virus combination

Adapted from Louise Moncla's illumina pipeline for influenza snp calling:
https://github.com/lmoncla/illumina_pipeline
"""
import glob
from datetime import datetime
from itertools import product

#### Helper functions and variable def'ns
def generate_sample_ids(config: dict) -> set:
    """
    Find all fastz.gz files within the fastq_directory in the *config* and
    parse the sample ids from these files to return as a set.

    Expects the file names to have the format of <sample_id>_*.fastq.gz
    Will ignore all sample ids listed under "ignored_samples" of *config*
    """
    all_ids = set()
    for filename in glob.glob("{}/*".format(config['fastq_directory'])):
        if filename.endswith('.fastq.gz'):
            sample_id = filename.split('.')[0].split('/')[-1].split('_')[0]
            if sample_id in config['ignored_samples']:
                continue
            all_ids.add(sample_id)
    return all_ids

def generate_all_files(sample: str, config: dict) -> tuple:
    """
    For a given sample, determine all fastqs as a tuple of forward
    and reverse within the fastq_directory in the *config*.

    The fastqs are expected to have *sample* in the filenamane and
    R1/R2 in the filename to differentiate forward and reverse reads.
    """
    r1 = glob.glob('{}/*{}*R1*'.format(config['fastq_directory'], sample))
    r2 = glob.glob('{}/*{}*R2*'.format(config['fastq_directory'], sample))
    return (r1, r2)

def filter_combinator(combinator, config):
    """
    Custom combinatoric function to be used with :py:func:`snakemake.io.expand`
    Only generates combination of sample and reference wildcards that are
    listed under `sample_reference_pairs` of the *config*.
    If no references are listed for a sample, will generate all possible
    combinations with all references.
    """
    def filtered_combinator(*args, **kwargs):
        for wc_comb in combinator(*args, **kwargs):
            if wc_comb[0][1] not in config["sample_reference_pairs"]:
                yield wc_comb
            elif len(config["sample_reference_pairs"][wc_comb[0][1]]) == 0:
                yield wc_comb
            elif wc_comb[1][1] in config["sample_reference_pairs"][wc_comb[0][1]]:
                yield wc_comb
    return filtered_combinator


# Build static lists of reference genomes, ID's of samples, and ID -> input files
all_references = [ v for v in  config['reference_viruses'].keys() ]
all_ids = generate_sample_ids(config)
mapped = {id: generate_all_files(id, config) for id in all_ids}
filtered_product = filter_combinator(product, config)

#### Main pipeline
rule index_reference_genome:
    input:
        raw_reference = "references/{reference}.fasta"
    output:
        indexed_reference = "references/{reference}.1.bt2"
    shell:
        """
        bowtie2-build {input.raw_reference} references/{wildcards.reference}
        """

rule merge_lanes:
    input:
        all_r1 = lambda wildcards: mapped[wildcards.sample][0],
        all_r2 = lambda wildcards: mapped[wildcards.sample][1]
    output:
        merged_r1 = "process/merged/{sample}_R1.fastq.gz",
        merged_r2 = "process/merged/{sample}_R2.fastq.gz"
    shell:
        """
        cat {input.all_r1} >> {output.merged_r1}
        cat {input.all_r2} >> {output.merged_r2}
        """

rule trim_fastqs:
    input:
        fastq_f = "process/merged/{sample}_R1.fastq.gz",
        fastq_r = "process/merged/{sample}_R2.fastq.gz"
    output:
        trimmed_fastq_1p = "process/trimmed/{sample}.trimmed_1P.fastq",
        trimmed_fastq_2p = "process/trimmed/{sample}.trimmed_2P.fastq",
        trimmed_fastq_1u = "process/trimmed/{sample}.trimmed_1U.fastq",
        trimmed_fastq_2u = "process/trimmed/{sample}.trimmed_2U.fastq"
    params:
        paired_end = config["params"]["trimmomatic"]["paired_end"],
        adapters = config["params"]["trimmomatic"]["adapters"],
        illumina_clip = config["params"]["trimmomatic"]["illumina_clip"],
        window_size = config["params"]["trimmomatic"]["window_size"],
        trim_qscore = config["params"]["trimmomatic"]["trim_qscore"],
        minimum_length = config["params"]["trimmomatic"]["minimum_length"]
    benchmark:
        "benchmarks/{sample}.trimmo"
    shell:
        """
        trimmomatic \
            {params.paired_end} \
            -phred33 \
            {input.fastq_f} {input.fastq_r} \
            -baseout process/trimmed/{wildcards.sample}.trimmed.fastq \
            ILLUMINACLIP:{params.adapters}:{params.illumina_clip} \
            SLIDINGWINDOW:{params.window_size}:{params.trim_qscore} \
            MINLEN:{params.minimum_length}
        """

rule post_trim_fastqc:
    input:
        p1 = rules.trim_fastqs.output.trimmed_fastq_1p,
        p2 = rules.trim_fastqs.output.trimmed_fastq_2p,
        u1 = rules.trim_fastqs.output.trimmed_fastq_1u,
        u2 = rules.trim_fastqs.output.trimmed_fastq_2u
    output:
        qc_1p_html = "summary/post_trim_fastqc/{sample}.trimmed_1P_fastqc.html",
        qc_2p_html = "summary/post_trim_fastqc/{sample}.trimmed_2P_fastqc.html",
        qc_1u_html = "summary/post_trim_fastqc/{sample}.trimmed_1U_fastqc.html",
        qc_2u_html = "summary/post_trim_fastqc/{sample}.trimmed_2U_fastqc.html",
        qc_1p_zip = "summary/post_trim_fastqc/{sample}.trimmed_1P_fastqc.zip",
        qc_2p_zip = "summary/post_trim_fastqc/{sample}.trimmed_2P_fastqc.zip",
        qc_1u_zip = "summary/post_trim_fastqc/{sample}.trimmed_1U_fastqc.zip",
        qc_2u_zip = "summary/post_trim_fastqc/{sample}.trimmed_2U_fastqc.zip"
    shell:
        """
        fastqc {input.p1} -o summary/post_trim_fastqc
        fastqc {input.p2} -o summary/post_trim_fastqc
        fastqc {input.u1} -o summary/post_trim_fastqc
        fastqc {input.u2} -o summary/post_trim_fastqc
        """

rule map:
    input:
        p1 = rules.trim_fastqs.output.trimmed_fastq_1p,
        p2 = rules.trim_fastqs.output.trimmed_fastq_2p,
        u1 = rules.trim_fastqs.output.trimmed_fastq_1u,
        u2 = rules.trim_fastqs.output.trimmed_fastq_2u,
        ref_file = rules.index_reference_genome.output.indexed_reference
    output:
        mapped_bam_file = "process/mapped/{reference}/{sample}.bam",
        bt2_log = "summary/bowtie2/{reference}/{sample}.log"
    params:
        threads = config["params"]["bowtie2"]["threads"],
        map_all = config["params"]["bowtie2"]["all"]
    benchmark:
        "benchmarks/{sample}_{reference}.bowtie2"
    shell:
        """
        bowtie2 \
            -x references/{wildcards.reference} \
            -1 {input.p1} \
            -2 {input.p2} \
            -U {input.u1},{input.u2} \
            -P {params.threads} \
            {params.map_all} \
            --local 2> {output.bt2_log} | \
                samtools view -bSF4 - > {output.mapped_bam_file}
        """

rule sort:
    input:
        mapped_bam = rules.map.output.mapped_bam_file
    output:
        sorted_bam_file = "process/sorted/{reference}/{sample}.sorted.bam"
    benchmark:
        "benchmarks/{sample}_{reference}.sort"
    shell:
        """
        samtools view \
            -bS {input.mapped_bam} | \
            samtools sort | \
            samtools view -h > {output.sorted_bam_file}
        """

rule bamstats:
    input:
        sorted_bam = rules.sort.output.sorted_bam_file
    output:
        bamstats_file = "summary/bamstats/{reference}/{sample}.coverage_stats.txt"
    benchmark:
        "benchmarks/{sample}_{reference}.bamstats"
    shell:
        """
        BAMStats -i {input.sorted_bam} > {output.bamstats_file}
        """

checkpoint mapped_reads:
    input:
        bt2_log = rules.map.output.bt2_log,
        bamstats = rules.bamstats.output.bamstats_file,
        reference = "references/{reference}.fasta"
    output:
        "summary/checkpoint/{reference}/{sample}.json"
    params:
        min_cov = config["params"]["varscan"]["min_cov"],
        raw_read_length = config["raw_read_length"]
    shell:
        """
        python scripts/checkpoint_mapped_reads.py \
            --bamstats {input.bamstats} \
            --bowtie2 {input.bt2_log} \
            --min-cov {params.min_cov} \
            --raw-read-length {params.raw_read_length} \
            --reference {input.reference} \
            --output {output} \
        """

rule not_mapped:
    input:
        sorted_bam = rules.sort.output.sorted_bam_file,
        reference = "references/{reference}.fasta"
    output:
        not_mapped = "summary/not_mapped/{reference}/{sample}.txt"
    shell:
        """
        touch {output.not_mapped}
        """

rule pileup:
    input:
        sorted_bam = rules.sort.output.sorted_bam_file,
        reference = "references/{reference}.fasta"
    output:
        pileup = "process/mpileup/{reference}/{sample}.pileup"
    params:
        depth = config["params"]["mpileup"]["depth"],
        min_base_qual = config["params"]["varscan"]["snp_qual_threshold"]
    benchmark:
        "benchmarks/{sample}_{reference}.mpileup"
    shell:
        """
        samtools mpileup -a -A -B \
            -Q {params.min_base_qual} \
            -d {params.depth} \
            {input.sorted_bam} > {output.pileup} \
            -f {input.reference}
        """

rule create_bed_file:
    input:
        pileup = rules.pileup.output.pileup
    output:
        bed_file = "summary/low_coverage/{reference}/{sample}.bed"
    params:
        min_cov = config["params"]["varscan"]["min_cov"],
        min_freq = config["params"]["varscan"]["snp_frequency"]
    shell:
        """
        python scripts/create_bed_file_for_masking.py \
            --pileup {input.pileup} \
            --min-cov {params.min_cov} \
            --min-freq {params.min_freq} \
            --bed-file {output.bed_file}
        """

rule call_consensus:
    input:
        pileup = rules.pileup.output.pileup
    output:
        vcf = "process/vcfs/{reference}/{sample}.vcf"
    params:
        min_cov = config["params"]["varscan"]["min_cov"],
        snp_qual_threshold = config["params"]["varscan"]["snp_qual_threshold"],
        snp_frequency = config["params"]["varscan"]["snp_frequency"]
    benchmark:
        "benchmarks/{sample}_{reference}.varscan"
    shell:
        """
        varscan mpileup2cns \
            {input.pileup} \
            --min-coverage {params.min_cov} \
            --min-avg-qual {params.snp_qual_threshold} \
            --min-var-freq {params.snp_frequency} \
            --strand-filter 0 \
            --variants 1 \
            --output-vcf 1 > {output.vcf}
        """

rule zip_vcf:
    input:
        vcf = rules.call_consensus.output.vcf
    output:
        bcf = "process/vcfs/{reference}/{sample}.vcf.gz"
    shell:
        """
        bgzip {input.vcf}
        """

rule index_bcf:
    input:
        bcf = rules.zip_vcf.output.bcf
    output:
        index = "process/vcfs/{reference}/{sample}.vcf.gz.csi"
    shell:
        """
        bcftools index {input}
        """

rule vcf_to_consensus:
    input:
        bcf = rules.zip_vcf.output.bcf,
        index = rules.index_bcf.output.index,
        mask = rules.create_bed_file.output.bed_file,
        ref = "references/{reference}.fasta"
    output:
        consensus_genome = "consensus_genomes/{reference}/{sample}.consensus.fasta"
    benchmark:
        "benchmarks/{sample}_{reference}.consensus"
    shell:
        """
        cat {input.ref} | \
            bcftools consensus {input.bcf} \
                --mask {input.mask} > \
            {output.consensus_genome}
        """

rule clean:
    message: "Removing directories: {params}"
    params:
        "summary "
        "process "
        "benchmarks "
        "references/*.bt2 "
        "references/*.fai"
    shell:
        """
        rm -rfv {params}
        """