Snakefiles/Snakefile-counts

import glob
import warnings
from snakemake.utils import min_version

##### set minimum snakemake version #####
min_version("5.18.0")


# check existence of SAMP variable. it may have already been defined if this
# Snakefile is being included from somewhere else
if 'SAMP3' not in globals():
    def read_samples():
        """Function to get names and fastq paths from a sample file specified
        in the configuration. Input file is expected to have 4 columns:
        <vcf_sample_id> <unique_sample_id> <dna_bam_path> <rna_bam_path>. If
        <dna_bam_path> is left out, the pipeline will default to an rna-only
        analysis, which doesn't require dna reads but is more conservative.
        Modify this function as needed to provide a dictionary of sample_id
        keys and (fastq1, fastq1) values."""
        f = open(config['sample_file'], "r")
        samp_dict = {}
        for line in f:
            words = line.strip().split("\t")
            if config['rna_only'] and len(words) == 4:
                samp_dict[words[1]] = (words[3],)
            elif len(words) == 3:
                samp_dict[words[1]] = (words[2],)
                # sanity check to make sure that rna_only is set to true
                config['rna_only'] = True
            else:
                samp_dict[words[1]] = (words[2], words[3])
        return samp_dict
    SAMP3 = read_samples()

# the user can define config['SAMP_NAMES'] to contain whichever sample names
# they'd like to run the pipeline on
if 'SAMP_NAMES' not in config:
    config['SAMP_NAMES'] = list(SAMP3.keys())
else:
    # double check that the user isn't asking for samples they haven't provided
    user_samps_len = len(config['SAMP_NAMES'])
    config['SAMP_NAMES'] = list(set(SAMP3.keys()).intersection(config['SAMP_NAMES']))
    if len(config['SAMP_NAMES']) != user_samps_len:
        warnings.warn("Not all of the samples requested have provided input. Proceeding with as many samples as is possible...")

# check existence of SAMP_TO_VCF_ID. it may have already been defined if this
# Snakefile is being included from somewhere else
if 'SAMP_TO_VCF_ID' not in globals():
    def read_vcf_samples():
        f = open(config['sample_file'], "r")
        samp_dict = {}
        for line in f:
            words = line.strip().split("\t")
            if words[1] in config['SAMP_NAMES']:
                samp_dict[words[1]] = words[0]
        return samp_dict
    SAMP_TO_VCF_ID = read_vcf_samples()


if not hasattr(rules, 'all'):
    rule all:
        # if you'd like to run the pipeline on only a subset of the samples,
        # you should specify them in the config['SAMP_NAMES'] variable above
        input:
            expand(config['output_dir'] + "/final/{sample}/result.csv.gz",
                   sample=config['SAMP_NAMES'])

rule extract_gq_scores:
    """Create tables containing GQ scores for each sample from the input vcf.
    The files will have columns: CHROM, POS, REF, ALT, and GQ"""
    input:
        vcf = config['vcf_file']
    output:
        config['output_dir'] + "/extract_gq/{vcf_sample}.tsv.gz"
    conda: "../envs/default.yaml"
    benchmark: config['output_dir'] + "/benchmark/counts/extract_gq/{vcf_sample}.tsv"
    resources:
        mem_mb = 100
    shell:
        "bcftools query -f '%CHROM\\t%POS\\t%REF\\t%ALT\\t[%GQ]\\n' "
        "-s {wildcards.vcf_sample} {input.vcf} | gzip >{output}"

if not hasattr(rules, 'vcf2h5'):
    include: "snp2h5_rules.smk"

rule get_as_counts:
    """get allele-specific read counts for SNPs"""
    input:
        bam = lambda wildcards: SAMP3[wildcards.sample][
            wildcards.type == "rna" and not config['rna_only']
        ],
        snp_index = rules.vcf2h5.output.snp_index,
        snp_tab = rules.vcf2h5.output.snp_tab,
        haplotype = rules.vcf2h5.output.haplotype,
        chrom = config['chrom_info'],
        bam2h5_script = rules.get_WASP.output.bam2h5_script
    params:
        sample_name = lambda wildcards: SAMP_TO_VCF_ID[wildcards.sample]
    output:
        ref_as_counts = temp(
            config['output_dir'] +
            "/as_counts/{sample}/{type}.ref_as_counts.h5"
        ),
        alt_as_counts = temp(
            config['output_dir'] +
            "/as_counts/{sample}/{type}.alt_as_counts.h5"
        ),
        other_as_counts = temp(
            config['output_dir'] +
            "/as_counts/{sample}/{type}.other_as_counts.h5"
        ),
        read_counts = temp(
            config['output_dir'] +
            "/as_counts/{sample}/{type}.read_counts.h5"
        ),
        txt_counts = config['output_dir'] +
        "/as_counts/{sample}/{type}.as_counts.txt.gz"
    conda: "../envs/default.yaml"
    benchmark: config['output_dir'] + "/benchmark/counts/get_as_counts/{sample}_{type}.tsv"
    resources:
        mem_mb = 6300
    shell:
        "python {input.bam2h5_script} "
            "--chrom {input.chrom} "
            "--snp_index {input.snp_index} "
            "--snp_tab {input.snp_tab} "
            "--haplotype {input.haplotype} "
            "--individual {params.sample_name} "
            "--ref_as_counts {output.ref_as_counts} "
            "--alt_as_counts {output.alt_as_counts} "
            "--other_as_counts {output.other_as_counts} "
            "--read_counts {output.read_counts} "
            "--txt_counts {output.txt_counts} "
            "--data_type uint16 "
            "{input.bam}"

rule extract_hets:
    """make sure there aren't any non heterozygous variants in the counts"""
    input:
        rules.get_as_counts.output.txt_counts
    output:
        config['output_dir'] +
        "/as_counts/{sample}/{type}.as_counts.hets.txt.gz"
    conda: "../envs/default.yaml"
    benchmark: config['output_dir'] + "/benchmark/counts/extract_hets/{sample}_{type}.tsv"
    resources:
        mem_mb = 50
    shell:
        "zcat {input} | grep -E ' 0\|1 | 1\|0 ' | gzip >{output}"

rule remove_indel_counts:
    """remove indels from the counts files in case the input VCF contained
    any"""
    input:
        rules.extract_hets.output
    params:
        sep = "' '"
    output:
        config['output_dir'] +
        "/as_counts/{sample}/{type}.as_counts.hets.snps.txt.gz"
    conda: "../envs/default.yaml"
    benchmark: config['output_dir'] + "/benchmark/counts/remove_indel_counts/{sample}_{type}.tsv"
    resources:
        mem_mb = 750
    shell:
        "Rscript --vanilla scripts/remove_indels.r {input} "
        "{params.sep} | gzip >{output}"

if config['rna_only']:
    rule prepare_counts:
        """prepare counts for detecting imbalance, adding gene info
        and calculating genotype error"""
        input:
            rna_counts = lambda wildcards:
                rules.remove_indel_counts.output[0].format(
                    sample=wildcards.sample, type="rna"
                ),
            gq_file = lambda wildcards:
                [] if 'default_gq' in config and config['default_gq'] else \
                rules.extract_gq_scores.output[0].format(
                    vcf_sample=SAMP_TO_VCF_ID[wildcards.sample]
                ),
            gene_info = config['gene_info']
        params:
            output_dir = lambda wildcards, output: str(Path(output[0]).parent)+"/",
            gq = lambda wildcards, input: config['default_gq'] if 'default_gq' in config \
                and config['default_gq'] else input.gq_file
        output:
            rna = config['output_dir'] + "/final/{sample}/rna.csv.gz"
        conda: "../envs/default.yaml"
        benchmark: config['output_dir'] + "/benchmark/counts/prepare_counts/{sample}.tsv"
        resources:
            mem_mb = 12500
        shell:
            "Rscript --no-save --no-restore scripts/"
            "prepare_counts-rna.r {input.rna_counts} "
            "{params.gq} {input.gene_info} {params.output_dir}"

    rule detect_imbalance:
        """quantify allelic imbalance in genes for each sample"""
        input:
            rna = rules.prepare_counts.output.rna,
            gene_info = config['gene_info']
        params:
            imbalance_script_path = "scripts/allele_imbalance-rna.r"
        output:
            config['output_dir'] + "/final/{sample}/result.csv.gz"
        conda: "../envs/default.yaml"
        benchmark: config['output_dir'] + "/benchmark/counts/detect_imbalance/{sample}.tsv"
        resources:
            mem_mb = 1950
        shell:
            "Rscript --no-save --no-restore scripts/"
            "find_imbalance-rna.r {params.imbalance_script_path} "
            "{input.rna} {input.gene_info} "
            "| gzip >{output}"
else:
    rule prepare_counts:
        """prepare counts for detecting imbalance, adding gene info
        and calculating genotype error"""
        input:
            dna_counts = lambda wildcards:
                rules.remove_indel_counts.output[0].format(
                    sample=wildcards.sample, type="dna"
                ),
            rna_counts = lambda wildcards:
                rules.remove_indel_counts.output[0].format(
                    sample=wildcards.sample, type="rna"
                ),
            gq_file = lambda wildcards:
                [] if 'default_gq' in config and config['default_gq'] else \
                rules.extract_gq_scores.output[0].format(
                    vcf_sample=SAMP_TO_VCF_ID[wildcards.sample]
                ),
            gene_info = config['gene_info']
        params:
            output_dir = lambda wildcards, output: str(Path(output[0]).parent)+"/",
            gq = lambda wildcards, input: config['default_gq'] if 'default_gq' in config \
                and config['default_gq'] else input.gq_file
        output:
            dna = config['output_dir'] + "/final/{sample}/dna.csv.gz",
            rna = config['output_dir'] + "/final/{sample}/rna.csv.gz"
        conda: "../envs/default.yaml"
        benchmark: config['output_dir'] + "/benchmark/counts/prepare_counts/{sample}.tsv"
        resources:
            mem_mb = 12500
        shell:
            "Rscript --no-save --no-restore scripts/"
            "prepare_counts.r {input.dna_counts} {input.rna_counts} "
            "{params.gq} {input.gene_info} {params.output_dir}"

    rule detect_imbalance:
        """quantify allelic imbalance in genes for each sample"""
        input:
            dna = rules.prepare_counts.output.dna,
            rna = rules.prepare_counts.output.rna,
            gene_info = config['gene_info']
        params:
            imbalance_script_path = "scripts/allele_imbalance.r"
        output:
            config['output_dir'] + "/final/{sample}/result.csv.gz"
        conda: "../envs/default.yaml"
        benchmark: config['output_dir'] + "/benchmark/counts/detect_imbalance/{sample}.tsv"
        resources:
            mem_mb = 1950
        shell:
            "Rscript --no-save --no-restore scripts/"
            "find_imbalance.r {params.imbalance_script_path} "
            "{input.dna} {input.rna} {input.gene_info} "
            "| gzip >{output}"