diff --git a/.gitignore b/.gitignore index 122be1b..e7fc954 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ resources/** # Custom additions Notes.md .vscode/* -.snakemake-workflow-catalog.yml \ No newline at end of file +.snakemake-workflow-catalog.yml +.test/results/* \ No newline at end of file diff --git a/.test/config/config.yml b/.test/config/config.yml index cbca87a..c66fa7d 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -1,10 +1,4 @@ - -# optional: define output folder here -# default: "./results" -output: null - -# define samplesheet here -samplesheet: "./test/config/samples.tsv" +samplesheet: "config/samples.tsv" get_genome: database: "ncbi" @@ -28,18 +22,20 @@ star: multi: 10 sam_multi: 1 intron_max: 1 - default: [ - "--readFilesCommand zcat ", - "--outSAMstrandField None ", - "--outSAMattributes All ", - "--outSAMattrIHstart 0 ", - "--outFilterType Normal ", - "--outFilterMultimapScoreRange 1 ", - "-o STARmappings ", - "--outSAMtype BAM Unsorted ", - "--outStd BAM_Unsorted ", - "--outMultimapperOrder Random ", - "--alignEndsType EndToEnd"] + default: + [ + "--readFilesCommand zcat ", + "--outSAMstrandField None ", + "--outSAMattributes All ", + "--outSAMattrIHstart 0 ", + "--outFilterType Normal ", + "--outFilterMultimapScoreRange 1 ", + "-o STARmappings ", + "--outSAMtype BAM Unsorted ", + "--outStd BAM_Unsorted ", + "--outMultimapperOrder Random ", + "--alignEndsType EndToEnd", + ] extract_features: biotypes: ["rRNA", "tRNA"] @@ -54,26 +50,25 @@ deeptools: normalize: "CPM" annotate_orfs: - window_size: 30 - sorf_max_length: 300 - sorf_min_length: 45 - orf_start_codon_table: 11 - orf_stop_codon: ["TAA", "TAG", "TGA"] - orf_longest_only: False + window_size: 30 + sorf_max_length: 300 + sorf_min_length: 45 + orf_start_codon_table: 11 + orf_stop_codon: ["TAA", "TAG", "TGA"] + orf_longest_only: False shift_reads: - window_size: 30 - read_length: [27, 45] - # rpf_read_length: [30, 45] - # qti_read_length: [27, 45] - rnaseq_read_length: [0, 1000] - end_alignment: "3prime" - shift_table: "config/shift_table/shift_table.csv" - export_bam: False - export_bigwig: True - skip_shifting: False - skip_length_filter: True - + window_size: 30 + read_length: [27, 45] + # rpf_read_length: [30, 45] + # qti_read_length: [27, 45] + rnaseq_read_length: [0, 1000] + end_alignment: "3prime" + shift_table: "config/shift_table/shift_table.csv" + export_bam: False + export_bigwig: True + skip_shifting: False + skip_length_filter: True multiqc: config: "config/multiqc_config.yml" diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml new file mode 100644 index 0000000..2f9d44b --- /dev/null +++ b/.test/config/multiqc_config.yml @@ -0,0 +1,2 @@ +remove_sections: + - samtools-stats \ No newline at end of file diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv index 7969058..ead9156 100644 --- a/.test/config/samples.tsv +++ b/.test/config/samples.tsv @@ -1,4 +1,3 @@ sample condition replicate lib_prep data_folder fq1 -RPF-RTP1 RPF-RTP 1 mpusp .test/data RPF-RTP1_R1_001.fastq.gz -RPF-RTP2 RPF-RTP 2 mpusp .test/data RPF-RTP2_R1_001.fastq.gz - +RPF-RTP1 RPF-RTP 1 mpusp data RPF-RTP1_R1_001.fastq.gz +RPF-RTP2 RPF-RTP 2 mpusp data RPF-RTP2_R1_001.fastq.gz diff --git a/.test/config/shift_table/shift_table.csv b/.test/config/shift_table/shift_table.csv new file mode 100644 index 0000000..7d42cb0 --- /dev/null +++ b/.test/config/shift_table/shift_table.csv @@ -0,0 +1,20 @@ +fraction,offsets_start +27,-11 +28,-12 +29,-13 +30,-14 +31,-15 +32,-16 +33,-17 +34,-18 +35,-19 +36,-20 +37,-21 +38,-22 +39,-23 +40,-24 +41,-25 +42,-26 +43,-27 +44,-28 +45,-29 diff --git a/config/config.yml b/config/config.yml index eec7b8a..c66fa7d 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,9 +1,3 @@ - -# optional: define output folder here -# default: "./results" -output: null - -# define samplesheet here samplesheet: "config/samples.tsv" get_genome: @@ -28,18 +22,20 @@ star: multi: 10 sam_multi: 1 intron_max: 1 - default: [ - "--readFilesCommand zcat ", - "--outSAMstrandField None ", - "--outSAMattributes All ", - "--outSAMattrIHstart 0 ", - "--outFilterType Normal ", - "--outFilterMultimapScoreRange 1 ", - "-o STARmappings ", - "--outSAMtype BAM Unsorted ", - "--outStd BAM_Unsorted ", - "--outMultimapperOrder Random ", - "--alignEndsType EndToEnd"] + default: + [ + "--readFilesCommand zcat ", + "--outSAMstrandField None ", + "--outSAMattributes All ", + "--outSAMattrIHstart 0 ", + "--outFilterType Normal ", + "--outFilterMultimapScoreRange 1 ", + "-o STARmappings ", + "--outSAMtype BAM Unsorted ", + "--outStd BAM_Unsorted ", + "--outMultimapperOrder Random ", + "--alignEndsType EndToEnd", + ] extract_features: biotypes: ["rRNA", "tRNA"] @@ -54,26 +50,25 @@ deeptools: normalize: "CPM" annotate_orfs: - window_size: 30 - sorf_max_length: 300 - sorf_min_length: 45 - orf_start_codon_table: 11 - orf_stop_codon: ["TAA", "TAG", "TGA"] - orf_longest_only: False + window_size: 30 + sorf_max_length: 300 + sorf_min_length: 45 + orf_start_codon_table: 11 + orf_stop_codon: ["TAA", "TAG", "TGA"] + orf_longest_only: False shift_reads: - window_size: 30 - read_length: [27, 45] - # rpf_read_length: [30, 45] - # qti_read_length: [27, 45] - rnaseq_read_length: [0, 1000] - end_alignment: "3prime" - shift_table: "config/shift_table/shift_table.csv" - export_bam: False - export_bigwig: True - skip_shifting: False - skip_length_filter: True - + window_size: 30 + read_length: [27, 45] + # rpf_read_length: [30, 45] + # qti_read_length: [27, 45] + rnaseq_read_length: [0, 1000] + end_alignment: "3prime" + shift_table: "config/shift_table/shift_table.csv" + export_bam: False + export_bigwig: True + skip_shifting: False + skip_length_filter: True multiqc: config: "config/multiqc_config.yml" diff --git a/workflow/Snakefile b/workflow/Snakefile index a5236ce..7252d3b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -13,10 +13,10 @@ import pandas as pd from datetime import date from snakemake.utils import min_version -#min_version("7.0") +# min_version("7.0") __author__ = "Rina Ahmed-Begrich, Michael Jahn" -__year__ = str(date.today()).split('-')[0] +__year__ = str(date.today()).split("-")[0] bold = "\033[1m" green = "\033[92m" @@ -27,13 +27,14 @@ msg = f"""{cyan}Bacterial-Riboseq: A Snakemake workflow for the analysis of riboseq data in bacteria.{end} """ -epilog=f""" +epilog = f""" {cyan}Written by {__author__}. Max Planck Unit for the Science of Pathogens. Copyright (c) {__year__}. Copyright Holder All Rights Reserved.{end} """ + # load configuration # ----------------------------------------------------- configfile: "config/config.yml" @@ -51,27 +52,29 @@ include: "rules/postprocessing.smk" shell.executable("bash") shell.prefix(f"set -eo pipefail; ") -if config.get('output') is None: - config['output'] = os.path.join(os.getcwd(), "./results") onstart: print("\n--- Analysis started...\n") print() print("--- Analysis parameters --------------------------------------------\n") - print(f"Current working directory: {os.path.join(os.getcwd())}") - print(f"Output directory:", {config['output']}) + print(f"Current working directory: {os.getcwd()}") + print(f"Output directory: {os.path.join(os.getcwd(), 'results')}") print() print(f"Riboseq samples: {list(samples.index)}") print() + onsuccess: print() print(msg) print(epilog) print("--- Workflow finished, no error! -----------------------------------") print() - debug = os.path.join(config['output'], "workflow.log") - shell("cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}") + debug = os.path.join(os.getcwd(), "results/workflow.log") + shell( + "cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}" + ) + onerror: print() @@ -79,7 +82,7 @@ onerror: print(epilog) print("--- An error occurred! ---------------------------------------------") print() - error = os.path.join(config['output'], "error.log") + error = os.path.join(os.getcwd(), "results/error.log") shell("cat {log} > {error} && echo -e '\nAn error occurred!' >> {error}") diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 532c25e..59255fe 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -10,7 +10,7 @@ samples = ( .sort_index() ) -#TODO: write validation schema +# TODO: write validation schema # validate(SAMPLES, schema="../config/schemas/samples.schema.yml") @@ -18,39 +18,44 @@ def get_final_output(): targets = [] targets.append("results/multiqc/multiqc_report.html") targets.append( - expand("results/{mapping_status}/length_dist/{sample}_length_dist.tsv", + expand( + "results/{mapping_status}/length_dist/{sample}_length_dist.tsv", mapping_status=["mapped", "deduplicated", "filtered_bam"], - sample=samples.index) + sample=samples.index, + ) ) targets.append("results/get_genome/mRNA_features.gff") targets.append( - expand("results/shift_reads/{sample}_shift.csv", - sample=samples.index) + expand("results/shift_reads/{sample}_shift.csv", sample=samples.index) ) return targets # get fastq files def get_fastq(wildcards): - if wildcards.status == 'raw': + if wildcards.status == "raw": return expand( "{input_dir}/{sample}", input_dir=samples.loc[wildcards.sample]["data_folder"], - sample=samples.loc[wildcards.sample]["fq1"]) - if wildcards.status == 'clipped': - return expand( - "results/clipped/{sample}.fastq.gz", - sample=samples.index) + sample=samples.loc[wildcards.sample]["fq1"], + ) + if wildcards.status == "clipped": + return expand("results/clipped/{sample}.fastq.gz", sample=samples.index) # get bam files def get_bam(wildcards): - if wildcards.mapping_status == 'mapped': - return expand(os.path.join("results", "mapped", "{sample}.bam"), - sample=wildcards.sample) - if wildcards.mapping_status == 'deduplicated': - return expand(os.path.join("results", "deduplicated", "{sample}.bam"), - sample=wildcards.sample) - if wildcards.mapping_status == 'filtered_bam': - return expand(os.path.join("results", "filtered_bam", "{sample}.bam"), - sample=wildcards.sample) + if wildcards.mapping_status == "mapped": + return expand( + os.path.join("results", "mapped", "{sample}.bam"), sample=wildcards.sample + ) + if wildcards.mapping_status == "deduplicated": + return expand( + os.path.join("results", "deduplicated", "{sample}.bam"), + sample=wildcards.sample, + ) + if wildcards.mapping_status == "filtered_bam": + return expand( + os.path.join("results", "filtered_bam", "{sample}.bam"), + sample=wildcards.sample, + ) diff --git a/workflow/rules/postprocessing.smk b/workflow/rules/postprocessing.smk index a33986a..35d1c9d 100644 --- a/workflow/rules/postprocessing.smk +++ b/workflow/rules/postprocessing.smk @@ -2,6 +2,7 @@ # Riboseq postprocessing: # # ----------------------------------------------------- # + # module to extract selected biotypes from gff file # ----------------------------------------------------- rule extract_mRNA_features: @@ -60,7 +61,7 @@ rule shift_reads: """--- Shifting Ribo-Seq reads.""" params: config["shift_reads"], - threads: workflow.cores, + threads: workflow.cores log: path="results/shift_reads/log/{sample}.log", script: diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk index 536ec52..36f9270 100644 --- a/workflow/rules/preprocessing.smk +++ b/workflow/rules/preprocessing.smk @@ -2,6 +2,7 @@ # Riboseq preprocessing: # # ----------------------------------------------------- # + # module to make QC report # ----------------------------------------------------- rule fastqc: @@ -10,12 +11,12 @@ rule fastqc: output: report=directory("results/fastqc_{status}/{sample}"), conda: - "../envs/fastqc.yml", + "../envs/fastqc.yml" message: """--- Checking fastq files with FastQC.""" log: "results/fastqc_{status}/log/{sample}.log", - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores shell: "mkdir -p {output.report};" "fastqc --nogroup -extract --threads {threads} -o {output.report} {input} > {log}" @@ -25,9 +26,11 @@ rule fastqc: # ----------------------------------------------------- rule cutadapt: input: - fastq=lambda wc: expand("{input_dir}/{sample}", + fastq=lambda wc: expand( + "{input_dir}/{sample}", input_dir=samples.loc[wc.sample]["data_folder"], - sample=samples.loc[wc.sample]["fq1"]), + sample=samples.loc[wc.sample]["fq1"], + ), output: fastq="results/clipped/{sample}.fastq.gz", conda: @@ -41,7 +44,7 @@ rule cutadapt: log: stdout="results/clipped/log/{sample}.log", stderr="results/clipped/log/{sample}.stderr", - threads: int(workflow.cores * 0.4), # assign 40% of max cores + threads: int(workflow.cores * 0.4) # assign 40% of max cores shell: "if [ {params.fivep_adapter} != None ]; then " "fivep=`echo -g {params.fivep_adapter}`; " @@ -152,7 +155,7 @@ rule star_mapping: outprefix=lambda w, output: f"{os.path.splitext(output.bam)[0]}_", log: path="results/mapped/log/{sample}.log", - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores shell: "STAR " "--runThreadN {threads} " @@ -177,12 +180,12 @@ rule mapping_sorted_bam: conda: "../envs/samtools.yml" log: - "results/mapped/log/samtools_{sample}.log" - # os.path.join(output_dir, "mapping/log/samtools_sort_{sample}.stderr") - message: """--- Samtools sort and index bam files.""" + "results/mapped/log/samtools_{sample}.log", + message: + """--- Samtools sort and index bam files.""" params: tmp="results/mapped/sort_{sample}_tmp", - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores shell: "samtools sort -@ {threads} -O bam -T {params.tmp} -o {output.bam} {input} 2> {log}; " "samtools index -@ {threads} {output.bam} 2>> {log}" @@ -204,7 +207,7 @@ rule umi_dedup: params: tmp="results/deduplicated/sort_{sample}_tmp", default=config["umi_dedup"], - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores log: path="results/deduplicated/log/{sample}.log", stderr="results/deduplicated/log/{sample}.stderr", @@ -253,7 +256,7 @@ rule filter_bam: "../envs/filter_bam.yml" params: defaults=config["bedtools_intersect"]["defaults"], - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores shell: "intersectBed -abam {input.bam} -b {input.gff} {params.defaults} | " "samtools sort -@ {threads} > {output.bam} 2> {log.path}; " @@ -265,30 +268,26 @@ rule filter_bam: # ----------------------------------------------------- rule extract_mapping_length: input: - get_bam, + bam=get_bam, output: - "results/{mapping_status}/length_dist/{sample}_length_dist.tsv", - message: """--- Extract mapping length of BAM input: {wildcards.mapping_status}/{wildcards.sample}.bam""" + tsv="results/{mapping_status}/length_dist/{sample}_length_dist.tsv", + pdf="results/{mapping_status}/length_dist/{sample}_length_dist.pdf", + message: + """--- Extract mapping length of BAM input: {wildcards.mapping_status}/{wildcards.sample}.bam""" conda: - "../envs/plot_mapping_length.yml", + "../envs/plot_mapping_length.yml" log: - os.path.join("results", "{mapping_status}", "length_dist", "log", "{sample}.log"), - params: - outdir=os.path.join("results","{mapping_status}", "length_dist"), - shell: - "workflow/scripts/plot_mapping_length.py -b {input} -o {params.outdir} -s {wildcards.sample} 2> {log}" + "results/{mapping_status}/length_dist/{sample}_length_dist.log", + script: + "../scripts/plot_mapping_length.py" # module to run multiQC on input + processed files # ----------------------------------------------------- rule multiqc: input: - expand( - "results/fastqc_clipped/{sample}_fastqc.html", - sample=samples.index), - expand( - "results/clipped/{sample}.fastq.gz", - sample=samples.index), + expand("results/fastqc_clipped/{sample}_fastqc.html", sample=samples.index), + expand("results/clipped/{sample}.fastq.gz", sample=samples.index), expand( "results/umi_extraction/{sample}.fastq.gz", sample=samples.index, diff --git a/workflow/scripts/plot_mapping_length.py b/workflow/scripts/plot_mapping_length.py index 9d3d639..dffddb6 100755 --- a/workflow/scripts/plot_mapping_length.py +++ b/workflow/scripts/plot_mapping_length.py @@ -17,7 +17,7 @@ __version__ = '.'.join(__version_info__) -def get_aln_length(bamfile, outfolder, sample): +def get_aln_length(bamfile, sample): lengths = [] for aln in bamfile.fetch(): lengths.append(len(aln.query_sequence)) @@ -43,29 +43,19 @@ def get_aln_length(bamfile, outfolder, sample): """ % (__authors__, __mail__), formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) - parser.add_argument('-b', dest='bam', required=True, help='bam file') - parser.add_argument('-o', dest='output_folder', required=True, help='output folder') - parser.add_argument('-s', dest='sample_name', required=True, help='sample name') - - args = vars(parser.parse_args()) - file = args['bam'] - sample_name = args['sample_name'] - outfolder=args['output_folder'] - + input = str(snakemake.input["bam"]) + sample_name = snakemake.wildcards.sample + output_tsv = snakemake.output["tsv"] + output_pdf = snakemake.output["pdf"] try: - if not file.endswith('bam'): - sys.stderr.write("Input file needs to be in bam format. " + - "Exit forced\n.") - sys.exit(1) - elif not os.path.exists(file): - sys.stderr.write("Input file %s does not exist. " + - "Exit forced\n." % file) + if not os.path.exists(input): + sys.stderr.write( + "Input file %s does not exist. " + "Exit forced\n." % input + ) sys.exit(1) - bamfile = pysam.AlignmentFile(file, "rb") - df = get_aln_length(bamfile=bamfile, outfolder=outfolder, - sample=sample_name) + bamfile = pysam.AlignmentFile(input, "rb") + df = get_aln_length(bamfile=bamfile, sample=sample_name) # plot pdf matplotlib.rcParams['axes.grid'] = True @@ -84,15 +74,15 @@ def get_aln_length(bamfile, outfolder, sample): ax.set_xlabel('Read length') ax.get_yaxis().get_major_formatter().set_scientific(False) plt.tight_layout() - plt.savefig(os.path.join(outfolder, f'{sample_name}_mapped_length.pdf'), bbox_inches='tight') + plt.savefig(output_pdf, bbox_inches='tight') # export coount table df.index.set_names(['Length'], inplace=True) df.reset_index(inplace=True) - df.to_csv(os.path.join(outfolder, f'{sample_name}_length_dist.tsv'), sep='\t', index=False) + df.to_csv(output_tsv, sep='\t', index=False) except: sys.stderr.write("Error ocurred when processing bam file: %s.\n" - % file) + % input) raise RuntimeError sys.exit(1)