From d35162222fc4e3008a366d7d1b8df16cdc35c168 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Tue, 23 Jul 2024 11:48:58 +0200
Subject: [PATCH] feat: prepared wf to run test data; removed unused conf
 options; snakefmt + linting

---
 .gitignore                               |  3 +-
 .test/config/config.yml                  | 69 +++++++++++-------------
 .test/config/multiqc_config.yml          |  2 +
 .test/config/samples.tsv                 |  5 +-
 .test/config/shift_table/shift_table.csv | 20 +++++++
 config/config.yml                        | 67 +++++++++++------------
 workflow/Snakefile                       | 23 ++++----
 workflow/rules/common.smk                | 45 +++++++++-------
 workflow/rules/postprocessing.smk        |  3 +-
 workflow/rules/preprocessing.smk         | 53 +++++++++---------
 workflow/scripts/plot_mapping_length.py  | 38 +++++--------
 11 files changed, 169 insertions(+), 159 deletions(-)
 create mode 100644 .test/config/multiqc_config.yml
 create mode 100644 .test/config/shift_table/shift_table.csv

diff --git a/.gitignore b/.gitignore
index 122be1b..e7fc954 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,5 @@ resources/**
 # Custom additions
 Notes.md
 .vscode/*
-.snakemake-workflow-catalog.yml
\ No newline at end of file
+.snakemake-workflow-catalog.yml
+.test/results/*
\ No newline at end of file
diff --git a/.test/config/config.yml b/.test/config/config.yml
index cbca87a..c66fa7d 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -1,10 +1,4 @@
-
-# optional: define output folder here
-# default: "./results"
-output: null
-
-# define samplesheet here
-samplesheet: "./test/config/samples.tsv"
+samplesheet: "config/samples.tsv"
 
 get_genome:
   database: "ncbi"
@@ -28,18 +22,20 @@ star:
   multi: 10
   sam_multi: 1
   intron_max: 1
-  default: [
-    "--readFilesCommand zcat ",
-    "--outSAMstrandField None ",
-    "--outSAMattributes All ",
-    "--outSAMattrIHstart 0 ",
-    "--outFilterType Normal ",
-    "--outFilterMultimapScoreRange 1 ",
-    "-o STARmappings ",
-    "--outSAMtype BAM Unsorted ",
-    "--outStd BAM_Unsorted ",
-    "--outMultimapperOrder Random ",
-    "--alignEndsType EndToEnd"]
+  default:
+    [
+      "--readFilesCommand zcat ",
+      "--outSAMstrandField None ",
+      "--outSAMattributes All ",
+      "--outSAMattrIHstart 0 ",
+      "--outFilterType Normal ",
+      "--outFilterMultimapScoreRange 1 ",
+      "-o STARmappings ",
+      "--outSAMtype BAM Unsorted ",
+      "--outStd BAM_Unsorted ",
+      "--outMultimapperOrder Random ",
+      "--alignEndsType EndToEnd",
+    ]
 
 extract_features:
   biotypes: ["rRNA", "tRNA"]
@@ -54,26 +50,25 @@ deeptools:
   normalize: "CPM"
 
 annotate_orfs:
-    window_size: 30
-    sorf_max_length: 300
-    sorf_min_length: 45
-    orf_start_codon_table: 11
-    orf_stop_codon: ["TAA", "TAG", "TGA"]
-    orf_longest_only: False
+  window_size: 30
+  sorf_max_length: 300
+  sorf_min_length: 45
+  orf_start_codon_table: 11
+  orf_stop_codon: ["TAA", "TAG", "TGA"]
+  orf_longest_only: False
 
 shift_reads:
-    window_size: 30
-    read_length: [27, 45]
-    # rpf_read_length: [30, 45]
-    # qti_read_length: [27, 45]
-    rnaseq_read_length: [0, 1000]
-    end_alignment: "3prime"
-    shift_table: "config/shift_table/shift_table.csv"
-    export_bam: False
-    export_bigwig: True
-    skip_shifting: False
-    skip_length_filter: True
-
+  window_size: 30
+  read_length: [27, 45]
+  # rpf_read_length: [30, 45]
+  # qti_read_length: [27, 45]
+  rnaseq_read_length: [0, 1000]
+  end_alignment: "3prime"
+  shift_table: "config/shift_table/shift_table.csv"
+  export_bam: False
+  export_bigwig: True
+  skip_shifting: False
+  skip_length_filter: True
 
 multiqc:
   config: "config/multiqc_config.yml"
diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml
new file mode 100644
index 0000000..2f9d44b
--- /dev/null
+++ b/.test/config/multiqc_config.yml
@@ -0,0 +1,2 @@
+remove_sections:
+  - samtools-stats
\ No newline at end of file
diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
index 7969058..ead9156 100644
--- a/.test/config/samples.tsv
+++ b/.test/config/samples.tsv
@@ -1,4 +1,3 @@
 sample	condition	replicate	lib_prep 	data_folder	fq1
-RPF-RTP1	RPF-RTP	1	mpusp	.test/data	RPF-RTP1_R1_001.fastq.gz
-RPF-RTP2	RPF-RTP	2	mpusp	.test/data	RPF-RTP2_R1_001.fastq.gz
-
+RPF-RTP1	RPF-RTP	1	mpusp	data	RPF-RTP1_R1_001.fastq.gz
+RPF-RTP2	RPF-RTP	2	mpusp	data	RPF-RTP2_R1_001.fastq.gz
diff --git a/.test/config/shift_table/shift_table.csv b/.test/config/shift_table/shift_table.csv
new file mode 100644
index 0000000..7d42cb0
--- /dev/null
+++ b/.test/config/shift_table/shift_table.csv
@@ -0,0 +1,20 @@
+fraction,offsets_start
+27,-11
+28,-12
+29,-13
+30,-14
+31,-15
+32,-16
+33,-17
+34,-18
+35,-19
+36,-20
+37,-21
+38,-22
+39,-23
+40,-24
+41,-25
+42,-26
+43,-27
+44,-28
+45,-29
diff --git a/config/config.yml b/config/config.yml
index eec7b8a..c66fa7d 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -1,9 +1,3 @@
-
-# optional: define output folder here
-# default: "./results"
-output: null
-
-# define samplesheet here
 samplesheet: "config/samples.tsv"
 
 get_genome:
@@ -28,18 +22,20 @@ star:
   multi: 10
   sam_multi: 1
   intron_max: 1
-  default: [
-    "--readFilesCommand zcat ",
-    "--outSAMstrandField None ",
-    "--outSAMattributes All ",
-    "--outSAMattrIHstart 0 ",
-    "--outFilterType Normal ",
-    "--outFilterMultimapScoreRange 1 ",
-    "-o STARmappings ",
-    "--outSAMtype BAM Unsorted ",
-    "--outStd BAM_Unsorted ",
-    "--outMultimapperOrder Random ",
-    "--alignEndsType EndToEnd"]
+  default:
+    [
+      "--readFilesCommand zcat ",
+      "--outSAMstrandField None ",
+      "--outSAMattributes All ",
+      "--outSAMattrIHstart 0 ",
+      "--outFilterType Normal ",
+      "--outFilterMultimapScoreRange 1 ",
+      "-o STARmappings ",
+      "--outSAMtype BAM Unsorted ",
+      "--outStd BAM_Unsorted ",
+      "--outMultimapperOrder Random ",
+      "--alignEndsType EndToEnd",
+    ]
 
 extract_features:
   biotypes: ["rRNA", "tRNA"]
@@ -54,26 +50,25 @@ deeptools:
   normalize: "CPM"
 
 annotate_orfs:
-    window_size: 30
-    sorf_max_length: 300
-    sorf_min_length: 45
-    orf_start_codon_table: 11
-    orf_stop_codon: ["TAA", "TAG", "TGA"]
-    orf_longest_only: False
+  window_size: 30
+  sorf_max_length: 300
+  sorf_min_length: 45
+  orf_start_codon_table: 11
+  orf_stop_codon: ["TAA", "TAG", "TGA"]
+  orf_longest_only: False
 
 shift_reads:
-    window_size: 30
-    read_length: [27, 45]
-    # rpf_read_length: [30, 45]
-    # qti_read_length: [27, 45]
-    rnaseq_read_length: [0, 1000]
-    end_alignment: "3prime"
-    shift_table: "config/shift_table/shift_table.csv"
-    export_bam: False
-    export_bigwig: True
-    skip_shifting: False
-    skip_length_filter: True
-
+  window_size: 30
+  read_length: [27, 45]
+  # rpf_read_length: [30, 45]
+  # qti_read_length: [27, 45]
+  rnaseq_read_length: [0, 1000]
+  end_alignment: "3prime"
+  shift_table: "config/shift_table/shift_table.csv"
+  export_bam: False
+  export_bigwig: True
+  skip_shifting: False
+  skip_length_filter: True
 
 multiqc:
   config: "config/multiqc_config.yml"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index a5236ce..7252d3b 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -13,10 +13,10 @@ import pandas as pd
 from datetime import date
 from snakemake.utils import min_version
 
-#min_version("7.0")
+# min_version("7.0")
 
 __author__ = "Rina Ahmed-Begrich, Michael Jahn"
-__year__ = str(date.today()).split('-')[0]
+__year__ = str(date.today()).split("-")[0]
 
 bold = "\033[1m"
 green = "\033[92m"
@@ -27,13 +27,14 @@ msg = f"""{cyan}Bacterial-Riboseq: A Snakemake workflow
 for the analysis of riboseq data in bacteria.{end}
 """
 
-epilog=f"""
+epilog = f"""
 {cyan}Written by {__author__}.
 Max Planck Unit for the Science of Pathogens. Copyright (c) {__year__}.
 Copyright Holder All Rights Reserved.{end}
 
 """
 
+
 # load configuration
 # -----------------------------------------------------
 configfile: "config/config.yml"
@@ -51,27 +52,29 @@ include: "rules/postprocessing.smk"
 shell.executable("bash")
 shell.prefix(f"set -eo pipefail; ")
 
-if config.get('output') is None:
-    config['output'] =  os.path.join(os.getcwd(), "./results")
 
 onstart:
     print("\n--- Analysis started...\n")
     print()
     print("--- Analysis parameters --------------------------------------------\n")
-    print(f"Current working directory: {os.path.join(os.getcwd())}")
-    print(f"Output directory:", {config['output']})
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Output directory: {os.path.join(os.getcwd(), 'results')}")
     print()
     print(f"Riboseq samples: {list(samples.index)}")
     print()
 
+
 onsuccess:
     print()
     print(msg)
     print(epilog)
     print("--- Workflow finished, no error! -----------------------------------")
     print()
-    debug = os.path.join(config['output'], "workflow.log")
-    shell("cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}")
+    debug = os.path.join(os.getcwd(), "results/workflow.log")
+    shell(
+        "cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}"
+    )
+
 
 onerror:
     print()
@@ -79,7 +82,7 @@ onerror:
     print(epilog)
     print("--- An error occurred! ---------------------------------------------")
     print()
-    error = os.path.join(config['output'], "error.log")
+    error = os.path.join(os.getcwd(), "results/error.log")
     shell("cat {log} > {error} && echo -e '\nAn error occurred!' >> {error}")
 
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 532c25e..59255fe 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -10,7 +10,7 @@ samples = (
     .sort_index()
 )
 
-#TODO: write validation schema
+# TODO: write validation schema
 # validate(SAMPLES, schema="../config/schemas/samples.schema.yml")
 
 
@@ -18,39 +18,44 @@ def get_final_output():
     targets = []
     targets.append("results/multiqc/multiqc_report.html")
     targets.append(
-        expand("results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
+        expand(
+            "results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
             mapping_status=["mapped", "deduplicated", "filtered_bam"],
-            sample=samples.index)
+            sample=samples.index,
+        )
     )
     targets.append("results/get_genome/mRNA_features.gff")
     targets.append(
-        expand("results/shift_reads/{sample}_shift.csv",
-            sample=samples.index)
+        expand("results/shift_reads/{sample}_shift.csv", sample=samples.index)
     )
     return targets
 
 
 # get fastq files
 def get_fastq(wildcards):
-    if wildcards.status == 'raw':
+    if wildcards.status == "raw":
         return expand(
             "{input_dir}/{sample}",
             input_dir=samples.loc[wildcards.sample]["data_folder"],
-            sample=samples.loc[wildcards.sample]["fq1"])
-    if wildcards.status == 'clipped':
-        return expand(
-            "results/clipped/{sample}.fastq.gz",
-            sample=samples.index)
+            sample=samples.loc[wildcards.sample]["fq1"],
+        )
+    if wildcards.status == "clipped":
+        return expand("results/clipped/{sample}.fastq.gz", sample=samples.index)
 
 
 # get bam files
 def get_bam(wildcards):
-    if wildcards.mapping_status == 'mapped':
-        return expand(os.path.join("results",  "mapped", "{sample}.bam"),
-            sample=wildcards.sample)
-    if wildcards.mapping_status == 'deduplicated':
-        return expand(os.path.join("results",  "deduplicated", "{sample}.bam"),
-            sample=wildcards.sample)
-    if wildcards.mapping_status == 'filtered_bam':
-        return expand(os.path.join("results",  "filtered_bam", "{sample}.bam"),
-            sample=wildcards.sample)
+    if wildcards.mapping_status == "mapped":
+        return expand(
+            os.path.join("results", "mapped", "{sample}.bam"), sample=wildcards.sample
+        )
+    if wildcards.mapping_status == "deduplicated":
+        return expand(
+            os.path.join("results", "deduplicated", "{sample}.bam"),
+            sample=wildcards.sample,
+        )
+    if wildcards.mapping_status == "filtered_bam":
+        return expand(
+            os.path.join("results", "filtered_bam", "{sample}.bam"),
+            sample=wildcards.sample,
+        )
diff --git a/workflow/rules/postprocessing.smk b/workflow/rules/postprocessing.smk
index a33986a..35d1c9d 100644
--- a/workflow/rules/postprocessing.smk
+++ b/workflow/rules/postprocessing.smk
@@ -2,6 +2,7 @@
 # Riboseq postprocessing:                               #
 # ----------------------------------------------------- #
 
+
 # module to extract selected biotypes from gff file
 # -----------------------------------------------------
 rule extract_mRNA_features:
@@ -60,7 +61,7 @@ rule shift_reads:
         """--- Shifting Ribo-Seq reads."""
     params:
         config["shift_reads"],
-    threads: workflow.cores,
+    threads: workflow.cores
     log:
         path="results/shift_reads/log/{sample}.log",
     script:
diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk
index 536ec52..36f9270 100644
--- a/workflow/rules/preprocessing.smk
+++ b/workflow/rules/preprocessing.smk
@@ -2,6 +2,7 @@
 # Riboseq preprocessing:                                #
 # ----------------------------------------------------- #
 
+
 # module to make QC report
 # -----------------------------------------------------
 rule fastqc:
@@ -10,12 +11,12 @@ rule fastqc:
     output:
         report=directory("results/fastqc_{status}/{sample}"),
     conda:
-        "../envs/fastqc.yml",
+        "../envs/fastqc.yml"
     message:
         """--- Checking fastq files with FastQC."""
     log:
         "results/fastqc_{status}/log/{sample}.log",
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     shell:
         "mkdir -p {output.report};"
         "fastqc --nogroup -extract --threads {threads} -o {output.report} {input} > {log}"
@@ -25,9 +26,11 @@ rule fastqc:
 # -----------------------------------------------------
 rule cutadapt:
     input:
-        fastq=lambda wc: expand("{input_dir}/{sample}",
+        fastq=lambda wc: expand(
+            "{input_dir}/{sample}",
             input_dir=samples.loc[wc.sample]["data_folder"],
-            sample=samples.loc[wc.sample]["fq1"]),
+            sample=samples.loc[wc.sample]["fq1"],
+        ),
     output:
         fastq="results/clipped/{sample}.fastq.gz",
     conda:
@@ -41,7 +44,7 @@ rule cutadapt:
     log:
         stdout="results/clipped/log/{sample}.log",
         stderr="results/clipped/log/{sample}.stderr",
-    threads: int(workflow.cores * 0.4), # assign 40% of max cores
+    threads: int(workflow.cores * 0.4)  # assign 40% of max cores
     shell:
         "if [ {params.fivep_adapter} != None ]; then "
         "fivep=`echo -g {params.fivep_adapter}`; "
@@ -152,7 +155,7 @@ rule star_mapping:
         outprefix=lambda w, output: f"{os.path.splitext(output.bam)[0]}_",
     log:
         path="results/mapped/log/{sample}.log",
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     shell:
         "STAR "
         "--runThreadN {threads} "
@@ -177,12 +180,12 @@ rule mapping_sorted_bam:
     conda:
         "../envs/samtools.yml"
     log:
-        "results/mapped/log/samtools_{sample}.log"
-        # os.path.join(output_dir, "mapping/log/samtools_sort_{sample}.stderr")
-    message: """--- Samtools sort and index bam files."""
+        "results/mapped/log/samtools_{sample}.log",
+    message:
+        """--- Samtools sort and index bam files."""
     params:
         tmp="results/mapped/sort_{sample}_tmp",
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     shell:
         "samtools sort -@ {threads} -O bam -T {params.tmp} -o {output.bam} {input} 2> {log}; "
         "samtools index -@ {threads} {output.bam} 2>> {log}"
@@ -204,7 +207,7 @@ rule umi_dedup:
     params:
         tmp="results/deduplicated/sort_{sample}_tmp",
         default=config["umi_dedup"],
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     log:
         path="results/deduplicated/log/{sample}.log",
         stderr="results/deduplicated/log/{sample}.stderr",
@@ -253,7 +256,7 @@ rule filter_bam:
         "../envs/filter_bam.yml"
     params:
         defaults=config["bedtools_intersect"]["defaults"],
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     shell:
         "intersectBed -abam {input.bam} -b {input.gff} {params.defaults} | "
         "samtools sort -@ {threads} > {output.bam} 2> {log.path}; "
@@ -265,30 +268,26 @@ rule filter_bam:
 # -----------------------------------------------------
 rule extract_mapping_length:
     input:
-        get_bam,
+        bam=get_bam,
     output:
-        "results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
-    message: """--- Extract mapping length of BAM input: {wildcards.mapping_status}/{wildcards.sample}.bam"""
+        tsv="results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
+        pdf="results/{mapping_status}/length_dist/{sample}_length_dist.pdf",
+    message:
+        """--- Extract mapping length of BAM input: {wildcards.mapping_status}/{wildcards.sample}.bam"""
     conda:
-        "../envs/plot_mapping_length.yml",
+        "../envs/plot_mapping_length.yml"
     log:
-        os.path.join("results", "{mapping_status}", "length_dist", "log", "{sample}.log"),
-    params:
-        outdir=os.path.join("results","{mapping_status}", "length_dist"),
-    shell:
-        "workflow/scripts/plot_mapping_length.py -b {input} -o {params.outdir} -s {wildcards.sample} 2> {log}"
+        "results/{mapping_status}/length_dist/{sample}_length_dist.log",
+    script:
+        "../scripts/plot_mapping_length.py"
 
 
 # module to run multiQC on input + processed files
 # -----------------------------------------------------
 rule multiqc:
     input:
-        expand(
-            "results/fastqc_clipped/{sample}_fastqc.html",
-            sample=samples.index),
-        expand(
-            "results/clipped/{sample}.fastq.gz",
-            sample=samples.index),
+        expand("results/fastqc_clipped/{sample}_fastqc.html", sample=samples.index),
+        expand("results/clipped/{sample}.fastq.gz", sample=samples.index),
         expand(
             "results/umi_extraction/{sample}.fastq.gz",
             sample=samples.index,
diff --git a/workflow/scripts/plot_mapping_length.py b/workflow/scripts/plot_mapping_length.py
index 9d3d639..dffddb6 100755
--- a/workflow/scripts/plot_mapping_length.py
+++ b/workflow/scripts/plot_mapping_length.py
@@ -17,7 +17,7 @@
 __version__ = '.'.join(__version_info__)
 
 
-def get_aln_length(bamfile, outfolder, sample):
+def get_aln_length(bamfile, sample):
     lengths = []
     for aln in bamfile.fetch():
         lengths.append(len(aln.query_sequence))
@@ -43,29 +43,19 @@ def get_aln_length(bamfile, outfolder, sample):
 """ % (__authors__, __mail__),
         formatter_class=argparse.RawDescriptionHelpFormatter)
 
-    parser.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__))
-    parser.add_argument('-b', dest='bam', required=True, help='bam file')
-    parser.add_argument('-o', dest='output_folder', required=True, help='output folder')
-    parser.add_argument('-s', dest='sample_name', required=True, help='sample name')
-
-    args = vars(parser.parse_args())
-    file = args['bam']
-    sample_name = args['sample_name']
-    outfolder=args['output_folder']
-
+    input = str(snakemake.input["bam"])
+    sample_name = snakemake.wildcards.sample
+    output_tsv = snakemake.output["tsv"]
+    output_pdf = snakemake.output["pdf"]
 
     try:
-        if not file.endswith('bam'):
-            sys.stderr.write("Input file needs to be in bam format. " +
-                             "Exit forced\n.")
-            sys.exit(1)
-        elif not os.path.exists(file):
-            sys.stderr.write("Input file %s does not exist. " +
-                             "Exit forced\n." % file)
+        if not os.path.exists(input):
+            sys.stderr.write(
+                "Input file %s does not exist. " + "Exit forced\n." % input
+            )
             sys.exit(1)
-        bamfile = pysam.AlignmentFile(file, "rb")
-        df = get_aln_length(bamfile=bamfile, outfolder=outfolder,
-                            sample=sample_name)
+        bamfile = pysam.AlignmentFile(input, "rb")
+        df = get_aln_length(bamfile=bamfile, sample=sample_name)
 
         # plot pdf
         matplotlib.rcParams['axes.grid'] = True
@@ -84,15 +74,15 @@ def get_aln_length(bamfile, outfolder, sample):
         ax.set_xlabel('Read length')
         ax.get_yaxis().get_major_formatter().set_scientific(False)
         plt.tight_layout()
-        plt.savefig(os.path.join(outfolder, f'{sample_name}_mapped_length.pdf'), bbox_inches='tight')
+        plt.savefig(output_pdf, bbox_inches='tight')
 
         # export coount table
         df.index.set_names(['Length'], inplace=True)
         df.reset_index(inplace=True)
-        df.to_csv(os.path.join(outfolder, f'{sample_name}_length_dist.tsv'), sep='\t', index=False)
+        df.to_csv(output_tsv, sep='\t', index=False)
 
     except:
         sys.stderr.write("Error ocurred when processing bam file: %s.\n"
-                         % file)
+                         % input)
         raise RuntimeError
         sys.exit(1)