From 051de2c9e9d9b4b0062b427b6e17bca3224353d8 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Tue, 23 Jul 2024 11:38:57 +0200
Subject: [PATCH 1/5] fix: added dev branch to gh actions workflow

---
 .github/workflows/main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 5d28ca0..7c1532f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,9 +2,9 @@ name: Tests
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, dev ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, dev ]
 
 
 jobs:
@@ -44,7 +44,7 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
+        args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp --dryrun"
 
     - name: Test report
       uses: snakemake/snakemake-github-action@v1.24.0

From 5184b3819a58be808308635357d0da63d29f9da9 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Tue, 23 Jul 2024 11:40:02 +0200
Subject: [PATCH 2/5] fix: added missing channels for env definitions

---
 workflow/envs/cutadapt.yml   | 1 +
 workflow/envs/fastqc.yml     | 1 +
 workflow/envs/filter_bam.yml | 3 ++-
 workflow/envs/r_orfik.yml    | 2 +-
 workflow/envs/samtools.yml   | 2 +-
 workflow/envs/star.yml       | 2 +-
 workflow/envs/umitools.yml   | 3 ++-
 7 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/workflow/envs/cutadapt.yml b/workflow/envs/cutadapt.yml
index 8664c55..180148c 100644
--- a/workflow/envs/cutadapt.yml
+++ b/workflow/envs/cutadapt.yml
@@ -2,5 +2,6 @@ name: cutadapt
 channels:
   - conda-forge
   - defaults
+  - bioconda
 dependencies:
   - cutadapt=4.9
diff --git a/workflow/envs/fastqc.yml b/workflow/envs/fastqc.yml
index 151235b..338a604 100644
--- a/workflow/envs/fastqc.yml
+++ b/workflow/envs/fastqc.yml
@@ -2,5 +2,6 @@ name: fastqc
 channels:
   - conda-forge
   - defaults
+  - bioconda
 dependencies:
   - fastqc=0.12.1
diff --git a/workflow/envs/filter_bam.yml b/workflow/envs/filter_bam.yml
index 8019466..696d070 100644
--- a/workflow/envs/filter_bam.yml
+++ b/workflow/envs/filter_bam.yml
@@ -1,7 +1,8 @@
 name: filter_bam
 channels:
-  - bioconda
+  - conda-forge
   - defaults
+  - bioconda
 dependencies:
   - bedtools=2.31.1
   - samtools=1.20
diff --git a/workflow/envs/r_orfik.yml b/workflow/envs/r_orfik.yml
index df85df2..6ed7913 100644
--- a/workflow/envs/r_orfik.yml
+++ b/workflow/envs/r_orfik.yml
@@ -1,8 +1,8 @@
 name: r_orfik
 channels:
   - conda-forge
-  - bioconda
   - defaults
+  - bioconda
 dependencies:
   - r-base=4.3.1
   - r-remotes=2.5.0
diff --git a/workflow/envs/samtools.yml b/workflow/envs/samtools.yml
index fca5cac..30159c6 100644
--- a/workflow/envs/samtools.yml
+++ b/workflow/envs/samtools.yml
@@ -1,7 +1,7 @@
 name: samtools
 channels:
   - conda-forge
-  - bioconda
   - defaults
+  - bioconda
 dependencies:
   - samtools=1.20
diff --git a/workflow/envs/star.yml b/workflow/envs/star.yml
index 669e69a..df39fdc 100644
--- a/workflow/envs/star.yml
+++ b/workflow/envs/star.yml
@@ -1,8 +1,8 @@
 name: star
 channels:
   - conda-forge
-  - bioconda
   - defaults
+  - bioconda
 dependencies:
   - star=2.7.11b
   - samtools=1.20
\ No newline at end of file
diff --git a/workflow/envs/umitools.yml b/workflow/envs/umitools.yml
index 2430ef1..6806afb 100644
--- a/workflow/envs/umitools.yml
+++ b/workflow/envs/umitools.yml
@@ -1,7 +1,8 @@
 name: umitools
 channels:
-  - bioconda
+  - conda-forge
   - defaults
+  - bioconda
 dependencies:
   - bioconda::umi_tools #=1.1.4
   - pysam #=0.21.0

From 4b31486ee45b98c2d79ea67c81929b54e7dd5118 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Tue, 23 Jul 2024 11:45:27 +0200
Subject: [PATCH 3/5] feat: added description to run examples

---
 README.md | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 204678a..e22307b 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,13 @@ A Snakemake workflow for the analysis of bacterial riboseq data.
   - [Usage](#usage)
   - [Workflow overview](#workflow-overview)
   - [Installation](#installation)
+    - [Additional tools](#additional-tools)
+  - [Running the workflow](#running-the-workflow)
+    - [Input data](#input-data)
+      - [Reference genome](#reference-genome)
+      - [Read data](#read-data)
+    - [Execution](#execution)
+    - [Parameters](#parameters)
   - [Authors](#authors)
   - [References](#references)
 
@@ -56,7 +63,7 @@ This step creates a new conda environment called `snakemake-bacterial-riboseq`.
 
 ```bash
 # create new environment with dependencies & activate it
-mamba env create -c conda-forge -c bioconda -n snakemake-bacterial-riboseq snakemake pandas
+mamba create -c conda-forge -c bioconda -n snakemake-bacterial-riboseq snakemake pandas
 conda activate snakemake-bacterial-riboseq
 ```
 
@@ -67,6 +74,59 @@ conda activate snakemake-bacterial-riboseq
 All other dependencies for the workflow are **automatically pulled as `conda` environments** by snakemake, when running the workflow with the `--use-conda` parameter (recommended).
 
 
+## Running the workflow
+
+### Input data
+
+#### Reference genome
+
+An NCBI Refseq ID, e.g. `GCF_000006945.2`. Find your genome assembly and corresponding ID on [NCBI genomes](https://www.ncbi.nlm.nih.gov/data-hub/genome/). Alternatively use a custom pair of `*.fasta` file and `*.gff` file that describe the genome of choice.
+
+Important requirements when using custom `*.fasta` and `*.gff` files:
+
+- `*.gff` genome annotation must have the same chromosome/region name as the `*.fasta` file (example: `NC_003197.2`)
+- `*.gff` genome annotation must have `gene` and `CDS` type annotation that is automatically parsed to extract transcripts
+- all chromosomes/regions in the `*.gff` genome annotation must be present in the `*.fasta` sequence
+- but not all sequences in the `*.fasta` file need to have annotated genes in the `*.gff` file
+
+#### Read data
+
+Ribosome footprint sequencing data in `*.fastq.gz` format. The currently supported input data are **single-end, strand-specific reads**. Input data files are supplied via a mandatory table, whose location is indicated in the `config.yml` file (default: `samples.tsv`). The sample sheet has the following layout:
+
+| sample   | condition | replicate | lib_prep | data_folder | fq1                      |
+| -------- | --------- | --------- | -------- | ----------- | ------------------------ |
+| RPF-RTP1 | RPF-RTP   | 1         | mpusp    | data        | RPF-RTP1_R1_001.fastq.gz |
+| RPF-RTP2 | RPF-RTP   | 2         | mpusp    | data        | RPF-RTP2_R1_001.fastq.gz |
+
+Some configuration parameters of the pipeline may be specific for your data and library preparation protocol. The options should be adjusted in the `config.yml` file. For example:
+
+- Minimum and maximum read length after adapter removal (see option `cutadapt: default`). Here, the test data has a minimum read length of 15 + 7 = 22 (2 nt on 5'end + 5 nt on 3'end), and a maximum of 45 + 7 = 52.
+- Unique molecular identifiers (UMIs). For example, the protocol by [McGlincy & Ingolia, 2017](https://doi.org/10.1016/J.YMETH.2017.05.028) creates a UMI that is located on both the 5'-end (2 nt) and the 3'-end (5 nt). These UMIs are extracted with `umi_tools` (see options `umi_extraction: method` and `pattern`).
+
+### Execution
+
+To run the workflow from command line, change the working directory.
+
+```bash
+cd path/to/snakemake-bacterial-riboseq
+```
+
+Adjust the global and module-specific options in the default config file `config/config.yml`.
+Before running the entire workflow, you can perform a dry run using:
+
+```bash
+snakemake --dry-run
+```
+
+To run the complete workflow with test files using **`conda`**, execute the following command. The definition of the number of compute cores is mandatory.
+
+```bash
+snakemake --cores 10 --use-conda --directory .test
+```
+
+### Parameters
+
+
 ## Authors
 
 - Dr. Rina Ahmed-Begrich

From d35162222fc4e3008a366d7d1b8df16cdc35c168 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Tue, 23 Jul 2024 11:48:58 +0200
Subject: [PATCH 4/5] feat: prepared wf to run test data; removed unused conf
 options; snakefmt + linting

---
 .gitignore                               |  3 +-
 .test/config/config.yml                  | 69 +++++++++++-------------
 .test/config/multiqc_config.yml          |  2 +
 .test/config/samples.tsv                 |  5 +-
 .test/config/shift_table/shift_table.csv | 20 +++++++
 config/config.yml                        | 67 +++++++++++------------
 workflow/Snakefile                       | 23 ++++----
 workflow/rules/common.smk                | 45 +++++++++-------
 workflow/rules/postprocessing.smk        |  3 +-
 workflow/rules/preprocessing.smk         | 53 +++++++++---------
 workflow/scripts/plot_mapping_length.py  | 38 +++++--------
 11 files changed, 169 insertions(+), 159 deletions(-)
 create mode 100644 .test/config/multiqc_config.yml
 create mode 100644 .test/config/shift_table/shift_table.csv

diff --git a/.gitignore b/.gitignore
index 122be1b..e7fc954 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,5 @@ resources/**
 # Custom additions
 Notes.md
 .vscode/*
-.snakemake-workflow-catalog.yml
\ No newline at end of file
+.snakemake-workflow-catalog.yml
+.test/results/*
\ No newline at end of file
diff --git a/.test/config/config.yml b/.test/config/config.yml
index cbca87a..c66fa7d 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -1,10 +1,4 @@
-
-# optional: define output folder here
-# default: "./results"
-output: null
-
-# define samplesheet here
-samplesheet: "./test/config/samples.tsv"
+samplesheet: "config/samples.tsv"
 
 get_genome:
   database: "ncbi"
@@ -28,18 +22,20 @@ star:
   multi: 10
   sam_multi: 1
   intron_max: 1
-  default: [
-    "--readFilesCommand zcat ",
-    "--outSAMstrandField None ",
-    "--outSAMattributes All ",
-    "--outSAMattrIHstart 0 ",
-    "--outFilterType Normal ",
-    "--outFilterMultimapScoreRange 1 ",
-    "-o STARmappings ",
-    "--outSAMtype BAM Unsorted ",
-    "--outStd BAM_Unsorted ",
-    "--outMultimapperOrder Random ",
-    "--alignEndsType EndToEnd"]
+  default:
+    [
+      "--readFilesCommand zcat ",
+      "--outSAMstrandField None ",
+      "--outSAMattributes All ",
+      "--outSAMattrIHstart 0 ",
+      "--outFilterType Normal ",
+      "--outFilterMultimapScoreRange 1 ",
+      "-o STARmappings ",
+      "--outSAMtype BAM Unsorted ",
+      "--outStd BAM_Unsorted ",
+      "--outMultimapperOrder Random ",
+      "--alignEndsType EndToEnd",
+    ]
 
 extract_features:
   biotypes: ["rRNA", "tRNA"]
@@ -54,26 +50,25 @@ deeptools:
   normalize: "CPM"
 
 annotate_orfs:
-    window_size: 30
-    sorf_max_length: 300
-    sorf_min_length: 45
-    orf_start_codon_table: 11
-    orf_stop_codon: ["TAA", "TAG", "TGA"]
-    orf_longest_only: False
+  window_size: 30
+  sorf_max_length: 300
+  sorf_min_length: 45
+  orf_start_codon_table: 11
+  orf_stop_codon: ["TAA", "TAG", "TGA"]
+  orf_longest_only: False
 
 shift_reads:
-    window_size: 30
-    read_length: [27, 45]
-    # rpf_read_length: [30, 45]
-    # qti_read_length: [27, 45]
-    rnaseq_read_length: [0, 1000]
-    end_alignment: "3prime"
-    shift_table: "config/shift_table/shift_table.csv"
-    export_bam: False
-    export_bigwig: True
-    skip_shifting: False
-    skip_length_filter: True
-
+  window_size: 30
+  read_length: [27, 45]
+  # rpf_read_length: [30, 45]
+  # qti_read_length: [27, 45]
+  rnaseq_read_length: [0, 1000]
+  end_alignment: "3prime"
+  shift_table: "config/shift_table/shift_table.csv"
+  export_bam: False
+  export_bigwig: True
+  skip_shifting: False
+  skip_length_filter: True
 
 multiqc:
   config: "config/multiqc_config.yml"
diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml
new file mode 100644
index 0000000..2f9d44b
--- /dev/null
+++ b/.test/config/multiqc_config.yml
@@ -0,0 +1,2 @@
+remove_sections:
+  - samtools-stats
\ No newline at end of file
diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
index 7969058..ead9156 100644
--- a/.test/config/samples.tsv
+++ b/.test/config/samples.tsv
@@ -1,4 +1,3 @@
 sample	condition	replicate	lib_prep 	data_folder	fq1
-RPF-RTP1	RPF-RTP	1	mpusp	.test/data	RPF-RTP1_R1_001.fastq.gz
-RPF-RTP2	RPF-RTP	2	mpusp	.test/data	RPF-RTP2_R1_001.fastq.gz
-
+RPF-RTP1	RPF-RTP	1	mpusp	data	RPF-RTP1_R1_001.fastq.gz
+RPF-RTP2	RPF-RTP	2	mpusp	data	RPF-RTP2_R1_001.fastq.gz
diff --git a/.test/config/shift_table/shift_table.csv b/.test/config/shift_table/shift_table.csv
new file mode 100644
index 0000000..7d42cb0
--- /dev/null
+++ b/.test/config/shift_table/shift_table.csv
@@ -0,0 +1,20 @@
+fraction,offsets_start
+27,-11
+28,-12
+29,-13
+30,-14
+31,-15
+32,-16
+33,-17
+34,-18
+35,-19
+36,-20
+37,-21
+38,-22
+39,-23
+40,-24
+41,-25
+42,-26
+43,-27
+44,-28
+45,-29
diff --git a/config/config.yml b/config/config.yml
index eec7b8a..c66fa7d 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -1,9 +1,3 @@
-
-# optional: define output folder here
-# default: "./results"
-output: null
-
-# define samplesheet here
 samplesheet: "config/samples.tsv"
 
 get_genome:
@@ -28,18 +22,20 @@ star:
   multi: 10
   sam_multi: 1
   intron_max: 1
-  default: [
-    "--readFilesCommand zcat ",
-    "--outSAMstrandField None ",
-    "--outSAMattributes All ",
-    "--outSAMattrIHstart 0 ",
-    "--outFilterType Normal ",
-    "--outFilterMultimapScoreRange 1 ",
-    "-o STARmappings ",
-    "--outSAMtype BAM Unsorted ",
-    "--outStd BAM_Unsorted ",
-    "--outMultimapperOrder Random ",
-    "--alignEndsType EndToEnd"]
+  default:
+    [
+      "--readFilesCommand zcat ",
+      "--outSAMstrandField None ",
+      "--outSAMattributes All ",
+      "--outSAMattrIHstart 0 ",
+      "--outFilterType Normal ",
+      "--outFilterMultimapScoreRange 1 ",
+      "-o STARmappings ",
+      "--outSAMtype BAM Unsorted ",
+      "--outStd BAM_Unsorted ",
+      "--outMultimapperOrder Random ",
+      "--alignEndsType EndToEnd",
+    ]
 
 extract_features:
   biotypes: ["rRNA", "tRNA"]
@@ -54,26 +50,25 @@ deeptools:
   normalize: "CPM"
 
 annotate_orfs:
-    window_size: 30
-    sorf_max_length: 300
-    sorf_min_length: 45
-    orf_start_codon_table: 11
-    orf_stop_codon: ["TAA", "TAG", "TGA"]
-    orf_longest_only: False
+  window_size: 30
+  sorf_max_length: 300
+  sorf_min_length: 45
+  orf_start_codon_table: 11
+  orf_stop_codon: ["TAA", "TAG", "TGA"]
+  orf_longest_only: False
 
 shift_reads:
-    window_size: 30
-    read_length: [27, 45]
-    # rpf_read_length: [30, 45]
-    # qti_read_length: [27, 45]
-    rnaseq_read_length: [0, 1000]
-    end_alignment: "3prime"
-    shift_table: "config/shift_table/shift_table.csv"
-    export_bam: False
-    export_bigwig: True
-    skip_shifting: False
-    skip_length_filter: True
-
+  window_size: 30
+  read_length: [27, 45]
+  # rpf_read_length: [30, 45]
+  # qti_read_length: [27, 45]
+  rnaseq_read_length: [0, 1000]
+  end_alignment: "3prime"
+  shift_table: "config/shift_table/shift_table.csv"
+  export_bam: False
+  export_bigwig: True
+  skip_shifting: False
+  skip_length_filter: True
 
 multiqc:
   config: "config/multiqc_config.yml"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index a5236ce..7252d3b 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -13,10 +13,10 @@ import pandas as pd
 from datetime import date
 from snakemake.utils import min_version
 
-#min_version("7.0")
+# min_version("7.0")
 
 __author__ = "Rina Ahmed-Begrich, Michael Jahn"
-__year__ = str(date.today()).split('-')[0]
+__year__ = str(date.today()).split("-")[0]
 
 bold = "\033[1m"
 green = "\033[92m"
@@ -27,13 +27,14 @@ msg = f"""{cyan}Bacterial-Riboseq: A Snakemake workflow
 for the analysis of riboseq data in bacteria.{end}
 """
 
-epilog=f"""
+epilog = f"""
 {cyan}Written by {__author__}.
 Max Planck Unit for the Science of Pathogens. Copyright (c) {__year__}.
 Copyright Holder All Rights Reserved.{end}
 
 """
 
+
 # load configuration
 # -----------------------------------------------------
 configfile: "config/config.yml"
@@ -51,27 +52,29 @@ include: "rules/postprocessing.smk"
 shell.executable("bash")
 shell.prefix(f"set -eo pipefail; ")
 
-if config.get('output') is None:
-    config['output'] =  os.path.join(os.getcwd(), "./results")
 
 onstart:
     print("\n--- Analysis started...\n")
     print()
     print("--- Analysis parameters --------------------------------------------\n")
-    print(f"Current working directory: {os.path.join(os.getcwd())}")
-    print(f"Output directory:", {config['output']})
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Output directory: {os.path.join(os.getcwd(), 'results')}")
     print()
     print(f"Riboseq samples: {list(samples.index)}")
     print()
 
+
 onsuccess:
     print()
     print(msg)
     print(epilog)
     print("--- Workflow finished, no error! -----------------------------------")
     print()
-    debug = os.path.join(config['output'], "workflow.log")
-    shell("cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}")
+    debug = os.path.join(os.getcwd(), "results/workflow.log")
+    shell(
+        "cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}"
+    )
+
 
 onerror:
     print()
@@ -79,7 +82,7 @@ onerror:
     print(epilog)
     print("--- An error occurred! ---------------------------------------------")
     print()
-    error = os.path.join(config['output'], "error.log")
+    error = os.path.join(os.getcwd(), "results/error.log")
     shell("cat {log} > {error} && echo -e '\nAn error occurred!' >> {error}")
 
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 532c25e..59255fe 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -10,7 +10,7 @@ samples = (
     .sort_index()
 )
 
-#TODO: write validation schema
+# TODO: write validation schema
 # validate(SAMPLES, schema="../config/schemas/samples.schema.yml")
 
 
@@ -18,39 +18,44 @@ def get_final_output():
     targets = []
     targets.append("results/multiqc/multiqc_report.html")
     targets.append(
-        expand("results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
+        expand(
+            "results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
             mapping_status=["mapped", "deduplicated", "filtered_bam"],
-            sample=samples.index)
+            sample=samples.index,
+        )
     )
     targets.append("results/get_genome/mRNA_features.gff")
     targets.append(
-        expand("results/shift_reads/{sample}_shift.csv",
-            sample=samples.index)
+        expand("results/shift_reads/{sample}_shift.csv", sample=samples.index)
     )
     return targets
 
 
 # get fastq files
 def get_fastq(wildcards):
-    if wildcards.status == 'raw':
+    if wildcards.status == "raw":
         return expand(
             "{input_dir}/{sample}",
             input_dir=samples.loc[wildcards.sample]["data_folder"],
-            sample=samples.loc[wildcards.sample]["fq1"])
-    if wildcards.status == 'clipped':
-        return expand(
-            "results/clipped/{sample}.fastq.gz",
-            sample=samples.index)
+            sample=samples.loc[wildcards.sample]["fq1"],
+        )
+    if wildcards.status == "clipped":
+        return expand("results/clipped/{sample}.fastq.gz", sample=samples.index)
 
 
 # get bam files
 def get_bam(wildcards):
-    if wildcards.mapping_status == 'mapped':
-        return expand(os.path.join("results",  "mapped", "{sample}.bam"),
-            sample=wildcards.sample)
-    if wildcards.mapping_status == 'deduplicated':
-        return expand(os.path.join("results",  "deduplicated", "{sample}.bam"),
-            sample=wildcards.sample)
-    if wildcards.mapping_status == 'filtered_bam':
-        return expand(os.path.join("results",  "filtered_bam", "{sample}.bam"),
-            sample=wildcards.sample)
+    if wildcards.mapping_status == "mapped":
+        return expand(
+            os.path.join("results", "mapped", "{sample}.bam"), sample=wildcards.sample
+        )
+    if wildcards.mapping_status == "deduplicated":
+        return expand(
+            os.path.join("results", "deduplicated", "{sample}.bam"),
+            sample=wildcards.sample,
+        )
+    if wildcards.mapping_status == "filtered_bam":
+        return expand(
+            os.path.join("results", "filtered_bam", "{sample}.bam"),
+            sample=wildcards.sample,
+        )
diff --git a/workflow/rules/postprocessing.smk b/workflow/rules/postprocessing.smk
index a33986a..35d1c9d 100644
--- a/workflow/rules/postprocessing.smk
+++ b/workflow/rules/postprocessing.smk
@@ -2,6 +2,7 @@
 # Riboseq postprocessing:                               #
 # ----------------------------------------------------- #
 
+
 # module to extract selected biotypes from gff file
 # -----------------------------------------------------
 rule extract_mRNA_features:
@@ -60,7 +61,7 @@ rule shift_reads:
         """--- Shifting Ribo-Seq reads."""
     params:
         config["shift_reads"],
-    threads: workflow.cores,
+    threads: workflow.cores
     log:
         path="results/shift_reads/log/{sample}.log",
     script:
diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk
index 536ec52..36f9270 100644
--- a/workflow/rules/preprocessing.smk
+++ b/workflow/rules/preprocessing.smk
@@ -2,6 +2,7 @@
 # Riboseq preprocessing:                                #
 # ----------------------------------------------------- #
 
+
 # module to make QC report
 # -----------------------------------------------------
 rule fastqc:
@@ -10,12 +11,12 @@ rule fastqc:
     output:
         report=directory("results/fastqc_{status}/{sample}"),
     conda:
-        "../envs/fastqc.yml",
+        "../envs/fastqc.yml"
     message:
         """--- Checking fastq files with FastQC."""
     log:
         "results/fastqc_{status}/log/{sample}.log",
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     shell:
         "mkdir -p {output.report};"
         "fastqc --nogroup -extract --threads {threads} -o {output.report} {input} > {log}"
@@ -25,9 +26,11 @@ rule fastqc:
 # -----------------------------------------------------
 rule cutadapt:
     input:
-        fastq=lambda wc: expand("{input_dir}/{sample}",
+        fastq=lambda wc: expand(
+            "{input_dir}/{sample}",
             input_dir=samples.loc[wc.sample]["data_folder"],
-            sample=samples.loc[wc.sample]["fq1"]),
+            sample=samples.loc[wc.sample]["fq1"],
+        ),
     output:
         fastq="results/clipped/{sample}.fastq.gz",
     conda:
@@ -41,7 +44,7 @@ rule cutadapt:
     log:
         stdout="results/clipped/log/{sample}.log",
         stderr="results/clipped/log/{sample}.stderr",
-    threads: int(workflow.cores * 0.4), # assign 40% of max cores
+    threads: int(workflow.cores * 0.4)  # assign 40% of max cores
     shell:
         "if [ {params.fivep_adapter} != None ]; then "
         "fivep=`echo -g {params.fivep_adapter}`; "
@@ -152,7 +155,7 @@ rule star_mapping:
         outprefix=lambda w, output: f"{os.path.splitext(output.bam)[0]}_",
     log:
         path="results/mapped/log/{sample}.log",
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     shell:
         "STAR "
         "--runThreadN {threads} "
@@ -177,12 +180,12 @@ rule mapping_sorted_bam:
     conda:
         "../envs/samtools.yml"
     log:
-        "results/mapped/log/samtools_{sample}.log"
-        # os.path.join(output_dir, "mapping/log/samtools_sort_{sample}.stderr")
-    message: """--- Samtools sort and index bam files."""
+        "results/mapped/log/samtools_{sample}.log",
+    message:
+        """--- Samtools sort and index bam files."""
     params:
         tmp="results/mapped/sort_{sample}_tmp",
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     shell:
         "samtools sort -@ {threads} -O bam -T {params.tmp} -o {output.bam} {input} 2> {log}; "
         "samtools index -@ {threads} {output.bam} 2>> {log}"
@@ -204,7 +207,7 @@ rule umi_dedup:
     params:
         tmp="results/deduplicated/sort_{sample}_tmp",
         default=config["umi_dedup"],
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     log:
         path="results/deduplicated/log/{sample}.log",
         stderr="results/deduplicated/log/{sample}.stderr",
@@ -253,7 +256,7 @@ rule filter_bam:
         "../envs/filter_bam.yml"
     params:
         defaults=config["bedtools_intersect"]["defaults"],
-    threads: int(workflow.cores * 0.2), # assign 20% of max cores
+    threads: int(workflow.cores * 0.2)  # assign 20% of max cores
     shell:
         "intersectBed -abam {input.bam} -b {input.gff} {params.defaults} | "
         "samtools sort -@ {threads} > {output.bam} 2> {log.path}; "
@@ -265,30 +268,26 @@ rule filter_bam:
 # -----------------------------------------------------
 rule extract_mapping_length:
     input:
-        get_bam,
+        bam=get_bam,
     output:
-        "results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
-    message: """--- Extract mapping length of BAM input: {wildcards.mapping_status}/{wildcards.sample}.bam"""
+        tsv="results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
+        pdf="results/{mapping_status}/length_dist/{sample}_length_dist.pdf",
+    message:
+        """--- Extract mapping length of BAM input: {wildcards.mapping_status}/{wildcards.sample}.bam"""
     conda:
-        "../envs/plot_mapping_length.yml",
+        "../envs/plot_mapping_length.yml"
     log:
-        os.path.join("results", "{mapping_status}", "length_dist", "log", "{sample}.log"),
-    params:
-        outdir=os.path.join("results","{mapping_status}", "length_dist"),
-    shell:
-        "workflow/scripts/plot_mapping_length.py -b {input} -o {params.outdir} -s {wildcards.sample} 2> {log}"
+        "results/{mapping_status}/length_dist/{sample}_length_dist.log",
+    script:
+        "../scripts/plot_mapping_length.py"
 
 
 # module to run multiQC on input + processed files
 # -----------------------------------------------------
 rule multiqc:
     input:
-        expand(
-            "results/fastqc_clipped/{sample}_fastqc.html",
-            sample=samples.index),
-        expand(
-            "results/clipped/{sample}.fastq.gz",
-            sample=samples.index),
+        expand("results/fastqc_clipped/{sample}_fastqc.html", sample=samples.index),
+        expand("results/clipped/{sample}.fastq.gz", sample=samples.index),
         expand(
             "results/umi_extraction/{sample}.fastq.gz",
             sample=samples.index,
diff --git a/workflow/scripts/plot_mapping_length.py b/workflow/scripts/plot_mapping_length.py
index 9d3d639..dffddb6 100755
--- a/workflow/scripts/plot_mapping_length.py
+++ b/workflow/scripts/plot_mapping_length.py
@@ -17,7 +17,7 @@
 __version__ = '.'.join(__version_info__)
 
 
-def get_aln_length(bamfile, outfolder, sample):
+def get_aln_length(bamfile, sample):
     lengths = []
     for aln in bamfile.fetch():
         lengths.append(len(aln.query_sequence))
@@ -43,29 +43,19 @@ def get_aln_length(bamfile, outfolder, sample):
 """ % (__authors__, __mail__),
         formatter_class=argparse.RawDescriptionHelpFormatter)
 
-    parser.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__))
-    parser.add_argument('-b', dest='bam', required=True, help='bam file')
-    parser.add_argument('-o', dest='output_folder', required=True, help='output folder')
-    parser.add_argument('-s', dest='sample_name', required=True, help='sample name')
-
-    args = vars(parser.parse_args())
-    file = args['bam']
-    sample_name = args['sample_name']
-    outfolder=args['output_folder']
-
+    input = str(snakemake.input["bam"])
+    sample_name = snakemake.wildcards.sample
+    output_tsv = snakemake.output["tsv"]
+    output_pdf = snakemake.output["pdf"]
 
     try:
-        if not file.endswith('bam'):
-            sys.stderr.write("Input file needs to be in bam format. " +
-                             "Exit forced\n.")
-            sys.exit(1)
-        elif not os.path.exists(file):
-            sys.stderr.write("Input file %s does not exist. " +
-                             "Exit forced\n." % file)
+        if not os.path.exists(input):
+            sys.stderr.write(
+                "Input file %s does not exist. " + "Exit forced\n." % input
+            )
             sys.exit(1)
-        bamfile = pysam.AlignmentFile(file, "rb")
-        df = get_aln_length(bamfile=bamfile, outfolder=outfolder,
-                            sample=sample_name)
+        bamfile = pysam.AlignmentFile(input, "rb")
+        df = get_aln_length(bamfile=bamfile, sample=sample_name)
 
         # plot pdf
         matplotlib.rcParams['axes.grid'] = True
@@ -84,15 +74,15 @@ def get_aln_length(bamfile, outfolder, sample):
         ax.set_xlabel('Read length')
         ax.get_yaxis().get_major_formatter().set_scientific(False)
         plt.tight_layout()
-        plt.savefig(os.path.join(outfolder, f'{sample_name}_mapped_length.pdf'), bbox_inches='tight')
+        plt.savefig(output_pdf, bbox_inches='tight')
 
         # export coount table
         df.index.set_names(['Length'], inplace=True)
         df.reset_index(inplace=True)
-        df.to_csv(os.path.join(outfolder, f'{sample_name}_length_dist.tsv'), sep='\t', index=False)
+        df.to_csv(output_tsv, sep='\t', index=False)
 
     except:
         sys.stderr.write("Error ocurred when processing bam file: %s.\n"
-                         % file)
+                         % input)
         raise RuntimeError
         sys.exit(1)

From a4dd4085d2761689955917ada0a6f0c4baf19e28 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Tue, 23 Jul 2024 14:52:21 +0200
Subject: [PATCH 5/5] fix: snakefmt error

---
 workflow/Snakefile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 7252d3b..aa71569 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -47,12 +47,6 @@ include: "rules/preprocessing.smk"
 include: "rules/postprocessing.smk"
 
 
-# set shell configs
-# -----------------------------------------------------
-shell.executable("bash")
-shell.prefix(f"set -eo pipefail; ")
-
-
 onstart:
     print("\n--- Analysis started...\n")
     print()