Merge pull request #1 from MPUSP/dev

feat: formatting and enabling wf to run GH actions tests
MPUSP · Jul 23, 2024 · 3a68735 · 3a68735
2 parents b4d5519 + a4dd408
commit 3a68735
Show file tree

Hide file tree

Showing 20 changed files with 242 additions and 174 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -2,9 +2,9 @@ name: Tests
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, dev ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, dev ]
 
 
 jobs:
@@ -44,7 +44,7 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
+        args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp --dryrun"
 
     - name: Test report
       uses: snakemake/[email protected]

diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@ resources/**
 # Custom additions
 Notes.md
 .vscode/*
-.snakemake-workflow-catalog.yml
+.snakemake-workflow-catalog.yml
+.test/results/*
diff --git a/.test/config/config.yml b/.test/config/config.yml
@@ -1,10 +1,4 @@
-
-# optional: define output folder here
-# default: "./results"
-output: null
-
-# define samplesheet here
-samplesheet: "./test/config/samples.tsv"
+samplesheet: "config/samples.tsv"
 
 get_genome:
   database: "ncbi"
@@ -28,18 +22,20 @@ star:
   multi: 10
   sam_multi: 1
   intron_max: 1
-  default: [
-    "--readFilesCommand zcat ",
-    "--outSAMstrandField None ",
-    "--outSAMattributes All ",
-    "--outSAMattrIHstart 0 ",
-    "--outFilterType Normal ",
-    "--outFilterMultimapScoreRange 1 ",
-    "-o STARmappings ",
-    "--outSAMtype BAM Unsorted ",
-    "--outStd BAM_Unsorted ",
-    "--outMultimapperOrder Random ",
-    "--alignEndsType EndToEnd"]
+  default:
+    [
+      "--readFilesCommand zcat ",
+      "--outSAMstrandField None ",
+      "--outSAMattributes All ",
+      "--outSAMattrIHstart 0 ",
+      "--outFilterType Normal ",
+      "--outFilterMultimapScoreRange 1 ",
+      "-o STARmappings ",
+      "--outSAMtype BAM Unsorted ",
+      "--outStd BAM_Unsorted ",
+      "--outMultimapperOrder Random ",
+      "--alignEndsType EndToEnd",
+    ]
 
 extract_features:
   biotypes: ["rRNA", "tRNA"]
@@ -54,26 +50,25 @@ deeptools:
   normalize: "CPM"
 
 annotate_orfs:
-    window_size: 30
-    sorf_max_length: 300
-    sorf_min_length: 45
-    orf_start_codon_table: 11
-    orf_stop_codon: ["TAA", "TAG", "TGA"]
-    orf_longest_only: False
+  window_size: 30
+  sorf_max_length: 300
+  sorf_min_length: 45
+  orf_start_codon_table: 11
+  orf_stop_codon: ["TAA", "TAG", "TGA"]
+  orf_longest_only: False
 
 shift_reads:
-    window_size: 30
-    read_length: [27, 45]
-    # rpf_read_length: [30, 45]
-    # qti_read_length: [27, 45]
-    rnaseq_read_length: [0, 1000]
-    end_alignment: "3prime"
-    shift_table: "config/shift_table/shift_table.csv"
-    export_bam: False
-    export_bigwig: True
-    skip_shifting: False
-    skip_length_filter: True
-
+  window_size: 30
+  read_length: [27, 45]
+  # rpf_read_length: [30, 45]
+  # qti_read_length: [27, 45]
+  rnaseq_read_length: [0, 1000]
+  end_alignment: "3prime"
+  shift_table: "config/shift_table/shift_table.csv"
+  export_bam: False
+  export_bigwig: True
+  skip_shifting: False
+  skip_length_filter: True
 
 multiqc:
   config: "config/multiqc_config.yml"
diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml
@@ -0,0 +1,2 @@
+remove_sections:
+  - samtools-stats
diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
@@ -1,4 +1,3 @@
 sample	condition	replicate	lib_prep 	data_folder	fq1
-RPF-RTP1	RPF-RTP	1	mpusp	.test/data	RPF-RTP1_R1_001.fastq.gz
-RPF-RTP2	RPF-RTP	2	mpusp	.test/data	RPF-RTP2_R1_001.fastq.gz
-
+RPF-RTP1	RPF-RTP	1	mpusp	data	RPF-RTP1_R1_001.fastq.gz
+RPF-RTP2	RPF-RTP	2	mpusp	data	RPF-RTP2_R1_001.fastq.gz
diff --git a/.test/config/shift_table/shift_table.csv b/.test/config/shift_table/shift_table.csv
@@ -0,0 +1,20 @@
+fraction,offsets_start
+27,-11
+28,-12
+29,-13
+30,-14
+31,-15
+32,-16
+33,-17
+34,-18
+35,-19
+36,-20
+37,-21
+38,-22
+39,-23
+40,-24
+41,-25
+42,-26
+43,-27
+44,-28
+45,-29
diff --git a/README.md b/README.md
@@ -13,6 +13,13 @@ A Snakemake workflow for the analysis of bacterial riboseq data.
   - [Usage](#usage)
   - [Workflow overview](#workflow-overview)
   - [Installation](#installation)
+    - [Additional tools](#additional-tools)
+  - [Running the workflow](#running-the-workflow)
+    - [Input data](#input-data)
+      - [Reference genome](#reference-genome)
+      - [Read data](#read-data)
+    - [Execution](#execution)
+    - [Parameters](#parameters)
   - [Authors](#authors)
   - [References](#references)
 
@@ -56,7 +63,7 @@ This step creates a new conda environment called `snakemake-bacterial-riboseq`.
 
 ```bash
 # create new environment with dependencies & activate it
-mamba env create -c conda-forge -c bioconda -n snakemake-bacterial-riboseq snakemake pandas
+mamba create -c conda-forge -c bioconda -n snakemake-bacterial-riboseq snakemake pandas
 conda activate snakemake-bacterial-riboseq
 ```
 
@@ -67,6 +74,59 @@ conda activate snakemake-bacterial-riboseq
 All other dependencies for the workflow are **automatically pulled as `conda` environments** by snakemake, when running the workflow with the `--use-conda` parameter (recommended).
 
 
+## Running the workflow
+
+### Input data
+
+#### Reference genome
+
+An NCBI Refseq ID, e.g. `GCF_000006945.2`. Find your genome assembly and corresponding ID on [NCBI genomes](https://www.ncbi.nlm.nih.gov/data-hub/genome/). Alternatively use a custom pair of `*.fasta` file and `*.gff` file that describe the genome of choice.
+
+Important requirements when using custom `*.fasta` and `*.gff` files:
+
+- `*.gff` genome annotation must have the same chromosome/region name as the `*.fasta` file (example: `NC_003197.2`)
+- `*.gff` genome annotation must have `gene` and `CDS` type annotation that is automatically parsed to extract transcripts
+- all chromosomes/regions in the `*.gff` genome annotation must be present in the `*.fasta` sequence
+- but not all sequences in the `*.fasta` file need to have annotated genes in the `*.gff` file
+
+#### Read data
+
+Ribosome footprint sequencing data in `*.fastq.gz` format. The currently supported input data are **single-end, strand-specific reads**. Input data files are supplied via a mandatory table, whose location is indicated in the `config.yml` file (default: `samples.tsv`). The sample sheet has the following layout:
+
+| sample   | condition | replicate | lib_prep | data_folder | fq1                      |
+| -------- | --------- | --------- | -------- | ----------- | ------------------------ |
+| RPF-RTP1 | RPF-RTP   | 1         | mpusp    | data        | RPF-RTP1_R1_001.fastq.gz |
+| RPF-RTP2 | RPF-RTP   | 2         | mpusp    | data        | RPF-RTP2_R1_001.fastq.gz |
+
+Some configuration parameters of the pipeline may be specific for your data and library preparation protocol. The options should be adjusted in the `config.yml` file. For example:
+
+- Minimum and maximum read length after adapter removal (see option `cutadapt: default`). Here, the test data has a minimum read length of 15 + 7 = 22 (2 nt on 5'end + 5 nt on 3'end), and a maximum of 45 + 7 = 52.
+- Unique molecular identifiers (UMIs). For example, the protocol by [McGlincy & Ingolia, 2017](https://doi.org/10.1016/J.YMETH.2017.05.028) creates a UMI that is located on both the 5'-end (2 nt) and the 3'-end (5 nt). These UMIs are extracted with `umi_tools` (see options `umi_extraction: method` and `pattern`).
+
+### Execution
+
+To run the workflow from command line, change the working directory.
+
+```bash
+cd path/to/snakemake-bacterial-riboseq
+```
+
+Adjust the global and module-specific options in the default config file `config/config.yml`.
+Before running the entire workflow, you can perform a dry run using:
+
+```bash
+snakemake --dry-run
+```
+
+To run the complete workflow with test files using **`conda`**, execute the following command. The definition of the number of compute cores is mandatory.
+
+```bash
+snakemake --cores 10 --use-conda --directory .test
+```
+
+### Parameters
+
+
 ## Authors
 
 - Dr. Rina Ahmed-Begrich

diff --git a/config/config.yml b/config/config.yml
@@ -1,9 +1,3 @@
-
-# optional: define output folder here
-# default: "./results"
-output: null
-
-# define samplesheet here
 samplesheet: "config/samples.tsv"
 
 get_genome:
@@ -28,18 +22,20 @@ star:
   multi: 10
   sam_multi: 1
   intron_max: 1
-  default: [
-    "--readFilesCommand zcat ",
-    "--outSAMstrandField None ",
-    "--outSAMattributes All ",
-    "--outSAMattrIHstart 0 ",
-    "--outFilterType Normal ",
-    "--outFilterMultimapScoreRange 1 ",
-    "-o STARmappings ",
-    "--outSAMtype BAM Unsorted ",
-    "--outStd BAM_Unsorted ",
-    "--outMultimapperOrder Random ",
-    "--alignEndsType EndToEnd"]
+  default:
+    [
+      "--readFilesCommand zcat ",
+      "--outSAMstrandField None ",
+      "--outSAMattributes All ",
+      "--outSAMattrIHstart 0 ",
+      "--outFilterType Normal ",
+      "--outFilterMultimapScoreRange 1 ",
+      "-o STARmappings ",
+      "--outSAMtype BAM Unsorted ",
+      "--outStd BAM_Unsorted ",
+      "--outMultimapperOrder Random ",
+      "--alignEndsType EndToEnd",
+    ]
 
 extract_features:
   biotypes: ["rRNA", "tRNA"]
@@ -54,26 +50,25 @@ deeptools:
   normalize: "CPM"
 
 annotate_orfs:
-    window_size: 30
-    sorf_max_length: 300
-    sorf_min_length: 45
-    orf_start_codon_table: 11
-    orf_stop_codon: ["TAA", "TAG", "TGA"]
-    orf_longest_only: False
+  window_size: 30
+  sorf_max_length: 300
+  sorf_min_length: 45
+  orf_start_codon_table: 11
+  orf_stop_codon: ["TAA", "TAG", "TGA"]
+  orf_longest_only: False
 
 shift_reads:
-    window_size: 30
-    read_length: [27, 45]
-    # rpf_read_length: [30, 45]
-    # qti_read_length: [27, 45]
-    rnaseq_read_length: [0, 1000]
-    end_alignment: "3prime"
-    shift_table: "config/shift_table/shift_table.csv"
-    export_bam: False
-    export_bigwig: True
-    skip_shifting: False
-    skip_length_filter: True
-
+  window_size: 30
+  read_length: [27, 45]
+  # rpf_read_length: [30, 45]
+  # qti_read_length: [27, 45]
+  rnaseq_read_length: [0, 1000]
+  end_alignment: "3prime"
+  shift_table: "config/shift_table/shift_table.csv"
+  export_bam: False
+  export_bigwig: True
+  skip_shifting: False
+  skip_length_filter: True
 
 multiqc:
   config: "config/multiqc_config.yml"
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -13,10 +13,10 @@ import pandas as pd
 from datetime import date
 from snakemake.utils import min_version
 
-#min_version("7.0")
+# min_version("7.0")
 
 __author__ = "Rina Ahmed-Begrich, Michael Jahn"
-__year__ = str(date.today()).split('-')[0]
+__year__ = str(date.today()).split("-")[0]
 
 bold = "\033[1m"
 green = "\033[92m"
@@ -27,13 +27,14 @@ msg = f"""{cyan}Bacterial-Riboseq: A Snakemake workflow
 for the analysis of riboseq data in bacteria.{end}
 """
 
-epilog=f"""
+epilog = f"""
 {cyan}Written by {__author__}.
 Max Planck Unit for the Science of Pathogens. Copyright (c) {__year__}.
 Copyright Holder All Rights Reserved.{end}
 
 """
 
+
 # load configuration
 # -----------------------------------------------------
 configfile: "config/config.yml"
@@ -46,40 +47,36 @@ include: "rules/preprocessing.smk"
 include: "rules/postprocessing.smk"
 
 
-# set shell configs
-# -----------------------------------------------------
-shell.executable("bash")
-shell.prefix(f"set -eo pipefail; ")
-
-if config.get('output') is None:
-    config['output'] =  os.path.join(os.getcwd(), "./results")
-
 onstart:
     print("\n--- Analysis started...\n")
     print()
     print("--- Analysis parameters --------------------------------------------\n")
-    print(f"Current working directory: {os.path.join(os.getcwd())}")
-    print(f"Output directory:", {config['output']})
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Output directory: {os.path.join(os.getcwd(), 'results')}")
     print()
     print(f"Riboseq samples: {list(samples.index)}")
     print()
 
+
 onsuccess:
     print()
     print(msg)
     print(epilog)
     print("--- Workflow finished, no error! -----------------------------------")
     print()
-    debug = os.path.join(config['output'], "workflow.log")
-    shell("cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}")
+    debug = os.path.join(os.getcwd(), "results/workflow.log")
+    shell(
+        "cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}"
+    )
+
 
 onerror:
     print()
     print(msg)
     print(epilog)
     print("--- An error occurred! ---------------------------------------------")
     print()
-    error = os.path.join(config['output'], "error.log")
+    error = os.path.join(os.getcwd(), "results/error.log")
     shell("cat {log} > {error} && echo -e '\nAn error occurred!' >> {error}")