From 051de2c9e9d9b4b0062b427b6e17bca3224353d8 Mon Sep 17 00:00:00 2001 From: jahn Date: Tue, 23 Jul 2024 11:38:57 +0200 Subject: [PATCH 1/5] fix: added dev branch to gh actions workflow --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5d28ca0..7c1532f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,9 +2,9 @@ name: Tests on: push: - branches: [ main ] + branches: [ main, dev ] pull_request: - branches: [ main ] + branches: [ main, dev ] jobs: @@ -44,7 +44,7 @@ jobs: with: directory: .test snakefile: workflow/Snakefile - args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp" + args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp --dryrun" - name: Test report uses: snakemake/snakemake-github-action@v1.24.0 From 5184b3819a58be808308635357d0da63d29f9da9 Mon Sep 17 00:00:00 2001 From: jahn Date: Tue, 23 Jul 2024 11:40:02 +0200 Subject: [PATCH 2/5] fix: added missing channels for env definitions --- workflow/envs/cutadapt.yml | 1 + workflow/envs/fastqc.yml | 1 + workflow/envs/filter_bam.yml | 3 ++- workflow/envs/r_orfik.yml | 2 +- workflow/envs/samtools.yml | 2 +- workflow/envs/star.yml | 2 +- workflow/envs/umitools.yml | 3 ++- 7 files changed, 9 insertions(+), 5 deletions(-) diff --git a/workflow/envs/cutadapt.yml b/workflow/envs/cutadapt.yml index 8664c55..180148c 100644 --- a/workflow/envs/cutadapt.yml +++ b/workflow/envs/cutadapt.yml @@ -2,5 +2,6 @@ name: cutadapt channels: - conda-forge - defaults + - bioconda dependencies: - cutadapt=4.9 diff --git a/workflow/envs/fastqc.yml b/workflow/envs/fastqc.yml index 151235b..338a604 100644 --- a/workflow/envs/fastqc.yml +++ b/workflow/envs/fastqc.yml @@ -2,5 +2,6 @@ name: fastqc channels: - conda-forge - defaults + - bioconda dependencies: - fastqc=0.12.1 diff --git a/workflow/envs/filter_bam.yml b/workflow/envs/filter_bam.yml index 8019466..696d070 100644 --- a/workflow/envs/filter_bam.yml +++ b/workflow/envs/filter_bam.yml @@ -1,7 +1,8 @@ name: filter_bam channels: - - bioconda + - conda-forge - defaults + - bioconda dependencies: - bedtools=2.31.1 - samtools=1.20 diff --git a/workflow/envs/r_orfik.yml b/workflow/envs/r_orfik.yml index df85df2..6ed7913 100644 --- a/workflow/envs/r_orfik.yml +++ b/workflow/envs/r_orfik.yml @@ -1,8 +1,8 @@ name: r_orfik channels: - conda-forge - - bioconda - defaults + - bioconda dependencies: - r-base=4.3.1 - r-remotes=2.5.0 diff --git a/workflow/envs/samtools.yml b/workflow/envs/samtools.yml index fca5cac..30159c6 100644 --- a/workflow/envs/samtools.yml +++ b/workflow/envs/samtools.yml @@ -1,7 +1,7 @@ name: samtools channels: - conda-forge - - bioconda - defaults + - bioconda dependencies: - samtools=1.20 diff --git a/workflow/envs/star.yml b/workflow/envs/star.yml index 669e69a..df39fdc 100644 --- a/workflow/envs/star.yml +++ b/workflow/envs/star.yml @@ -1,8 +1,8 @@ name: star channels: - conda-forge - - bioconda - defaults + - bioconda dependencies: - star=2.7.11b - samtools=1.20 \ No newline at end of file diff --git a/workflow/envs/umitools.yml b/workflow/envs/umitools.yml index 2430ef1..6806afb 100644 --- a/workflow/envs/umitools.yml +++ b/workflow/envs/umitools.yml @@ -1,7 +1,8 @@ name: umitools channels: - - bioconda + - conda-forge - defaults + - bioconda dependencies: - bioconda::umi_tools #=1.1.4 - pysam #=0.21.0 From 4b31486ee45b98c2d79ea67c81929b54e7dd5118 Mon Sep 17 00:00:00 2001 From: jahn Date: Tue, 23 Jul 2024 11:45:27 +0200 Subject: [PATCH 3/5] feat: added description to run examples --- README.md | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 204678a..e22307b 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,13 @@ A Snakemake workflow for the analysis of bacterial riboseq data. - [Usage](#usage) - [Workflow overview](#workflow-overview) - [Installation](#installation) + - [Additional tools](#additional-tools) + - [Running the workflow](#running-the-workflow) + - [Input data](#input-data) + - [Reference genome](#reference-genome) + - [Read data](#read-data) + - [Execution](#execution) + - [Parameters](#parameters) - [Authors](#authors) - [References](#references) @@ -56,7 +63,7 @@ This step creates a new conda environment called `snakemake-bacterial-riboseq`. ```bash # create new environment with dependencies & activate it -mamba env create -c conda-forge -c bioconda -n snakemake-bacterial-riboseq snakemake pandas +mamba create -c conda-forge -c bioconda -n snakemake-bacterial-riboseq snakemake pandas conda activate snakemake-bacterial-riboseq ``` @@ -67,6 +74,59 @@ conda activate snakemake-bacterial-riboseq All other dependencies for the workflow are **automatically pulled as `conda` environments** by snakemake, when running the workflow with the `--use-conda` parameter (recommended). +## Running the workflow + +### Input data + +#### Reference genome + +An NCBI Refseq ID, e.g. `GCF_000006945.2`. Find your genome assembly and corresponding ID on [NCBI genomes](https://www.ncbi.nlm.nih.gov/data-hub/genome/). Alternatively use a custom pair of `*.fasta` file and `*.gff` file that describe the genome of choice. + +Important requirements when using custom `*.fasta` and `*.gff` files: + +- `*.gff` genome annotation must have the same chromosome/region name as the `*.fasta` file (example: `NC_003197.2`) +- `*.gff` genome annotation must have `gene` and `CDS` type annotation that is automatically parsed to extract transcripts +- all chromosomes/regions in the `*.gff` genome annotation must be present in the `*.fasta` sequence +- but not all sequences in the `*.fasta` file need to have annotated genes in the `*.gff` file + +#### Read data + +Ribosome footprint sequencing data in `*.fastq.gz` format. The currently supported input data are **single-end, strand-specific reads**. Input data files are supplied via a mandatory table, whose location is indicated in the `config.yml` file (default: `samples.tsv`). The sample sheet has the following layout: + +| sample | condition | replicate | lib_prep | data_folder | fq1 | +| -------- | --------- | --------- | -------- | ----------- | ------------------------ | +| RPF-RTP1 | RPF-RTP | 1 | mpusp | data | RPF-RTP1_R1_001.fastq.gz | +| RPF-RTP2 | RPF-RTP | 2 | mpusp | data | RPF-RTP2_R1_001.fastq.gz | + +Some configuration parameters of the pipeline may be specific for your data and library preparation protocol. The options should be adjusted in the `config.yml` file. For example: + +- Minimum and maximum read length after adapter removal (see option `cutadapt: default`). Here, the test data has a minimum read length of 15 + 7 = 22 (2 nt on 5'end + 5 nt on 3'end), and a maximum of 45 + 7 = 52. +- Unique molecular identifiers (UMIs). For example, the protocol by [McGlincy & Ingolia, 2017](https://doi.org/10.1016/J.YMETH.2017.05.028) creates a UMI that is located on both the 5'-end (2 nt) and the 3'-end (5 nt). These UMIs are extracted with `umi_tools` (see options `umi_extraction: method` and `pattern`). + +### Execution + +To run the workflow from command line, change the working directory. + +```bash +cd path/to/snakemake-bacterial-riboseq +``` + +Adjust the global and module-specific options in the default config file `config/config.yml`. +Before running the entire workflow, you can perform a dry run using: + +```bash +snakemake --dry-run +``` + +To run the complete workflow with test files using **`conda`**, execute the following command. The definition of the number of compute cores is mandatory. + +```bash +snakemake --cores 10 --use-conda --directory .test +``` + +### Parameters + + ## Authors - Dr. Rina Ahmed-Begrich From d35162222fc4e3008a366d7d1b8df16cdc35c168 Mon Sep 17 00:00:00 2001 From: jahn Date: Tue, 23 Jul 2024 11:48:58 +0200 Subject: [PATCH 4/5] feat: prepared wf to run test data; removed unused conf options; snakefmt + linting --- .gitignore | 3 +- .test/config/config.yml | 69 +++++++++++------------- .test/config/multiqc_config.yml | 2 + .test/config/samples.tsv | 5 +- .test/config/shift_table/shift_table.csv | 20 +++++++ config/config.yml | 67 +++++++++++------------ workflow/Snakefile | 23 ++++---- workflow/rules/common.smk | 45 +++++++++------- workflow/rules/postprocessing.smk | 3 +- workflow/rules/preprocessing.smk | 53 +++++++++--------- workflow/scripts/plot_mapping_length.py | 38 +++++-------- 11 files changed, 169 insertions(+), 159 deletions(-) create mode 100644 .test/config/multiqc_config.yml create mode 100644 .test/config/shift_table/shift_table.csv diff --git a/.gitignore b/.gitignore index 122be1b..e7fc954 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ resources/** # Custom additions Notes.md .vscode/* -.snakemake-workflow-catalog.yml \ No newline at end of file +.snakemake-workflow-catalog.yml +.test/results/* \ No newline at end of file diff --git a/.test/config/config.yml b/.test/config/config.yml index cbca87a..c66fa7d 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -1,10 +1,4 @@ - -# optional: define output folder here -# default: "./results" -output: null - -# define samplesheet here -samplesheet: "./test/config/samples.tsv" +samplesheet: "config/samples.tsv" get_genome: database: "ncbi" @@ -28,18 +22,20 @@ star: multi: 10 sam_multi: 1 intron_max: 1 - default: [ - "--readFilesCommand zcat ", - "--outSAMstrandField None ", - "--outSAMattributes All ", - "--outSAMattrIHstart 0 ", - "--outFilterType Normal ", - "--outFilterMultimapScoreRange 1 ", - "-o STARmappings ", - "--outSAMtype BAM Unsorted ", - "--outStd BAM_Unsorted ", - "--outMultimapperOrder Random ", - "--alignEndsType EndToEnd"] + default: + [ + "--readFilesCommand zcat ", + "--outSAMstrandField None ", + "--outSAMattributes All ", + "--outSAMattrIHstart 0 ", + "--outFilterType Normal ", + "--outFilterMultimapScoreRange 1 ", + "-o STARmappings ", + "--outSAMtype BAM Unsorted ", + "--outStd BAM_Unsorted ", + "--outMultimapperOrder Random ", + "--alignEndsType EndToEnd", + ] extract_features: biotypes: ["rRNA", "tRNA"] @@ -54,26 +50,25 @@ deeptools: normalize: "CPM" annotate_orfs: - window_size: 30 - sorf_max_length: 300 - sorf_min_length: 45 - orf_start_codon_table: 11 - orf_stop_codon: ["TAA", "TAG", "TGA"] - orf_longest_only: False + window_size: 30 + sorf_max_length: 300 + sorf_min_length: 45 + orf_start_codon_table: 11 + orf_stop_codon: ["TAA", "TAG", "TGA"] + orf_longest_only: False shift_reads: - window_size: 30 - read_length: [27, 45] - # rpf_read_length: [30, 45] - # qti_read_length: [27, 45] - rnaseq_read_length: [0, 1000] - end_alignment: "3prime" - shift_table: "config/shift_table/shift_table.csv" - export_bam: False - export_bigwig: True - skip_shifting: False - skip_length_filter: True - + window_size: 30 + read_length: [27, 45] + # rpf_read_length: [30, 45] + # qti_read_length: [27, 45] + rnaseq_read_length: [0, 1000] + end_alignment: "3prime" + shift_table: "config/shift_table/shift_table.csv" + export_bam: False + export_bigwig: True + skip_shifting: False + skip_length_filter: True multiqc: config: "config/multiqc_config.yml" diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml new file mode 100644 index 0000000..2f9d44b --- /dev/null +++ b/.test/config/multiqc_config.yml @@ -0,0 +1,2 @@ +remove_sections: + - samtools-stats \ No newline at end of file diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv index 7969058..ead9156 100644 --- a/.test/config/samples.tsv +++ b/.test/config/samples.tsv @@ -1,4 +1,3 @@ sample condition replicate lib_prep data_folder fq1 -RPF-RTP1 RPF-RTP 1 mpusp .test/data RPF-RTP1_R1_001.fastq.gz -RPF-RTP2 RPF-RTP 2 mpusp .test/data RPF-RTP2_R1_001.fastq.gz - +RPF-RTP1 RPF-RTP 1 mpusp data RPF-RTP1_R1_001.fastq.gz +RPF-RTP2 RPF-RTP 2 mpusp data RPF-RTP2_R1_001.fastq.gz diff --git a/.test/config/shift_table/shift_table.csv b/.test/config/shift_table/shift_table.csv new file mode 100644 index 0000000..7d42cb0 --- /dev/null +++ b/.test/config/shift_table/shift_table.csv @@ -0,0 +1,20 @@ +fraction,offsets_start +27,-11 +28,-12 +29,-13 +30,-14 +31,-15 +32,-16 +33,-17 +34,-18 +35,-19 +36,-20 +37,-21 +38,-22 +39,-23 +40,-24 +41,-25 +42,-26 +43,-27 +44,-28 +45,-29 diff --git a/config/config.yml b/config/config.yml index eec7b8a..c66fa7d 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,9 +1,3 @@ - -# optional: define output folder here -# default: "./results" -output: null - -# define samplesheet here samplesheet: "config/samples.tsv" get_genome: @@ -28,18 +22,20 @@ star: multi: 10 sam_multi: 1 intron_max: 1 - default: [ - "--readFilesCommand zcat ", - "--outSAMstrandField None ", - "--outSAMattributes All ", - "--outSAMattrIHstart 0 ", - "--outFilterType Normal ", - "--outFilterMultimapScoreRange 1 ", - "-o STARmappings ", - "--outSAMtype BAM Unsorted ", - "--outStd BAM_Unsorted ", - "--outMultimapperOrder Random ", - "--alignEndsType EndToEnd"] + default: + [ + "--readFilesCommand zcat ", + "--outSAMstrandField None ", + "--outSAMattributes All ", + "--outSAMattrIHstart 0 ", + "--outFilterType Normal ", + "--outFilterMultimapScoreRange 1 ", + "-o STARmappings ", + "--outSAMtype BAM Unsorted ", + "--outStd BAM_Unsorted ", + "--outMultimapperOrder Random ", + "--alignEndsType EndToEnd", + ] extract_features: biotypes: ["rRNA", "tRNA"] @@ -54,26 +50,25 @@ deeptools: normalize: "CPM" annotate_orfs: - window_size: 30 - sorf_max_length: 300 - sorf_min_length: 45 - orf_start_codon_table: 11 - orf_stop_codon: ["TAA", "TAG", "TGA"] - orf_longest_only: False + window_size: 30 + sorf_max_length: 300 + sorf_min_length: 45 + orf_start_codon_table: 11 + orf_stop_codon: ["TAA", "TAG", "TGA"] + orf_longest_only: False shift_reads: - window_size: 30 - read_length: [27, 45] - # rpf_read_length: [30, 45] - # qti_read_length: [27, 45] - rnaseq_read_length: [0, 1000] - end_alignment: "3prime" - shift_table: "config/shift_table/shift_table.csv" - export_bam: False - export_bigwig: True - skip_shifting: False - skip_length_filter: True - + window_size: 30 + read_length: [27, 45] + # rpf_read_length: [30, 45] + # qti_read_length: [27, 45] + rnaseq_read_length: [0, 1000] + end_alignment: "3prime" + shift_table: "config/shift_table/shift_table.csv" + export_bam: False + export_bigwig: True + skip_shifting: False + skip_length_filter: True multiqc: config: "config/multiqc_config.yml" diff --git a/workflow/Snakefile b/workflow/Snakefile index a5236ce..7252d3b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -13,10 +13,10 @@ import pandas as pd from datetime import date from snakemake.utils import min_version -#min_version("7.0") +# min_version("7.0") __author__ = "Rina Ahmed-Begrich, Michael Jahn" -__year__ = str(date.today()).split('-')[0] +__year__ = str(date.today()).split("-")[0] bold = "\033[1m" green = "\033[92m" @@ -27,13 +27,14 @@ msg = f"""{cyan}Bacterial-Riboseq: A Snakemake workflow for the analysis of riboseq data in bacteria.{end} """ -epilog=f""" +epilog = f""" {cyan}Written by {__author__}. Max Planck Unit for the Science of Pathogens. Copyright (c) {__year__}. Copyright Holder All Rights Reserved.{end} """ + # load configuration # ----------------------------------------------------- configfile: "config/config.yml" @@ -51,27 +52,29 @@ include: "rules/postprocessing.smk" shell.executable("bash") shell.prefix(f"set -eo pipefail; ") -if config.get('output') is None: - config['output'] = os.path.join(os.getcwd(), "./results") onstart: print("\n--- Analysis started...\n") print() print("--- Analysis parameters --------------------------------------------\n") - print(f"Current working directory: {os.path.join(os.getcwd())}") - print(f"Output directory:", {config['output']}) + print(f"Current working directory: {os.getcwd()}") + print(f"Output directory: {os.path.join(os.getcwd(), 'results')}") print() print(f"Riboseq samples: {list(samples.index)}") print() + onsuccess: print() print(msg) print(epilog) print("--- Workflow finished, no error! -----------------------------------") print() - debug = os.path.join(config['output'], "workflow.log") - shell("cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}") + debug = os.path.join(os.getcwd(), "results/workflow.log") + shell( + "cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}" + ) + onerror: print() @@ -79,7 +82,7 @@ onerror: print(epilog) print("--- An error occurred! ---------------------------------------------") print() - error = os.path.join(config['output'], "error.log") + error = os.path.join(os.getcwd(), "results/error.log") shell("cat {log} > {error} && echo -e '\nAn error occurred!' >> {error}") diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 532c25e..59255fe 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -10,7 +10,7 @@ samples = ( .sort_index() ) -#TODO: write validation schema +# TODO: write validation schema # validate(SAMPLES, schema="../config/schemas/samples.schema.yml") @@ -18,39 +18,44 @@ def get_final_output(): targets = [] targets.append("results/multiqc/multiqc_report.html") targets.append( - expand("results/{mapping_status}/length_dist/{sample}_length_dist.tsv", + expand( + "results/{mapping_status}/length_dist/{sample}_length_dist.tsv", mapping_status=["mapped", "deduplicated", "filtered_bam"], - sample=samples.index) + sample=samples.index, + ) ) targets.append("results/get_genome/mRNA_features.gff") targets.append( - expand("results/shift_reads/{sample}_shift.csv", - sample=samples.index) + expand("results/shift_reads/{sample}_shift.csv", sample=samples.index) ) return targets # get fastq files def get_fastq(wildcards): - if wildcards.status == 'raw': + if wildcards.status == "raw": return expand( "{input_dir}/{sample}", input_dir=samples.loc[wildcards.sample]["data_folder"], - sample=samples.loc[wildcards.sample]["fq1"]) - if wildcards.status == 'clipped': - return expand( - "results/clipped/{sample}.fastq.gz", - sample=samples.index) + sample=samples.loc[wildcards.sample]["fq1"], + ) + if wildcards.status == "clipped": + return expand("results/clipped/{sample}.fastq.gz", sample=samples.index) # get bam files def get_bam(wildcards): - if wildcards.mapping_status == 'mapped': - return expand(os.path.join("results", "mapped", "{sample}.bam"), - sample=wildcards.sample) - if wildcards.mapping_status == 'deduplicated': - return expand(os.path.join("results", "deduplicated", "{sample}.bam"), - sample=wildcards.sample) - if wildcards.mapping_status == 'filtered_bam': - return expand(os.path.join("results", "filtered_bam", "{sample}.bam"), - sample=wildcards.sample) + if wildcards.mapping_status == "mapped": + return expand( + os.path.join("results", "mapped", "{sample}.bam"), sample=wildcards.sample + ) + if wildcards.mapping_status == "deduplicated": + return expand( + os.path.join("results", "deduplicated", "{sample}.bam"), + sample=wildcards.sample, + ) + if wildcards.mapping_status == "filtered_bam": + return expand( + os.path.join("results", "filtered_bam", "{sample}.bam"), + sample=wildcards.sample, + ) diff --git a/workflow/rules/postprocessing.smk b/workflow/rules/postprocessing.smk index a33986a..35d1c9d 100644 --- a/workflow/rules/postprocessing.smk +++ b/workflow/rules/postprocessing.smk @@ -2,6 +2,7 @@ # Riboseq postprocessing: # # ----------------------------------------------------- # + # module to extract selected biotypes from gff file # ----------------------------------------------------- rule extract_mRNA_features: @@ -60,7 +61,7 @@ rule shift_reads: """--- Shifting Ribo-Seq reads.""" params: config["shift_reads"], - threads: workflow.cores, + threads: workflow.cores log: path="results/shift_reads/log/{sample}.log", script: diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk index 536ec52..36f9270 100644 --- a/workflow/rules/preprocessing.smk +++ b/workflow/rules/preprocessing.smk @@ -2,6 +2,7 @@ # Riboseq preprocessing: # # ----------------------------------------------------- # + # module to make QC report # ----------------------------------------------------- rule fastqc: @@ -10,12 +11,12 @@ rule fastqc: output: report=directory("results/fastqc_{status}/{sample}"), conda: - "../envs/fastqc.yml", + "../envs/fastqc.yml" message: """--- Checking fastq files with FastQC.""" log: "results/fastqc_{status}/log/{sample}.log", - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores shell: "mkdir -p {output.report};" "fastqc --nogroup -extract --threads {threads} -o {output.report} {input} > {log}" @@ -25,9 +26,11 @@ rule fastqc: # ----------------------------------------------------- rule cutadapt: input: - fastq=lambda wc: expand("{input_dir}/{sample}", + fastq=lambda wc: expand( + "{input_dir}/{sample}", input_dir=samples.loc[wc.sample]["data_folder"], - sample=samples.loc[wc.sample]["fq1"]), + sample=samples.loc[wc.sample]["fq1"], + ), output: fastq="results/clipped/{sample}.fastq.gz", conda: @@ -41,7 +44,7 @@ rule cutadapt: log: stdout="results/clipped/log/{sample}.log", stderr="results/clipped/log/{sample}.stderr", - threads: int(workflow.cores * 0.4), # assign 40% of max cores + threads: int(workflow.cores * 0.4) # assign 40% of max cores shell: "if [ {params.fivep_adapter} != None ]; then " "fivep=`echo -g {params.fivep_adapter}`; " @@ -152,7 +155,7 @@ rule star_mapping: outprefix=lambda w, output: f"{os.path.splitext(output.bam)[0]}_", log: path="results/mapped/log/{sample}.log", - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores shell: "STAR " "--runThreadN {threads} " @@ -177,12 +180,12 @@ rule mapping_sorted_bam: conda: "../envs/samtools.yml" log: - "results/mapped/log/samtools_{sample}.log" - # os.path.join(output_dir, "mapping/log/samtools_sort_{sample}.stderr") - message: """--- Samtools sort and index bam files.""" + "results/mapped/log/samtools_{sample}.log", + message: + """--- Samtools sort and index bam files.""" params: tmp="results/mapped/sort_{sample}_tmp", - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores shell: "samtools sort -@ {threads} -O bam -T {params.tmp} -o {output.bam} {input} 2> {log}; " "samtools index -@ {threads} {output.bam} 2>> {log}" @@ -204,7 +207,7 @@ rule umi_dedup: params: tmp="results/deduplicated/sort_{sample}_tmp", default=config["umi_dedup"], - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores log: path="results/deduplicated/log/{sample}.log", stderr="results/deduplicated/log/{sample}.stderr", @@ -253,7 +256,7 @@ rule filter_bam: "../envs/filter_bam.yml" params: defaults=config["bedtools_intersect"]["defaults"], - threads: int(workflow.cores * 0.2), # assign 20% of max cores + threads: int(workflow.cores * 0.2) # assign 20% of max cores shell: "intersectBed -abam {input.bam} -b {input.gff} {params.defaults} | " "samtools sort -@ {threads} > {output.bam} 2> {log.path}; " @@ -265,30 +268,26 @@ rule filter_bam: # ----------------------------------------------------- rule extract_mapping_length: input: - get_bam, + bam=get_bam, output: - "results/{mapping_status}/length_dist/{sample}_length_dist.tsv", - message: """--- Extract mapping length of BAM input: {wildcards.mapping_status}/{wildcards.sample}.bam""" + tsv="results/{mapping_status}/length_dist/{sample}_length_dist.tsv", + pdf="results/{mapping_status}/length_dist/{sample}_length_dist.pdf", + message: + """--- Extract mapping length of BAM input: {wildcards.mapping_status}/{wildcards.sample}.bam""" conda: - "../envs/plot_mapping_length.yml", + "../envs/plot_mapping_length.yml" log: - os.path.join("results", "{mapping_status}", "length_dist", "log", "{sample}.log"), - params: - outdir=os.path.join("results","{mapping_status}", "length_dist"), - shell: - "workflow/scripts/plot_mapping_length.py -b {input} -o {params.outdir} -s {wildcards.sample} 2> {log}" + "results/{mapping_status}/length_dist/{sample}_length_dist.log", + script: + "../scripts/plot_mapping_length.py" # module to run multiQC on input + processed files # ----------------------------------------------------- rule multiqc: input: - expand( - "results/fastqc_clipped/{sample}_fastqc.html", - sample=samples.index), - expand( - "results/clipped/{sample}.fastq.gz", - sample=samples.index), + expand("results/fastqc_clipped/{sample}_fastqc.html", sample=samples.index), + expand("results/clipped/{sample}.fastq.gz", sample=samples.index), expand( "results/umi_extraction/{sample}.fastq.gz", sample=samples.index, diff --git a/workflow/scripts/plot_mapping_length.py b/workflow/scripts/plot_mapping_length.py index 9d3d639..dffddb6 100755 --- a/workflow/scripts/plot_mapping_length.py +++ b/workflow/scripts/plot_mapping_length.py @@ -17,7 +17,7 @@ __version__ = '.'.join(__version_info__) -def get_aln_length(bamfile, outfolder, sample): +def get_aln_length(bamfile, sample): lengths = [] for aln in bamfile.fetch(): lengths.append(len(aln.query_sequence)) @@ -43,29 +43,19 @@ def get_aln_length(bamfile, outfolder, sample): """ % (__authors__, __mail__), formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) - parser.add_argument('-b', dest='bam', required=True, help='bam file') - parser.add_argument('-o', dest='output_folder', required=True, help='output folder') - parser.add_argument('-s', dest='sample_name', required=True, help='sample name') - - args = vars(parser.parse_args()) - file = args['bam'] - sample_name = args['sample_name'] - outfolder=args['output_folder'] - + input = str(snakemake.input["bam"]) + sample_name = snakemake.wildcards.sample + output_tsv = snakemake.output["tsv"] + output_pdf = snakemake.output["pdf"] try: - if not file.endswith('bam'): - sys.stderr.write("Input file needs to be in bam format. " + - "Exit forced\n.") - sys.exit(1) - elif not os.path.exists(file): - sys.stderr.write("Input file %s does not exist. " + - "Exit forced\n." % file) + if not os.path.exists(input): + sys.stderr.write( + "Input file %s does not exist. " + "Exit forced\n." % input + ) sys.exit(1) - bamfile = pysam.AlignmentFile(file, "rb") - df = get_aln_length(bamfile=bamfile, outfolder=outfolder, - sample=sample_name) + bamfile = pysam.AlignmentFile(input, "rb") + df = get_aln_length(bamfile=bamfile, sample=sample_name) # plot pdf matplotlib.rcParams['axes.grid'] = True @@ -84,15 +74,15 @@ def get_aln_length(bamfile, outfolder, sample): ax.set_xlabel('Read length') ax.get_yaxis().get_major_formatter().set_scientific(False) plt.tight_layout() - plt.savefig(os.path.join(outfolder, f'{sample_name}_mapped_length.pdf'), bbox_inches='tight') + plt.savefig(output_pdf, bbox_inches='tight') # export coount table df.index.set_names(['Length'], inplace=True) df.reset_index(inplace=True) - df.to_csv(os.path.join(outfolder, f'{sample_name}_length_dist.tsv'), sep='\t', index=False) + df.to_csv(output_tsv, sep='\t', index=False) except: sys.stderr.write("Error ocurred when processing bam file: %s.\n" - % file) + % input) raise RuntimeError sys.exit(1) From a4dd4085d2761689955917ada0a6f0c4baf19e28 Mon Sep 17 00:00:00 2001 From: jahn Date: Tue, 23 Jul 2024 14:52:21 +0200 Subject: [PATCH 5/5] fix: snakefmt error --- workflow/Snakefile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 7252d3b..aa71569 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -47,12 +47,6 @@ include: "rules/preprocessing.smk" include: "rules/postprocessing.smk" -# set shell configs -# ----------------------------------------------------- -shell.executable("bash") -shell.prefix(f"set -eo pipefail; ") - - onstart: print("\n--- Analysis started...\n") print()