From 42492c697da2048d5df96b925c0e3793954bfadb Mon Sep 17 00:00:00 2001 From: William Roberts Date: Tue, 17 Dec 2024 16:54:27 +0000 Subject: [PATCH 01/10] Add assorted-sub-workflows --- .gitmodules | 3 +++ assorted-sub-workflows | 1 + 2 files changed, 4 insertions(+) create mode 100644 .gitmodules create mode 160000 assorted-sub-workflows diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..340f1e0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "assorted-sub-workflows"] + path = assorted-sub-workflows + url = git@github.com:sanger-pathogens/assorted-sub-workflows.git diff --git a/assorted-sub-workflows b/assorted-sub-workflows new file mode 160000 index 0000000..618a80b --- /dev/null +++ b/assorted-sub-workflows @@ -0,0 +1 @@ +Subproject commit 618a80b6afebd5334225cb5e9475094109fb9789 From f08ba3437faa213c3eea39d2847cf35a70b42b58 Mon Sep 17 00:00:00 2001 From: William Roberts Date: Wed, 18 Dec 2024 17:40:46 +0000 Subject: [PATCH 02/10] Update pipeline to use MIXED_INPUT --- modules/validate.nf | 40 ++++++++++++++++++++++++++++++++++++++++ nextflow.config | 18 +++++++++++++++++- workflows/pipeline.nf | 14 +++++++++++++- 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/modules/validate.nf b/modules/validate.nf index c8e2e8e..47ee4a9 100644 --- a/modules/validate.nf +++ b/modules/validate.nf @@ -1,3 +1,41 @@ +// Map of valid parameters for which to skip validation +skipValidationParams = [ + // From common config + input: 'skip', + tracedir: 'skip', + max_memory: 'skip', + max_cpus: 'skip', + max_time: 'skip', + max_retries: 'skip', + retry_strategy: 'skip', + queue_size: 'skip', + submit_rate_limit: 'skip', + // From mixed input config + outdir: 'skip', + manifest_of_reads: 'skip', + manifest_of_lanes: 'skip', + manifest: 'skip', + save_metadata: 'skip', + combine_same_id_crams: 'skip', + dehumanising_method: 'skip', + cleanup_intermediate_files_irods_extractor: 'skip', + save_fastqs: 'skip', + save_method: 'skip', + raw_reads_prefix: 'skip', + preexisting_fastq_tag: 'skip', + split_sep_for_ID_from_fastq: 'skip', + lane_plex_sep: 'skip', + start_queue: 'skip', + irods_subset_to_skip: 'skip', + short_metacsv_name: 'skip', + studyid: 'skip', + runid: 'skip', + laneid: 'skip', + plexid: 'skip', + target: 'skip', + type: 'skip' +] + // Map of valid parameters and their value types validParams = [ help: 'boolean', @@ -31,6 +69,8 @@ validParams = [ lite: 'boolean' ] +validParams += skipValidationParams + // Validate whether all provided parameters are valid void validate(Map params) { // Ensure only one or none of the alternative workflows is selected diff --git a/nextflow.config b/nextflow.config index c718543..0bbc1de 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,5 +1,10 @@ nextflow.enable.dsl=2 +// Import mixed input params +includeConfig "https://raw.githubusercontent.com/sanger-pathogens/nextflow-commons/cb60d523f202dace1b1efd02a5ae03f8a4b049a6/configs/nextflow.config" +includeConfig "$projectDir/assorted-sub-workflows/irods_extractor/subworkflows/irods.config" +includeConfig "$projectDir/assorted-sub-workflows/mixed_input/subworkflows/mixed_input.config" + // Default parameters that can be overridden params { // Show help message @@ -13,6 +18,8 @@ params { reads = "$projectDir/input" // Default output directory output = "$projectDir/output" + // To allow mixed input to work without warnings + outdir = output // Default databases directory for saving all the required databases db = "$projectDir/databases" @@ -63,8 +70,11 @@ params { lite = false } -// Set auto-retry and process container images process { + // Avoid use of `-o pipefail` as this fails version info collation + shell = ['/bin/bash', '-eu'] + + // Set auto-retry and process container images maxRetries = 2 errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'ignore' } @@ -192,4 +202,10 @@ profiles { } } + // Profile for Sanger + sanger { + singularity { + runOptions = '--bind /lustre,/nfs,/data,/software,/tmp' + } + } } diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 34f6041..0f299e8 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -10,6 +10,9 @@ include { MLST } from "$projectDir/modules/mlst" include { PBP_RESISTANCE; PARSE_PBP_RESISTANCE; GET_ARIBA_DB; OTHER_RESISTANCE; PARSE_OTHER_RESISTANCE } from "$projectDir/modules/amr" include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output" +// Import subworkflows +include { MIXED_INPUT } from "$projectDir/assorted-sub-workflows/mixed_input/mixed_input" + // Main pipeline workflow workflow PIPELINE { main: @@ -29,8 +32,17 @@ workflow PIPELINE { // Get path to ARIBA database, generate from reference sequences and metadata if ncessary GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.db) + // Obtain input from manifests and iRODS params + MIXED_INPUT + | map { meta, R1, R2 -> [meta.ID, [R1, R2]] } + | set { raw_read_pairs_ch } + // Get read pairs into Channel raw_read_pairs_ch - raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) + if (params.reads) { + Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true) + | mix(raw_read_pairs_ch) + | set { raw_read_pairs_ch } + } // Basic input files validation // Output into Channel FILE_VALIDATION.out.result From 6c44b89485acab4ef3b8b2bf9323b36f1f417845 Mon Sep 17 00:00:00 2001 From: William Roberts Date: Thu, 19 Dec 2024 10:49:04 +0000 Subject: [PATCH 03/10] Update assorted-sub-workflows submodule --- assorted-sub-workflows | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assorted-sub-workflows b/assorted-sub-workflows index 618a80b..e966ed3 160000 --- a/assorted-sub-workflows +++ b/assorted-sub-workflows @@ -1 +1 @@ -Subproject commit 618a80b6afebd5334225cb5e9475094109fb9789 +Subproject commit e966ed3c9d960a02f27c7def89d8a4ef22a4a1de From 21c234312a7558a9e1f0b85fc85e096271505f05 Mon Sep 17 00:00:00 2001 From: William Roberts Date: Thu, 19 Dec 2024 22:58:04 +0000 Subject: [PATCH 04/10] Fix bug - repeated value during join --- workflows/pipeline.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf index 0f299e8..16ee6ed 100644 --- a/workflows/pipeline.nf +++ b/workflows/pipeline.nf @@ -126,7 +126,7 @@ workflow PIPELINE { // Merge Channels FILE_VALIDATION.out.result & READ_QC.out.result & ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report OVERALL_QC( - raw_read_pairs_ch.map{ it[0] } + raw_read_pairs_ch.map{ [it[0]] } .join(FILE_VALIDATION.out.result, failOnDuplicate: true, remainder: true) .join(READ_QC.out.result, failOnDuplicate: true, remainder: true) .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true) @@ -173,7 +173,7 @@ workflow PIPELINE { // Generate sample reports by merging outputs from all result-generating modules GENERATE_SAMPLE_REPORT( - raw_read_pairs_ch.map{ it[0] } + raw_read_pairs_ch.map{ [it[0]] } .join(READ_QC.out.report, failOnDuplicate: true, remainder: true) .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true) .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true) From 328271c022e0895b3ee51684ff96d4eaabb02fc2 Mon Sep 17 00:00:00 2001 From: William Roberts Date: Thu, 19 Dec 2024 23:53:04 +0000 Subject: [PATCH 05/10] Update help message and README.md --- README.md | 67 +++++++++++++++++++++++++++++++++++++++++---- modules/messages.nf | 9 +++--- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index db6d1de..6a68024 100644 --- a/README.md +++ b/README.md @@ -85,13 +85,70 @@ It is recommended to have at least 16GB of RAM and 100GB of free storage > - The pipeline generates ~1.8GB intermediate files for each sample on average > - These files can be removed when the pipeline run is completed, please refer to [Clean Up](#clean-up) > - To further reduce storage requirement by sacrificing the ability to resume the pipeline, please refer to [Experimental](#experimental) + ## Accepted Inputs - Only Illumina paired-end short reads are supported -- Each sample is expected to be a pair of raw reads following this file name pattern: - - `*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}` - - example 1: `SampleName_R1_001.fastq.gz`, `SampleName_R2_001.fastq.gz` - - example 2: `SampleName_1.fastq.gz`, `SampleName_2.fastq.gz` - - example 3: `SampleName_R1.fq`, `SampleName_R2.fq` +- Any combination of the following input options are supported: + 1. `--reads`: + Specify a directory of per-sample paired (gzipped) fastq files containing reads (files named according to the following pattern `*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}`): + - example 1: `SampleName_R1_001. fastq.gz`, `SampleName_R2_001.fastq.gz` + - example 2: `SampleName_1.fastq. gz`, `SampleName_2.fastq.gz` + - example 3: `SampleName_R1.fq`, `SampleName_R2.fq` + + 2. `--manifest_of_reads` or `--manifest`: + Specify the paths to (gzipped) fastq files + containing reads via a CSV manifest, listing the pair of read files pertaining to a sample, one per row. + + 3. **iRODS attribute parameters** (Sanger HPC only): + Specify a combination of iRODS attributes to search for reads to use as pipeline input. + + The selected set of data files is defined by a combination of parameters: `--studyid`, `--runid`, `--laneid`, `--plexid`, `--target` and `--type` (these refer to specifics of the sequencing experiment and data to be retrieved). + + Each parameter restricts the set of data files that match and will be downloaded. With the exception of `--type` and `--target`, omitting an option causes samples for all possible values of the parameter to be retrieved. + + Either `--studyid` or `--runid` is required.While `--laneid`, `--plexid`, `--target` and `--type` are optional. This avoids indiscriminately and unintentionally downloading thousands of files. + ``` + --studyid + default: -1 + Sequencing Study ID + --runid + default: -1 + Sequencing Run ID + --laneid + default: -1 + Sequencing Lane ID + --plexid + default: -1 + Sequencing Plex ID + --target + default: 1 + Marker of key data product likely to be of interest to customer + --type + default: cram + File type + ``` + + 4. `--manifest_of_lanes` (Sanger HPC only): + Specify a CSV manifest listing a batch of iRODS parameter combinations. + + Valid column headings include the individual parameter options described above: `studyid`, `runid`, `laneid`, `plexid`, or any other iRODS metadata attribute, e.g. `sample_common_name`, `sample_supplier_name`. + Corresponding fields in the CSV manifest file can be left blank. + + `laneid` and `plexid` are only considered when provided alongside a `studyid` or `runid`. + - example 1: + ``` + studyid,runid,laneid,plexid + ,37822,2,354 + 5970,37822,,332 + 5970,37822,2, + ``` + - example 2: + ``` + sample_common_name,type,target + Romboutsia lituseburensis,cram,1 + Romboutsia lituseburensis,cram,0 + ``` + ## Setup > [!WARNING] > - Docker or Singularity must be running diff --git a/modules/messages.nf b/modules/messages.nf index 4e5cc4c..72bd6d8 100644 --- a/modules/messages.nf +++ b/modules/messages.nf @@ -28,10 +28,11 @@ void helpMessage() { |./run_pipeline [option] [value] | |All options are optional, some common options: - |--reads [PATH] Path to the input directory that contains the reads to be processed - |--output [PATH] Path to the output directory that save the results - |--init Alternative workflow for initialisation - |--version Alternative workflow for getting versions of pipeline, container images, tools and databases + |--reads [PATH] Path to the input directory that contains the reads to be processed + |--manifest [PATH] Path to input CSV (headings: ID,R1,R2), listing a pair (gzipped) fastq files pertaining to a sample, one per row + |--output [PATH] Path to the output directory that save the results + |--init Alternative workflow for initialisation + |--version Alternative workflow for getting versions of pipeline, container images, tools and databases | |For all available options, please refer to README.md '''.stripMargin() From 01c5fb1136686ea2393683fd6761648ba17784a8 Mon Sep 17 00:00:00 2001 From: William Roberts Date: Thu, 19 Dec 2024 23:57:06 +0000 Subject: [PATCH 06/10] Add sanger profile to README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6a68024..1140adf 100644 --- a/README.md +++ b/README.md @@ -210,6 +210,7 @@ It is recommended to have at least 16GB of RAM and 100GB of free storage | `standard`
(Default) | Docker is used as the container engine.
Processes are executed locally. | | `singularity` | Singularity is used as the container engine.
Processes are executed locally. | | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.**
Singularity is used as the container engine.
Processes are submitted to your LSF cluster via `bsub` by the pipeline.
(Tested on Wellcome Sanger Institute farm5 LSF cluster only)
(Option `--kraken2_memory_mapping` default change to `false`.) | + | `sanger` | **Only required for Sanger HPC cluster.**
Intended to be used in combination with `lsf` profile. | ## Resume > [!TIP] From a0eafcf739b150ae33711f8f76633e0cc5ca241a Mon Sep 17 00:00:00 2001 From: William Roberts Date: Fri, 20 Dec 2024 10:23:45 +0000 Subject: [PATCH 07/10] Add cloning instructions --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1140adf..ab685dc 100644 --- a/README.md +++ b/README.md @@ -153,17 +153,22 @@ It is recommended to have at least 16GB of RAM and 100GB of free storage > [!WARNING] > - Docker or Singularity must be running > - An Internet connection is required -1. Clone the repository (if Git is installed on your system) +1. Clone the repository (`git` is installed on your system) ``` - git clone https://github.com/GlobalPneumoSeq/gps-pipeline.git + git clone --recurse-submodules https://github.com/GlobalPneumoSeq/gps-pipeline.git ``` - or + > Note: The pipeline depends on git submodules. If you don't clone with `--recurse-submodules`, you can correct this with `git submodule update --init`. - Download and unzip/extract the [latest release](https://github.com/GlobalPneumoSeq/gps-pipeline/releases) + OR + + - Download and unzip/extract the [latest release](https://github.com/GlobalPneumoSeq/gps-pipeline/releases). + - Click [here](./assorted-sub-workflows) and download the source code (`Code -> Download Zip`). Unzip/extract the code in the same directory as the gps-pipeline root. + 2. Go into the local directory of the pipeline and it is ready to use without installation (the directory name might be different) ``` cd gps-pipeline ``` + 3. (Optional) You could perform an initialisation to download all required additional files and container images, so the pipeline can be used at any time with or without the Internet afterwards. - Using Docker as the container engine ``` From ec7c05eacc8fc68b83780414a4835ca2d8a2db4a Mon Sep 17 00:00:00 2001 From: William Roberts Date: Fri, 20 Dec 2024 10:45:33 +0000 Subject: [PATCH 08/10] Only suggest git --- README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ab685dc..c08e1d4 100644 --- a/README.md +++ b/README.md @@ -153,16 +153,18 @@ It is recommended to have at least 16GB of RAM and 100GB of free storage > [!WARNING] > - Docker or Singularity must be running > - An Internet connection is required -1. Clone the repository (`git` is installed on your system) +1. Clone the repository (`git` must be installed on your system) ``` git clone --recurse-submodules https://github.com/GlobalPneumoSeq/gps-pipeline.git ``` > Note: The pipeline depends on git submodules. If you don't clone with `--recurse-submodules`, you can correct this with `git submodule update --init`. - - OR - - - Download and unzip/extract the [latest release](https://github.com/GlobalPneumoSeq/gps-pipeline/releases). - - Click [here](./assorted-sub-workflows) and download the source code (`Code -> Download Zip`). Unzip/extract the code in the same directory as the gps-pipeline root. + + To use a particular version of this pipeline, navigate into the root directory of the gps_pipeline and checkout a particular branch or tag: + ``` + git checkout + ``` + + See [Releases/Tags](./releases) and [Branches](./branches) for possibilities. 2. Go into the local directory of the pipeline and it is ready to use without installation (the directory name might be different) ``` From ef01f584deab2d8ee2df358e99f77b2781daa359 Mon Sep 17 00:00:00 2001 From: William Roberts Date: Fri, 20 Dec 2024 10:50:40 +0000 Subject: [PATCH 09/10] Use HTTPS for submodule --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 340f1e0..1283bf2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "assorted-sub-workflows"] path = assorted-sub-workflows - url = git@github.com:sanger-pathogens/assorted-sub-workflows.git + url = https://github.com/sanger-pathogens/assorted-sub-workflows.git From 76d750adf34d7ef9307e3bc3106e85330f58ea9b Mon Sep 17 00:00:00 2001 From: William Roberts Date: Tue, 28 Jan 2025 10:55:10 +0000 Subject: [PATCH 10/10] Change config to use nextflow commons master --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 0bbc1de..bedf7c7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,7 +1,7 @@ nextflow.enable.dsl=2 // Import mixed input params -includeConfig "https://raw.githubusercontent.com/sanger-pathogens/nextflow-commons/cb60d523f202dace1b1efd02a5ae03f8a4b049a6/configs/nextflow.config" +includeConfig "https://raw.githubusercontent.com/sanger-pathogens/nextflow-commons/refs/heads/master/configs/nextflow.config" includeConfig "$projectDir/assorted-sub-workflows/irods_extractor/subworkflows/irods.config" includeConfig "$projectDir/assorted-sub-workflows/mixed_input/subworkflows/mixed_input.config"