From 42492c697da2048d5df96b925c0e3793954bfadb Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Tue, 17 Dec 2024 16:54:27 +0000
Subject: [PATCH 01/10] Add assorted-sub-workflows

---
 .gitmodules            | 3 +++
 assorted-sub-workflows | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 assorted-sub-workflows

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..340f1e0
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "assorted-sub-workflows"]
+	path = assorted-sub-workflows
+	url = git@github.com:sanger-pathogens/assorted-sub-workflows.git
diff --git a/assorted-sub-workflows b/assorted-sub-workflows
new file mode 160000
index 0000000..618a80b
--- /dev/null
+++ b/assorted-sub-workflows
@@ -0,0 +1 @@
+Subproject commit 618a80b6afebd5334225cb5e9475094109fb9789

From f08ba3437faa213c3eea39d2847cf35a70b42b58 Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Wed, 18 Dec 2024 17:40:46 +0000
Subject: [PATCH 02/10] Update pipeline to use MIXED_INPUT

---
 modules/validate.nf   | 40 ++++++++++++++++++++++++++++++++++++++++
 nextflow.config       | 18 +++++++++++++++++-
 workflows/pipeline.nf | 14 +++++++++++++-
 3 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/modules/validate.nf b/modules/validate.nf
index c8e2e8e..47ee4a9 100644
--- a/modules/validate.nf
+++ b/modules/validate.nf
@@ -1,3 +1,41 @@
+// Map of valid parameters for which to skip validation
+skipValidationParams = [
+    // From common config
+    input: 'skip',
+    tracedir: 'skip',
+    max_memory: 'skip',
+    max_cpus: 'skip',
+    max_time: 'skip',
+    max_retries: 'skip',
+    retry_strategy: 'skip',
+    queue_size: 'skip',
+    submit_rate_limit: 'skip',
+    // From mixed input config
+    outdir: 'skip',
+    manifest_of_reads: 'skip',
+    manifest_of_lanes: 'skip',
+    manifest: 'skip',
+    save_metadata: 'skip',
+    combine_same_id_crams: 'skip',
+    dehumanising_method: 'skip',
+    cleanup_intermediate_files_irods_extractor: 'skip',
+    save_fastqs: 'skip',
+    save_method: 'skip',
+    raw_reads_prefix: 'skip',
+    preexisting_fastq_tag: 'skip',
+    split_sep_for_ID_from_fastq: 'skip',
+    lane_plex_sep: 'skip',
+    start_queue: 'skip',
+    irods_subset_to_skip: 'skip',
+    short_metacsv_name: 'skip',
+    studyid: 'skip',
+    runid: 'skip',
+    laneid: 'skip',
+    plexid: 'skip',
+    target: 'skip',
+    type: 'skip'
+] 
+
 // Map of valid parameters and their value types
 validParams = [
     help: 'boolean',
@@ -31,6 +69,8 @@ validParams = [
     lite: 'boolean'
 ]
 
+validParams += skipValidationParams
+
 // Validate whether all provided parameters are valid
 void validate(Map params) {
     // Ensure only one or none of the alternative workflows is selected
diff --git a/nextflow.config b/nextflow.config
index c718543..0bbc1de 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,5 +1,10 @@
 nextflow.enable.dsl=2
 
+// Import mixed input params
+includeConfig "https://raw.githubusercontent.com/sanger-pathogens/nextflow-commons/cb60d523f202dace1b1efd02a5ae03f8a4b049a6/configs/nextflow.config"
+includeConfig "$projectDir/assorted-sub-workflows/irods_extractor/subworkflows/irods.config"
+includeConfig "$projectDir/assorted-sub-workflows/mixed_input/subworkflows/mixed_input.config"
+
 // Default parameters that can be overridden
 params {
     // Show help message
@@ -13,6 +18,8 @@ params {
     reads = "$projectDir/input"
     // Default output directory
     output = "$projectDir/output"
+    // To allow mixed input to work without warnings
+    outdir = output
     
     // Default databases directory for saving all the required databases
     db = "$projectDir/databases"
@@ -63,8 +70,11 @@ params {
     lite = false
 }
 
-// Set auto-retry and process container images
 process {
+    // Avoid use of `-o pipefail` as this fails version info collation
+    shell = ['/bin/bash', '-eu']
+
+    // Set auto-retry and process container images
     maxRetries = 2
     errorStrategy = { task.attempt <= process.maxRetries ? 'retry' : 'ignore' }
 
@@ -192,4 +202,10 @@ profiles {
         }
     }
 
+    // Profile for Sanger
+    sanger {
+        singularity {
+            runOptions = '--bind /lustre,/nfs,/data,/software,/tmp'
+        }
+    }
 }
diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf
index 34f6041..0f299e8 100644
--- a/workflows/pipeline.nf
+++ b/workflows/pipeline.nf
@@ -10,6 +10,9 @@ include { MLST } from "$projectDir/modules/mlst"
 include { PBP_RESISTANCE; PARSE_PBP_RESISTANCE; GET_ARIBA_DB; OTHER_RESISTANCE; PARSE_OTHER_RESISTANCE } from "$projectDir/modules/amr"
 include { GENERATE_SAMPLE_REPORT; GENERATE_OVERALL_REPORT } from "$projectDir/modules/output"
 
+// Import subworkflows
+include { MIXED_INPUT } from "$projectDir/assorted-sub-workflows/mixed_input/mixed_input"
+
 // Main pipeline workflow
 workflow PIPELINE {
     main:
@@ -29,8 +32,17 @@ workflow PIPELINE {
     // Get path to ARIBA database, generate from reference sequences and metadata if ncessary
     GET_ARIBA_DB(params.ariba_ref, params.ariba_metadata, params.db)
 
+    // Obtain input from manifests and iRODS params
+    MIXED_INPUT
+    | map { meta, R1, R2 -> [meta.ID, [R1, R2]] }
+    | set { raw_read_pairs_ch }
+
     // Get read pairs into Channel raw_read_pairs_ch
-    raw_read_pairs_ch = Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true)
+    if (params.reads) {
+        Channel.fromFilePairs("$params.reads/*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}", checkIfExists: true)
+        | mix(raw_read_pairs_ch)
+        | set { raw_read_pairs_ch }
+    }
 
     // Basic input files validation
     // Output into Channel FILE_VALIDATION.out.result

From 6c44b89485acab4ef3b8b2bf9323b36f1f417845 Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Thu, 19 Dec 2024 10:49:04 +0000
Subject: [PATCH 03/10] Update assorted-sub-workflows submodule

---
 assorted-sub-workflows | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assorted-sub-workflows b/assorted-sub-workflows
index 618a80b..e966ed3 160000
--- a/assorted-sub-workflows
+++ b/assorted-sub-workflows
@@ -1 +1 @@
-Subproject commit 618a80b6afebd5334225cb5e9475094109fb9789
+Subproject commit e966ed3c9d960a02f27c7def89d8a4ef22a4a1de

From 21c234312a7558a9e1f0b85fc85e096271505f05 Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Thu, 19 Dec 2024 22:58:04 +0000
Subject: [PATCH 04/10] Fix bug - repeated value during join

---
 workflows/pipeline.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/pipeline.nf b/workflows/pipeline.nf
index 0f299e8..16ee6ed 100644
--- a/workflows/pipeline.nf
+++ b/workflows/pipeline.nf
@@ -126,7 +126,7 @@ workflow PIPELINE {
     // Merge Channels FILE_VALIDATION.out.result & READ_QC.out.result & ASSEMBLY_QC.out.result & MAPPING_QC.out.result & TAXONOMY_QC.out.result to provide Overall QC Status
     // Output into Channel OVERALL_QC.out.result & OVERALL_QC.out.report
     OVERALL_QC(
-        raw_read_pairs_ch.map{ it[0] }
+        raw_read_pairs_ch.map{ [it[0]] }
         .join(FILE_VALIDATION.out.result, failOnDuplicate: true, remainder: true)
         .join(READ_QC.out.result, failOnDuplicate: true, remainder: true)
         .join(ASSEMBLY_QC.out.result, failOnDuplicate: true, remainder: true)
@@ -173,7 +173,7 @@ workflow PIPELINE {
 
     // Generate sample reports by merging outputs from all result-generating modules
     GENERATE_SAMPLE_REPORT(
-        raw_read_pairs_ch.map{ it[0] }
+        raw_read_pairs_ch.map{ [it[0]] }
         .join(READ_QC.out.report, failOnDuplicate: true, remainder: true)
         .join(ASSEMBLY_QC.out.report, failOnDuplicate: true, remainder: true)
         .join(MAPPING_QC.out.report, failOnDuplicate: true, remainder: true)

From 328271c022e0895b3ee51684ff96d4eaabb02fc2 Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Thu, 19 Dec 2024 23:53:04 +0000
Subject: [PATCH 05/10] Update help message and README.md

---
 README.md           | 67 +++++++++++++++++++++++++++++++++++++++++----
 modules/messages.nf |  9 +++---
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index db6d1de..6a68024 100644
--- a/README.md
+++ b/README.md
@@ -85,13 +85,70 @@ It is recommended to have at least 16GB of RAM and 100GB of free storage
 > - The pipeline generates ~1.8GB intermediate files for each sample on average
 >     - These files can be removed when the pipeline run is completed, please refer to [Clean Up](#clean-up)
 >     - To further reduce storage requirement by sacrificing the ability to resume the pipeline, please refer to [Experimental](#experimental)
+
 ## Accepted Inputs
 - Only Illumina paired-end short reads are supported
-- Each sample is expected to be a pair of raw reads following this file name pattern: 
-    - `*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}` 
-        - example 1: `SampleName_R1_001.fastq.gz`, `SampleName_R2_001.fastq.gz`
-        - example 2: `SampleName_1.fastq.gz`, `SampleName_2.fastq.gz`
-        - example 3: `SampleName_R1.fq`, `SampleName_R2.fq`
+- Any combination of the following input options are supported:
+  1. `--reads`:  
+     Specify a directory of per-sample paired   (gzipped) fastq files containing reads   (files named according to the following   pattern `*_{,R}{1,2}{,_001}.{fq,fastq}{,.gz}`):
+       - example 1: `SampleName_R1_001.  fastq.gz`, `SampleName_R2_001.fastq.gz`
+       - example 2: `SampleName_1.fastq.  gz`, `SampleName_2.fastq.gz`
+       - example 3: `SampleName_R1.fq`,   `SampleName_R2.fq`
+
+  2. `--manifest_of_reads` or `--manifest`:  
+     Specify the paths to (gzipped) fastq files
+     containing reads via a CSV manifest, listing the pair of read files pertaining to a sample, one per row.
+      
+  3. **iRODS attribute parameters** (Sanger HPC only):  
+     Specify a combination of iRODS attributes to search for reads to use as pipeline input.
+
+     The selected set of data files is defined by a combination of parameters: `--studyid`, `--runid`, `--laneid`, `--plexid`, `--target` and `--type` (these refer to specifics of the sequencing experiment and data to be retrieved).
+
+     Each parameter restricts the set of data files that match and will be downloaded. With the exception of `--type` and `--target`, omitting an option causes samples for all possible values of the parameter to be retrieved.
+
+     Either `--studyid` or `--runid` is required.While `--laneid`, `--plexid`, `--target` and `--type` are optional. This avoids indiscriminately and unintentionally downloading thousands of files.
+     ```
+      --studyid
+            default: -1
+            Sequencing Study ID
+      --runid
+            default: -1
+            Sequencing Run ID
+      --laneid
+            default: -1
+            Sequencing Lane ID
+      --plexid
+            default: -1
+            Sequencing Plex ID
+      --target
+            default: 1
+            Marker of key data product likely to be of interest to customer
+      --type
+            default: cram
+            File type
+     ```
+      
+  4. `--manifest_of_lanes` (Sanger HPC only):  
+     Specify a CSV manifest listing a batch of iRODS parameter combinations.
+     
+     Valid column headings include the individual parameter options described above: `studyid`, `runid`, `laneid`, `plexid`, or any other iRODS metadata attribute, e.g. `sample_common_name`, `sample_supplier_name`.
+     Corresponding fields in the CSV manifest file can be left blank.
+     
+     `laneid` and `plexid` are only considered when provided alongside a `studyid` or `runid`.
+       - example 1:
+         ```
+         studyid,runid,laneid,plexid
+         ,37822,2,354
+         5970,37822,,332
+         5970,37822,2,
+         ```
+       - example 2:
+         ```
+         sample_common_name,type,target
+         Romboutsia lituseburensis,cram,1
+         Romboutsia lituseburensis,cram,0
+         ```
+
 ## Setup 
 > [!WARNING]
 > - Docker or Singularity must be running
diff --git a/modules/messages.nf b/modules/messages.nf
index 4e5cc4c..72bd6d8 100644
--- a/modules/messages.nf
+++ b/modules/messages.nf
@@ -28,10 +28,11 @@ void helpMessage() {
         |./run_pipeline [option] [value]
         |
         |All options are optional, some common options:
-        |--reads [PATH]    Path to the input directory that contains the reads to be processed
-        |--output [PATH]   Path to the output directory that save the results
-        |--init          Alternative workflow for initialisation
-        |--version       Alternative workflow for getting versions of pipeline, container images, tools and databases
+        |--reads [PATH]     Path to the input directory that contains the reads to be processed
+        |--manifest [PATH]  Path to input CSV (headings: ID,R1,R2), listing a pair (gzipped) fastq files pertaining to a sample, one per row
+        |--output [PATH]    Path to the output directory that save the results
+        |--init             Alternative workflow for initialisation
+        |--version          Alternative workflow for getting versions of pipeline, container images, tools and databases
         |
         |For all available options, please refer to README.md
         '''.stripMargin()

From 01c5fb1136686ea2393683fd6761648ba17784a8 Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Thu, 19 Dec 2024 23:57:06 +0000
Subject: [PATCH 06/10] Add sanger profile to README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6a68024..1140adf 100644
--- a/README.md
+++ b/README.md
@@ -210,6 +210,7 @@ It is recommended to have at least 16GB of RAM and 100GB of free storage
     | `standard`<br> (Default) | Docker is used as the container engine. <br> Processes are executed locally. |
     | `singularity` |  Singularity is used as the container engine. <br> Processes are executed locally. |
     | `lsf` | **The pipeline should be launched from a LSF cluster head node with this profile.** <br>Singularity is used as the container engine. <br> Processes are submitted to your LSF cluster via `bsub` by the pipeline. <br> (Tested on Wellcome Sanger Institute farm5 LSF cluster only) <br> (Option `--kraken2_memory_mapping` default change to `false`.) |
+    | `sanger` | **Only required for Sanger HPC cluster.** <br>Intended to be used in combination with `lsf` profile. |
 
 ## Resume
 > [!TIP]

From a0eafcf739b150ae33711f8f76633e0cc5ca241a Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Fri, 20 Dec 2024 10:23:45 +0000
Subject: [PATCH 07/10] Add cloning instructions

---
 README.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1140adf..ab685dc 100644
--- a/README.md
+++ b/README.md
@@ -153,17 +153,22 @@ It is recommended to have at least 16GB of RAM and 100GB of free storage
 > [!WARNING]
 > - Docker or Singularity must be running
 > - An Internet connection is required
-1. Clone the repository (if Git is installed on your system)
+1. Clone the repository (`git` is installed on your system)
     ```
-    git clone https://github.com/GlobalPneumoSeq/gps-pipeline.git
+    git clone --recurse-submodules https://github.com/GlobalPneumoSeq/gps-pipeline.git
     ```
-    or 
+    > Note: The pipeline depends on git submodules. If you don't clone with `--recurse-submodules`, you can correct this with `git submodule update --init`.
     
-    Download and unzip/extract the [latest release](https://github.com/GlobalPneumoSeq/gps-pipeline/releases)
+    OR
+    
+    - Download and unzip/extract the [latest release](https://github.com/GlobalPneumoSeq/gps-pipeline/releases).
+    - Click [here](./assorted-sub-workflows) and download the source code (`Code -> Download Zip`). Unzip/extract the code in the same directory as the gps-pipeline root.
+
 2. Go into the local directory of the pipeline and it is ready to use without installation (the directory name might be different)
     ```
     cd gps-pipeline
     ```
+
 3. (Optional) You could perform an initialisation to download all required additional files and container images, so the pipeline can be used at any time with or without the Internet afterwards.
     - Using Docker as the container engine
         ```

From ec7c05eacc8fc68b83780414a4835ca2d8a2db4a Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Fri, 20 Dec 2024 10:45:33 +0000
Subject: [PATCH 08/10] Only suggest git

---
 README.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ab685dc..c08e1d4 100644
--- a/README.md
+++ b/README.md
@@ -153,16 +153,18 @@ It is recommended to have at least 16GB of RAM and 100GB of free storage
 > [!WARNING]
 > - Docker or Singularity must be running
 > - An Internet connection is required
-1. Clone the repository (`git` is installed on your system)
+1. Clone the repository (`git` must be installed on your system)
     ```
     git clone --recurse-submodules https://github.com/GlobalPneumoSeq/gps-pipeline.git
     ```
     > Note: The pipeline depends on git submodules. If you don't clone with `--recurse-submodules`, you can correct this with `git submodule update --init`.
-    
-    OR
-    
-    - Download and unzip/extract the [latest release](https://github.com/GlobalPneumoSeq/gps-pipeline/releases).
-    - Click [here](./assorted-sub-workflows) and download the source code (`Code -> Download Zip`). Unzip/extract the code in the same directory as the gps-pipeline root.
+
+    To use a particular version of this pipeline, navigate into the root directory of the gps_pipeline and checkout a particular branch or tag:
+    ```
+    git checkout <tag/branch>
+    ```
+
+    See [Releases/Tags](./releases) and [Branches](./branches) for possibilities.
 
 2. Go into the local directory of the pipeline and it is ready to use without installation (the directory name might be different)
     ```

From ef01f584deab2d8ee2df358e99f77b2781daa359 Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Fri, 20 Dec 2024 10:50:40 +0000
Subject: [PATCH 09/10] Use HTTPS for submodule

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 340f1e0..1283bf2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "assorted-sub-workflows"]
 	path = assorted-sub-workflows
-	url = git@github.com:sanger-pathogens/assorted-sub-workflows.git
+	url = https://github.com/sanger-pathogens/assorted-sub-workflows.git

From 76d750adf34d7ef9307e3bc3106e85330f58ea9b Mon Sep 17 00:00:00 2001
From: William Roberts <wr7@sanger.ac.uk>
Date: Tue, 28 Jan 2025 10:55:10 +0000
Subject: [PATCH 10/10] Change config to use nextflow commons master

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 0bbc1de..bedf7c7 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,7 +1,7 @@
 nextflow.enable.dsl=2
 
 // Import mixed input params
-includeConfig "https://raw.githubusercontent.com/sanger-pathogens/nextflow-commons/cb60d523f202dace1b1efd02a5ae03f8a4b049a6/configs/nextflow.config"
+includeConfig "https://raw.githubusercontent.com/sanger-pathogens/nextflow-commons/refs/heads/master/configs/nextflow.config"
 includeConfig "$projectDir/assorted-sub-workflows/irods_extractor/subworkflows/irods.config"
 includeConfig "$projectDir/assorted-sub-workflows/mixed_input/subworkflows/mixed_input.config"