Add bambu_rec_ndr option (#33)

* Update Stringtie to v3.0.0 * Bump ANNEXA version * Add 'bambu_ndr_option' Add option to use Bambu's recommend NDR threshold. * Fixes to rec_ndr option * Added info option to ReadMe * Bump ANNEXA version
IGDRion · Feb 5, 2025 · ff70fa2 · ff70fa2
1 parent bc81478
commit ff70fa2
Show file tree

Hide file tree

Showing 13 changed files with 60 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -77,6 +77,7 @@ Bambu options
   --bambu_singleexon    [boolean] Include single exon transcripts in Bambu output or not. These are known to have a high frequency of false positives.
                                   [default: true]
   --bambu_threshold     [integer] bambu NDR threshold below which new transcripts are retained. [default: 0.2]
+  --bambu_rec_ndr       [boolean] Use NDR threshold recommended by Bambu instead of preset threshold. [default: false]
 
 Filtering options
   --tfkmers_tokenizer   [string]  Path to TransforKmers tokenizer. Required if filter option is activated.

diff --git a/bin/extract_stringtie_counts.R b/bin/extract_stringtie_counts.R
@@ -43,7 +43,7 @@ geneCountMatrix <- extractGeneExpression(
 ref <- rtracklayer::import(ref)
 
 ## Rename missing MSTRG to ref_gene_id in geneCountMatrix when unambiguous
-## Ambiguous gene_ids are left as they are (with MSTRG id) and reference in a new list
+## Ambiguous gene_ids are left as they are (with MSTRG id) and referenced in a new list
 # Rename missing MSTRG to ref_gene_id in geneCountMatrix
 mstrg_mapping <- as.data.frame(ref[startsWith(ref$gene_id, "MSTRG") & !is.na(ref$ref_gene_id), c("gene_id", "ref_gene_id")])
 ambiguous_mappings <- mstrg_mapping %>%

diff --git a/bin/run_bambu.R b/bin/run_bambu.R
@@ -16,6 +16,7 @@ Rsamtools::indexFa(genomeseq)
 annot_gtf      <- strsplit(grep('--annotation*', args, value = TRUE), split = '=')[[1]][[2]]
 bambu_strand   <- strsplit(grep('--bambu_strand*', args, value = TRUE), split = '=')[[1]][[2]]
 bambu_singleexon   <- strsplit(grep('--bambu_singleexon*', args, value = TRUE), split = '=')[[1]][[2]]
+bambu_rec_ndr   <- strsplit(grep('--bambu_rec_ndr*', args, value = TRUE), split = '=')[[1]][[2]]
 if (bambu_strand=="true"){
   bambu_strand=TRUE
 } else {
@@ -26,7 +27,7 @@ if (bambu_singleexon=="true"){
 } else {
   bambu_singleexon=FALSE
 }
-readlist       <- args[7:length(args)]
+readlist       <- args[8:length(args)]
 
 print("BAMs:")
 readlist
@@ -42,17 +43,22 @@ if (bambu_singleexon==TRUE) {
   opt_discovery <- NULL
 }
 
-se     <- bambu(
-  reads = readlist,
-  annotations = grlist,
-  genome = genomeSequence,
-  ncore = ncore,
-  verbose = TRUE,
-  NDR = 1,
-  stranded = bambu_strand,
-  opt.discovery = opt_discovery
+output <- capture.output(
+  se     <- bambu(
+    reads = readlist,
+    annotations = grlist,
+    genome = genomeSequence,
+    ncore = ncore,
+    verbose = TRUE,
+    NDR = 1,
+    stranded = bambu_strand,
+    opt.discovery = opt_discovery
+  ),
+  type = "message"
 )
 
+cat(output, sep = "\n")
+
 # Extract NDR
 tx_infos   <- se@rowRanges@elementMetadata
 new_tx_idx <- tx_infos[tx_infos$novelTranscript == "TRUE" & tx_infos$txClassDescription != "annotation", c(1,2,3) ]
@@ -65,3 +71,16 @@ write.csv(
 
 # Write GTF and counts
 writeBambuOutput(se, output_tag)
+
+if (bambu_rec_ndr=="true"){
+  # Extract recommended NDR
+  ndr_line <- output[grepl("we recommend an NDR of", output)]
+  match <- regexec("of ([0-9\\.]+)\\.", ndr_line)
+  ndr_value <- as.numeric(regmatches(ndr_line, match)[[1]][2])
+  write(ndr_value, file = "rec_ndr.txt")
+
+  # Write filtered GTF and counts
+  message("Creating a filtered GTF using a recommended threshold of: ",ndr_value)
+  se.filter <- se[mcols(se)$NDR < ndr_value | is.na(mcols(se)$NDR),]
+  writeToGTF(rowRanges(se.filter),"extended_annotations.NDR_filter.gtf")
+}
diff --git a/main.nf b/main.nf
@@ -107,12 +107,14 @@ workflow {
     ch_gene_counts = BAMBU.out.gene_counts
     ch_tx_counts = BAMBU.out.tx_counts
     ch_ndr = BAMBU.out.ndr
+    ch_rec_ndr = BAMBU.out.rec_ndr
     class_code = GFFCOMPARE.out.class_code_gtf
   }
   else if (params.tx_discovery == "stringtie2") {
     ch_gene_counts = STRINGTIE.out.gene_counts
     ch_tx_counts = STRINGTIE.out.tx_counts
     ch_ndr = STRINGTIE.out.ndr
+    ch_rec_ndr = STRINGTIE.out.rec_ndr
     class_code = STRINGTIE.out.class_code_gtf
   }
 
@@ -138,6 +140,7 @@ workflow {
     TFKMERS(RESTRAND_NOVEL.out, 
             ref_fa, 
             ch_ndr, 
+            ch_rec_ndr, 
             tokenizer, 
             model, 
             ch_tx_counts)

diff --git a/modules/bambu/bambu.nf b/modules/bambu/bambu.nf
@@ -16,6 +16,8 @@ process BAMBU {
   path 'counts_transcript.txt', emit: tx_counts
   path 'counts_gene.txt', emit: gene_counts
   path 'bambu_ndr.csv', emit: ndr
+  path 'rec_ndr.txt', emit: rec_ndr, optional: true
+  path 'extended_annotations.NDR_filter.gtf', emit: ndr_filter_bambu_gtf, optional: true
 
   script:
   """
@@ -26,6 +28,7 @@ process BAMBU {
     --fasta=${fa} \
     --bambu_strand=${params.bambu_strand} \
     --bambu_singleexon=${params.bambu_singleexon} \
+    --bambu_rec_ndr=${params.bambu_rec_ndr} \
     *.bam
 
   sed -i 's/*/./g' extended_annotations.gtf

diff --git a/modules/check_ndr.nf b/modules/check_ndr.nf
diff --git a/modules/header.nf b/modules/header.nf
@@ -27,6 +27,7 @@ Tfkmers Tokenizer     : ${params.tfkmers_tokenizer}
 Tfkmers Threshold     : ${params.tfkmers_threshold}
 Bambu Threshold       : ${params.bambu_threshold}
 Bambu Single Exons    : ${params.bambu_singleexon}
+Bambu Recommended NDR : ${params.bambu_rec_ndr}
 Filtering operation   : ${params.operation}
 Stranded              : ${params.bambu_strand}
 -${c_dim}-------------------------------------${c_reset}-

diff --git a/modules/stringtie/stringtie_extract_counts.nf b/modules/stringtie/stringtie_extract_counts.nf
@@ -15,6 +15,7 @@ process EXTRACT_QUANTS {
     path "counts_gene.txt", emit: gene_counts
     path "counts_transcript.txt", emit: tx_counts
     path "empty.ndr", emit: ndr
+    path "rec_ndr.txt", emit: rec_ndr
 
     script:
     def mean_length = bam_length.collect { it as double }.sum() / bam_length.size()
@@ -26,5 +27,6 @@ process EXTRACT_QUANTS {
 
     # Create empty NDR file for TFKMERS to have same workflow as bambu in filtering step (placeholder)
     touch empty.ndr
+    echo "1" > rec_ndr.txt
     """
 }
diff --git a/modules/stringtie/stringtie_workflow.nf b/modules/stringtie/stringtie_workflow.nf
@@ -48,4 +48,5 @@ workflow STRINGTIE {
     gene_counts = EXTRACT_QUANTS.out.gene_counts
     tx_counts = EXTRACT_QUANTS.out.tx_counts
     ndr = EXTRACT_QUANTS.out.ndr
+    rec_ndr = EXTRACT_QUANTS.out.rec_ndr
 }
diff --git a/modules/transforkmers/filter.nf b/modules/transforkmers/filter.nf
@@ -10,6 +10,7 @@ process FILTER {
   file counts_tx
   file tfkmers
   file bambu_ndr
+  file rec_ndr
 
   output:
   path "novel.filter.gtf", emit: gtf
@@ -20,13 +21,19 @@ process FILTER {
   """
   cp ${counts_tx} counts_transcript.full.txt
 
+  if [ "${params.bambu_rec_ndr}" == "true" ]; then
+    bambu_threshold=\$(cat ${rec_ndr})
+  else
+    bambu_threshold=${params.bambu_threshold}
+  fi
+
   filter_gtf_ndr.py \
     --gtf ${novel} \
     --counts_tx ${counts_tx} \
     --tfkmers ${tfkmers} \
     --bambu ${bambu_ndr} \
     --tfkmers-threshold ${params.tfkmers_threshold} \
-    --bambu-threshold ${params.bambu_threshold} \
+    --bambu-threshold \$bambu_threshold \
     --operation ${params.operation} \
     --tx_discovery ${params.tx_discovery}
 

diff --git a/modules/transforkmers/workflow.nf b/modules/transforkmers/workflow.nf
@@ -8,6 +8,7 @@ workflow TFKMERS {
     novel_gtf
     ref_fa
     bambu_ndr
+    rec_ndr
     tokenizer
     model
     counts_tx
@@ -32,7 +33,8 @@ workflow TFKMERS {
       novel_gtf,
       counts_tx,
       PREDICT.out,
-      bambu_ndr
+      bambu_ndr,
+      rec_ndr
     )
 
   emit:

diff --git a/nextflow.config b/nextflow.config
@@ -17,6 +17,7 @@ params {
   tfkmers_tokenizer = null
   bambu_strand      = true
   bambu_singleexon  = true
+  bambu_rec_ndr     = false
   tx_discovery      = "bambu"
   help              = false
 }
@@ -42,7 +43,6 @@ profiles {
       withGeneCoverage = true
       maxCpu = 2
       maxMemory = '8GB'
-      tx_discovery = 'bambu'
     }
   }
 
@@ -68,5 +68,5 @@ manifest {
 	description = 'Analysis of Nanopore with Nextflow for EXtended Annotation'
 	mainScript = 'main.nf'
 	nextflowVersion = '>=19.10.0'
-	version = '3.2.2'
+	version = '3.2.3'
 }
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -106,8 +106,14 @@
                     "type": "integer",
                     "description": "bambu NDR threshold below which new transcripts are retained.",
                     "default": "0.2"
+                },
+                "bambu_rec_ndr": {
+                    "type": "boolean",
+                    "description": "Use NDR threshold recommended by Bambu instead of preset threshold.",
+                    "default": "false"
                 }
             }
+            }
         },
         "filtering_options": {
             "title": "Filtering options",