Skip to content

Commit

Permalink
Add bambu_rec_ndr option (#33)
Browse files Browse the repository at this point in the history
* Update Stringtie to v3.0.0

* Bump ANNEXA version

* Add 'bambu_ndr_option'

Add option to use Bambu's recommend NDR threshold.

* Fixes to rec_ndr option

* Added info option to ReadMe

* Bump ANNEXA version
  • Loading branch information
N-Hoffmann authored Feb 5, 2025
1 parent bc81478 commit ff70fa2
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 28 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Bambu options
--bambu_singleexon [boolean] Include single exon transcripts in Bambu output or not. These are known to have a high frequency of false positives.
[default: true]
--bambu_threshold [integer] bambu NDR threshold below which new transcripts are retained. [default: 0.2]
--bambu_rec_ndr [boolean] Use NDR threshold recommended by Bambu instead of preset threshold. [default: false]

Filtering options
--tfkmers_tokenizer [string] Path to TransforKmers tokenizer. Required if filter option is activated.
Expand Down
2 changes: 1 addition & 1 deletion bin/extract_stringtie_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ geneCountMatrix <- extractGeneExpression(
ref <- rtracklayer::import(ref)

## Rename missing MSTRG to ref_gene_id in geneCountMatrix when unambiguous
## Ambiguous gene_ids are left as they are (with MSTRG id) and reference in a new list
## Ambiguous gene_ids are left as they are (with MSTRG id) and referenced in a new list
# Rename missing MSTRG to ref_gene_id in geneCountMatrix
mstrg_mapping <- as.data.frame(ref[startsWith(ref$gene_id, "MSTRG") & !is.na(ref$ref_gene_id), c("gene_id", "ref_gene_id")])
ambiguous_mappings <- mstrg_mapping %>%
Expand Down
39 changes: 29 additions & 10 deletions bin/run_bambu.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Rsamtools::indexFa(genomeseq)
annot_gtf <- strsplit(grep('--annotation*', args, value = TRUE), split = '=')[[1]][[2]]
bambu_strand <- strsplit(grep('--bambu_strand*', args, value = TRUE), split = '=')[[1]][[2]]
bambu_singleexon <- strsplit(grep('--bambu_singleexon*', args, value = TRUE), split = '=')[[1]][[2]]
bambu_rec_ndr <- strsplit(grep('--bambu_rec_ndr*', args, value = TRUE), split = '=')[[1]][[2]]
if (bambu_strand=="true"){
bambu_strand=TRUE
} else {
Expand All @@ -26,7 +27,7 @@ if (bambu_singleexon=="true"){
} else {
bambu_singleexon=FALSE
}
readlist <- args[7:length(args)]
readlist <- args[8:length(args)]

print("BAMs:")
readlist
Expand All @@ -42,17 +43,22 @@ if (bambu_singleexon==TRUE) {
opt_discovery <- NULL
}

se <- bambu(
reads = readlist,
annotations = grlist,
genome = genomeSequence,
ncore = ncore,
verbose = TRUE,
NDR = 1,
stranded = bambu_strand,
opt.discovery = opt_discovery
output <- capture.output(
se <- bambu(
reads = readlist,
annotations = grlist,
genome = genomeSequence,
ncore = ncore,
verbose = TRUE,
NDR = 1,
stranded = bambu_strand,
opt.discovery = opt_discovery
),
type = "message"
)

cat(output, sep = "\n")

# Extract NDR
tx_infos <- se@rowRanges@elementMetadata
new_tx_idx <- tx_infos[tx_infos$novelTranscript == "TRUE" & tx_infos$txClassDescription != "annotation", c(1,2,3) ]
Expand All @@ -65,3 +71,16 @@ write.csv(

# Write GTF and counts
writeBambuOutput(se, output_tag)

if (bambu_rec_ndr=="true"){
# Extract recommended NDR
ndr_line <- output[grepl("we recommend an NDR of", output)]
match <- regexec("of ([0-9\\.]+)\\.", ndr_line)
ndr_value <- as.numeric(regmatches(ndr_line, match)[[1]][2])
write(ndr_value, file = "rec_ndr.txt")

# Write filtered GTF and counts
message("Creating a filtered GTF using a recommended threshold of: ",ndr_value)
se.filter <- se[mcols(se)$NDR < ndr_value | is.na(mcols(se)$NDR),]
writeToGTF(rowRanges(se.filter),"extended_annotations.NDR_filter.gtf")
}
3 changes: 3 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,14 @@ workflow {
ch_gene_counts = BAMBU.out.gene_counts
ch_tx_counts = BAMBU.out.tx_counts
ch_ndr = BAMBU.out.ndr
ch_rec_ndr = BAMBU.out.rec_ndr
class_code = GFFCOMPARE.out.class_code_gtf
}
else if (params.tx_discovery == "stringtie2") {
ch_gene_counts = STRINGTIE.out.gene_counts
ch_tx_counts = STRINGTIE.out.tx_counts
ch_ndr = STRINGTIE.out.ndr
ch_rec_ndr = STRINGTIE.out.rec_ndr
class_code = STRINGTIE.out.class_code_gtf
}

Expand All @@ -138,6 +140,7 @@ workflow {
TFKMERS(RESTRAND_NOVEL.out,
ref_fa,
ch_ndr,
ch_rec_ndr,
tokenizer,
model,
ch_tx_counts)
Expand Down
3 changes: 3 additions & 0 deletions modules/bambu/bambu.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ process BAMBU {
path 'counts_transcript.txt', emit: tx_counts
path 'counts_gene.txt', emit: gene_counts
path 'bambu_ndr.csv', emit: ndr
path 'rec_ndr.txt', emit: rec_ndr, optional: true
path 'extended_annotations.NDR_filter.gtf', emit: ndr_filter_bambu_gtf, optional: true

script:
"""
Expand All @@ -26,6 +28,7 @@ process BAMBU {
--fasta=${fa} \
--bambu_strand=${params.bambu_strand} \
--bambu_singleexon=${params.bambu_singleexon} \
--bambu_rec_ndr=${params.bambu_rec_ndr} \
*.bam
sed -i 's/*/./g' extended_annotations.gtf
Expand Down
13 changes: 0 additions & 13 deletions modules/check_ndr.nf

This file was deleted.

1 change: 1 addition & 0 deletions modules/header.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Tfkmers Tokenizer : ${params.tfkmers_tokenizer}
Tfkmers Threshold : ${params.tfkmers_threshold}
Bambu Threshold : ${params.bambu_threshold}
Bambu Single Exons : ${params.bambu_singleexon}
Bambu Recommended NDR : ${params.bambu_rec_ndr}
Filtering operation : ${params.operation}
Stranded : ${params.bambu_strand}
-${c_dim}-------------------------------------${c_reset}-
Expand Down
2 changes: 2 additions & 0 deletions modules/stringtie/stringtie_extract_counts.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ process EXTRACT_QUANTS {
path "counts_gene.txt", emit: gene_counts
path "counts_transcript.txt", emit: tx_counts
path "empty.ndr", emit: ndr
path "rec_ndr.txt", emit: rec_ndr

script:
def mean_length = bam_length.collect { it as double }.sum() / bam_length.size()
Expand All @@ -26,5 +27,6 @@ process EXTRACT_QUANTS {
# Create empty NDR file for TFKMERS to have same workflow as bambu in filtering step (placeholder)
touch empty.ndr
echo "1" > rec_ndr.txt
"""
}
1 change: 1 addition & 0 deletions modules/stringtie/stringtie_workflow.nf
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,5 @@ workflow STRINGTIE {
gene_counts = EXTRACT_QUANTS.out.gene_counts
tx_counts = EXTRACT_QUANTS.out.tx_counts
ndr = EXTRACT_QUANTS.out.ndr
rec_ndr = EXTRACT_QUANTS.out.rec_ndr
}
9 changes: 8 additions & 1 deletion modules/transforkmers/filter.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ process FILTER {
file counts_tx
file tfkmers
file bambu_ndr
file rec_ndr

output:
path "novel.filter.gtf", emit: gtf
Expand All @@ -20,13 +21,19 @@ process FILTER {
"""
cp ${counts_tx} counts_transcript.full.txt
if [ "${params.bambu_rec_ndr}" == "true" ]; then
bambu_threshold=\$(cat ${rec_ndr})
else
bambu_threshold=${params.bambu_threshold}
fi
filter_gtf_ndr.py \
--gtf ${novel} \
--counts_tx ${counts_tx} \
--tfkmers ${tfkmers} \
--bambu ${bambu_ndr} \
--tfkmers-threshold ${params.tfkmers_threshold} \
--bambu-threshold ${params.bambu_threshold} \
--bambu-threshold \$bambu_threshold \
--operation ${params.operation} \
--tx_discovery ${params.tx_discovery}
Expand Down
4 changes: 3 additions & 1 deletion modules/transforkmers/workflow.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ workflow TFKMERS {
novel_gtf
ref_fa
bambu_ndr
rec_ndr
tokenizer
model
counts_tx
Expand All @@ -32,7 +33,8 @@ workflow TFKMERS {
novel_gtf,
counts_tx,
PREDICT.out,
bambu_ndr
bambu_ndr,
rec_ndr
)

emit:
Expand Down
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ params {
tfkmers_tokenizer = null
bambu_strand = true
bambu_singleexon = true
bambu_rec_ndr = false
tx_discovery = "bambu"
help = false
}
Expand All @@ -42,7 +43,6 @@ profiles {
withGeneCoverage = true
maxCpu = 2
maxMemory = '8GB'
tx_discovery = 'bambu'
}
}

Expand All @@ -68,5 +68,5 @@ manifest {
description = 'Analysis of Nanopore with Nextflow for EXtended Annotation'
mainScript = 'main.nf'
nextflowVersion = '>=19.10.0'
version = '3.2.2'
version = '3.2.3'
}
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,14 @@
"type": "integer",
"description": "bambu NDR threshold below which new transcripts are retained.",
"default": "0.2"
},
"bambu_rec_ndr": {
"type": "boolean",
"description": "Use NDR threshold recommended by Bambu instead of preset threshold.",
"default": "false"
}
}
}
},
"filtering_options": {
"title": "Filtering options",
Expand Down

0 comments on commit ff70fa2

Please sign in to comment.