fmalmeida · fmalmeida · Sep 9, 2022 · Aug 29, 2022 · Sep 2, 2022 · Sep 3, 2022
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,5 @@
 .RData
 .Ruserdata
 TESTE
+testing
 docs/_html
diff --git a/.zenodo.json b/.zenodo.json
@@ -2,7 +2,7 @@
  "description": "<p>The pipeline</p>\n\n<p>bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more, while providing nice an beautiful interactive documents for results exploration.</p>", 
  "license": "other-open", 
  "title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline", 
- "version": "v3.1.2", 
+ "version": "v3.1.3", 
  "upload_type": "software",
  "creators": [
  {

diff --git a/bin/write_table_from_gff.R b/bin/write_table_from_gff.R
@@ -59,15 +59,15 @@ output_file <- opt$out
 
 if (length(opt$type) && opt$type != "card") {
  ### Create fields - Prokka
- gff$Prokka_ID <- getAttributeField(gff$attributes, "id", ";")
+ gff$Generic_ID <- getAttributeField(gff$attributes, "id", ";")
  gff$Prokka_product <- getAttributeField(gff$attributes, "product", ";")
  gff$Prokka_inference <- getAttributeField(gff$attributes, "inference", ";")
  gff$Domain <- getAttributeField(gff$attributes, "protein_motif", ";")
  gff$Additional_DB <- getAttributeField(gff$attributes, "additional_database", ";")
  gff$Additional_product <- getAdditionalProducts(gff$attributes)
 
  ### Give columns a name
- col = c("seqname", "Prokka_ID", "start", "end", "feature", "source", "Additional_DB",
+ col = c("seqname", "Generic_ID", "start", "end", "feature", "source", "Additional_DB",
  "Prokka_product", "Additional_product", "Prokka_inference", "Domain")
 
  ### Write document
@@ -86,15 +86,15 @@ if (length(opt$type) && opt$type != "card") {
  gff$CVTerm <- getAttributeField(gff$attributes, "cvterm", ";")
  gff$Domain <- getAttributeField(gff$attributes, "protein_motif", ";")
  gff$Prokka_product <- getAttributeField(gff$attributes, "product", ";")
- gff$Prokka_ID <- getAttributeField(gff$attributes, "id", ";")
+ gff$Generic_ID <- getAttributeField(gff$attributes, "id", ";")
  gff$Prokka_inference <- getAttributeField(gff$attributes, "inference", ";")
  gff$Additional_DB <- getAttributeField(gff$attributes, "additional_database", ";")
  gff$Additional_product <- getAdditionalProducts(gff$attributes)
 
  #### Give columns a name
  col = c("seqname", "start", "end", "feature", "source", "ARO_Accession", 
  "Gene_Family", "Name", "Drug_Class", "Resistance_Mechanism", "CVTerm",
- "Domain", "Prokka_product", "Prokka_ID", "Prokka_inference", "Additional_DB", "Additional_product")
+ "Domain", "Prokka_product", "Generic_ID", "Prokka_inference", "Additional_DB", "Additional_product")
 
  ### Write document
  table <- gff[, col]
@@ -103,13 +103,13 @@ if (length(opt$type) && opt$type != "card") {
 } else {
  ### Create non-specific file
  ### Create fields - Prokka
- gff$Prokka_ID <- getAttributeField(gff$attributes, "id", ";")
+ gff$Generic_ID <- getAttributeField(gff$attributes, "id", ";")
  gff$Prokka_product <- getAttributeField(gff$attributes, "product", ";")
  gff$Prokka_inference <- getAttributeField(gff$attributes, "inference", ";")
  gff$Annotated_domain <- getAttributeField(gff$attributes, "protein_motif", ";")
 
  ### Give columns a name
- col = c("seqname", "start", "end", "feature", "source", "Prokka_ID", 
+ col = c("seqname", "start", "end", "feature", "source", "Generic_ID", 
  "Prokka_product", "Prokka_inference", "Annotated_domain")
 
 

diff --git a/conf/defaults.config b/conf/defaults.config
@@ -53,6 +53,12 @@ params {
 // By default (if Blank), this process is not executed. For execution the user needs to provide a value
  bedtools_merge_distance = null
 
+ /*
+ * Bakta optional
+ */
+// If user set path to an existing bakta database, the pipeline will use bakta instead of prokka
+ bakta_db = null
+
  /*
  * Prokka optional parameters
  */

diff --git a/conf/docker.config b/conf/docker.config
@@ -3,7 +3,6 @@ singularity.enabled = false
 docker {
  enabled = true
  runOptions = '--platform linux/amd64 -u root:$(id -g)'
- fixOwnership = true
 }
 
 
@@ -52,6 +51,10 @@ process {
  container = 'quay.io/biocontainers/flye:2.9--py39h39abbe0_0'
  }
 
+ withName: BAKTA {
+ container = 'quay.io/biocontainers/bakta:1.5.0--pyhdfd78af_0'
+ }
+
  /*
  * Other (non-image) customization
  */

diff --git a/docker/renv/reports/aro_index.tsv b/docker/renv/reports/aro_index.tsv
diff --git a/docker/renv/reports/report_general.Rmd b/docker/renv/reports/report_general.Rmd
@@ -3,7 +3,8 @@ title: "Generic annotation"
 author: "Produced with bacannot pipeline"
 date: "`r format(Sys.time(), '%d %B %Y')`"
 params:
- prokka:
+ generic_annotation:
+ generic_annotator:
  kegg:
  barrnap:
  mlst:
@@ -26,14 +27,20 @@ suppressMessages(library(magrittr))
 suppressMessages(library(knitr))
 suppressMessages(library(DT))
 suppressMessages(library(ballgown))
+suppressMessages(library(stringr))
+suppressMessages(library(dplyr))
 
 # Line checker
 check_lines <- function(x) {
  return(ifelse(identical(as.integer(nrow(x)), integer(0)), 0, nrow(x)))
 }
 
 # Load data
-prokka_txt <- try(read.delim(params$prokka, header = FALSE, sep = ":"), silent = TRUE)
+if (params$generic_annotator == "bakta") {
+ generic_annotation_txt <- try(read.delim(params$generic_annotation, header = FALSE, sep = ":"), silent = TRUE) %>% filter(!V1=="Sequence(s)") %>% filter(!V1=="Annotation") %>% filter(!V1=="Bakta")
+} else {
+ generic_annotation_txt <- try(read.delim(params$generic_annotation, header = FALSE, sep = ":"), silent = TRUE)
+}
 mlst_txt <- try(read.delim(params$mlst, header = FALSE), silent = TRUE)
 barrnap_gff <- try(gffRead(params$barrnap), silent = TRUE)
 if (class(barrnap_gff) != 'try-error' &
@@ -51,6 +58,15 @@ if (file.exists(params$kegg)) {
  kegg_not_null <- FALSE
 }
 
+if (params$generic_annotator == "prokka") {
+ annotator_url <- "https://github.com/tseemann/prokka"
+ prokka_not_null <- TRUE
+}
+if (params$generic_annotator == "bakta") {
+ annotator_url <- "https://github.com/oschwengers/bakta"
+ prokka_not_null <- FALSE
+}
+
 # DT options
 # Lists
 dt_opt_lst <- list(pageLength = 1,
@@ -70,7 +86,7 @@ dt_opt_lst <- list(pageLength = 1,
 
 ## About
 
-This report was built to summarise in a report the results of the most generic annotation contents, which are: Prokka, Barrnap, mlst, KofamScan and refseq_masher. If you'd like to see any other result included in this report please flag an [enhancement issue on Github](https://github.com/fmalmeida/bacannot/issues).
+This report was built to summarise in a report the results of the most generic annotation contents, which are: `r str_to_title(params$generic_annotator)`, Barrnap, mlst, KofamScan and refseq_masher. If you'd like to see any other result included in this report please flag an [enhancement issue on Github](https://github.com/fmalmeida/bacannot/issues).
 
 ## RefSeq Masher
 
@@ -113,15 +129,17 @@ datatable(mlst_txt,
  )
 ```
 
-## Prokka
+## `r str_to_title(params$generic_annotator)`
 
-[Prokka](https://github.com/tseemann/prokka) is rapid prokaryotic genome annotation tool that quickly produces standards-compliant output files.
+[`r str_to_title(params$generic_annotator)`](`r annotator_url`) is generic prokaryotic genome annotation tool that produces standards-compliant output files.
 
+```{r prokka_msg, echo=FALSE, results='asis', eval=prokka_not_null}
 > In bacannot, the prokka database is incremented with TIGRFAM's hmm [hosted at NCBI](https://ftp.ncbi.nlm.nih.gov/hmm/TIGRFAMs/release_15.0/).
+```
 
-```{r prokka_stats, echo=FALSE}
-df <- data.frame(prokka_txt[,-1])
-rownames(df) <- prokka_txt[,1]
+```{r generic_stats, echo=FALSE}
+df <- data.frame(generic_annotation_txt[,-1])
+rownames(df) <- generic_annotation_txt[,1]
 datatable(t(df),
  escape = FALSE,
  filter = 'top',

diff --git a/docker/renv/reports/report_resistance.Rmd b/docker/renv/reports/report_resistance.Rmd
@@ -8,6 +8,7 @@ params:
  query:
  amrfinder:
  gff:
+ generic_annotator:
  rgitool:
  rgiparsed:
  rgi_heatmap:
@@ -93,16 +94,25 @@ check_lines <- function(x) {
  return(ifelse(identical(as.integer(nrow(x)), integer(0)), 0, nrow(x)))
 }
 
+if (params$generic_annotator == "prokka") {
+ annotator_url <- "https://github.com/tseemann/prokka"
+ prokka_not_null <- TRUE
+}
+if (params$generic_annotator == "bakta") {
+ annotator_url <- "https://github.com/oschwengers/bakta"
+ prokka_not_null <- FALSE
+}
+
 #####################
 ### Loading files ###
 #####################
 
-# Prokka GFF
+# Generic Annotation GFF
 gff <- try(gffRead(params$gff), silent = TRUE)
 ## Adding IDs and coordinates to the GFF
 gff$`Query Protein Coordinates` <- paste(gff$seqname, ":", gff$start, "-", gff$end, sep = "")
-gff$Prokka_ID <- getAttributeField(gff$attributes, "ID", ";")
-gff$`Prokka Product` <- getAttributeField(gff$attributes, "product", ";")
+gff$Generic_ID <- getAttributeField(gff$attributes, "ID", ";")
+gff$`Generic Annotation Product` <- getAttributeField(gff$attributes, "product", ";")
 
 # Amrfinder
 amrtsv <- try(read.delim(params$amrfinder) %>% select(2,3,4,5,6,7,8,9,10,16), silent = TRUE)
@@ -232,18 +242,23 @@ All the predictions were passed through a user defined threshold for minimum cov
 ```{r, argminer_conditional_block_2, echo=FALSE, results='asis', eval=argminer_null, child='no_argminer.Rmd'}
 ```
 
-## Prokka
+## `r str_to_title(params$generic_annotator)`
 
-Additionally, Prokka generically annotates a few proteins that are related to any type of resistance. One must take caution when evaluating this result because this annotation can be very generic and therefore not so meaningful. Because it uses hmms, sometimes the annotation of genes can be based on a single detected motif thus its results must be checked whether they are correctly annotated and/or functional. These are showed in Table \@ref(tab:prokka-general-resistance).
+Additionally, `r str_to_title(params$generic_annotator)` generically annotates a few proteins that are related to any type of resistance. These are showed in Table \@ref(tab:general-resistance).
+
+
+```{r prokka_msg, echo=FALSE, results='asis', eval=prokka_not_null}
+> One must take caution when evaluating this result because this annotation can be very generic and therefore not so meaningful. Because it uses hmms, sometimes the annotation of genes can be based on a single detected motif thus its results must be checked whether they are correctly annotated and/or functional.
+```
 
-<caption>(#tab:prokka-general-resistance) Generic annotation of resistance determinants by Prokka</caption>
-```{r prokka-general-resistance, echo=FALSE}
+<caption>(#tab:general-resistance) Generic annotation of resistance determinants by `r str_to_title(params$generic_annotator)`</caption>
+```{r general-resistance, echo=FALSE}
 gff %>%
  filter(str_detect(attributes, "resistance|Resistance")) %>%
- select("seqname", "start", "end", `Prokka Product`, "attributes") %>%
+ select("seqname", "start", "end", `Generic Annotation Product`, "attributes") %>%
  datatable(escape = FALSE,
  filter = 'top',
- colnames = c("Contig", "Start", "End", "Prokka Product", "Description"),
+ colnames = c("Contig", "Start", "End", "Generic Annotation Product", "Description"),
  options = list(pageLength = 5,
  lengthMenu = c(5, 10, 15, 20, 50),
  dom='flrtBip',

diff --git a/docker/renv/reports/yes_AMRfinder.Rmd b/docker/renv/reports/yes_AMRfinder.Rmd
@@ -23,13 +23,13 @@ cat(paste("\t-", string.list), sep = '\n')
 <caption>(#tab:ncbi-amr-resistance-genes) Resistance genes annotated from NCBI AMR curated database using AMRfinderplus</caption>
 ```{r ncbi-amr-resistance-genes}
 # Merge tables
-tab2 <- merge.data.frame(gff, tab, by.x = "Prokka_ID", by.y = "Protein.identifier", all.y = TRUE, all.x = FALSE)
+tab2 <- merge.data.frame(gff, tab, by.x = "Generic_ID", by.y = "Protein.identifier", all.y = TRUE, all.x = FALSE)
 tab2$Accession.of.closest.sequence <- amrfinder_url(tab2$Accession.of.closest.sequence)
 
 # Produce Table
 tab2 %>%
- select(Element.type, Element.subtype, Prokka_ID, Gene.symbol, Sequence.name, Class, Subclass, Method, Accession.of.closest.sequence, `Query Protein Coordinates`) %>%
- dplyr::arrange(Element.type, Prokka_ID) %>%
+ select(Element.type, Element.subtype, Generic_ID, Gene.symbol, Sequence.name, Class, Subclass, Method, Accession.of.closest.sequence, `Query Protein Coordinates`) %>%
+ dplyr::arrange(Element.type, Generic_ID) %>%
  datatable(escape = FALSE,
  filter = 'top',
  colnames = c("Resistance type", "Resistance subtype", "Query protein", "Gene", "Product", "Resistance Class", "Resistance subclass", "Detection method", "Ref. Accession", "Genomic coordinates"),
@@ -48,7 +48,7 @@ tab2 %>%
 ```{r ncbi-resistome-png, fig.align='center', fig.show='hold', fig.cap="Resistome Predicted using NCBI's AMRFinderplus", out.width="35%"}
 ## Grab AMR
 amr <- tab2 %>%
- select(Prokka_ID, Subclass)
+ select(Generic_ID, Subclass)
 amr <- plyr::count(amr, "Subclass")
 
 ## Plot

diff --git a/docker/renv/reports/yes_RGI.Rmd b/docker/renv/reports/yes_RGI.Rmd
@@ -14,7 +14,7 @@ The results obtained with RGI tool are summarized in the heatmap produced by the
 # Merge data.frame with GFF
 rgi_tsv <- rgi_parsed
 colnames(rgi_tsv) <- c("Protein ID", "Cut_Off", "Product", "ARO", "Drug Class", "Resistance Mechanism", "AMR Gene Family")
-rgi_tsv <- merge.data.frame(gff, rgi_tsv, by.x = "Prokka_ID", by.y = "Protein ID", all.y = TRUE, all.x = FALSE)
+rgi_tsv <- merge.data.frame(gff, rgi_tsv, by.x = "Generic_ID", by.y = "Protein ID", all.y = TRUE, all.x = FALSE)
 
 # Get CARD metadata
 aro_index <- read.csv("aro_index.tsv", sep = "\t")
@@ -33,8 +33,8 @@ rgi_tsv$CVTERM.ID <- card_url(rgi_tsv$CVTERM.ID)
 
 # Produce Table
 rgi_tsv %>%
- select(Prokka_ID, Product, `AMR Gene Family`, `Drug Class`, `Resistance Mechanism`, CVTERM.ID, `Query Protein Coordinates`, `Cut_Off`) %>%
- arrange(8,1) %>%
+ select(Generic_ID, Product, `AMR Gene Family`, `Drug Class`, `Resistance Mechanism`, CVTERM.ID, `Query Protein Coordinates`, `Cut_Off`) %>%
+ arrange(`Query Protein Coordinates`, `Generic_ID`) %>%
  datatable(escape = FALSE,
  filter = 'top',
  options = list(pageLength = 5,

diff --git a/docker/renv/reports/yes_ncbi.Rmd b/docker/renv/reports/yes_ncbi.Rmd
@@ -7,15 +7,15 @@ All resistance genes from this dataset and its protein ID (local protein ID) are
 ncbi_amr$`Query Protein Coordinates` <- paste(ncbi_amr$seqname, ":", ncbi_amr$start, "-", ncbi_amr$end)
 
 # To upper
-ncbi_amr$Prokka_ID <- toupper(ncbi_amr$Prokka_ID)
+ncbi_amr$Generic_ID <- toupper(ncbi_amr$Generic_ID)
 
 # Parse
 info <- read.csv(url("https://ftp.ncbi.nlm.nih.gov/hmm/NCBIfam-AMR/latest/NCBIfam-AMR.tsv"), sep = "\t", header = FALSE)
 info$V1 <- tolower(info$V1)
-genes <- ncbi_amr %>% select(Prokka_ID, `Query Protein Coordinates`, Prokka_product, Prokka_inference)
+genes <- ncbi_amr %>% select(Generic_ID, `Query Protein Coordinates`, Prokka_product, Prokka_inference)
 genes <- separate(genes, Prokka_inference, c("method", "prog", "ver", "motif", "db_id"), sep = ":", extra = "merge")
 genes <- merge.data.frame(genes, info, by.x = "db_id", by.y = "V1")
-genes <- genes %>% select("Prokka_ID", "Prokka_product", "db_id", "V2", "V5", "V9", "Query Protein Coordinates")
+genes <- genes %>% select("Generic_ID", "Prokka_product", "db_id", "V2", "V5", "V9", "Query Protein Coordinates")
 
 # Create List
 string.list <- genes %>% pull("V9") %>% unique()
@@ -39,7 +39,7 @@ kable(caption = "Resistance genes annotated from NCBI-amr curated database", esc
 
 ```{r ncbi-resistome-png, fig.align='center', fig.show='hold', fig.cap="Resistance Gene Families annotated using NCBI-amr", out.width="60%"}
 ncbi_count <- genes %>%
- select("V2", "Prokka_ID") %>% group_by(V2)
+ select("V2", "Generic_ID") %>% group_by(V2)
 drugs <- plyr::count(ncbi_count, "V2")
 ggplot(drugs, aes(x=V2, y=freq, fill=V2)) +
  geom_bar(stat = "identity") +

diff --git a/docker/renv/scripts/rscripts/write_table_from_gff.R b/docker/renv/scripts/rscripts/write_table_from_gff.R
@@ -59,15 +59,15 @@ output_file <- opt$out
 
 if (length(opt$type) && opt$type != "card") {
  ### Create fields - Prokka
- gff$Prokka_ID <- getAttributeField(gff$attributes, "id", ";")
+ gff$Generic_ID <- getAttributeField(gff$attributes, "id", ";")
  gff$Prokka_product <- getAttributeField(gff$attributes, "product", ";")
  gff$Prokka_inference <- getAttributeField(gff$attributes, "inference", ";")
  gff$Domain <- getAttributeField(gff$attributes, "protein_motif", ";")
  gff$Additional_DB <- getAttributeField(gff$attributes, "additional_database", ";")
  gff$Additional_product <- getAdditionalProducts(gff$attributes)
 
  ### Give columns a name
- col = c("seqname", "Prokka_ID", "start", "end", "feature", "source", "Additional_DB",
+ col = c("seqname", "Generic_ID", "start", "end", "feature", "source", "Additional_DB",
  "Prokka_product", "Additional_product", "Prokka_inference", "Domain")
 
  ### Write document
@@ -86,15 +86,15 @@ if (length(opt$type) && opt$type != "card") {
  gff$CVTerm <- getAttributeField(gff$attributes, "cvterm", ";")
  gff$Domain <- getAttributeField(gff$attributes, "protein_motif", ";")
  gff$Prokka_product <- getAttributeField(gff$attributes, "product", ";")
- gff$Prokka_ID <- getAttributeField(gff$attributes, "id", ";")
+ gff$Generic_ID <- getAttributeField(gff$attributes, "id", ";")
  gff$Prokka_inference <- getAttributeField(gff$attributes, "inference", ";")
  gff$Additional_DB <- getAttributeField(gff$attributes, "additional_database", ";")
  gff$Additional_product <- getAdditionalProducts(gff$attributes)
 
  #### Give columns a name
  col = c("seqname", "start", "end", "feature", "source", "ARO_Accession", 
  "Gene_Family", "Name", "Drug_Class", "Resistance_Mechanism", "CVTerm",
- "Domain", "Prokka_product", "Prokka_ID", "Prokka_inference", "Additional_DB", "Additional_product")
+ "Domain", "Prokka_product", "Generic_ID", "Prokka_inference", "Additional_DB", "Additional_product")
 
  ### Write document
  table <- gff[, col]
@@ -103,13 +103,13 @@ if (length(opt$type) && opt$type != "card") {
 } else {
  ### Create non-specific file
  ### Create fields - Prokka
- gff$Prokka_ID <- getAttributeField(gff$attributes, "id", ";")
+ gff$Generic_ID <- getAttributeField(gff$attributes, "id", ";")
  gff$Prokka_product <- getAttributeField(gff$attributes, "product", ";")
  gff$Prokka_inference <- getAttributeField(gff$attributes, "inference", ";")
  gff$Annotated_domain <- getAttributeField(gff$attributes, "protein_motif", ";")
 
  ### Give columns a name
- col = c("seqname", "start", "end", "feature", "source", "Prokka_ID", 
+ col = c("seqname", "start", "end", "feature", "source", "Generic_ID", 
  "Prokka_product", "Prokka_inference", "Annotated_domain")
 
 

diff --git a/docs/manual.md b/docs/manual.md
@@ -63,6 +63,18 @@ The pipeline accepts as input two other input files types that are used to perfo
 
  In order to increase the accuracy of prokka annotation, this pipeline includes an additional HMM database to prokka's defaults. It can be either TIGRFAM (smaller but curated) or PGAP (bigger comprehensive NCBI database that contains TIGRFAM).
 
+## Bakta annotation
+
+!!! info "Using Bakta"
+
+ If desired, users can use [`bakta`](https://github.com/oschwengers/bakta) instead of `prokka` to perform the core generic annotation of their prokaryotic genomes. For that, users must simply [download and store bakta database](https://github.com/oschwengers/bakta#database-download) in their machine, and pass its path to `bacannot` with the `--bakta_db` parameter.
+
+ We opted for having it like this because bakta database is quite big.
+
+| <div style="width:160px">Parameter</div> | Required | Default | Description |
+| :--------------------------------------- | :------- | :------ | :---------- |
+| `--bakta_db` | :material-close: | NA | Path to bakta database. If given, bacannot will use bakta instead of prokka. |
+
 ## Resfinder annotation
 
 The use of this parameter sets a default value for input samples. If a sample has a different value given inside the samplesheet, the pipeline will use, for that sample, the value found inside the [samplesheet](samplesheet.md#).

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -82,3 +82,7 @@ nextflow run fmalmeida/bacannot -profile docker,quicktest --bacannot_db ./bacann
 !!! info ""
 
  Unfortunately, due to file sizes, we could not provide fast5 files for users to check on the methylation step.
+
+### Annotation with bakta
+
+User can also perform the core generic annotation with bakta instead of prokka. Please read [the manual](manual#bakta-annotation).