Merge pull request #20 from bcbio/fix_QC_rnaseq_318

lpantano · web-flow · commit 1e4ffd12a577 · 2025-05-21T16:13:33.000-04:00
update QC for rnaseq v3.18
diff --git a/00_libs/load_data.R b/00_libs/load_data.R
@@ -6,109 +6,133 @@ load_metrics <- function(se = se_object, multiqc = multiqc_data_dir,
                          gtf = gtf_fn,
                          counts = counts,
                          single_end = FALSE) {
-  # bcbio input
-  if (!is.na(se_object)) {
-    se <- readRDS(se_object)
-    metrics <- metadata(se)$metrics %>% as.data.frame()
-    # left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample')
-  } else { # nf-core input
+  # Get metrics from nf-core into bcbio like table
+  # many metrics are already in the General Table of MultiQC, this reads the file
+  metrics <- read_tsv(file.path(multiqc_data_dir, "multiqc_general_stats.txt"))
 
-    # Get metrics from nf-core into bcbio like table
-    # many metrics are already in the Genereal Table of MultiQC, this reads the file
-    metrics <- read_tsv(file.path(multiqc_data_dir, "multiqc_general_stats.txt"))
+  # we use the names in the multiqc general stats file to determine which version of the pipeline was used.
+  # this affects other metrics processing throughout this function.
+  if (any(grepl("mqc-generalstats", names(metrics)))) {
+    version <- "3.14"
+  } else {
+    version <- "3.18"
+  }
 
-    # we get some more metrics from Qualimap and rename columns
+  # we get some more metrics from Qualimap and rename columns
+  if (version == 3.14) {
     metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, "mqc_qualimap_genomic_origin_1.txt"))
-    metrics <- metrics %>% full_join(metrics_qualimap)
+  } else {
+    metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, "qualimap_genomic_origin.txt"))
+  }
+
+  metrics <- metrics %>% full_join(metrics_qualimap)
+  metrics <- metrics %>%
+    clean_names()
+
+  if (version == "3.14") {
+    metrics <- metrics %>% dplyr::rename_with(~ gsub(".*mqc_generalstats_", "", .))
+  }
+
+  # This uses the fastqc metrics to get total reads
+  total_reads <- metrics %>%
+    dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>%
+    remove_empty(which = "cols") %>%
+    dplyr::rename(single_sample = sample) %>%
+    mutate(sample = gsub("_[12]+$", "", single_sample)) %>%
+    group_by(sample) %>%
+    summarize(total_reads = sum(fastqc_raw_total_sequences))
+
+  # This renames to user-friendly names the metrics columns
+  if (single_end) {
     metrics <- metrics %>%
-      clean_names() %>%
-      dplyr::rename_with(~ gsub(".*mqc_generalstats_", "", .))
-
-    # This uses the fastqc metrics to get total reads
-    total_reads <- metrics %>%
-      dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>%
-      remove_empty(which = "cols") %>%
-      dplyr::rename(single_sample = sample) %>%
-      mutate(sample = gsub("_[12]+$", "", single_sample)) %>%
-      group_by(sample) %>%
-      summarize(total_reads = sum(fastqc_raw_total_sequences))
-
-    # This renames to user-friendly names the metrics columns
-    if (single_end) {
-      metrics <- metrics %>%
-        dplyr::filter(!is.na(fastqc_raw_total_sequences))
-    } else {
-      metrics <- metrics %>%
-        dplyr::filter(is.na(fastqc_raw_total_sequences))
-    }
+      dplyr::filter(!is.na(fastqc_raw_total_sequences))
+  } else {
     metrics <- metrics %>%
-      remove_empty(which = "cols") %>%
-      full_join(total_reads) %>%
-      mutate(mapped_reads = samtools_reads_mapped) %>%
-      rowwise() %>%
-      mutate(exonic_rate = exonic / (exonic + intronic + intergenic)) %>%
-      mutate(intronic_rate = intronic / (exonic + intronic + intergenic)) %>%
-      mutate(intergenic_rate = intergenic / (exonic + intronic + intergenic)) %>%
-      mutate(x5_3_bias = qualimap_5_3_bias)
-
-    # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case
-    gtf <- NULL
-    biotype <- NULL
-
-    if (genome == "other") {
-      gtf <- gtf_fn
-    } else {
-      if (genome == "hg38") {
-        gtf <- "hg38.rna.gtf.gz"
-      } else if (genome == "mm10") {
-        gtf <- "mm10.rna.gtf.gz"
-      } else if (genome == "mm39") {
-        gtf <- "mm39.rna.gtf.gz"
-      }
-      gtf <- file.path("https://github.com/bcbio/bcbioR/raw/refs/heads/main/inst/extdata/annotation", gtf)
-    }
-    if (is.null(gtf)) {
-      warning("No genome provided! Please add it at the top of this Rmd")
-    } else {
-      gtf <- rtracklayer::import(gtf)
-
-      one <- grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE)
-      another <- grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE)
-      if (length(one) == 1) {
-        biotype <- one
-      } else if (length(another) == 1) {
-        biotype <- another
-      } else {
-        warning("No gene biotype founded")
-      }
+      dplyr::filter(is.na(fastqc_raw_total_sequences))
+  }
+
+  metrics <- metrics %>%
+    remove_empty(which = "cols") %>%
+    full_join(total_reads)
+
+  if (version == "3.14") {
+    metrics <- metrics %>% mutate(mapped_reads = samtools_reads_mapped)
+  } else {
+    metrics <- metrics %>% mutate(mapped_reads = samtools_stats_reads_mapped)
+  }
+
+  metrics <- metrics %>%
+    rowwise() %>%
+    mutate(exonic_rate = exonic / (exonic + intronic + intergenic)) %>%
+    mutate(intronic_rate = intronic / (exonic + intronic + intergenic)) %>%
+    mutate(intergenic_rate = intergenic / (exonic + intronic + intergenic))
+
+  if (version == "3.14") {
+    metrics <- metrics %>% mutate(x5_3_bias = qualimap_5_3_bias)
+  } else {
+    metrics <- metrics %>% mutate(x5_3_bias = qualimap_rnaseq_5_3_bias)
+  }
+
+  # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case
+  gtf <- NULL
+  biotype <- NULL
+
+  if (genome == "other") {
+    gtf <- gtf_fn
+  } else {
+    if (genome == "hg38") {
+      gtf <- "hg38.rna.gtf.gz"
+    } else if (genome == "mm10") {
+      gtf <- "mm10.rna.gtf.gz"
+    } else if (genome == "mm39") {
+      gtf <- "mm39.rna.gtf.gz"
     }
+    gtf <- file.path("https://github.com/bcbio/bcbioR/raw/refs/heads/main/inst/extdata/annotation", gtf)
+  }
+  if (is.null(gtf)) {
+    warning("No genome provided! Please add it at the top of this Rmd")
+  } else {
+    gtf <- rtracklayer::import(gtf)
 
-    metrics$sample <- make.names(metrics$sample)
-    if (!is.null(biotype)) {
-      annotation <- as.data.frame(gtf) %>% .[, c("gene_id", biotype)]
-      annotation$gene_id <- stringr::str_remove(annotation$gene_id, "\\..*$") # remove .1 from end of gene
-      rRNA <- grepl("rRNA|tRNA", annotation[[biotype]])
-      genes <- intersect(annotation[rRNA, "gene_id"], row.names(counts))
-      ratio <- data.frame(
-        sample = colnames(counts),
-        r_and_t_rna_rate = colSums(counts[genes, ]) / colSums(counts)
-      )
+    one <- grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE)
+    another <- grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE)
+    if (length(one) == 1) {
+      biotype <- one
+    } else if (length(another) == 1) {
+      biotype <- another
+      metrics$sample <- make.names(metrics$sample)
       metrics <- left_join(metrics, ratio, by = "sample")
     } else {
-      metrics[["r_and_t_rna_rate"]] <- NA
+      warning("No gene biotype founded")
     }
+  }
 
-    # if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){
-    #   metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna)
-    # }else{
-    #  metrics[["r_rna_rate"]] <- NA
-    # }
-    metrics <- metrics[, c(
-      "sample", "mapped_reads", "exonic_rate", "intronic_rate",
-      "total_reads",
-      "x5_3_bias", "r_and_t_rna_rate", "intergenic_rate"
-    )]
+  metrics$sample <- make.names(metrics$sample)
+  if (!is.null(biotype)) {
+    annotation <- as.data.frame(gtf) %>% .[, c("gene_id", biotype)]
+    annotation$gene_id <- stringr::str_remove(annotation$gene_id, "\\..*$") # remove .1 from end of gene
+    rRNA <- grepl("rRNA|tRNA", annotation[[biotype]])
+    genes <- intersect(annotation[rRNA, "gene_id"], row.names(counts))
+    ratio <- data.frame(
+      sample = colnames(counts),
+      r_and_t_rna_rate = colSums(counts[genes, ]) / colSums(counts)
+    )
+    metrics <- left_join(metrics, ratio, by = "sample")
+  } else {
+    metrics[["r_and_t_rna_rate"]] <- NA
   }
+
+  # if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){
+  #   metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna)
+  # }else{
+  #  metrics[["r_rna_rate"]] <- NA
+  # }
+  metrics <- metrics[, c(
+    "sample", "mapped_reads", "exonic_rate", "intronic_rate",
+    "total_reads",
+    "x5_3_bias", "r_and_t_rna_rate", "intergenic_rate"
+  )]
+
   rownames(metrics) <- metrics$sample
   return(metrics)
 }
diff --git a/00_params/params-example.R b/00_params/params-example.R
@@ -3,10 +3,10 @@ date <- "YYYYMMDD"
 basedir <- "./" # where to write down output files
 
 # Example data
-coldata_fn <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv"
-counts_fn <- url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv")
+coldata_fn <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/v3_18/coldata.csv"
+counts_fn <- url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/v3_18/star_salmon/salmon.merged.gene_counts.tsv")
 # This folder is in the output directory inside multiqc folder
-multiqc_data_dir <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/"
+multiqc_data_dir <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/v3_18/multiqc/star_salmon/multiqc_report_data/"
 # This file is inside the genome folder in the output directory
-gtf_fn <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz"
+gtf_fn <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/refs/heads/main/nf-core/v3_18/genome/genome.filtered.gtf.gz"
 se_object <- NA
diff --git a/01_quality_assessment/QC.qmd b/01_quality_assessment/QC.qmd
@@ -182,7 +182,7 @@ Here, we want to see consistency and a minimum of 20 million reads (the grey lin
 metrics %>%
   ggplot(aes(
     x = factor(sample, level = order),
-    y = total_reads / 1000000,
+    y = total_reads, # if total reads are not already in millions, divide by 10000000 here
     fill = .data[[factor_of_interest]]
   )) +
   geom_bar(stat = "identity") +