Skip to content

Commit 1e4ffd1

Browse files
authored
Merge pull request #20 from bcbio/fix_QC_rnaseq_318
update QC for rnaseq v3.18
2 parents 6ef6278 + e440e04 commit 1e4ffd1

3 files changed

Lines changed: 120 additions & 96 deletions

File tree

00_libs/load_data.R

Lines changed: 115 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -6,109 +6,133 @@ load_metrics <- function(se = se_object, multiqc = multiqc_data_dir,
66
gtf = gtf_fn,
77
counts = counts,
88
single_end = FALSE) {
9-
# bcbio input
10-
if (!is.na(se_object)) {
11-
se <- readRDS(se_object)
12-
metrics <- metadata(se)$metrics %>% as.data.frame()
13-
# left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample')
14-
} else { # nf-core input
9+
# Get metrics from nf-core into bcbio like table
10+
# many metrics are already in the General Table of MultiQC, this reads the file
11+
metrics <- read_tsv(file.path(multiqc_data_dir, "multiqc_general_stats.txt"))
1512

16-
# Get metrics from nf-core into bcbio like table
17-
# many metrics are already in the Genereal Table of MultiQC, this reads the file
18-
metrics <- read_tsv(file.path(multiqc_data_dir, "multiqc_general_stats.txt"))
13+
# we use the names in the multiqc general stats file to determine which version of the pipeline was used.
14+
# this affects other metrics processing throughout this function.
15+
if (any(grepl("mqc-generalstats", names(metrics)))) {
16+
version <- "3.14"
17+
} else {
18+
version <- "3.18"
19+
}
1920

20-
# we get some more metrics from Qualimap and rename columns
21+
# we get some more metrics from Qualimap and rename columns
22+
if (version == 3.14) {
2123
metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, "mqc_qualimap_genomic_origin_1.txt"))
22-
metrics <- metrics %>% full_join(metrics_qualimap)
24+
} else {
25+
metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, "qualimap_genomic_origin.txt"))
26+
}
27+
28+
metrics <- metrics %>% full_join(metrics_qualimap)
29+
metrics <- metrics %>%
30+
clean_names()
31+
32+
if (version == "3.14") {
33+
metrics <- metrics %>% dplyr::rename_with(~ gsub(".*mqc_generalstats_", "", .))
34+
}
35+
36+
# This uses the fastqc metrics to get total reads
37+
total_reads <- metrics %>%
38+
dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>%
39+
remove_empty(which = "cols") %>%
40+
dplyr::rename(single_sample = sample) %>%
41+
mutate(sample = gsub("_[12]+$", "", single_sample)) %>%
42+
group_by(sample) %>%
43+
summarize(total_reads = sum(fastqc_raw_total_sequences))
44+
45+
# This renames to user-friendly names the metrics columns
46+
if (single_end) {
2347
metrics <- metrics %>%
24-
clean_names() %>%
25-
dplyr::rename_with(~ gsub(".*mqc_generalstats_", "", .))
26-
27-
# This uses the fastqc metrics to get total reads
28-
total_reads <- metrics %>%
29-
dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>%
30-
remove_empty(which = "cols") %>%
31-
dplyr::rename(single_sample = sample) %>%
32-
mutate(sample = gsub("_[12]+$", "", single_sample)) %>%
33-
group_by(sample) %>%
34-
summarize(total_reads = sum(fastqc_raw_total_sequences))
35-
36-
# This renames to user-friendly names the metrics columns
37-
if (single_end) {
38-
metrics <- metrics %>%
39-
dplyr::filter(!is.na(fastqc_raw_total_sequences))
40-
} else {
41-
metrics <- metrics %>%
42-
dplyr::filter(is.na(fastqc_raw_total_sequences))
43-
}
48+
dplyr::filter(!is.na(fastqc_raw_total_sequences))
49+
} else {
4450
metrics <- metrics %>%
45-
remove_empty(which = "cols") %>%
46-
full_join(total_reads) %>%
47-
mutate(mapped_reads = samtools_reads_mapped) %>%
48-
rowwise() %>%
49-
mutate(exonic_rate = exonic / (exonic + intronic + intergenic)) %>%
50-
mutate(intronic_rate = intronic / (exonic + intronic + intergenic)) %>%
51-
mutate(intergenic_rate = intergenic / (exonic + intronic + intergenic)) %>%
52-
mutate(x5_3_bias = qualimap_5_3_bias)
53-
54-
# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case
55-
gtf <- NULL
56-
biotype <- NULL
57-
58-
if (genome == "other") {
59-
gtf <- gtf_fn
60-
} else {
61-
if (genome == "hg38") {
62-
gtf <- "hg38.rna.gtf.gz"
63-
} else if (genome == "mm10") {
64-
gtf <- "mm10.rna.gtf.gz"
65-
} else if (genome == "mm39") {
66-
gtf <- "mm39.rna.gtf.gz"
67-
}
68-
gtf <- file.path("https://github.com/bcbio/bcbioR/raw/refs/heads/main/inst/extdata/annotation", gtf)
69-
}
70-
if (is.null(gtf)) {
71-
warning("No genome provided! Please add it at the top of this Rmd")
72-
} else {
73-
gtf <- rtracklayer::import(gtf)
74-
75-
one <- grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE)
76-
another <- grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE)
77-
if (length(one) == 1) {
78-
biotype <- one
79-
} else if (length(another) == 1) {
80-
biotype <- another
81-
} else {
82-
warning("No gene biotype founded")
83-
}
51+
dplyr::filter(is.na(fastqc_raw_total_sequences))
52+
}
53+
54+
metrics <- metrics %>%
55+
remove_empty(which = "cols") %>%
56+
full_join(total_reads)
57+
58+
if (version == "3.14") {
59+
metrics <- metrics %>% mutate(mapped_reads = samtools_reads_mapped)
60+
} else {
61+
metrics <- metrics %>% mutate(mapped_reads = samtools_stats_reads_mapped)
62+
}
63+
64+
metrics <- metrics %>%
65+
rowwise() %>%
66+
mutate(exonic_rate = exonic / (exonic + intronic + intergenic)) %>%
67+
mutate(intronic_rate = intronic / (exonic + intronic + intergenic)) %>%
68+
mutate(intergenic_rate = intergenic / (exonic + intronic + intergenic))
69+
70+
if (version == "3.14") {
71+
metrics <- metrics %>% mutate(x5_3_bias = qualimap_5_3_bias)
72+
} else {
73+
metrics <- metrics %>% mutate(x5_3_bias = qualimap_rnaseq_5_3_bias)
74+
}
75+
76+
# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case
77+
gtf <- NULL
78+
biotype <- NULL
79+
80+
if (genome == "other") {
81+
gtf <- gtf_fn
82+
} else {
83+
if (genome == "hg38") {
84+
gtf <- "hg38.rna.gtf.gz"
85+
} else if (genome == "mm10") {
86+
gtf <- "mm10.rna.gtf.gz"
87+
} else if (genome == "mm39") {
88+
gtf <- "mm39.rna.gtf.gz"
8489
}
90+
gtf <- file.path("https://github.com/bcbio/bcbioR/raw/refs/heads/main/inst/extdata/annotation", gtf)
91+
}
92+
if (is.null(gtf)) {
93+
warning("No genome provided! Please add it at the top of this Rmd")
94+
} else {
95+
gtf <- rtracklayer::import(gtf)
8596

86-
metrics$sample <- make.names(metrics$sample)
87-
if (!is.null(biotype)) {
88-
annotation <- as.data.frame(gtf) %>% .[, c("gene_id", biotype)]
89-
annotation$gene_id <- stringr::str_remove(annotation$gene_id, "\\..*$") # remove .1 from end of gene
90-
rRNA <- grepl("rRNA|tRNA", annotation[[biotype]])
91-
genes <- intersect(annotation[rRNA, "gene_id"], row.names(counts))
92-
ratio <- data.frame(
93-
sample = colnames(counts),
94-
r_and_t_rna_rate = colSums(counts[genes, ]) / colSums(counts)
95-
)
97+
one <- grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE)
98+
another <- grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE)
99+
if (length(one) == 1) {
100+
biotype <- one
101+
} else if (length(another) == 1) {
102+
biotype <- another
103+
metrics$sample <- make.names(metrics$sample)
96104
metrics <- left_join(metrics, ratio, by = "sample")
97105
} else {
98-
metrics[["r_and_t_rna_rate"]] <- NA
106+
warning("No gene biotype founded")
99107
}
108+
}
100109

101-
# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){
102-
# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna)
103-
# }else{
104-
# metrics[["r_rna_rate"]] <- NA
105-
# }
106-
metrics <- metrics[, c(
107-
"sample", "mapped_reads", "exonic_rate", "intronic_rate",
108-
"total_reads",
109-
"x5_3_bias", "r_and_t_rna_rate", "intergenic_rate"
110-
)]
110+
metrics$sample <- make.names(metrics$sample)
111+
if (!is.null(biotype)) {
112+
annotation <- as.data.frame(gtf) %>% .[, c("gene_id", biotype)]
113+
annotation$gene_id <- stringr::str_remove(annotation$gene_id, "\\..*$") # remove .1 from end of gene
114+
rRNA <- grepl("rRNA|tRNA", annotation[[biotype]])
115+
genes <- intersect(annotation[rRNA, "gene_id"], row.names(counts))
116+
ratio <- data.frame(
117+
sample = colnames(counts),
118+
r_and_t_rna_rate = colSums(counts[genes, ]) / colSums(counts)
119+
)
120+
metrics <- left_join(metrics, ratio, by = "sample")
121+
} else {
122+
metrics[["r_and_t_rna_rate"]] <- NA
111123
}
124+
125+
# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){
126+
# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna)
127+
# }else{
128+
# metrics[["r_rna_rate"]] <- NA
129+
# }
130+
metrics <- metrics[, c(
131+
"sample", "mapped_reads", "exonic_rate", "intronic_rate",
132+
"total_reads",
133+
"x5_3_bias", "r_and_t_rna_rate", "intergenic_rate"
134+
)]
135+
112136
rownames(metrics) <- metrics$sample
113137
return(metrics)
114138
}

00_params/params-example.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ date <- "YYYYMMDD"
33
basedir <- "./" # where to write down output files
44

55
# Example data
6-
coldata_fn <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv"
7-
counts_fn <- url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv")
6+
coldata_fn <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/v3_18/coldata.csv"
7+
counts_fn <- url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/v3_18/star_salmon/salmon.merged.gene_counts.tsv")
88
# This folder is in the output directory inside multiqc folder
9-
multiqc_data_dir <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/"
9+
multiqc_data_dir <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/v3_18/multiqc/star_salmon/multiqc_report_data/"
1010
# This file is inside the genome folder in the output directory
11-
gtf_fn <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz"
11+
gtf_fn <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/refs/heads/main/nf-core/v3_18/genome/genome.filtered.gtf.gz"
1212
se_object <- NA

01_quality_assessment/QC.qmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ Here, we want to see consistency and a minimum of 20 million reads (the grey lin
182182
metrics %>%
183183
ggplot(aes(
184184
x = factor(sample, level = order),
185-
y = total_reads / 1000000,
185+
y = total_reads, # if total reads are not already in millions, divide by 10000000 here
186186
fill = .data[[factor_of_interest]]
187187
)) +
188188
geom_bar(stat = "identity") +

0 commit comments

Comments
 (0)