Merge pull request #28 from yerkes-gencore/dev

Dev
yerkes-gencore · Apr 15, 2024 · 5c6cb49 · 5c6cb49
2 parents 5f9fa98 + f3b6146
commit 5c6cb49
Show file tree

Hide file tree

Showing 18 changed files with 524 additions and 138 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: gencoreBulk
 Title: NPRC Genomics Core Bulk RNA Analysis Utilities
-Version: 0.2.1
+Version: 0.2.2
 Date: 2024-03-22
 Author: person("Derrik", "Gratz", email = "[email protected]", role =
     c("aut", "cre"), comment = c(ORCID = "0009-0002-5934-576X"))
@@ -9,6 +9,7 @@ Description: A shared codebase for custom R functions we find useful for
     bulk RNA analysis.
 License: MIT + file LICENSE
 URL: https://github.com/yerkes-gencore/gencoreBulk
+biocViews:
 Imports:
     Biobase,
     circlize,
@@ -35,7 +36,9 @@ Imports:
     SummarizedExperiment,
     tibble,
     tidyr
-biocViews: ComplexHeatmap, EnhancedVolcano, DESeq2, SummarizedExperiment, fgsea
 Encoding: UTF-8
 Roxygen: list(markdown=TRUE)
 RoxygenNote: 7.2.3
+Suggests: 
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -14,7 +14,10 @@ export(gseaDotplot_joint)
 export(gseaDotplot_single)
 export(heatmapFromGenelist)
 export(mappingBinsPlot)
+export(normalizeCountsForHeatmap)
+export(normalizeCountsForHeatmapByIndividual)
 export(parseReadPerGeneFiles)
+export(pcaPlotSimple)
 export(plotFilterByExpr)
 export(plotGeneExpression)
 export(plotPCAFromConfig)
@@ -69,6 +72,7 @@ importFrom(readr,write_lines)
 importFrom(readr,write_tsv)
 importFrom(rlang,.data)
 importFrom(stats,model.matrix)
+importFrom(stats,na.omit)
 importFrom(stats,prcomp)
 importFrom(stringr,str_count)
 importFrom(stringr,str_remove)

diff --git a/R/PCA.R b/R/PCA.R
@@ -20,8 +20,6 @@
 #'
 #' @import ggplot2
 #' @import ggrepel
-#' @importFrom stats prcomp
-#' @importFrom matrixStats rowVars
 #' @importFrom SummarizedExperiment assays assay
 #' @importFrom rlang .data
 #'
@@ -32,9 +30,8 @@ plotPCAFromConfig <- function(analysis,
                               size = 5,
                               pc.1 = 1,
                               pc.2 = 2) {
-  pca_data <- 
-  .pcaPlotGKT(assays(analysis$dds)$vst,
-    intgroup = names(colData(assays(analysis$dds)$vst)),
+  pcaPlotSimple(counts = assay(assays(analysis$dds)$vst),
+    metadata = colData(analysis$dds),
     xpc = pc.1, ypc = pc.2
   ) +
     geom_point(
@@ -85,36 +82,54 @@ plotPCAFromConfig <- function(analysis,
     theme(text = element_text(size = 10)) # , arrow=arrow(ends="last", type="closed", length=unit(0.1, "inches")))
 }
 
-.pcaDataGKT <- function(object, intgroup = "condition", ntop = 500) {
-  rv <- matrixStats::rowVars(assay(object))
+
+#' Simple base for PCA plot
+#' 
+#' Calculates PCA on counts data. Returns a ggplot object with the PCA data
+#' and associated metadata. No additional geoms are included, this is meant
+#' to be a blank template. You do need to provide metadata here so it can be 
+#' accessed with geoms you add to this output. 
+#'
+#' @param counts Expression data matrix. Colnames of counts should match rownames
+#'  of `metadata`
+#' @param metadata Dataframe of metadata to associate with counts data. Rownames
+#'  of metadata should match colnames of `counts`. 
+#' @param xpc Numeric, which PC to plot on the x axis. Default 1
+#' @param ypc Numeric, which PC to plot on the y axis. Default 2
+#' @param ntop Number of genes to include in PCA. Default 500
+#'
+#' @returns A ggplot object
+#' @export
+#' 
+#' @import ggplot2
+#' @importFrom stats prcomp
+#' @importFrom matrixStats rowVars
+#' @importFrom rlang .data
+#' 
+#'
+#' @examples 
+#' \dontrun{
+#' pcaPlot <- pcaPlotSimple(assay(assays(pbmc_subset)$vst), metadata = colData(pbmc_subset))
+#' 
+#' ## Manually adding geoms
+#' pcaPlot + geom_point(aes(color = Individual, shape=Timepoint)) 
+#' 
+#' }
+#' 
+pcaPlotSimple <- function(counts, metadata, xpc = 1, ypc = 2, ntop = 500) {
+  if (colnames(counts) != rownames(metadata)) {
+    stop('Colnames of counts does not match rownames of metadata')
+  }
+  rv <- matrixStats::rowVars(counts)
   select <- order(rv, decreasing = TRUE)[seq_len(min(
     ntop,
     length(rv)
   ))]
-  pca <- stats::prcomp(t(assay(object)[select, ]))
+  pca <- stats::prcomp(t(counts[select,]))
+  d <- merge(pca$x, metadata, by.x = 'row.names', by.y = 'row.names')
   percentVar <- pca$sdev^2 / sum(pca$sdev^2)
-  if (!all(intgroup %in% names(colData(object)))) {
-    stop("the argument 'intgroup' should specify columns of colData(dds)")
-  }
-  intgroup.df <- as.data.frame(colData(object)[, intgroup,
-    drop = FALSE
-  ])
-  group <- if (length(intgroup) > 1) {
-    factor(apply(intgroup.df, 1, paste, collapse = " : "))
-  } else {
-    colData(object)[[intgroup]]
-  }
-  d <- data.frame(pca$x,
-    group = group,
-    intgroup.df, name = colnames(object)
-  )
-  attr(d, "percentVar") <- percentVar
-  return(d)
-}
-
-.pcaPlotGKT <- function(object, intgroup = "condition", xpc = 1, ypc = 2, ntop = 500) {
-  d <- .pcaDataGKT(object, intgroup, ntop)
-  percentVar <- round(100 * attr(d, "percentVar"))
-  ggplot(d, aes_string(x = names(d)[xpc], y = names(d)[ypc])) +
-    labs(x = paste0(names(d)[xpc], ": ", percentVar[xpc], "% variance"), y = paste0(names(d)[ypc], ": ", percentVar[ypc], "% variance"))
-}
+  percentVar <- round(100 * percentVar)
+  ggplot(d, aes(x = .data[[paste0('PC',xpc)]], y = .data[[paste0('PC',ypc)]])) +
+    labs(x = paste0('PC',xpc, ": ", percentVar[xpc], "% variance"), 
+         y = paste0('PC',ypc, ": ", percentVar[ypc], "% variance"))
+}
diff --git a/R/RLE.R b/R/RLE.R
@@ -54,16 +54,3 @@ checkRLE <- function(dds) {
     ggtitle(title) +
     aes(forcats::fct_inorder(.data$Sample)) #+ scale_y_continuous(limits = c(-9,3.25), expand = c(0,0))
 }
-
-.plotRLE <- function(data, title) {
-  tibble::as_tibble(data, rownames = NA) %>%
-    tidyr::pivot_longer(everything(), names_to = "Sample", values_to = "RLE") %>%
-    ggplot(aes(x = .data$Sample, y = .data$RLE)) +
-    geom_hline(yintercept = 0, color = "red") +
-    geom_violin(draw_quantiles = c(0.25, 0.75), trim = TRUE, color = "lightgreen", alpha = 0.1) +
-    geom_boxplot(alpha = 0) +
-    theme_bw() +
-    theme(axis.text.x = element_text(vjust = 0.5, angle = 90), axis.title.x = element_blank(), aspect.ratio = 0.55) +
-    ggtitle(title) +
-    aes(forcats::fct_inorder(.data$Sample)) #+ scale_y_continuous(limits = c(-9,3.25), expand = c(0,0))
-}
diff --git a/R/gseaDotplot_joint.R b/R/gseaDotplot_joint.R
@@ -104,5 +104,5 @@ gseaDotplot_joint <- function(gsea_results,
                  range=c(1,8),
                  breaks=-log10(c(0.1,0.01,0.001,0.0001)),
                  labels=c(0.1,0.01,0.001,0.0001)) +
-    geom_text(na.rm = TRUE, color = 'black', size = 3)
+    geom_text(na.rm = TRUE, color = 'white', size = 3)
 }
diff --git a/R/heatmapFromGenelist.R b/R/heatmapFromGenelist.R
@@ -3,15 +3,14 @@
 #' Makes a heatmap of the given list of genes, separating samples slightly by
 #'  group variable. The heatmap will render samples in the order of colnames
 #'  in the provided data, and will be split by the metadata column specified.
-#'  Expression values will be normalized to the median of a specified group.
+#'  Normalization by groups or samples is no longer contained in this function.
+#'  See `gencoreBulk::normalizeCounts` for help on normalizing data for heatmaps.
 #'
 #'  If you want to reorder columns plotted, arrange the data passed into `data`.
+#'  See the examples
 #'
 #' @param geneList Character vector of genes to plot
-#' @param baseline_grouping Character, level of `colnames(colData(data))`
-#' @param baseline level of `baseline_grouping` in `colData(data)` used to calculate
-#'  median, or leave blank to use all samples to generate median
-#' @param data Object of class DESeqTransform
+#' @param data Matrix of processed counts
 #' @param slice_labels Optional labels for sliced columns
 #' @param colors vector of color values passed to colorRamp2, default is c("blue", "white", "red")
 #' @param slice_labels_rot Rotation angle of `slice_labels`
@@ -35,16 +34,26 @@
 #'
 #' @examples
 #' \dontrun{
+#' 
+#' ## Normalize data for plotting
+#' data_to_plot <- normalizeCountsForHeatmapByIndividual(assay(assays(obj.pbmc)$rld),
+#'   obj.pbmc@colData,
+#'   group_var = 'Timepoint', baseline = 'pre', 
+#'   individual_var = 'Individual',
+#'   remove_baseline = FALSE)
+#'   
 #' ## Simple call
 #' heatmapFromGenelist(
 #'   geneList = c("Ccl2", "Cxcl1", "Cxcl2", "Postn", "Fn1", "Thbs1"),
+#'   data_to_plot,
 #'   baseline_grouping = "Group",
 #'   baseline = "Cont"
 #' )
 #'
 #' ## run with custom reordering of data and labeled slices
 #' heatmapFromGenelist(
 #'   geneList = c("Ccl2", "Cxcl1", "Cxcl2", "Postn", "Fn1", "Thbs1"),
+#'   data_to_plot,
 #'   baseline_grouping = "Group",
 #'   baseline = "Cont",
 #'   column_split = c(rep(1, 3), rep(2, 3), rep(3, 3), rep(4, 3)),
@@ -55,30 +64,32 @@
 #'
 #' @export
 heatmapFromGenelist <- function(geneList,
-                                baseline_grouping = NULL,
-                                baseline = NULL,
-                                column_split = NULL,
-                                slice_labels = NULL,
-                                colors = c("blue", "white", "red"),
-                                data = SummarizedExperiment::assays(analysis$dds)$rld,
-                                column_labels = colnames(data),
-                                slice_labels_rot = 90,
-                                box_width = unit(3.5, "mm"),
-                                box_height = unit(3.5, "mm"),
-                                width_buffer = unit(5, "mm"),
-                                height_buffer = unit(10, "mm"),
-                                column_title = " ",
-                                cluster_rows = FALSE,
-                                cluster_columns = FALSE,
-                                column_gap = unit(2, "mm"),
-                                scale_min = -2,
-                                scale_max = 2,
-                                heatmap_legend_param = list(
-                                  at = c(scale_min, 0, scale_max),
-                                  labels = c(scale_min,0,scale_max),
-                                  title = 'log2 fold\ndifference\nfrom\nmedian\nexpression'
-                                ),
-                                ...) {
+                                 data,
+                                 column_split = NULL,
+                                 slice_labels = NULL,
+                                 colors = c("blue", "white", "red"),
+                                 column_labels = colnames(data),
+                                 slice_labels_rot = 90,
+                                 box_width = unit(3.5, "mm"),
+                                 box_height = unit(3.5, "mm"),
+                                 width_buffer = unit(5, "mm"),
+                                 height_buffer = unit(10, "mm"),
+                                 column_title = " ",
+                                 cluster_rows = FALSE,
+                                 cluster_columns = FALSE,
+                                 column_gap = unit(2, "mm"),
+                                 scale_min = -2,
+                                 scale_max = 2,
+                                 heatmap_legend_param = list(
+                                   at = c(scale_min, 0, scale_max),
+                                   labels = c(scale_min,0,scale_max),
+                                   title = 'log2 fold\nchange'
+                                 ),
+                                 ...) {
+  if (inherits(data, 'DESeqTransform')) {
+    message('Converting DESeqTransform data into matrix')
+    data <- SummarizedExperiment::assay(data)
+  }
   duds <- geneList[!geneList %in% rownames(data)]
   if (length(duds) > 0){
     geneList <- geneList[geneList %in% rownames(data)]
@@ -88,45 +99,33 @@ heatmapFromGenelist <- function(geneList,
       message(paste0('Genes ', paste0(duds, collapse = ', '), ' not found in data'))
     }
   }
-  hmap <- data[geneList, ]
-  if (is.null(baseline_grouping) | is.null(baseline)) {
-    message('Basline grouping or baseline level not specified, using all samples
-            to generate median expression per gene')
-    baseline <- matrixStats::rowMedians(SummarizedExperiment::assay(hmap))
-  } else if (!baseline_grouping %in% colnames(colData(data))) {
-    stop("Argument 'baseline_grouping' should be in colData(data)")
-  } else if (!baseline %in% unique(colData(data)[[baseline_grouping]])) {
-    stop("Argument 'baseline' should be a level of 'baseline_grouping' in colData(data)")
-  } else{
-    baseline <- matrixStats::rowMedians(SummarizedExperiment::assay(hmap[, as.character(hmap@colData[[baseline_grouping]]) %in% baseline]))
-  }
-  hmap <- SummarizedExperiment::assay(hmap) - baseline
-  ComplexHeatmap::Heatmap(hmap,
-    heatmap_legend_param = heatmap_legend_param,
-    #border = "black",
-    width = ncol(hmap) * box_width + width_buffer,
-    height = nrow(hmap) * box_height + height_buffer,
-    #rect_gp = grid::gpar(color = "black"),
-    column_title = column_title,
-    cluster_rows = cluster_rows,
-    cluster_columns = cluster_columns,
-    column_split = column_split,
-    top_annotation = (if (!is.null(slice_labels)) {
-      if (is.null(column_split)) {
-        warning("Setting labels requires slices to also be set")
-      }
-      ComplexHeatmap::HeatmapAnnotation(foo = ComplexHeatmap::anno_block(
-        gp = gpar(col = NA),
-        labels = slice_labels,
-        labels_gp = gpar(col = "black", fontsize = 10),
-        labels_rot = slice_labels_rot, height = unit(2, "cm")
-      ))
-    } else {
-      NULL
-    }),
-    column_gap = column_gap,
-    column_labels = column_labels,
-    col = circlize::colorRamp2(c(scale_min, 0, scale_max), colors),
-    ...
+  data <- data[geneList,,drop=FALSE]
+  ComplexHeatmap::Heatmap(data,
+                          heatmap_legend_param = heatmap_legend_param,
+                          #border = "black",
+                          width = ncol(data) * box_width + width_buffer,
+                          height = nrow(data) * box_height + height_buffer,
+                          #rect_gp = grid::gpar(color = "black"),
+                          column_title = column_title,
+                          cluster_rows = cluster_rows,
+                          cluster_columns = cluster_columns,
+                          column_split = column_split,
+                          top_annotation = (if (!is.null(slice_labels)) {
+                            if (is.null(column_split)) {
+                              warning("Setting labels requires slices to also be set")
+                            }
+                            ComplexHeatmap::HeatmapAnnotation(foo = ComplexHeatmap::anno_block(
+                              gp = gpar(col = NA),
+                              labels = slice_labels,
+                              labels_gp = gpar(col = "black", fontsize = 10),
+                              labels_rot = slice_labels_rot, height = unit(2, "cm")
+                            ))
+                          } else {
+                            NULL
+                          }),
+                          column_gap = column_gap,
+                          column_labels = column_labels,
+                          col = circlize::colorRamp2(c(scale_min, 0, scale_max), colors),
+                          ...
   )
 }