pca_report_template.Rmd


---
title: "PCA & Correlations"
author: "Kimon Froussios"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document: 
    code_folding: hide
    toc: yes
    toc_depth: 4
    toc_float:
      collapsed: no
params:
  name: ''                            # 
  tpm: ''                             # values table path
  covars: ''                          # Experimental variables table
  outdir: ''                          # save path for intermediate data
  nidcols: 1                          # how many columns contain alternative IDs, including the col with the row IDs
  idcol: 1                            # column that has the row IDs
  ntop: 10                            # maximum number of hits to list
  minMean: 0                          # Filter out poorly covered genes
  minSingle: 0                        # Bypass of the above, for genes expressed only in one or few samples
  topVars: 500                        # Number of top dispersed genes to use
  loopVal: "default"                  # default = no looping
  excluded: NULL                      # Features not to be included in PCA
  sortVar: NULL                       # Variable in covars by which to sort the correlation axes
  groupVar: NULL                      # Variable in covars by which to summarise the correlations
  onlyUsable: FALSE                   # Only include the samples defined in the 'use' variable
---

\VignetteEngine{knitr::knitr}

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, bitmapType='cairo-png')
```

```{r message=FALSE, warning=FALSE}
library(data.table)
library(DT)
library(ggplot2)
library(htmltools)
library(patchwork)
library(matrixStats)
library(ggforce)
library(ggdendro)
library(ggrepel)
library(ggbeeswarm)

#setDTthreads(1)

options(scipen=2)

theme_set(theme_bw())
```

```{r functions}
# Correlations
my_pairwise_internal_corels <- function(mat, samples, method = "pearson", rds=NULL, txs=3, minMean=0, minSingle=0, groups=NULL, loopVal="default") {
  # mat <- log10(counts[topgenes,]);
  
  # Filter
  if (minMean != 0 | minSingle != 0) {
    mat <- mat[rowSums(mat >= minSingle) >= 1 | rowMeans(mat) >= minMean, ]
  } else {
    mat <- mat[rowSums(mat) > 0, ]
  }
  
  # Correlations
  cormat <- cor(mat, method=method)
  
  # Cluster
  hcfit <- hclust(dist(scale(cormat, center=TRUE)))
  rn <- rownames(cormat)
  
  # Make dendrogram. https://stackoverflow.com/questions/42047896/joining-a-dendrogram-and-a-heatmap
  dend <- as.dendrogram(hcfit)
  dend_data <- dendro_data(dend)
  # Setup the data, so that the axes are exchanged, instead of using coord_flip()
  segment_data <- with(
      segment(dend_data), 
      data.frame(x = y, y = x, xend = yend, yend = xend))
  # Use the dendrogram label data to position the sample labels
  sample_pos_table <- with(
      dend_data$labels, 
      data.frame(y_center = x, sample = as.character(label), height = 1))
  # Limits for the vertical axes
  sample_axis_limits <- with(
      sample_pos_table, 
      c(min(y_center - 0.5 * height), max(y_center + 0.5 * height))
  ) + 0.1 * c(-1, 1) # extra spacing: 0.1
  # Dendrogram plot
  pd <- ggplot(segment_data) + 
    geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + 
    scale_x_reverse(expand = c(0, 0.5),
                    position = "top") + 
    scale_y_continuous(position = "right",
                       breaks = sample_pos_table$y_center, 
                       labels = sample_pos_table$sample, 
                       limits = sample_axis_limits, 
                       expand = c(0, 0)) + 
    labs(x = NULL, y = NULL) +
    theme_minimal() + 
    theme(panel.grid = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank())

  
  # Create duplicates for different plot styles.
  cormat <- cormat[samples, samples]                     # In the supplied order.
  cormat_t <- cormat                                     # Duplicate in which to delete below the diagonal.
  cormat_c <- cormat[rn[hcfit$order], rn[hcfit$order]]   # Duplicate in clustered order.
  cormat_ct <- cormat_c                                  # Duplicate in which to delete below the diagonal.
  # Delete below the diagonal, for the triangles.
  for (r in 1:nrow(cormat_t)) {
    for (c in 1:ncol(cormat_t)) {
      if (c <= r) {                # For non-clustered, also delete the diagonal. 
        cormat_t[r, c] <- NA_real_
      }
    }
  }
  for (r in 1:nrow(cormat_ct)) {
    for (c in 1:ncol(cormat_ct)) {
      if (c < r) {                 # For clustered keep the diagonal, so the dendrogram lines up.
        cormat_ct[r, c] <- NA_real_
      }
    }
  }
  cormat_ctv <- cormat_ct                               # Duplicate in which to also delete the diagonal, for the value labels.
  for (r in 1:nrow(cormat_ctv)) {
    for (c in 1:ncol(cormat_ctv)) {
      if (c == r) {
        cormat_ctv[r, c] <- NA_real_
      }
    }
  }

  # Restructure for plotting.
  restruct <- function(cm){
    rn <- rownames(cm)
    cm <- as.data.table(cm)
    cm[, observation1 := factor(rn, ordered=TRUE, levels=rn)]
    cm <- melt(cm, id.vars = "observation1", value.name = "Correlation", variable.name = "observation2")
    cm[, observation2 := factor(observation2, ordered=TRUE, levels=rn)]
    # cm <- merge(cormat, sample_pos_table, by.x="observation2", by.y="sample", all.x=TRUE)
    cm
  }
  
  cormat <- restruct(cormat)
  cormat_t <- restruct(cormat_t)
  cormat_c <- restruct(cormat_c)
  cormat_ct <- restruct(cormat_ct)
  cormat_ctv <- restruct(cormat_ctv)
  cormat_t <- cormat_t[!is.na(Correlation)]
  cormat_ct <- cormat_ct[!is.na(Correlation)]
  cormat_ctv <- cormat_ctv[!is.na(Correlation)]
  

  # Text colour switch for the dynamic range
  m <- min(cormat_ctv$Correlation, na.rm=TRUE)
  M <- max(cormat_ctv$Correlation, na.rm=TRUE)
  colourswitch <- c( m + 0.49 * (M-m),  m + 0.51 * (M-m) )
  
  
  # Square. Custom order. No values. Full range.
  p_sonf <- ggplot(cormat, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    scale_fill_gradientn(limits=c(-1, 1), colors=c("lightskyblue", "dodgerblue3", "darkblue", "black", "darkred", "red", "gold"), na.value = "grey50" ) +
    scale_x_discrete(position = "top") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )

  # Square. Custom order. No values. Dynamic range.
  p_sond <- ggplot(cormat, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    scale_fill_gradientn(colors=c("black", "red", "gold", "white"), na.value = "grey50" ) +
    scale_x_discrete(position = "top") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )


  # Square. Custom order. With values. Full range.
  p_sovf <- ggplot(cormat, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    geom_text(data=cormat_t, aes(label=sub('0.', '.', as.character(round(Correlation, 2))), colour=Correlation >= -0.60 & Correlation <= 0.60 ), size=rel(txs)) +
    scale_x_discrete(position = "top") +
    scale_fill_gradientn(limits=c(-1, 1), colors=c("lightskyblue", "dodgerblue3", "darkblue", "black", "darkred", "red", "gold"), na.value = "grey50" ) +
    scale_colour_manual(values=c('FALSE'="black", 'TRUE'="white"), na.value="grey50", guide="none") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )

  # Square. Custom order. With values. Dynamic range.
  p_sovd <- ggplot(cormat, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    geom_text(data=cormat_t, aes(label=sub('0.', '.', as.character(round(Correlation, 2))), colour=(Correlation <= colourswitch[2]) ), size=rel(txs)) +
    scale_fill_gradientn(colors=c("black", "red", "gold", "white"), na.value = "grey50" ) +
    scale_colour_manual(values=c("black", "white"), na.value="grey50", guide="none") +
    scale_x_discrete(position = "top") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )


  # Square. Clustered order. No values. Full range.
  p_scnf <- ggplot(cormat_c, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    scale_fill_gradientn(limits=c(-1, 1), colors=c("lightskyblue", "dodgerblue3", "darkblue", "black", "darkred", "red", "gold"), na.value = "grey50" ) +
    scale_x_discrete(position = "top") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation - Clustered")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )

  # Square. Clustered order. No values. Dynamic range.
  p_scnd <- ggplot(cormat_c, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    scale_fill_gradientn(colors=c("black", "red", "gold", "white"), na.value = "grey50" ) +
    scale_x_discrete(position = "top") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation - Clustered")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )


  # Square. Clustered order. With values. Full range.
  p_scvf <- ggplot(cormat_c, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    geom_text(data=cormat_ctv, aes(label=sub('0.', '.', as.character(round(Correlation, 2))), colour=Correlation >= -0.60 & Correlation <= 0.60 ), size=rel(txs)) +
    scale_fill_gradientn(limits=c(-1, 1), colors=c("lightskyblue", "dodgerblue3", "darkblue", "black", "darkred", "red", "gold"), na.value = "grey50" ) +
    scale_colour_manual(values=c('FALSE'="black", 'TRUE'="white"), na.value="grey50", guide="none") +
    scale_x_discrete(position = "top") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation - Clustered")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )

  # Square. Clustered order. With values. Dynamic range.
  p_scvd <- ggplot(cormat_c, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    geom_text(data=cormat_ctv, aes(label=sub('0.', '.', as.character(round(Correlation, 2))), colour=(Correlation <= colourswitch[2]) ), size=rel(txs)) +
    scale_fill_gradientn(colors=c("black", "red", "gold", "white"), na.value = "grey50" ) +
    scale_colour_manual(values=c("black", "white"), na.value="grey50", guide="none") +
    scale_x_discrete(position = "top") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation - Clustered")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )


  # Triangle. Custom order. With values. Full range.
  p_tovf <- ggplot(cormat_t, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    geom_text(aes(label=sub('0.', '.', as.character(round(Correlation, 2))), colour=Correlation >= -0.60 & Correlation <= 0.60 ), size=rel(txs)) +
    scale_x_discrete(position = "top") +
    scale_fill_gradientn(limits=c(-1, 1), colors=c("lightskyblue", "dodgerblue3", "darkblue", "black", "darkred", "red", "gold"), na.value = "grey50" ) +
    scale_colour_manual(values=c('FALSE'="black", 'TRUE'="white"), na.value="grey50", guide="none") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )

  # Triangle. Custom order. With values. Dynamic range.
  p_tovd <- ggplot(cormat_t, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    geom_text(aes(label=sub('0.', '.', as.character(round(Correlation, 2))), colour=(Correlation <= colourswitch[2]) ), size=rel(txs)) +
    scale_fill_gradientn(colors=c("black", "red", "gold", "white"), na.value = "grey50" ) +
    scale_colour_manual(values=c("black", "white"), na.value="grey50", guide="none") +
    scale_x_discrete(position = "top") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )


  # Triangle. Clustered order. with values. Full range.
  p_tcvf <- ggplot(cormat_ct, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    geom_text(data=cormat_ctv, aes(label=sub('0.', '.', as.character(round(Correlation, 2))), colour=Correlation >= -0.60 & Correlation <= 0.60 ), size=rel(txs)) +
    scale_x_discrete(position = "top") +
    scale_fill_gradientn(limits=c(-1, 1), colors=c("lightskyblue", "dodgerblue3", "darkblue", "black", "darkred", "red", "gold"), na.value = "grey50" ) +
    scale_colour_manual(values=c('FALSE'="black", 'TRUE'="white"), na.value="grey50", guide="none") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation - Clustered")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )

  # Triangle. Clustered order. with values. Dynamic range.
  p_tcvd <- ggplot(cormat_ct, aes(x=observation1, y=observation2)) +
    geom_tile(aes(fill=Correlation)) +
    geom_text(data=cormat_ctv, aes(label=sub('0.', '.', as.character(round(Correlation, 2))), colour=(Correlation <= colourswitch[2]) ), size=rel(txs)) +
    scale_x_discrete(position = "top") +
    scale_fill_gradientn(colors=c("black", "red", "gold", "white"), na.value ="grey50" ) +
    scale_colour_manual(values=c("black", "white"), na.value="grey50", guide="none") +
    labs(x='', y='', title=paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation - Clustered")) +
    theme(axis.text.x=element_text(angle=90, hjust=0, vjust=0.5),
          panel.grid = element_blank() )

  
  # If exact pairs don't matter.
  cormat_x <- merge(cormat_t, groups, by = c('observation1', 'observation2'), all.x = TRUE, all.y = FALSE)
  
  p_bee <- ggplot(cormat_x, aes(x = Correlation, y = group, colour = group)) +
    # geom_violin()+ #(draw_quantiles = c(0.25, 0.5, 0.75)) +
    geom_boxplot(outlier.alpha = 0, width = 0.5, fill = 'grey95', colour = 'black') +
    # geom_jitter(width = 0, height = 0.2) +
    geom_beeswarm() +
    # scale_x_continuous(limits = c(-1, 1)) +
    labs(title = paste(paste(toupper(substr(method, 1, 1)), tolower(substr(method, 2, nchar(method))), "'s", sep=""), "correlation - Summary"), x = "Pairwise Correlation", y = "Pairs") +
    theme(legend.position = 'none')
    
  out <- list(corr=dcast(cormat_t, observation1 ~ observation2, value.var = "Correlation"),
              sonf=p_sonf, #sond=p_sond,
              sovf=p_sovf, #sovd=p_sovd,
              scnf=pd + p_scnf + plot_layout(ncol=2, widths=c(1,4)), #scnd=pd + p_scnd + plot_layout(ncol=2, widths=c(1,4)),
              scvf=pd + p_scvf + plot_layout(ncol=2, widths=c(1,4)), #scvd=pd + p_scvd + plot_layout(ncol=2, widths=c(1,4)),
              tovf=p_tovf, #tovd=p_tovd,
              tcvf=pd + p_tcvf + plot_layout(ncol=2, widths=c(1,4)), #tcvd=pd + p_tcvd + plot_layout(ncol=2, widths=c(1,4)),
              bee = p_bee
            )
  
  if (!is.null(rds) && length(rds) > 0) {
    saveRDS(out, file = rds)
  }
  
  return(out)
}

## helper function
pca_plotter <- function(pc, loads, highloads, ig, pcaimp, pcx, pcy){
  # pc <- pc; loads <- infl; highloads <- highinfl; ig <- ig; pcaimp <- pcaimp; pcx <- 1; pcy <- 2
  mycolours <- c(
  	"#880000", "#ff0000", "#eebb00", "#ddaadd", "#22ff00", "#224400", "#7700aa", "#0000ff", "#00ddff"
  )
  # This works best when the colours are in a sensible sequence of gradual change.
  getPalette <- colorRampPalette(mycolours)

  pcxs <- paste0("PC", pcx)
  pcys <- paste0("PC", pcy)
  subload <- loads[rowID %in% unique(c(head(highloads[[pcx]], 5), head(highloads[[pcy]], 5))), c("rowID", pcxs, pcys), with=FALSE]
  lmt <- max( c(abs(pc[[pcxs]]), abs(pc[[pcys]])) )
  
  
  pc_samples <- ggplot(pc, aes(x = .data[[pcxs]], y = .data[[pcys]], label = sample, colour = sample)) +
          geom_vline(xintercept=0, linetype='dashed', colour='grey70') +
          geom_hline(yintercept=0, linetype='dashed', colour='grey70') +
          geom_text(size=rel(2.1)) +
          # geom_point(shape=16, alpha=0.5) +
          # geom_text_repel(size=rel(2.1), min.segment.length = 0) +
          coord_fixed(ratio=1, xlim=c(-lmt, lmt), ylim=c(-lmt, lmt) ) +
          scale_colour_manual(values=getPalette(length(unique(pc[['sample']])))) +
          labs(x=paste0(pcxs, " (", round(pcaimp[PC==pcx, Explained], 1), "%)"),
               y=paste0(pcys, " (", round(pcaimp[PC==pcy, Explained], 1), "%)"),
               title='sample') +
          theme(panel.grid=element_blank(), legend.position='none')

  pc_variables <- wrap_plots(
    lapply(ig, function(varname) {
      # varname <- ig[2]
      ggplot(pc, aes(x = .data[[pcxs]], y = .data[[pcys]], label = as.character(.data[[varname]]), colour = as.character(.data[[varname]]))) +
        geom_vline(xintercept=0, linetype='dashed', colour='grey70') +
        geom_hline(yintercept=0, linetype='dashed', colour='grey70') +
        # geom_mark_hull(alpha=0.1, colour='transparent') +
        # geom_point(shape=16, alpha=0.5) +
        geom_text(size=rel(2.1)) +
        coord_fixed(ratio=1, xlim=c(-lmt, lmt), ylim=c(-lmt, lmt) ) +
        scale_colour_manual(values=getPalette(length(unique(pc[[varname]]))), name=varname) +
        labs(x=paste0(pcxs, " (", round(pcaimp[PC==pcx, Explained], 1), "%)"),
             y=paste0(pcys, " (", round(pcaimp[PC==pcy, Explained], 1), "%)"),
             title=varname) +
        guides(colour = guide_legend(ncol = 1)) +
        theme(panel.grid=element_blank(), legend.position='right')
    }), 
    ncol = ifelse(length(ig) < 9, 2, 3))
  

      # ggplot(pc, aes(x = .data[[pcxs]], y = .data[[pcys]], label = .data[['sample']], fill = as.character(.data[[varname]]), colour = as.character(.data[[varname]]))) +
      #   geom_vline(xintercept=0, linetype='dashed', colour='grey70') +
      #   geom_hline(yintercept=0, linetype='dashed', colour='grey70') +
      #   # geom_mark_hull(alpha=0.1, colour='transparent') +
      #   geom_segment(data=subload, inherit.aes=FALSE, colour='black', arrow=arrow(length=unit(0.1, 'cm')), alpha=0.2,
      #                x=0, y=0, aes(xend = .data[[pcxs]], yend = .data[[pcys]])) +
      #   geom_point() +
      #   geom_text_repel(data=subload, inherit.aes=FALSE, aes(label = .data[["rowID"]], x = .data[[pcxs]], y = .data[[pcys]]), size=2) +
      #   coord_fixed(ratio=1, xlim=c(-lmt, lmt), ylim=c(-lmt, lmt) ) +
      #   scale_colour_manual(values=getPalette(length(unique(pc[[varname]])))) +
      #   scale_fill_manual(values=getPalette(length(unique(pc[[varname]])))) +
      #   labs(x=paste0(pcxs, " (", round(pcaimp[PC==pcx, Explained], 1), "%)"), 
      #        y=paste0(pcys, " (", round(pcaimp[PC==pcy, Explained], 1), "%)")) +
      #   theme(panel.grid=element_blank())
    
  return(list(pc_samples, pc_variables))
}

# PCA
# (scaling and centering of genes to one another, NOT samples to one another)
do_pca <- function(countsmat, covars, scale = TRUE, center = TRUE, rds=NULL, topgenes=NULL, exclude=NULL, minMean=0, minSingle=0, ntop=10) {
  # countsmat <- log10(counts[, samples])
  # scale = TRUE; center = TRUE; rds = file.path(params$outdir, sub(".txt|.tsv", "_corrP.RDS", basename(params$name))); topgenes=NULL; exclude=NULL; minMean=0; minSingle=0; ntop=10
  
  nvars <- nrow(countsmat)
  countsmat <- countsmat[rowSums(countsmat) > 0, ]
  
  # Mean and Standard Deviation, standardized counts, mean and stdev of standardized counts.
  genevar <- data.table(name = rownames(countsmat),
                        Mean = rowMeans(countsmat),
                        StDev = rowSds(countsmat),
                        singles = rowSums(countsmat >= minSingle))
  # normcounts <- (countsmat - genevar$Mean) / genevar$StDev
  # genevar [, zMean := rowMeans(normcounts)]
  # genevar [, zStDev := rowSds(normcounts)]    # always 1 by definition
  
  # Select features
  if (is.null(topgenes)) {
    message(paste("Using variable features that exceed either", minMean, "mean count or", minSingle, "count in any single sample."))
    topgenes = genevar[StDev > 0 & (Mean >= minMean | singles > 0), name]  # all the variable genes, above min count level
  } else {
    message("Using the variable features among the manually provided list.")
    topgenes = genevar[name %in% topgenes & StDev > 0, name] # manually provided features, as long as they are variable
  }
  message(paste("Number of variable features available:", nrow(genevar[StDev>0,])))
  message(paste("Number of variable features used:", length(topgenes[!topgenes %in% exclude])))
  
  # PCA
  subcmat <- countsmat[topgenes[!topgenes %in% exclude], ]
  subcmat <- t(subcmat)
  pca <- prcomp(subcmat, center = center, scale = scale)
    
  srn <- sqrt(nrow(pca$x) - 1)
  pc <- sweep(pca$x, 2, 1 / (pca$sdev * srn), FUN = '*')   
  dirs <- as.data.table(pca$rotation)
  
  pc <- cbind(pc, data.frame(sample = rownames(pc)))
  pc <- as.data.frame(merge(pc, covars, by="sample", all = TRUE))
  npc <- sum(pca$sdev > 1)
  
  # Screeplot.
  
  pcaimp <- data.table(PC = 1:length(colnames(pca$x)),
                       Explained = summary(pca)$importance['Proportion of Variance', ] * 100,
                       Cumulative = summary(pca)$importance['Cumulative Proportion', ] * 100)
  pcaimp <- pcaimp[1:npc, ]
  topnpc <- nrow(pcaimp[Explained >= 5])
  
  pimp <- ggplot(pcaimp[Explained > 1, ]) +
    geom_line(aes(x=PC, y=Cumulative), colour='dodgerblue') +
    geom_bar(aes(x=PC, y=Explained, fill=Explained >= 5),
             stat='identity', colour='transparent', alpha=0.3) +
    geom_text(aes(x=PC, y=0, label=paste0(round(Explained, 1),"%")), 
              angle=90, hjust=0, vjust=0, size=rel(3)) +
    geom_text(data=pcaimp[2:npc,][Explained > 1, ],
              aes(x=PC, y=0.95*Cumulative, label=paste0(round(Cumulative, 1),"%")), 
              angle=90, vjust=0, hjust=1, size=rel(3)) +
    scale_fill_manual(values=c("grey75", "grey25"), guide="none") +
    scale_x_continuous(breaks=seq.int(1, npc, 1)) +
    labs(title = "Scree plot", subtitle=paste0(length(topgenes), " features, ", npc, " PCs"), 
         x = "Principal Component", y='% Variance') +
    theme(panel.grid=element_blank())
  
    
  # Top influencers. 
  
  infl <- as.data.table(pca$rotation)
  highinfl <- lapply(infl, function(x){ order(abs(x), decreasing=TRUE) })
  infl[, rowID := rownames(pca$rotation)]
  highinfl <- lapply(highinfl, function(x){ head(unique(infl$rowID[x]), ntop) })
  # highinfl <- highinfl[1:topnpc]
  setkey(infl, rowID)
  
  # Influence plots
  # lapply(1:npc, function(i) {
  #   ggplot(infl[highinfl[[i]], 
  #             c('rowID', names(loads)[i]), with=FALSE], 
  #        aes_string(x="rowID", xend="rowID", y=0, yend=names(loads)[i])) +
  #   geom_segment() +
  #   geom_point(aes_string(y=names(loads)[i])) +
  #   coord_flip() +
  #   labs(y=NULL, x=NULL)
  # })
  
  # Coefficient names.

  ig <- names(covars)
  ig <- ig[! ig %in% c('sample', 'Sample', 'name', 'Name', 'sizeFactor')]
  # ig <- ig[! ig %in% c('name', 'Name', 'sizeFactor')]
  
  # Means and Variances of the selected genes
  genevar[, selected := name %in% colnames(subcmat)]
  genevar[, excluded := name %in% exclude]
  pvar1 <- ggplot() +
      geom_point(data=genevar[!selected & excluded, ], aes(x=Mean, y=StDev/Mean), shape=16, size=0.8, alpha=0.5, colour='grey50') +
      geom_point(data=genevar[!selected & !excluded, ],aes(x=Mean, y=StDev/Mean), shape=16, size=0.8, alpha=1, colour='black') +
      geom_point(data=genevar[(selected), ], aes(x=Mean, y=StDev/Mean), shape=16, size=1, alpha=1, colour='red') +
      geom_point(aes(x=NA_real_, y=NA_real_, colour=c('Filtered out','Not used','Used'))) +
      scale_x_log10() +
      scale_y_log10() +
      scale_colour_manual(name='Filtering:', breaks=c('Used', 'Not used', 'Filtered out'), values=c('red', 'black', 'grey70')) +
      annotation_logticks(base=10, sides='lb') +
      labs(x="Mean", y="Coeff. of Variation", title=paste(nrow(genevar[StDev>0, ]), "variable features"), 
           subtitle=paste(nrow(genevar[(!excluded),]), "after filtering,", length(topgenes), "used")) +
      theme(legend.position='bottom', panel.grid=element_blank())

  # Plot higher PCs in 2D pairs. Highlight one variable at a time.
  L <- lapply(seq(1, topnpc, 2), function(pcx){
    # pcx <- 1
    pcy <- pcx + 1
    if (pcy > npc) pcy <- 1
    pca_plotter(pc, infl, highinfl, ig, pcaimp, pcx, pcy)
  } )
  
  out <- list(pca=pca,
              nvars=nvars,
              nPC=npc,
              pimp=pimp, 
              pvar1=pvar1,
              highloads=highinfl,
              pc2d=L,
              npanels=length(ig)
              )
  
  if (!is.null(rds) && length(rds) > 0) {
    saveRDS(out, file = rds)
  }
  
  return(out)
}
```


# Input

```{r parameters}
message("Name: ", params$name)
message("Minimum mean count across samples: ", params$minMean)
# message(paste("Minimum count in a single sample, if the minimum mean is not met:", params$minSingle))
message("Number of features to report per Principle Component: ", params$ntop)
message("Genes to exclude from library size calculations and PCA: ", params$specnorm)
message("Number of top dispersed genes to use for correlation and PCA: ", ifelse(is.null(params$topVars),' all that meet the count thresholds ', params$topVars))
message("Include only the samples marked as usable: ", params$onlyUsable)
message("sort axes by: ", params$sortVar)
message("summarise correlations by: ", params$groupVar)

```

```{r sample_info}
# Counts
if (!is.matrix(params$tpm)) {
  counts <- fread(params$tpm, colClasses="character")  # prevent any numeric IDs converting a string column to integer
  xref <- counts[, 1:params$nidcols]
  samples <- names(counts)[(params$nidcols + 1):length(counts)]
  counts <- matrix(as.numeric(as.matrix(counts[, c(params$nidcols+1):length(counts), with=FALSE])), ncol=length(counts) - params$nidcols )
  colnames(counts) <- samples  # because the type conversion lost them
  counts[is.na(counts)] <- 0
  rownames(counts) <- xref[[params$idcol]]
} else {
  counts <- params$tpm
}

# Covariates
if (!is.data.frame(params$covars)) {
  covars <- fread(params$covars, header = TRUE)
} else {
  covars <- params$covars
}

# Grouping look-up
setkey(covars, sample)
G <- as.data.table(expand.grid(covars$sample, covars$sample))
setnames(G, c('observation1','observation2'))

G[, observation1 := as.character(observation1)]
G[, observation2 := as.character(observation2)]
if (!is.null(params$groupVar)) {
  G[, group1 := covars[observation1, ][, params$groupVar, with = FALSE]]
  G[, group2 := covars[observation2, params$groupVar, with = FALSE]]
  G[, group := paste(group1, group2, sep = ' <-> ')]
  # Fix order of conditions, ragerdless of order of sample names
  G[, group := vapply(group, function(x){ strsplit(x, ' <-> ')[[1]] |> sort() |> paste(collapse = ' <-> ') }, character(1))]
} else {
  G[, group := 'NA']
}

# Samples
if (!is.null(params$sortVar))
  setorderv(covars, c(params$sortVar, 'sample'))
if (params$onlyUsable && 'use' %in% names(covars)) {
  samples <- covars[as.logical(use), sample]
} else {
  samples <- covars$sample
}

datatable(covars[sample %in% samples, ], options = list(pageLength = min(25, nrow(covars))))
```


# Filter

Genes with mean count across all samples below `r params$minMean` are excluded. For PCA, all genes are given equal weight, but low expression genes are proportionally more variable, so filtering them in advance reduces noise.

```{r filter}
stopifnot(all(samples %in% colnames(counts)))

# All genes above the count thresholds
# message(paste("Using features that exceed either", params$minMean, "mean count across samples or", params$minSingle, "count in any single sample."))
message(paste("Using features that exceed", params$minMean, "mean count across samples."))
n <- nrow(counts)
# topgenes <- rownames(counts)[(rowMeans(counts) >= params$minMean) | (rowSums(counts >= params$minSingle) > 0)]
topgenes <- rownames(counts)[(rowSums(counts[, samples]) > 0) & (rowMeans(counts[, samples]) >= params$minMean)]
message(length(topgenes), " features are expressed in at least one sample and meet the count threshold, out of ", n)

# Genes to exclude from PCA.
if (!is.null(params$excluded)) {
  n <- length(topgenes)
  topgenes <- topgenes[! topgenes %in% params$excluded ]
  message(paste("Additionally excluding", n - length(topgenes), "listed features."))
}

# Further reduce the above genes to the top most dispersed, as in DESeq2
cv <- sort(rowSds(log10(counts[topgenes, samples])) / rowMeans(log10(counts[topgenes, samples])), decreasing = TRUE, na.last = TRUE)   # coefficient of variation
topgenes2 <- names(head(cv, n=params$topVars))


# Make it obvious which genes in the TPM-StDev plot were excluded due to low count.
# lowcnt <- rownames(counts)[(rowMeans(counts) < params$minMean) | (rowSums(counts >= params$minSingle) == 0)]
lowcnt <- rownames(counts)[(rowMeans(counts[, samples]) < params$minMean) | (rowSums(counts[, samples]) == 0)]
```

Padding of zeroes, for the purpose of log-scaling, is done with a value smaller than any non-zero value.

```{r pad}
# Pad zeros, so I can correlate the log abundance, which is more linear than the plain abundance.
pad <- min(counts[counts > 0], na.rm = TRUE) / 10
message('Padding zeros by ', pad, ' to enable log-scaling.')
counts[counts == 0] <- counts[counts == 0] + pad
```


# Correlations

The *Pearson* correlation coefficient between samples X and Y measures how much their observations can be represented as `Y = A * X`. Perfect PCC is a straight line in a 2D plot of corresponding X,Y value pairs. 

The *Spearman* correlation coefficient does the same for the rank of the observations instead of their values. Perfect SCC is any monotonic line between X and Y values (straight, curved, or zigzag).

Using log-scaled counts.


## All eligible features

Correlations based on all the genes that meet the count and name criteria. The analysis is performed on the log-transformed padded counts.

```{r corr}
corelsP <- my_pairwise_internal_corels(log10(counts[topgenes, samples]), samples, method="pearson", groups = G,
                                      minMean = 0, minSingle = 0, loopVal=params$loopVal,   # prefiltered
                                      rds = file.path(params$outdir, 
                                                             sub(".txt|.tsv", paste0("_", params$loopval, "_corrP.RDS"), basename(params$name))) )
corelsS <- my_pairwise_internal_corels(log10(counts[topgenes, samples]), samples, method="spearman", groups = G,
                                      minMean = 0, minSingle = 0, loopVal=params$loopVal,     # prefiltered
                                      rds = file.path(params$outdir,
                                                             sub(".txt|.tsv", paste0("_", params$loopval, "_corrS.RDS"), basename(params$name))) )
```

### Pearson

```{r, fig.height=15, fig.width=15}
print( corelsP[['sovf']] )
```

### Spearman

```{r, fig.height=15, fig.width=15}
print( corelsS[['sovf']] )
```

### Pearson - clustered

```{r, fig.height=15, fig.width=17}
print( corelsP[['scvf']] )
```

### Spearman - clustered

```{r, fig.height=15, fig.width=17}
print( corelsS[['scvf']] )
```

### Pearson - beeswarm

```{r, fig.height=8, fig.width=10}
print( corelsP[['bee']] )
```

### Spearman - beeswarm

```{r, fig.height=8, fig.width=10}
print( corelsS[['bee']] )
```


## Top dispersed features

Correlations using the top `r params$topVars` dispersed genes (out of those already filtered).

```{r corrb}
corelsPb <- my_pairwise_internal_corels(log10(counts[topgenes2, samples]), samples, method="pearson", groups = G,
                                      minMean = 0, minSingle = 0, loopVal=params$loopVal,   # prefiltered
                                      rds = file.path(params$outdir,
                                                      sub(".txt|.tsv", paste0("_", params$loopval, "_corrPtop.RDS"), basename(params$name))) )
corelsSb <- my_pairwise_internal_corels(log10(counts[topgenes2, samples]), samples, method="spearman", groups = G,
                                      minMean = 0, minSingle = 0, loopVal=params$loopVal,    # prefiltered
                                      rds = file.path(params$outdir,
                                                      sub(".txt|.tsv", paste0("_", params$loopval, "_corrStop.RDS"), basename(params$name))) )
```

### Pearson

```{r, fig.height=15, fig.width=15}
print( corelsPb[['sovf']] )
```

### Spearman

```{r, fig.height=15, fig.width=15}
print( corelsSb[['sovf']] )
```

### Pearson - clustered

```{r corr2, fig.height=15, fig.width=17}
print( corelsPb[['scvf']] )
```

### Spearman - clustered

```{r, fig.height=15, fig.width=17}
print( corelsSb[['scvf']] )
```

### Pearson - beeswarm

```{r, fig.height=8, fig.width=10}
print( corelsPb[['bee']] )
```

### Spearman - beeswarm

```{r, fig.height=8, fig.width=10}
print( corelsSb[['bee']] )
```


# PCA

For this, the log-scaled expression of the features is further standardized. This compensates for large differences in magnitude range and gives every feature equal weight. Features with exactly `0` variability have to be excluded for PCA, they are typically features that are not detected in any of the samples.

## All eligible features

PCA based on all genes that meet the count thresholds. The analysis is performed on the log-transformed padded counts.

```{r pca}
all_pca <- do_pca(log10(counts[, samples]), covars, ntop = params$ntop,
                  topgenes= topgenes, exclude=lowcnt,
                  minMean = 0, minSingle = 0,      # prefiltered
                  rds = file.path(params$outdir,
                                  sub(".txt$|.tsv$", paste0("_", params$loopval, "_pca.RDS"), basename(params$name))))
```

### Selection overview

This shows the scaled expression and scaled dispersion for the variable features that were selected for the correlation and PCA, against the total of variable features. Non-variable features are not shown, due to log-scaled axes.

```{r varmean}
# Selected features
print( all_pca$pvar1 )
```

### Variance explained by PCs

There are different ways to decide which components to consider important. One is based on the idea of diminishing returns and selects components before the "elbow" in the cumulative variance. The second considers all components that explain at least 1%. The third simply aims for a target cumulative variance explained, like 80%.

```{r scree}
# Scree
print(all_pca$pimp)
```


### Known variables

Highlighting the defined experimental variables against the principal components helps make sense of what the components are.

```{r pcs, results = 'asis'}
i <- 1
for (x in all_pca$pc2d) {
  kid <- knitr::knit_child(
    text = c('#### `r paste("PC", a)`',
           '',
           '```{r, fig.height=10, fig.width=10}',
           'print(b)',
           '```',
           '',
           '```{r, fig.height=18, fig.width=13}',
           'print(c)',
           '```'),
      envir = rlang::env(a = i, b = x[[1]], c = x[[2]], output=NULL, quiet = TRUE)
    )
  
  cat(kid, sep = "\n")
  
  i <- i + 2
}
```


<!-- ## Associated features -->

<!-- Only the top `r params$ntop` genes are listed here, in order of decreasing association. -->

<!-- If a PC aligns well with an experimental variable of interest, the associated genes for that PC could merit follow-up. -->

<!-- ```{r highload} -->
<!-- TL <- as.data.frame(lapply (names(all_pca$highloads), function(pc) { -->
<!--   head(all_pca$highloads[[pc]], params$ntop) }) ) -->
<!-- names(TL) <- names(all_pca$highloads) -->

<!-- datatable(TL) -->
<!-- ``` -->


## Top dispersed features

PCA using the top `r params$topVars` dispersed genes (out of those that already meet the count thresholds).

```{r pcab}
all_pcab <- do_pca(log10(counts[, samples]), covars, ntop = params$ntop,
                  topgenes= topgenes2, exclude=lowcnt,
                  minMean = 0, minSingle = 0,      # prefiltered
                  rds = file.path(params$outdir,
                                  sub(".txt|.tsv", paste0("_", params$loopval, "_pcatop.RDS"), basename(params$name))))
```

### Selection overview

This shows the mean TPM (or RPM) value and standard deviation for the variable features that were selected for the correlation and PCA, against the total of variable features. Non-variable features are not shown, due to log-scaled axes.

```{r varmeanb}
# Selected features
print( all_pcab$pvar1 )
```

### Variance explained by PCs

There are different ways to decide which components to consider important. One is based on the idea of diminishing returns and selects components before the "elbow" in the cumulative variance. The second considers all components that explain at least 1%, this is the one used in this report. The third simply aims for a target cumulative variance explained, such as 80%.

```{r screeb}
# Scree
print(all_pcab$pimp)
```


### Known variables

Highlighting the defined experimental variables against the principal components helps make sense of what the components are.

```{r pcsb, results = 'asis'}
i <- 1
for (x in all_pcab$pc2d) {
  kid <- knitr::knit_child(
    text = c('#### `r paste("PC", a)`',
           '',
           '```{r, fig.height=10, fig.width=10}',
           'print(b)',
           '```',
           '',
           '```{r, fig.height=18, fig.width=13}',
           'print(c)',
           '```'),
      envir = rlang::env(a = i, b = x[[1]], c = x[[2]], output=NULL, quiet = TRUE)
    )
  
  cat(kid, sep = "\n")
  
  i <- i + 2
}

```


<!-- ## Associated features -->

<!-- Only the top `r params$ntop` genes are listed here, in order of decreasing association. -->

<!-- If a PC aligns well with an experimental variable of interest, the associated genes for that PC could merit follow-up. -->

<!-- ```{r highloadb} -->
<!-- TL <- as.data.frame(lapply (names(all_pcab$highloads), function(pc) { -->
<!--   head(all_pcab$highloads[[pc]], params$ntop) }) ) -->
<!-- names(TL) <- names(all_pcab$highloads) -->

<!-- datatable(TL) -->
<!-- ``` -->


# Write plots to PDF

```{r pdf}
## Plots to PDF ##
pdf(file.path(params$outdir,
              sub("txt|tsv", paste0(params$loopVal, ".correlations.pdf"), basename(params$name))),
    width=max(12, ceiling(nrow(params$covars) / 3)), height=max(12, ceiling(nrow(params$covars) / 3))) # Scale up when many samples, but don't scale down when few.

# Correlation plots
plot.new()
text(.5, .5, 'Correlation with all expressed genes')
print(corelsP[['sovf']])
print(corelsS[['sovf']])
print(corelsP[['scvf']])
print(corelsS[['scvf']])
print(corelsP[['bee']])
print(corelsS[['bee']])
plot.new()
text(.5, .5, 'Correlation with only the most variable genes')
print(corelsPb[['sovf']])
print(corelsSb[['sovf']])
print(corelsPb[['scvf']])
print(corelsSb[['scvf']])
print(corelsPb[['bee']])
print(corelsSb[['bee']])

dev.off()

pdf(file.path(params$outdir,
              sub("txt|tsv", paste0(params$loopVal, ".pca.pdf"), basename(params$name))),
    width=max(12, ceiling(nrow(params$covars) / 3)), height=max(12, ceiling(nrow(params$covars) / 3))) # Scale up when many samples, but don't scale down when few.

# PCA plotsplot.new()
plot.new()
text(.5, .5, 'PCA with all expressed genes')
print(all_pca$pimp)
for(x in all_pca$pc2d) {  # list of lists
  print(x[[1]])
  print(x[[2]])
}

plot.new()
text(.5, .5, 'PCA with only the most variable genes')
print(all_pcab$pimp)
print(all_pcab$pc2d)
for(x in all_pcab$pc2d) { # list of lists
  print(x[[1]])
  print(x[[2]])
}

dev.off()
```

# Session Info

```{r}
sessionInfo()
```