selectNormalizeGeneCounts_CEA_pub.R

library(matrixStats)      
library(edgeR)
library(WGCNA)
library(biomaRt)
library(annotables)
library(dplyr)
library(readxl)
library(car)

library(foreach)
library(doMC)
registerDoMC()
library(proxy)
library(sgof)
library(multtest)
library(lawstat)

getDoParWorkers()
options(cores=36)
getDoParWorkers()

# linux does not work with box so this code has a local data directory

setwd("./")

source("functionDefinitions.R")

try(dir.create("resultsCoexpr"), silent = T)

# read raw data - can be improved by using read.table from original .txt file
geneReadsOriginal=read.table("RNASeq019_CEA_featureCounts_results_with_EnsemblID.txt", header=T, stringsAsFactors = F,sep="\t")
rownames(geneReadsOriginal)=geneReadsOriginal[,1]

geneInfo=geneReadsOriginal[,1:6]
geneReadsRaw=geneReadsOriginal[,7:dim(geneReadsOriginal)[2]]

geneENS=rownames(geneReadsOriginal)

colnames(geneReadsRaw)

# Let's simplify sample names:
splitSampleIDs=mapply(strsplit, colnames(geneReadsRaw), MoreArgs=list(split="_", fixed = FALSE, perl = FALSE, useBytes = FALSE))
sampleNumberPL=unlist(lapply(splitSampleIDs, "[[", 3))
sampleNumberPL=paste("S",sampleNumberPL, sep="")
colnames(geneReadsRaw)=sampleNumberPL

#################################################################################
# remove outliers 

IAC=cor(geneReadsRaw,method="p") 
hist(IAC,sub=paste("Mean=",format(mean(IAC[upper.tri(IAC)]),digits=3))) 

# IAC should ideally be 0.8 or more - lower IACs could be outliers

cluster1=hclust(as.dist(1-IAC),method="average") 
plot(cluster1,cex=0.7,labels=dimnames(geneReadsRaw)[[2]]) 
##quartz.save("figuresSelect/IACrawDataClust.png", type="png", bg="white")

meanIAC=apply(IAC,2,mean) 
sdCorr=sd(meanIAC) 
numbersd=(meanIAC-mean(meanIAC))/sdCorr 
plot(numbersd, ylim=c(-5,2), main="Inter Sample Correlations \nnormalized") 
abline(h=-3) 

# samples more than 2 standard deviations from mean Inter Array Correlation (IAC) are considered outliers
#quartz.save("figuresSelect/meanIACrawData.png", type="png", bg="white")

sdout=-3 
outliers=dimnames(geneReadsRaw)[[2]][numbersd<sdout] 
outliers 

geneReadsRaw=geneReadsRaw[, setdiff(colnames(geneReadsRaw),  outliers)]

# NOTE: While there is no sample that is clearly an outlier, the cluster plot above displays
# clear clustering of a set of samples (all female). Here we take a look at the gene counts
# to see if there are genes that CEAow abnormally high or low counts.
#################################################################################################
stats.per.sample <- data.frame(t(do.call(cbind, lapply(geneReadsRaw, summary))))
head(stats.per.sample)

barplot(stats.per.sample$Max,names.arg = row.names(stats.per.sample),las=2,main="Maximum gene counts found per sample)")

rownames(geneReadsRaw[which.max(geneReadsRaw[,"S778"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S718"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S789"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S814"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S799"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S728"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S710"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S714"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S859"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S708"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S716"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S712"]),])
rownames(geneReadsRaw[which.max(geneReadsRaw[,"S727"]),])

geneReadsRaw = geneReadsRaw[!(rownames(geneReadsRaw) %in% c("ENSMUSG00000083563")),]
geneENS = geneENS[!geneENS %in% c("ENSMUSG00000083563")]

# Now let's run the sample outlier detection again
#################################################################################
# remove outliers 

IAC=cor(geneReadsRaw,method="p") 
hist(IAC,sub=paste("Mean=",format(mean(IAC[upper.tri(IAC)]),digits=3))) 

cluster1=hclust(as.dist(1-IAC),method="average") 
plot(cluster1,cex=0.7,labels=dimnames(geneReadsRaw)[[2]]) 

meanIAC=apply(IAC,2,mean) 
sdCorr=sd(meanIAC) 
numbersd=(meanIAC-mean(meanIAC))/sdCorr 
plot(numbersd, ylim=c(-5,2), main="Inter Sample Correlations \nnormalized") 
abline(h=-3) 

sdout=-3 
outliers=dimnames(geneReadsRaw)[[2]][numbersd<sdout] 
outliers 

geneReadsRaw=geneReadsRaw[, setdiff(colnames(geneReadsRaw),  outliers)]

# NOTE: Samples 801 and 825 were removed

#################################################################################################
# Load the strain information for each sample 
sample.data<-read.table("CEA_Phenotype.txt",header=T,sep="\t",stringsAsFactors = F)

sample.data$LabID <- paste("S",sample.data$LabID,sep="")
sample.data=sample.data[!sample.data$LabID %in% outliers,]

samplesHigh=sample.data[sample.data$Group == "H",2]
samplesLow=sample.data[sample.data$Group == "L",2]


##################################################################################################################
mart <- useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
 
transcriptInfoMouse=getBM( attributes=c("ensembl_gene_id","external_gene_name","transcript_length", "entrezgene_id"),
                           mart=mart, useCache = FALSE)

# Just make sure ... 
transcriptInfoMouse[transcriptInfoMouse[,"external_gene_name"]=="Drd1",]
save(transcriptInfoMouse, file="transcriptInfoMouse.RData")

##################################################################################################################
# divide the data in different groups and find genes with > 1 CPM

H=geneReadsRaw[,samplesHigh]
L=geneReadsRaw[,samplesLow]

groupSelection=c(rep("H",dim(H)[2]),rep("L",dim(L)[2]))
groupSelection =factor(groupSelection)

################################################################
# EDGER
################################################################
d=DGEList(counts= cbind(H, L), group= groupSelection)
d <- estimateCommonDisp(d)
d <- estimateTagwiseDisp(d)
de.tgw <- exactTest(d, dispersion="tagwise") 

geneENSHighCPM=geneENS[de.tgw$table[,"logCPM"]>0]

# calculate edgeR normalization factors and normalize the data
UQnormFactors=calcNormFactors(geneReadsRaw, method=c("upperquartile"))

effectiveLibrarySizes= UQnormFactors*colSums(geneReadsRaw)
meanEffLibSize=mean(effectiveLibrarySizes)
countNormFactor= meanEffLibSize/effectiveLibrarySizes

normalizedGeneCountsUQ=0* geneReadsRaw

for (sample in 1:dim(normalizedGeneCountsUQ)[2]){	
  normalizedGeneCountsUQ[,sample]= geneReadsRaw[, sample]* countNormFactor[sample]	
}
normalizedGeneCountsUQ=as.matrix(normalizedGeneCountsUQ)

# verify that smaller libraries are multiplied by bigger normalization factors
plot(colSums(geneReadsRaw), countNormFactor, xlab="Un-normalized library sizes", ylab="Normalization factors")

# plot original library sizes versus normalized
# range of data should be much smaller in normalized data
xylim=c(min(colSums(geneReadsRaw), colSums(normalizedGeneCountsUQ)), max(colSums(geneReadsRaw), colSums(normalizedGeneCountsUQ)))
plot(colSums(geneReadsRaw), colSums(normalizedGeneCountsUQ), xlim=xylim, ylim=xylim)

# interquartile range of normalized libraries CEAould me much smaller
IQR(colSums(geneReadsRaw))
IQR(colSums(normalizedGeneCountsUQ))

########################################################################################
# do differential expression on high CPM genes
pValues_DE=de.tgw$table$PValue
names(pValues_DE)=geneENS
rownames(de.tgw$table)
pValues_DE_highCPM=pValues_DE[geneENSHighCPM]

#adjPvalues_DE_highCPM_Object=mt.rawp2adjp(pValues_DE_highCPM, proc="BH", alpha = 0.05, na.rm = FALSE)
#adjPvalues_DE_highCPM=adjPvalues_DE_highCPM_Object$adjp[,"BH"]
#names(adjPvalues_DE_highCPM)=geneENSHighCPM[adjPvalues_DE_highCPM_Object$index]

sortIndexes=sort.int(pValues_DE_highCPM, decreasing = F, index.return=T)$ix
sortedGeneNames=geneENSHighCPM[sortIndexes]

adjustedResults<-SGoF(u=pValues_DE_highCPM)
summary(adjustedResults)

sortedAdjustedPvals_DE=adjustedResults$Adjusted.pvalues
names(sortedAdjustedPvals_DE)=sortedGeneNames

sum(sortedAdjustedPvals_DE<=0.05)

fileConnSummary<-file("resultsCoexpr/SummaryResultsCoexpr.txt",  open="wt")

writeLines(paste("Number of genes with 1 CPM or more that are DE at FDR=0.05: ", sum(sortedAdjustedPvals_DE<=0.05), sep=' '), fileConnSummary)
close(fileConnSummary)

# some sanity checks

plot(pValues_DE_highCPM, sortedAdjustedPvals_DE[geneENSHighCPM])

###################################################################################
# select genes with logCPM > 0 (equivalent to CPM>1) 

H_norm=normalizedGeneCountsUQ[geneENSHighCPM,samplesHigh ]
L_norm=normalizedGeneCountsUQ[geneENSHighCPM,samplesLow]

meanCounts_H=rowMeans(H_norm[,])
meanCounts_L=rowMeans(L_norm[,])

sdCounts_H=apply(H_norm[,],1, sd) 
sdCounts_L=apply(L_norm[,],1, sd) 

##############################################################################
# find differentially variable genes

pvalVarLevene=rep(1, length(geneENSHighCPM))
names(pvalVarLevene)=geneENSHighCPM

for (gene in geneENSHighCPM){
  Y=unlist(c(H_norm[gene,], L_norm[gene,]))
  pvalVarLevene[gene]=leveneTest(y = Y, group = groupSelection)$`Pr(>F)`[1]
}

pValues_DV_highCPM=pvalVarLevene

sortIndexes=sort.int(pValues_DV_highCPM, decreasing = F, index.return=T)$ix
sortedGeneNames=geneENSHighCPM[sortIndexes]

adjustedResults<-SGoF(u=pValues_DV_highCPM)
summary(adjustedResults)

sortedAdjustedPvals_DV=adjustedResults$Adjusted.pvalues
names(sortedAdjustedPvals_DV)=sortedGeneNames

sum(sortedAdjustedPvals_DV<=0.05)

fileConnSummary<-file("resultsCoexpr/SummaryResultsCoexpr.txt",  open="at")

writeLines(paste("Number of genes with 1 CPM or more that are DV at FDR=0.05 (using SGoF): ", sum(sortedAdjustedPvals_DV<=0.05), sep=' '), fileConnSummary)
close(fileConnSummary)

# some sanity checks

plot(pValues_DV_highCPM, sortedAdjustedPvals_DV[geneENSHighCPM])

# need to discuss DV results if number of DV genes is large (>5000)
###########################################################################
# now collect geneNames

#load("transcriptInfoMouse.RData")
transcriptInfoMouse_present=transcriptInfoMouse[transcriptInfoMouse[,"ensembl_gene_id"] %in% geneENSHighCPM,]
transcriptInfoMouse_uniqueENS=transcriptInfoMouse_present[!duplicated(transcriptInfoMouse_present[,"ensembl_gene_id"]) ,]
rownames(transcriptInfoMouse_uniqueENS)=transcriptInfoMouse_uniqueENS[,"ensembl_gene_id"]

sum(duplicated(transcriptInfoMouse_uniqueENS[,"external_gene_name"]))
duplicatedGeneIdxs=which(duplicated(transcriptInfoMouse_uniqueENS[,"external_gene_name"]))
 
duplicatedGeneNames=transcriptInfoMouse_uniqueENS[duplicatedGeneIdxs,"external_gene_name"]
 
setdiff(geneENSHighCPM, rownames(transcriptInfoMouse_uniqueENS))
setdiff(geneENSHighCPM, transcriptInfoMouse[,"ensembl_gene_id"])
setdiff(geneENSHighCPM, pull(grcm38,ensgene))

commonENS=intersect(rownames(transcriptInfoMouse_uniqueENS), geneENSHighCPM)

transcriptInfoMouseEnsGene=transcriptInfoMouse_uniqueENS[commonENS,]

geneEnsTable=transcriptInfoMouseEnsGene[commonENS,]
H_norm=H_norm[commonENS,]
L_norm=L_norm[commonENS,]

geneEnsNameSHighCPM<-geneEnsTable$ensembl_gene_id
geneNameSHighCPM<-geneEnsTable$external_gene_name


###########################################################################33
results_highCPMgenes=cbind(de.tgw$table[commonENS,], sortedAdjustedPvals_DE[commonENS],meanCounts_L[commonENS], meanCounts_H[commonENS],  sdCounts_L[commonENS],sdCounts_H[commonENS],  pValues_DV_highCPM[commonENS], sortedAdjustedPvals_DV[commonENS] )
results_highCPMgenes=round(results_highCPMgenes,3)
results_highCPMgenes=cbind(transcriptInfoMouseEnsGene[,"external_gene_name"], results_highCPMgenes)
colnames(results_highCPMgenes)=c("gene_symbol","logFC","logCPM","p_val_DE", "adj_p_DE", "mean_counts_L", "mean_counts_H",  "sd_L", "sd_H", "p_val_DV", "adj_p_DV")

#this will be collected in Supplemental Table 1
write.csv(results_highCPMgenes, file="resultsCoexpr/resultsDEDV_highCPM.csv")

#######################################################################################3

adjCoexpr_H=adjacency(t(H_norm),  type="unsigned", power=6)
adjCoexpr_L=adjacency(t(L_norm),  type="unsigned", power=6)

# the consensus network adjacency is the MAX of the two networks adjacencies!!!!!!!
adjCoexpr=pmax(adjCoexpr_H,adjCoexpr_L)
adjCoexpr[is.na(adjCoexpr)]=0
for (i in 1:dim(adjCoexpr)[1]){
  adjCoexpr[i,i]=1
}

#sanity check, CEAould be 0
sum(adjCoexpr<0, na.rm=T)
connCoexpr=rowSums(adjCoexpr, na.rm = T) -1

connCoexpr[is.na(connCoexpr)]=0
sortedConn=sort(connCoexpr, decreasing = T)
sum(connCoexpr < 0, na.rm=T)

totalConn=sum(sortedConn)
cumulativeConnFraction=0*sortedConn
for (i in 1:length(sortedConn)){
  cumulativeConnFraction[i]=sum(sortedConn[1:i])/totalConn
}


lastGene=min(which(cumulativeConnFraction > .9))

highConnGenes=names(sortedConn)[1:lastGene]

plot(cumulativeConnFraction, xlab="", ylab="")
title(xlab="Number of genes \n ranked by connectivity", cex.lab=1.25, font.lab=2)
title(ylab="Fraction total connectivity", cex.lab=1.25, font.lab=2)
abline(v=lastGene)
abline(h=0.9)
text(x=8100, y=0.8, labels=paste(lastGene, " genes", sep=""))
text(x=3100, y=0.95, labels="90% connectivity captured")
title(main="Selecting the network size\n starting from genes with > 1 CPM", cex.lab=1.5, font.lab=2)

#############################################################################################################
adjCoexprHighConn=adjCoexpr[highConnGenes,highConnGenes]

selectedGeneCounts=cbind(H_norm, L_norm)

save(selectedGeneCounts,adjCoexprHighConn, samplesHigh, samplesLow, geneEnsTable,file="resultsCoexpr/selectedData.RData")

###################################################################################