-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from jjsayleraxio/source-v1.0.0
add source files
- Loading branch information
Showing
11 changed files
with
8,349 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Marine_pheno | ||
Marine_raw | ||
WTC_pheno | ||
WTC_raw | ||
diabetes_data: .Random.seed diabetes_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
## ---- include = FALSE---------------------------------------------------- | ||
knitr::opts_chunk$set( | ||
collapse = TRUE, | ||
fig.width=7, | ||
fig.height=5, | ||
fig.path='Figs/', | ||
fig.align="left", | ||
warning=FALSE, | ||
message=FALSE, | ||
comment = "#>" | ||
) | ||
|
||
## ----setup--------------------------------------------------------------- | ||
library(JTIMLmaster) | ||
library(Biobase) | ||
library(DESeq2) | ||
library(edgeR) | ||
library(ggplot2) | ||
library(SmartSVA) | ||
library(pheatmap) | ||
library(data.table) | ||
library(plyr) | ||
library(dplyr) | ||
library(DT) | ||
library(car) | ||
library(randomForest) | ||
library(glmnet) | ||
library(caret) | ||
library(e1071) | ||
library(VSURF) | ||
library(neuralnet) | ||
library(gbm) | ||
library(verification) | ||
library(pROC) | ||
|
||
## ----ML application------------------------------------------------------ | ||
diabetes_data$Outcome <- factor(diabetes_data$Outcome, levels = c("0", "1")) | ||
## The data name is diabetes_data | ||
set.seed(415) | ||
K <- 10 ## Number of replicates | ||
n.case <- which(diabetes_data$Outcome==1) | ||
n.control <- which(diabetes_data$Outcome==0) | ||
|
||
train_rows <- lapply(1:K, function(x){c(sample(n.case, length(n.case), replace = TRUE), sample(n.control, length(n.control), replace = TRUE))}) | ||
|
||
depVar <- "Outcome" | ||
|
||
alphaLevel <- c(0.4, 0.7, 1) | ||
|
||
ntree <- 100 | ||
|
||
RFresult <- lapply(train_rows, function(x){RF.boot.err.Func(train=diabetes_data[x, ], test=diabetes_data[-x, ], depVar = depVar, ntree=ntree)}) | ||
names(RFresult) <- paste0("RF_iter", 1:K) | ||
|
||
PRresult <- lapply(train_rows, function(x){PR.boot.err.Func(x.train=diabetes_data[x, (names(diabetes_data)!=depVar)], y.train=diabetes_data[x, depVar], x.test=diabetes_data[-x, (names(diabetes_data)!=depVar)], y.test=diabetes_data[-x, depVar], alphaLevel=alphaLevel, family="binomial", type="class")}) | ||
names(PRresult) <- paste0("PR_iter", 1:K) | ||
|
||
### SVM ### | ||
SVMresult <- lapply(train_rows, function(x){SVM.boot.err.Func(train=diabetes_data[x, ], test=diabetes_data[-x, ], depVar= depVar, kernel="radial", cost=5)}) | ||
names(SVMresult) <- paste0("SVM_iter", 1:K) | ||
|
||
### Neural Network ### | ||
NNresult <- lapply(train_rows, function(x){NN.boot.err.Func(train=diabetes_data[x, ], test=diabetes_data[-x, ], depVar= depVar, hidden=10)}) | ||
names(NNresult) <- paste0("NN_iter", 1:K) | ||
|
||
### Gredient Boosting (gbm R package) ### | ||
### Somehow predicted values are all 0. So we cannot compare them to the true values. | ||
|
||
GBMresult <- lapply(train_rows, function(x){GBM.boot.err.Func(train=diabetes_data[x, ], test=diabetes_data[-x, ], depVar= depVar, distribution="adaboost", n.trees = ntree)}) | ||
names(GBMresult) <- paste0("GBM_iter", 1:K) | ||
|
||
## ----RF result summary--------------------------------------------------- | ||
RF.err.measure <- data.frame(AUC=unlist(lapply(RFresult, function(x){x$RF.err[1,"overall"]})), | ||
Sensitivity= unlist(lapply(RFresult, function(x){x$RF.err[2,"overall"]})), | ||
Specificity= unlist(lapply(RFresult, function(x){x$RF.err[3,"overall"]})), | ||
Misclassification= unlist(lapply(RFresult, function(x){x$RF.err[4,"overall"]}))) | ||
RF.err.measure.melt <- melt(RF.err.measure) | ||
names(RF.err.measure.melt) <- c("accuracy.measure", "value") | ||
RF.err.measure.melt$accuracy.measure <- factor(RF.err.measure.melt$accuracy.measure, levels = c("AUC", "Sensitivity", "Specificity", "Misclassification")) | ||
|
||
ggplot(RF.err.measure.melt) + geom_boxplot(aes(x=accuracy.measure, y=value*100, fill=accuracy.measure)) + ggtitle(paste0("Random Forest (K=", K, ", ntree=", ntree, ")")) + ylab("percentage(%)") | ||
|
||
## ----Distribution of accuracy measures----------------------------------- | ||
tempD <- ddply(RF.err.measure.melt, ~ accuracy.measure, summarise, mean= round(mean(value), digits = 4), sd=round(sd(value), digits = 4)) | ||
tempD$value <- paste0(tempD$mean, " (sd=", tempD$sd, ")") | ||
tempD$mean <- tempD$sd <- NULL | ||
DT::datatable(tempD, caption = paste0("Mean(SD) of Overall Accuracy Measures using RF (K=", K, ")"), rownames = FALSE) | ||
|
||
## ----importance measures for RF------------------------------------------ | ||
all.important.measure <- lapply(RFresult, function(x){x$importance_measure[order(rownames(x$importance_measure)),]}) | ||
all.important.measure.DF <- do.call("cbind", all.important.measure) | ||
|
||
all.important.measure.mean.decrease.acc.mean <- apply(all.important.measure.DF[, grep("Mean.Decrease.Accuracy", names(all.important.measure.DF))], 1, mean) | ||
all.important.measure.mean.Decrease.Gini.mean <- apply(all.important.measure.DF[, grep("Mean.Decrease.Gini", names(all.important.measure.DF))], 1, mean) | ||
|
||
important_measure_mean <- data.frame(Mean.Decrease.Accurary=round(all.important.measure.mean.decrease.acc.mean, 4), Mean.Decrease.Gini=round(all.important.measure.mean.Decrease.Gini.mean, 4)) | ||
important_measure_mean <- important_measure_mean[order(important_measure_mean$Mean.Decrease.Accurary, decreasing = TRUE),] | ||
|
||
DT::datatable(important_measure_mean, caption = paste0("Mean of Importance Measures from Random Forest (K=", K, ")")) | ||
|
||
## ----PR result, fig.height=6, fig.width=12------------------------------- | ||
## For each iteration, Penalized Regression run returns a matrix of ModelXerr.measures | ||
## 1. Combine them across all K iterations. This returns a matrix of ModelxK dimension for each element | ||
## 2. Plot the error.measure of AUC, Sensitivity, Sepecificity, and MC and faceted by Model | ||
PR.err.measure <- list(AUC=do.call("rbind", lapply(PRresult, function(x){x$PR.err.vect["AUC.overall",]})), | ||
Sensitivity= do.call("rbind", lapply(PRresult, function(x){x$PR.err.vect["sensitivity.overall",]})), | ||
Specificity= do.call("rbind", lapply(PRresult, function(x){x$PR.err.vect["specificity.overall",]})), | ||
Misclassification= do.call("rbind", lapply(PRresult, function(x){x$PR.err.vect["misclassification.overall",]}))) | ||
|
||
PR.err.measure <- lapply(PR.err.measure, function(x){as.data.frame(t(x), row.names = c("Elastic Net(alpha=0.4)", "Elastic Net(alpha=0.7)", "LASSO(alpha=1)"))}) | ||
|
||
PR.err.measure.DF <- do.call("cbind", PR.err.measure) | ||
PR.err.measure.DF$Model <- rownames(PR.err.measure.DF) | ||
PR.err.measure.DF.melt <- melt(PR.err.measure.DF, id="Model") | ||
PR.err.measure.DF.melt$variable <- factor(unlist(lapply(strsplit(as.character(PR.err.measure.DF.melt$variable), "[.]"), function(x){x[[1]]})), levels=c("AUC", "Sensitivity", "Specificity", "Misclassification")) | ||
PR.err.measure.DF.melt$Model <- factor(PR.err.measure.DF.melt$Model, levels=c("Elastic Net(alpha=0.4)", "Elastic Net(alpha=0.7)", "LASSO(alpha=1)")) | ||
|
||
ggplot(PR.err.measure.DF.melt) + geom_boxplot(aes(x=variable, y=value*100,fill=variable)) + facet_grid(~ Model)+ ylab("percentage(%)") + ggtitle(paste0("Penalized Regressions (K=", K, ", Lasso/EN)")) + ylab("percentage(%)") | ||
|
||
## 3. Mean and SD of error.measures for each Model | ||
PR.err.measure.mean <- do.call("cbind", lapply(PR.err.measure, function(x){paste0(round(apply(x, 1, mean), 4), " (sd=", round(apply(x, 1, sd), 4), ")")})) | ||
rownames(PR.err.measure.mean) <- c("Elastic Net(alpha=0.4)", "Elastic Net(alpha=0.7)", "LASSO(alpha=1)") | ||
DT::datatable(PR.err.measure.mean, caption = paste0("Mean (SD) of Accuracy Measures using Penalized Regression, K=", K, ")"), rownames = TRUE) | ||
|
||
## ----Non zero coeffi for PR serum---------------------------------------- | ||
## 4. non zero coefficients and cound them how many times each variable selected | ||
## For each model (alpha level), we order selected(non-zero coefficients) variables by their counts. | ||
PR.non0coeff <- list(alphaLevel_04=lapply(PRresult, function(x){x$non0coeff[[1]]}), | ||
alphaLevel_07=lapply(PRresult, function(x){x$non0coeff[[2]]}), | ||
alphaLevel_1=lapply(PRresult, function(x){x$non0coeff[[3]]})) | ||
PR.non0coeff_variables <- list(alphaLevel_04=lapply(PRresult, function(x){names(x$non0coeff[[1]])}), | ||
alphaLevel_07=lapply(PRresult, function(x){names(x$non0coeff[[2]])}), | ||
alphaLevel_1=lapply(PRresult, function(x){names(x$non0coeff[[3]])})) | ||
union_non0coeff <- data.frame(Variables=unique(unlist(lapply(PR.non0coeff_variables, function(x){unlist(x)})))) | ||
|
||
union.all.non0coeff <- lapply(PR.non0coeff_variables, function(x){table(unlist(x))}) | ||
union.all.non0coeff <- lapply(union.all.non0coeff, function(x){merge(union_non0coeff, data.frame(Variables= names(x), Counts=as.vector(x)), by="Variables", all.x=TRUE)}) | ||
#union.all.non0coeff <- lapply(union.all.non0coeff, function(x){x[order(x$Counts, decreasing=TRUE),]}) | ||
|
||
union.all.non0coeff_all <- merge(merge(union.all.non0coeff[[1]], union.all.non0coeff[[2]], by="Variables"), union.all.non0coeff[[3]], by="Variables") | ||
names(union.all.non0coeff_all) <- c("Variables", "alphaLevel=0.4", "alphaLevel=0.7", "alphaLevel=1") | ||
union.all.non0coeff_all <- union.all.non0coeff_all[order(union.all.non0coeff_all$`alphaLevel=0.4`, decreasing = TRUE),] | ||
union.all.non0coeff_all$Variables <- as.character(union.all.non0coeff_all$Variables) | ||
|
||
DT::datatable(union.all.non0coeff_all[-1,], rownames = FALSE, caption = paste0("Selected Variables and its counts for each alpha (K=", K, ")")) | ||
|
||
## Select top ranked genes having more than 70% out of K times selected in across all models ## | ||
## Estimate these genes' coefficients in Logistic model and then use them as Polygenic score for the new testing data (Marine data) | ||
polygenicGenes <- union.all.non0coeff_all$Variables[which(union.all.non0coeff_all$`alphaLevel=1`>=round(0.7*K))] | ||
polygenicGenes <- polygenicGenes[polygenicGenes!="(Intercept)"] | ||
|
||
## ----SVM result summary-------------------------------------------------- | ||
SVM.err.measure <- data.frame(AUC=unlist(lapply(SVMresult, function(x){x$SVM.err[1,"overall"]})), | ||
Sensitivity= unlist(lapply(SVMresult, function(x){x$SVM.err[2,"overall"]})), | ||
Specificity= unlist(lapply(SVMresult, function(x){x$SVM.err[3,"overall"]})), | ||
Misclassification= unlist(lapply(SVMresult, function(x){x$SVM.err[4,"overall"]}))) | ||
SVM.err.measure.melt <- melt(SVM.err.measure) | ||
names(SVM.err.measure.melt) <- c("accuracy.measure", "value") | ||
SVM.err.measure.melt$accuracy.measure <- factor(SVM.err.measure.melt$accuracy.measure, levels = c("AUC", "Sensitivity", "Specificity", "Misclassification")) | ||
|
||
ggplot(SVM.err.measure.melt) + geom_boxplot(aes(x=accuracy.measure, y=value*100, fill=accuracy.measure)) + ggtitle(paste0("Support Vector Machine (K=", K, ", Radial Kernal)")) + ylab("percentage(%)") | ||
|
||
## ----SVM accuracy measures----------------------------------------------- | ||
tempD <- ddply(SVM.err.measure.melt, ~ accuracy.measure, summarise, mean= round(mean(value), digits = 4), sd=round(sd(value), digits = 4)) | ||
tempD$value <- paste0(tempD$mean, " (sd=", tempD$sd, ")") | ||
tempD$mean <- tempD$sd <- NULL | ||
DT::datatable(tempD, caption = paste0("Mean(SD) of Overall Accuracy Measures using SVM (K=", K, ")"), rownames = FALSE) | ||
|
||
## ----Neural Network result summary--------------------------------------- | ||
NN.err.measure <- data.frame(AUC=unlist(lapply(NNresult, function(x){x$NN.err[1,"overall"]})), | ||
Sensitivity= unlist(lapply(NNresult, function(x){x$NN.err[2,"overall"]})), | ||
Specificity= unlist(lapply(NNresult, function(x){x$NN.err[3,"overall"]})), | ||
Misclassification= unlist(lapply(NNresult, function(x){x$NN.err[4,"overall"]}))) | ||
NN.err.measure.melt <- melt(NN.err.measure) | ||
names(NN.err.measure.melt) <- c("accuracy.measure", "value") | ||
NN.err.measure.melt$accuracy.measure <- factor(NN.err.measure.melt$accuracy.measure, levels = c("AUC", "Sensitivity", "Specificity", "Misclassification")) | ||
|
||
ggplot(NN.err.measure.melt) + geom_boxplot(aes(x=accuracy.measure, y=value*100, fill=accuracy.measure)) + ggtitle(paste0("Neural Network (K=", K, ", 10 layers)")) + ylab("percentage(%)") | ||
|
||
## ----NN accuracy measures----------------------------------------------- | ||
tempD <- ddply(NN.err.measure.melt, ~ accuracy.measure, summarise, mean= round(mean(value), digits = 4), sd=round(sd(value), digits = 4)) | ||
tempD$value <- paste0(tempD$mean, " (sd=", tempD$sd, ")") | ||
tempD$mean <- tempD$sd <- NULL | ||
DT::datatable(tempD, caption = paste0("Mean(SD) of Overall Accuracy Measures using SVM (K=", K, ")"), rownames = FALSE) | ||
|
||
## ----Gredient Boosting result summary------------------------------------ | ||
GBM.err.measure <- data.frame(AUC=unlist(lapply(GBMresult, function(x){x$GBM.err[1,"overall"]})), | ||
Sensitivity= unlist(lapply(GBMresult, function(x){x$GBM.err[2,"overall"]})), | ||
Specificity= unlist(lapply(GBMresult, function(x){x$GBM.err[3,"overall"]})), | ||
Misclassification= unlist(lapply(GBMresult, function(x){x$GBM.err[4,"overall"]}))) | ||
GBM.err.measure.melt <- melt(GBM.err.measure) | ||
names(GBM.err.measure.melt) <- c("accuracy.measure", "value") | ||
GBM.err.measure.melt$accuracy.measure <- factor(GBM.err.measure.melt$accuracy.measure, levels = c("AUC", "Sensitivity", "Specificity", "Misclassification")) | ||
|
||
ggplot(GBM.err.measure.melt) + geom_boxplot(aes(x=accuracy.measure, y=value*100, fill=accuracy.measure)) + ggtitle(paste0("Gredient Boosting (K=", K, ")")) + ylab("percentage(%)") | ||
|
||
## ----GBM accuracy measures---------------------------------------------- | ||
tempD <- ddply(GBM.err.measure.melt, ~ accuracy.measure, summarise, mean= round(mean(value), digits = 4), sd=round(sd(value), digits = 4)) | ||
tempD$value <- paste0(tempD$mean, " (sd=", tempD$sd, ")") | ||
tempD$mean <- tempD$sd <- NULL | ||
DT::datatable(tempD, caption = paste0("Mean(SD) of Overall Accuracy Measures using SVM (K=", K, ")"), rownames = FALSE) | ||
|
Oops, something went wrong.