-
Notifications
You must be signed in to change notification settings - Fork 21
Expand file tree
/
Copy pathclassifier_training.rmd
More file actions
182 lines (130 loc) · 5.2 KB
/
classifier_training.rmd
File metadata and controls
182 lines (130 loc) · 5.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
---
title: "Immune Subtype Classification"
output: html_document
---
Here we'll be using the new robencla package to train a classifier and do some experiments.
```{r}
devtools::install_github('gibbsdavidl/robencla',force=T)
```
OK then
```{r}
library(robencla)
library(readr)
```
Load in our data and gene features
```{r}
load("E:/Work/GBM_clusters/ImmuneSubtypeClassifier/data/geneSetSymbols.rda")
load("E:/Work/GBM_clusters/ImmuneSubtypeClassifier/data/modelgenes.rda")
load("E:/Work/GBM_clusters/ImmuneSubtypeClassifier/data/ebpp_gene.rda")
label_table <- read_table('inst/data/five_signature_mclust_ensemble_results.tsv.gz')
label_table$ClusterModel1 <- as.character(label_table$ClusterModel1)
```
Load in the expression data:
```{r}
dat <- read_csv("E:/Work/iatlas/Subtype_Classifier_Training_Dir/data/EBpp_pancancer.csv.gz")
# let's randomize columns with zero variance
idx <- which(apply(dat, 2, var) == 0)
for (i in idx) {
dat[,i] <- rnorm(n=9129)
}
#dat$AliquotBarcode <- label_table$AliquotBarcode
dat$Class <- sapply(as.character(label_table$ClusterModel1), FUN = function(A) paste0("C",A))
```
Let's make a mini feature set:
```{r}
# pick out genes that are in the data
genes <- colnames(dat)[colnames(dat) %in% ebpp_genes_sig$Symbol]
pair_features <- list()
for (ci in 1:6) {
print(ci)
gs <- sample(genes, size = 10, replace = F)
pair_features[[ci]] <- gs
}
names(pair_features) <- c('C1','C2','C3','C4','C5','C6')
sig_features <- list()
for (li in names(genesetsymbols)) {
print(li)
gs <- genesetsymbols[[li]][genesetsymbols[[li]] %in% colnames(dat)]
sig_features[[li]] <- sample(gs, size = 10, replace = F)
}
```
```{r}
# XGBoost parameters to pass to each sub-classifier in the ensembles
params <- list(
max_depth=12, # "height" of the tree, 6 is actually default. I think about 12 seems better. (xgboost parameter)
eta=0.3, # this is the learning rate. smaller values slow it down, more conservative (xgboost parameter)
nrounds=24, # number of rounds of training, lower numbers less overfitting (potentially) (xgboost parameter)
early_stopping_rounds=2, # number of rounds without improvment stops the training (xgboost early_stopping_rounds)
nthreads=4, # parallel threads
gamma=0.0, # min loss required to again split a leaf node. higher ~ more conservative (xgboost parameter)
lambda=1.0, # L2 regularization term on weights, higher number ~ more conservative (xgboost parameter)
alpha=0.0, # L1 regularization term on weights. higher number ~ more conservative (xgboost parameter)
size=21, # Size of the ensemble, per binary prediction
sample_prop=0.8, # The percentage of data used to train each ensemble member.
feature_prop=0.8, # The percentage of data used to train each ensemble member.
subsample=0.8, # the xgboost machines subsample at this rate.
combine_function='median', # How the ensemble should be combined. Only median currently.
verbose=0)
```
```{r}
mod <- Robencla$new("Test1")
mod$version
mod$train(data_frame = dat,
label_name = 'Class',
sample_id = 'Barcode',
data_mode = 'pairs',
pair_list = pair_features,
params = params
)
mod$predict(data_frame = dat,
label_name = 'Class',
sample_id = 'Barcode')
df <- mod$results(include_label = T)
# create a confusion matrix
table(Label=df$Label, Pred=df$BestCalls)
# Prediction metrics on the test set predictions
#mod$classification_metrics(use_cv_results = F)
# Get the importance of features in each ensemble member
# these are found by combining the importance measures across the ensemble
#mod$importance()
# Plot the ROC curves for each classpar(mar=c(3,3,3,3))
#plot_roc(mod, 'C3')
#ensemble_rocs(mod)
#plot_pred_final(mod, cluster = T)
colnames(mod$ensbl[[1]]$train_data)
```
```{r}
mod$autocv(
# The data to use for training
data_frame=dat,
# The name of the column containing the training label
label_name='Class',
# The column containing the sample ID
sample_id = 'Barcode',
# The number of cross validation folds to predict on, ignores data_split if >1.
cv_rounds=5,
# The data transformation mode, set using allpairs, pairs,sigpairs, quartiles,tertiles,binarize,ranks,original
data_mode=c('pairs'), # pairs takes LR pairs from the pair_list, allpairs forms all pairs from pair_list
# If using signature pairs, then the list defining the signatures, must be composed of column names
signatures=sig_features,
# Use only these features for the pairs
pair_list=pair_features,
# The XGBoost parameters (and a few extra params)
params=params,
verbose=T
)
mod$train(data_frame=dat,
label_name='Class',
sample_id = 'AliquotBarcode',
data_mode=c('namedpairs'),
pair_list=pair_features,
signatures=NULL,
params=params)
```
```{r}
res0 <- mod$results()
mod$classification_metrics()
table(Label=mod$test_label, Pred=mod$results(include_label = T)$BestCalls)
mod$importance()
plot_pred_final(mod)
```