-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlr_calculations.R
368 lines (345 loc) · 16.1 KB
/
lr_calculations.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
## File created: 30 / 5 / 2014
## Last updated: 15 / 7 / 2015
## Isabel Fenton
## Likelihood ratio plots
##
## Code to calculate likelihood ratios for a set of variables
##
## Functions:
## model.evs - a function to get the list of variables from a model, including poly
## lr.calc - a function that calculates likelihood ratios for a set of variables
## lr.plot - a function to plot the likelihood ratios / comparisons on barplots
##
## To do: Initially set to run with sarlm models
## suspect this only works with one poly variable
## n.b. can't handle log()
library(lmtest)
library(scales)
model.evs <- function(model) {
# function to get the list of variables from a model, including poly
# input - model: the model to work from
# output - a list of the variables
EVs <- all.vars(as.formula(model))[-1] # use the -1 to remove the RV
# if poly is present
if (length(grep("poly", names(model$coefficients), fixed = TRUE)) > 0) {
# identify where poly is present and it's not an interaction
warning("Only tested with one poly variable")
poly.ev <- names(model$coefficients)[setdiff(grep("poly", names(model$coefficients), fixed = TRUE), grep(":", names(model$coefficients), fixed = TRUE))]
tmp.poly <- NULL
full.poly <- NULL
# identify the unique EVs from that list (uni.poly)
for (i in poly.ev){
tmp.poly <- c(tmp.poly, names(unlist(sapply(EVs, grep, i, fixed = TRUE))))
}
uni.poly <- unique(tmp.poly)
# for each unique poly generate a sequence from the max order to 1
for (i in uni.poly) {
# identify the max order of poly
tmp <- poly.ev[grep(i, poly.ev, fixed = TRUE)][1] # given these aren't interactions, these must have the highest order of poly, so arbitraily take the first one
poly.seq <- seq(as.numeric(substr(tmp, nchar(tmp) - 2, nchar(tmp) - 2)), 1)
for (j in poly.seq) {
substr(tmp, nchar(tmp) - 2, nchar(tmp) - 2) <- as.character(j)
full.poly <- c(full.poly, substr(tmp, 1, nchar(tmp) - 1))
}
}
# remove those EVs which are poly from the total EV list
EVs <- setdiff(EVs, uni.poly)
# add the poly values
EVs <- c(EVs, full.poly)
}
# repeat for those with I()
if (length(grep("I(", names(model$coefficients), fixed = TRUE)) > 0) {
# identify where I( is present and it's not an interaction
Ib.ev <- names(model$coefficients)[setdiff(grep("I(", names(model$coefficients), fixed = TRUE), grep(":", names(model$coefficients), fixed = TRUE))]
tmp.Ib <- NULL
# identify EVs without the I() from that list (uni.Ib)
for (i in Ib.ev){
tmp.Ib <- c(tmp.Ib, names(unlist(sapply(EVs, grep, i, fixed = TRUE))))
}
uni.Ib <- unique(tmp.Ib)
# remove those EVs which are Ib from the total EV list
EVs <- setdiff(EVs, uni.Ib)
# add the Ib values
EVs <- c(EVs, Ib.ev)
}
return(EVs)
}
lr.calc <- function(model, EVs = NULL, plots = FALSE, pred.data = NULL, mod.data = NULL, file.nm = as.character(model), multiple.oce = TRUE) {
# function to calculate likelihood ratios
# input - model: the model to work from
# - EVs: dataframe of variables, and the groups they are in (optional, otherwise calculated from model)
# - plots: if TRUE then produce plots by ocean of the predicted values
# - pred.data: the dataframe for predicting
# - mod.data: the dataframe on which the original model was run
# output - dataframe containing variables, p values and significance stars
# obtain a list of variables
if (is.null(EVs)) {
# calculate the list of EVs
EVs <- model.evs(model)
# create a dataframe of these, so it matches the input of groups
EVs <- data.frame(name = EVs, group = EVs)
} else {
# create a standard format
if(!is.data.frame(EVs)) stop("Expecting a dataframe")
if(ncol(EVs) != 2) stop ("Expecting 2 columns")
if(length(unique(EVs[, 1])) < length(unique(EVs[, 2]))) stop ("Order should be name, then group")
colnames(EVs) <- c("name", "group")
}
# if plots, then get predicted values from the main model
if (plots) {
mod.p <- sar.predict(model, olddata = mod.data, newdata = pred.data)
m <- 1
}
# generate a dataframe to store these values
LRs <- data.frame(names = unique(EVs$group), lr = NA, p = NA, stars = NA)
# for each value in group
for (i in unique(EVs$group)) {
# identify the variables to remove
m.EVs <- EVs$name[EVs$group == i]
# if it is poly need to add higher orders
if (length(grep("poly", i, fixed = TRUE)) > 0){
# identify the order
poly.order <- substr(i, regexpr(")", i) - 1, regexpr(")", i) - 1)
# if there are higher orders, then add those to m.EVs
# what polys are there
all.poly <- unique(EVs$name)[grep("poly", unique(EVs$name), fixed = TRUE)]
max.poly <- max(unlist(lapply(all.poly, function(x) substr(x, regexpr(")", x) - 1, regexpr(")", x) - 1))))
# if the maximum is greater than the one we have currently, add these higher ones in
if (poly.order < max.poly)
m.EVs <- c(as.character(m.EVs), as.character(all.poly[substr(all.poly, regexpr(")", all.poly) - 1, regexpr(")", all.poly) - 1) > substr(i, regexpr(")", i) - 1, regexpr(")", i) - 1)]))
}
# create all possible interaction variables
m.all.int <- unlist(lapply(m.EVs, function(x) names(model$coefficients)[grep(x, names(model$coefficients), fixed = TRUE)]))
# make sure these are the same as found in EVs$name
m.all.EVs <- NULL
for (j in 1:length(m.all.int)) {
tmp.split <- strsplit(m.all.int[j], ":")
# for each part, what is the EVs$name that contains that part
tmp.1 <- unlist(lapply(EVs$name, function(x) if (length(grep(x, tmp.split[[1]][1], fixed = TRUE)) > 0) as.character(x)))
if(length(tmp.split[[1]]) == 2) {
tmp.2 <- unlist(lapply(EVs$name, function(x) if (length(grep(x, tmp.split[[1]][2], fixed = TRUE)) > 0) as.character(x)))
tmp.1 <- paste(tmp.1, ":", tmp.2, sep= "")
}
m.all.EVs <- c(m.all.EVs, tmp.1)
}
m.all.EVs <- unique(m.all.EVs)
# if it is poly, need to add some in as well
# if the main variable is poly
if (length(grep("poly", i, fixed = TRUE)) > 0 ) {
# reduce the order by one for all poly
tmp.order <- as.numeric(min(unlist(lapply(m.EVs, function(x) substr(x, regexpr(")", x) - 1, regexpr(")", x) - 1)))))
if (tmp.order > 1) {
poly.add <- NULL
for (k in m.all.EVs) {
tmp.add <- as.numeric(substr(k, regexpr(")", k) - 1, regexpr(")", k) - 1))
# subtract one if the value is less than 1
if (tmp.add > 1)
substr(k, regexpr(")", k) - 1, regexpr(")", k) - 1) <- as.character(tmp.order - 1)
poly.add <- c(poly.add, k)
}
tmp.mod <- eval(parse(text = paste("update(model, ~. -", paste(m.all.EVs, collapse = " - "), " + ", paste(poly.add, collapse = " + "), ")", sep = "")))
} else {
tmp.mod <- eval(parse(text = paste("update(model, ~. -", paste(m.all.EVs, collapse = " - "), ")", sep = "")))
}
} else {
# create a model with that variable(s) removed
tmp.mod <- eval(parse(text = paste("update(model, ~. -", paste(m.all.EVs, collapse = " - "), ")", sep = "")))
}
# obtain the LR and P-value
LRs$lr[LRs$names == i] <- lrtest(model, tmp.mod)$Chisq[2]
LRs$p[LRs$names == i] <- lrtest(model, tmp.mod)$Pr[2]
if (plots) {
# set the choice of colors
color <- c("gray50", "springgreen3", "orchid3")
ifelse(i == "prop2.oxy", span <- 1, span <- 0.75)
tmp.mod.p <- sar.predict(tmp.mod, olddata = mod.data, newdata = pred.data)
# if groups
if (sum(EVs$name != EVs$group) == 0) {
# extract variable name if it is a poly variable
if (length(grep("poly", i)) > 0) {
var <- gsub(",.*", "", gsub(".*\\(", "", i))
m <- m + 1
} else if (length(grep("I(", i, fixed = TRUE)) > 0) {
var <- gsub("/.*", "", gsub(".*\\(", "", i))
} else {
var <- i
}
if (!is.factor(pred.data[, var]) & multiple.oce){
if (min(pred.data[, var]) != max(pred.data[, var])) {
# create a vector containing values for each ocean
p.data <- data.frame(p.x = rep(seq(min(pred.data[, var]), max(pred.data[, var]), length.out = 100), 3), p.oce = rep(levels(pred.data[, grep("Ocean", names(pred.data))]), each = 100), p.y = NA)
# for each of the oceans, run a loess and predict the values
for (l in 1:3) {
tmp.val <- which(pred.data[, grep("Ocean", names(pred.data))] == levels(pred.data[, grep("Ocean", names(pred.data))])[l])
# need to set the span for data where there are limited x values
lo <- loess((mod.p - tmp.mod.p)[tmp.val] ~ pred.data[tmp.val, var], span = span)
p.data$p.y[(1:100) + 100 * (l - 1)] <- predict(lo, newdata = p.data$p.x[(1:100) + 100 * (l - 1)])
}
}
}
# randomise to plot better
samp <- sample(1:nrow(pred.data), nrow(pred.data))
# plot these results
png(paste("Figures/",file.nm, "_", var, "_", m, ".png", sep = ""))
with(pred.data[samp, ], plot(pred.data[samp, var], mod.p[samp] - tmp.mod.p[samp], col = alpha(color[pred.data[samp, grep("Ocean", names(pred.data))]], 0.5), xlab = var, ylab = paste("Relative", eval(model$call[2][[1]])[2][[1]]), bty = "l", las = 1, cex.lab = 1.2, cex.axis = 1.2))
if (!is.factor(pred.data[, var]) & multiple.oce){
if (min(pred.data[, var]) != max(pred.data[, var])) {
with(p.data[1:100, ], lines(p.x, p.y, col = "black", lwd = 2))
with(p.data[101:200, ], lines(p.x, p.y, col = "green4", lwd = 2))
with(p.data[201:300, ], lines(p.x, p.y, col = "magenta4", lwd = 2))
}
}
dev.off()
} else {
k <- 1
for (j in EVs$name[EVs$group == i]) {
if (length(grep("poly", j) > 0)) {
var <- gsub(",.*", "", gsub(".*\\(", "", j))
m <- m + 1
} else {
var <- j
}
if (!is.factor(pred.data[, var]) & multiple.oce){
if (min(pred.data[, var]) != max(pred.data[, var])) {
# create a vector containing values for each ocean
p.data <- data.frame(p.x = rep(seq(min(pred.data[, var]), max(pred.data[, var]), length.out = 100), 3), p.oce = rep(levels(pred.data[, grep("Ocean", names(pred.data))]), each = 100), p.y = NA)
# for each of the oceans, run a loess and predict the values
for (l in 1:3) {
tmp.val <- which(pred.data[, grep("Ocean", names(pred.data))] == levels(pred.data[, grep("Ocean", names(pred.data))])[l])
# need to set the span for data where there are limited x values
lo <- loess((mod.p - tmp.mod.p)[tmp.val] ~ pred.data[tmp.val, var], span = span)
p.data$p.y[(1:100) + 100 * (l - 1)] <- predict(lo, newdata = p.data$p.x[(1:100) + 100 * (l - 1)])
}
}
}
# randomise to plot better
samp <- sample(1:nrow(pred.data), nrow(pred.data))
# plot these results
png(paste("Figures/",file.nm, "_", var, "_", m, ".png", sep = ""))
with(pred.data[samp, ], plot(pred.data[samp, var], mod.p[samp] - tmp.mod.p[samp], col = alpha(color[pred.data[samp, grep("Ocean", names(pred.data))]], 0.5), xlab = var, ylab = paste("Relative", eval(model$call[2][[1]])[2][[1]]), bty = "l", las = 1, cex.lab = 1.2, cex.axis = 1.2, main = paste(i, k, sep = ": ")))
if (!is.factor(pred.data[, var]) & multiple.oce){
if (min(pred.data[, var]) != max(pred.data[, var])) {
with(p.data[1:100, ], lines(p.x, p.y, col = "black", lwd = 2))
with(p.data[101:200, ], lines(p.x, p.y, col = "green4", lwd = 2))
with(p.data[201:300, ], lines(p.x, p.y, col = "magenta4", lwd = 2))
}
}
dev.off()
k <- k + 1
}
}
}
}
# calculate significance stars
stars <- c(0.001, 0.01, 0.05, 0.1)
names(stars) <- c("***", "**", "*", ".")
for (i in 1:length(stars)) {
LRs$stars[which(LRs$p <= stars[i] & is.na(LRs$stars))] <- names(stars)[i]
}
return(LRs)
}
lr.plot <- function(lr.mod1, lr.mod2 = NULL, lr.mod3 = NULL, lr.mod4 = NULL, order = NULL, plt = 0.4, leg.txt = NULL, leg.x = "topright", leg.y = NULL, leg.cex = 1, star.pos = 10, legend = TRUE, ylim = NULL, cex.pts = 1, srt = NULL, cex.axis = 1, se.val = NULL, ...) {
# function to plot the likelihood ratios for up to four models
# input - likelihood ratios from lr.calc (up to four)
# - order: a list of numbers for the order of the code
# - plt: the plot parameter for the x-axis
# - leg.text: the words for the legend, otherwise defaults to the model names
# output - the x-coordinates of the bars
# calculate number of comparisons
# create a list of the models
lr.mods <- c("lr.mod1", "lr.mod2", "lr.mod3", "lr.mod4")
num.mod <- 0
# create the dataframe for analysis
for (i in 1:length(lr.mods)) {
# get each model
tmp <- eval(parse(text = lr.mods[i]))
# if there is a model
if (!is.null(tmp)) {
num.mod <- num.mod + 1
# rename the variables, so they are unique
names(tmp)[2:4] <- paste(names(tmp)[2:4], lr.mods[i], sep = ".")
# if it is the first one
if (i == 1) {
all.lr.mods <- tmp
} else {
all.lr.mods <- merge(all.lr.mods, tmp, all = TRUE)
}
}
}
# make sure order of se and of lr values are the same when multiple models are present
if (!is.null(se.val)) {
if(!is.null(dim(se.val))) {
se.val <- se.val[, as.character(all.lr.mods$names)]
}
}
# order the points, if order is present
if (!is.null(order)) {
if (length(order) != nrow(all.lr.mods)) stop("Different lengths in order and lr dataframe")
all.lr.mods <- all.lr.mods[order,]
if(!is.null(se.val)) {
if (is.null(dim(se.val))) {
se.val <- se.val[order]
} else {
se.val <- se.val[, order]
}
}
}
# generate the data for the barplot
bar.lr.mods <- t(as.matrix(all.lr.mods[grep("^lr", names(all.lr.mods))]))
# plot the absolute coefficients
plt.def <- par("plt")
on.exit(par(plt.def))
par(plt = c(plt.def[1:2], plt, plt.def[4]))
x.axis <- "s"
if (!is.null(srt)) x.axis <- "n"
if (is.null(ylim)) {
if (is.null(se.val)) {
ylim <- c(0, max(bar.lr.mods, na.rm = T) + star.pos + 5)
} else {
ylim.max <- which(bar.lr.mods == max(bar.lr.mods, na.rm = T))
ylim <- c(0, bar.lr.mods[ylim.max] + star.pos + 5 + se.val[ylim.max])
}
}
pts.x <- barplot(bar.lr.mods, names = all.lr.mods$names, beside = T, las = 2, ylim = ylim, xaxt = x.axis, ...)
if (!is.null(srt)) {
text(c(pts.x[2,]), par("usr")[3] - 10, srt = srt, labels = all.lr.mods$names, adj = 1, xpd = TRUE, cex = cex.axis)
}
star.se <- 0
if (!is.null(se.val)) {
# error bar function
error.bars <- function(xv, yv, z) {
g <- (pts.x[2] - pts.x[1]) * 0.2
for (i in 1:length(xv)) {
lines(c(xv[i], xv[i]), c(yv[i] + z[i], yv[i] - z[i]))
lines(c(xv[i] - g, xv[i] + g), c(yv[i] + z[i], yv[i] + z[i]))
lines(c(xv[i] - g, xv[i] + g), c(yv[i] - z[i], yv[i] - z[i]))
}
}
# add them to the plot
error.bars(pts.x, bar.lr.mods, se.val)
}
for (i in 1:num.mod) {
if(!is.null(lr.mods[i])) {
if (!is.null(dim(se.val))) {
star.se <- se.val[i, ]
} else if (!is.null(se.val)) {
star.se <- se.val[i]
} else {
star.se <- 0
}
text(pts.x[i, ], all.lr.mods[, grep("^lr", names(all.lr.mods))[i]] + star.pos + star.se, all.lr.mods[, grep("^stars", names(all.lr.mods))[i]], cex = cex.pts)
}
}
if (legend) {
if (is.null(leg.txt)) {
leg.txt <- lr.mods[1:num.mod]
}
legend(leg.x, leg.y, leg.txt, fill = gray.colors(length(lr.mods))[1:length(lr.mods)], cex = leg.cex)
}
return(pts.x)
}
# lr.calc(mod.sar.opf)
# ms.lr
# tmp <- data.frame(names = model.evs(mod.sar.opf), group = c("Temp", "Strat", "Chla", "Sal", "Sal", "Ocean", "Dissol", "Temp", "Temp", "Temp"))
# lr.calc(mod.sar.opf, tmp)
# ms.lr.group