-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathKORA.prot.preproc.belowlod.R
173 lines (121 loc) · 5.33 KB
/
KORA.prot.preproc.belowlod.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# Script started "Thu Jan 10 15:03:10 2019"
# To run on TRYGYVVE
# Purpose: take INF1 proteomics data from KORA cohort (untruncated NPX values),
# and normalise for subsequent linear regression/GWAS
# Oddities: some individuals without ANY proteomic data are included in the .csv file
# Issues:
# 1) LIF is not in the new dataset - strange as it was in the earlier data censored to LLOD
# 2) # NB 2 proteins appear to have typos 'ORTN' and 'OT3' (probably 'NRTN' and 'NT3').
# The file columns for proteins in this dataset generally begin "UH_O_" followed by protein symbol.
# In the case of these 2 proteins this formatting pattern has been messed up so the names are "UH_ORTN" and "UH_OT3".
# So it looks like the "_N" from the protein name got lost here i.e. they should be "UH_O_NRTN" and "UH_O_NT3".
#-----------------------------------------------#
# 1) Data set-up
#-----------------------------------------------#
options(stringsAsFactors = F)
rm(list=ls())
# this should be the file with untruncated values (ie not censored at 'LLOD')
data.dir <- "/data/andmala/KORA/KORA_updated_olink/"
outdir <- "/data/jampet/KORA/"
setwd(data.dir)
# read in the Olink INF proteomic data
x <- read.csv("K15117g_Malarstig_tra_20181129.csv", head=T)
# important: Olink have withdrawn BDNF: the assay is dodgy
# see: https://www.olink.com/bdnf-info/
x$UH_O_BDNF <- NULL
# initial columns have id and covariate data. Protein columns have prefix "UH_O"
# Find the indices of these columns
prot.ind <- grep("^uh_o", colnames(x), ignore.case = TRUE )
# subset to get just the protein data
pr <- x[ ,prot.ind]
if (ncol(pr) != 91){
stop(paste( "Incorrect number of proteins: there should be 91. There are", ncol(pr)) )
}
total.na <- sum( apply(pr, 1, FUN= function(x) all(is.na(x)) ) )
message(paste(total.na, "samples have no protein values whatsoever"))
# indices of samples with no Olink data
total.na.ind <- which( apply(pr, 1, FUN= function(x) all(is.na(x)) ) )
# indices of samples with no Axiom id (ie not genotyped)
axiom.na <- which(is.na(x$zz_nr_axiom))
to.cut <- union(total.na.ind, axiom.na)
# cut samples with no protein data
y <- x[-to.cut, ]
# let's investigate whether protein values are truncated at the lower end of the distribution:
# originally we found the data was truncated when we expected it not to be
# here's a simple test: for each protein find the lowest value:
# if there are many ties then likely the data is censored
find.n.llod <- function(x){
# find the minimum value
x.min <- x[which.min(x)]
if (any(x < x.min, na.rm = T)){
stop("Error: vector contains value lower than supposed minimum")
}
n.llod <- sum(x == x.min, na.rm=T)
n.llod
}
# find the number of individuals at the minimum value for each protein
n.llod.by.prot <- sort( apply(y[,prot.ind], 2, FUN= find.n.llod ), decreasing = T )
# see if the minimum value is not unique for any protein (suggesting truncation ...)
if (any( n.llod.by.prot != 1)){
message("there are multiple minimum values for at least one protein")
message("Checking for culprit proteins...")
n.llod.by.prot[n.llod.by.prot != 1]
}
# ucsex: M = 1, F= 2, then colour code by sex
y$ucsex[which(y$ucsex==1)] <- 'M'
y$ucsex[which(y$ucsex==2)] <- 'F'
#-----------------------------------------------#
# 2) Visualisation
#-----------------------------------------------#
pdf(paste0(outdir,"kora.below.llod.npx.by.sex.pdf"), height=7, width = 6)
par(mfrow=c(4,3), mar=c(4,4,2,1))
for (i in prot.ind){
boxplot( y[,i] ~ y$ucsex, las=1, col= c("blue", "pink"))
mtext(text= colnames(y)[i], side=3)
}
dev.off()
pdf(paste0(outdir, "kora.below.llod.npx.hist.pdf"), height=7, width = 6)
par(mfrow=c(4,3), mar=c(4,4,2,1))
for (i in prot.ind){
hist( y[,i], freq=F, breaks=50, main="", xlab='', ylab=NA, las=1 )
# add a density curve
curve(
expr= dnorm(x, mean=mean(y[,i], na.rm=TRUE), sd=sd(y[,i], na.rm=T)),
add=TRUE, col='darkblue', lwd=2
)
mtext(text= 'NPX expr', side=1, line=2, cex=0.75)
mtext(text= colnames(y)[i], side=3)
}
dev.off()
pdf(paste0(outdir,"kora.below.llod.npx.distrib.pdf"), height=7, width = 6)
par(mfrow=c(4,3), mar=c(4,3,2,1))
for (i in prot.ind){
plot(density( y[,i], na.rm=T), main="", ylab=NA, las=1 )
mtext(text= colnames(y)[i], side=3)
}
dev.off()
#-----------------------------------------------#
# 3) Normalisation
#-----------------------------------------------#
# Define the inverse normalisation function
#https://github.com/antagomir/scripts/blob/master/R/inverse.normalization.R
inverse_normal_transform <- function (x) { qnorm((rank(x,na.last="keep")-0.5)/sum(!is.na(x))) }
# Apply invnt transformation to proteins
# nb apply transposes result: proteins become cols
pr.transformed <- apply(y[,prot.ind], 2, FUN=inverse_normal_transform)
#
if (identical( row.names(pr.transformed), row.names(y[,-prot.ind]))){
z <- cbind(y[,-prot.ind], pr.transformed)
} else {
stop("row names don't match")
}
colnames(z)[which(colnames(z)=="utalter")] <- "age"
colnames(z)[grep("sex", colnames(z))] <- "sex"
ind.keep <- grep("zz_nr_axiom|age|sex|^uh_o_", colnames(z), ignore.case = T)
# just keep ids, age, sex, protein levels
final <- z[,ind.keep]
#-----------------------------------------------#
# 4) Write out
#-----------------------------------------------#
write.table(final, file=paste0(outdir,"kora.below.llod.normalised.prot.txt"),
sep="\t", quote=F, row.names = F)