-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathKORA.prot.preproc.belowlod.v2.R
182 lines (128 loc) · 5.82 KB
/
KORA.prot.preproc.belowlod.v2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Script modified "Thu Mar 7 16:52:15 2019"
# To run on TRYGYVVE
# Purpose: take INF1 proteomics data from KORA cohort (untruncated NPX values),
# and normalise for subsequent linear regression/GWAS
# Oddities: some individuals without ANY proteomic data are included in the .csv file
# Issues:
# 1) LIF is not in the original KORA dataset provided by Anders "K15117g_Malarstig_tra_20181129.csv": this was supplied later
# as "K15117g_Malarstig_tra_20190219.csv" and needs to be merged in. There are also different nrow for these csv files
# 2) # NB 2 protein names have typos 'ORTN' and 'OT3' ('NRTN' and 'NT3').
# The file columns for proteins in this dataset generally begin "UH_O_" followed by protein symbol.
# In the case of these 2 proteins this formatting pattern has been messed up so the names are "UH_ORTN" and "UH_OT3".
# So it looks like the "_N" from the protein name got lost here i.e. they should be "UH_O_NRTN" and "UH_O_NT3".
#-----------------------------------------------#
# 1) Data set-up
#-----------------------------------------------#
options(stringsAsFactors = F)
rm(list=ls())
# this should be the file with untruncated values (ie not censored at 'LLOD')
data.dir <- "/data/andmala/KORA/KORA_updated_olink/"
outdir <- "/data/jampet/KORA/"
setwd(data.dir)
# read in the Olink INF proteomic data
x <- read.csv("K15117g_Malarstig_tra_20181129.csv", head=T)
# important: Olink have withdrawn BDNF: the assay is dodgy
# see: https://www.olink.com/bdnf-info/
x$UH_O_BDNF <- NULL
# fix colnames
colnames(x) <- gsub("UH_ORTN", "UH_O_NRTN", colnames(x))
colnames(x) <- gsub("UH_OT3", "UH_O_NT3", colnames(x))
# Protein LIF was omitted from the first data upload
lif <- read.csv("/data/jinhua/data/KORA/K15117g_Malarstig_tra_20190219.csv")
x <- merge(x, lif, by="ZZ_nr", all.x=T)
# initial columns have id and covariate data. Protein columns have prefix "UH_O"
# Find the indices of these columns
prot.ind <- grep("^uh_o", colnames(x), ignore.case = TRUE )
# subset to get just the protein data
pr <- x[ ,prot.ind]
if (ncol(pr) != 91){
stop(paste( "Incorrect number of proteins: there should be 91. There are", ncol(pr)) )
}
total.na <- sum( apply(pr, 1, FUN= function(x) all(is.na(x)) ) )
message(paste(total.na, "samples have no protein values whatsoever"))
# indices of samples with no Olink data
total.na.ind <- which( apply(pr, 1, FUN= function(x) all(is.na(x)) ) )
# indices of samples with no Axiom id (ie not genotyped)
axiom.na <- which(is.na(x$zz_nr_axiom))
to.cut <- union(total.na.ind, axiom.na)
# cut samples with no protein data
y <- x[-to.cut, ]
# let's investigate whether protein values are truncated at the lower end of the distribution:
# originally we found the data was truncated when we expected it not to be
# here's a simple test: for each protein find the lowest value:
# if there are many ties then likely the data is censored
find.n.llod <- function(x){
# find the minimum value
x.min <- x[which.min(x)]
if (any(x < x.min, na.rm = T)){
stop("Error: vector contains value lower than supposed minimum")
}
n.llod <- sum(x == x.min, na.rm=T)
n.llod
}
# find the number of individuals at the minimum value for each protein
n.llod.by.prot <- sort( apply(y[,prot.ind], 2, FUN= find.n.llod ), decreasing = T )
# see if the minimum value is not unique for any protein (suggesting truncation ...)
if (any( n.llod.by.prot != 1)){
message("there are multiple minimum values for at least one protein")
message("Checking for culprit proteins...")
n.llod.by.prot[n.llod.by.prot != 1]
}
# ucsex: M = 1, F= 2, then colour code by sex
y$ucsex[which(y$ucsex==1)] <- 'M'
y$ucsex[which(y$ucsex==2)] <- 'F'
#-----------------------------------------------#
# 2) Visualisation
#-----------------------------------------------#
pdf(paste0(outdir,"kora.below.llod.npx.by.sex.",Sys.Date(),".pdf"), height=7, width = 6)
par(mfrow=c(4,3), mar=c(4,4,2,1))
for (i in prot.ind){
boxplot( y[,i] ~ y$ucsex, las=1, col= c("blue", "pink"))
mtext(text= colnames(y)[i], side=3)
}
dev.off()
pdf(paste0(outdir, "kora.below.llod.npx.hist.",Sys.Date(),".pdf"), height=7, width = 6)
par(mfrow=c(4,3), mar=c(4,4,2,1))
for (i in prot.ind){
hist( y[,i], freq=F, breaks=50, main="", xlab='', ylab=NA, las=1 )
# add a density curve
curve(
expr= dnorm(x, mean=mean(y[,i], na.rm=TRUE), sd=sd(y[,i], na.rm=T)),
add=TRUE, col='darkblue', lwd=2
)
mtext(text= 'NPX expr', side=1, line=2, cex=0.75)
mtext(text= colnames(y)[i], side=3)
}
dev.off()
pdf(paste0(outdir,"kora.below.llod.npx.distrib.",Sys.Date(),".pdf"), height=7, width = 6)
par(mfrow=c(4,3), mar=c(4,3,2,1))
for (i in prot.ind){
plot(density( y[,i], na.rm=T), main="", ylab=NA, las=1 )
mtext(text= colnames(y)[i], side=3)
}
dev.off()
#-----------------------------------------------#
# 3) Normalisation
#-----------------------------------------------#
# Define the inverse normalisation function
#https://github.com/antagomir/scripts/blob/master/R/inverse.normalization.R
inverse_normal_transform <- function (x) { qnorm((rank(x,na.last="keep")-0.5)/sum(!is.na(x))) }
# Apply invnt transformation to proteins
# nb apply transposes result: proteins become cols
pr.transformed <- apply(y[,prot.ind], 2, FUN=inverse_normal_transform)
#
if (identical( row.names(pr.transformed), row.names(y[,-prot.ind]))){
z <- cbind(y[,-prot.ind], pr.transformed)
} else {
stop("row names don't match")
}
colnames(z)[which(colnames(z)=="utalter")] <- "age"
colnames(z)[grep("sex", colnames(z))] <- "sex"
ind.keep <- grep("zz_nr_axiom|age|sex|^uh_o_", colnames(z), ignore.case = T)
# just keep ids, age, sex, protein levels
final <- z[,ind.keep]
#-----------------------------------------------#
# 4) Write out
#-----------------------------------------------#
write.table(final, file=paste0(outdir,"kora.below.llod.normalised.prot.",Sys.Date(),".txt"),
sep="\t", quote=F, row.names = F)