-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathKORA.prot.preproc.R
99 lines (70 loc) · 3.03 KB
/
KORA.prot.preproc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Script started "Thu Oct 4 11:09:34 2018"
# To run on TRYGYVVE
# Purpose: take INF1 proteomics data from KORA cohort, and normalise for subsequent linear regression/GWAS
# Oddities: some individuals without ANY proteomic data are included in the .csv file
rm(list=ls())
setwd("/data/andmala/KORA/")
# read in the Olink INF proteomic data
x <- read.csv("K15117g_Malarstig_tra_20180502.csv", head=T)
# initial columns have id and covariate data. Protein columns have prefix "uh_o"
# Find the indices of these columns
prot.ind <- grep("^uh_o_", colnames(x) )
# subset to get just the protein data
pr <- x[ ,prot.ind]
if (ncol(pr) != 92){
stop("Incorrect number of proteins: there should be 92")
}
total.na <- sum( apply(pr, 1, FUN= function(x) all(is.na(x)) ) )
message(paste(total.na, "samples have no protein values whatsoever"))
# indices of samples with no Olink data
total.na.ind <- which( apply(pr, 1, FUN= function(x) all(is.na(x)) ) )
# indices of samples with no Axiom id (ie not genotyped)
axiom.na <- which(is.na(x$zz_nr_axiom))
to.cut <- union(total.na.ind, axiom.na)
# cut samples with no protein data
y <- x[-to.cut, ]
# let's investigate whether protein values are truncated at the lower end of the distribution:
# Olink set values below lower limit of detection (LLOD) to LLOD,
# although the untruncated data can be provided
find.n.llod <- function(x){
# find the minimum value
x.min <-x[which.min(x)]
if (any(x < x.min)){
stop("Error: vector contains value lower than supposed minimum")
}
n.llod <- sum(x == x.min)
n.llod
}
# find the number of individuals at LLOD for each protein
n.llod.by.prot <- sort( apply(y[,prot.ind], 2, FUN= find.n.llod ), decreasing = T )
pc.llod.by.prot <- 100*n.llod.by.prot/nrow(y)
# set a % threshold for proportion individuals below LLOD, above which to plot proteins
thr <- 20
pdf("~/KORA.pc.below.llod.pdf")
barplot(height= pc.llod.by.prot[which(pc.llod.by.prot>thr)],
names.arg= toupper( gsub("^uh_o_", "", names(pc.llod.by.prot[which(pc.llod.by.prot>thr)])) ),
ylim=c(0,100),
xlab="",
ylab= "% individuals below LLOD",
main= "KORA cohort: proteins with high % of individuals below LLOD",
las=2)
dev.off()
# Define the inverse normalisation function
#https://github.com/antagomir/scripts/blob/master/R/inverse.normalization.R
inverse_normal_transform <- function (x) { qnorm((rank(x,na.last="keep")-0.5)/sum(!is.na(x))) }
# Apply invnt transformation to proteins
# nb apply transposes result: proteins become cols
pr.transformed <- apply(y[,prot.ind], 2, FUN=inverse_normal_transform)
#
if (identical( row.names(pr.transformed), row.names(y[,-prot.ind]))){
z <- cbind(y[,-prot.ind], pr.transformed)
} else {
stop("row names don't match")
}
colnames(z)[which(colnames(z)=="utalter")] <- "age"
colnames(z)[grep("sex", colnames(z))] <- "sex"
ind.keep <- grep("zz_nr_axiom|age|sex|^uh_o_", colnames(z))
# just keep ids, age, sex, protein levels
final <- z[,ind.keep]
write.table(final, file="~/kora.normalised.prot.txt",
sep="\t", quote=F, row.names = F)