-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcoloc.sb
executable file
·187 lines (171 loc) · 5.62 KB
/
coloc.sb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/bash
#SBATCH --job-name=_coloc
#SBATCH --account CARDIO-SL0-CPU
#SBATCH --partition cardio
#SBATCH --qos=cardio
#SBATCH --array=4,6
#SBATCH --mem=28800
#SBATCH --time=5-00:00:00
#SBATCH --error=/rds/user/jhz22/hpc-work/work/_coloc_%A_%a.err
#SBATCH --output=/rds/user/jhz22/hpc-work/work/_coloc_%A_%a.out
#SBATCH --export ALL
function coloc()
# GTEx, eQTL-Catalogue
{
export r=${SLURM_ARRAY_TASK_ID}
export cvt=${INF}/work/INF1.merge.cis.vs.trans
read prot MarkerName < \
<(awk -vFS="," '$14=="cis"' ${cvt} | \
awk -vFS="," -vr=${r} 'NR==r{print $2,$5}')
echo ${r} - ${prot} - ${MarkerName}
export prot=${prot}
export MarkerName=${MarkerName}
if [ ! -f ${INF}/coloc/${prot}-${MarkerName}.pdf ] || \
[ ! -f ${INF}/coloc/${prot}-${MarkerName}.RDS ]; then
cd ${INF}/coloc
R --no-save < ${INF}/rsid/coloc.R 2>&1 | \
tee ${prot}-${MarkerName}.log
cd -
fi
}
coloc
# CCL23 CCL25 CCL4 CD6 FGF.5 MIP.1.alpha TNFSF14
# 4 6 20 22 34 50 52
# --- legacy ---
function coloc_gene()
{
Rscript -e '
suppressMessages(library(dplyr))
library(openxlsx)
INF <- Sys.getenv("INF")
metal <- read.delim(file.path(INF,"work","INF1.METAL"))
pqtls <- left_join(metal, pQTLdata::inf1)
xlsx <- file.path(INF,"NG","SCALLOP-INF-ST.xlsx")
plist <- function(sheet)
read.xlsx(xlsx,sheet=sheet,startRow=2) %>%
filter(flag=="x") %>%
select(Protein) %>%
distinct()
gtex <- plist("ST4-GTEx coloc")
eqtlgen <- plist("EQTLGen_coloc")
eqtlcat <- plist("EQTL-Catalogue_coloc")
pall <- unique(bind_rows(gtex,eqtlgen,eqtlcat)) %>%
rename(target.short=Protein) %>%
left_join(pQTLdata::inf1) %>%
select(target.short,uniprot,prot,gene,ensembl_gene_id)
genes <- pull(pall,gene)
write.table(genes,row.names=FALSE,col.names=FALSE,quote=FALSE)
'
}
function recollect()
{
export dir=${1}
export out=${2}
(
ls ${dir}/*RDS | \
parallel -C' ' '
export rds=$(basename {});
Rscript -e "
options(width=200)
suppressMessages(library(dplyr))
dir <- Sys.getenv(\"dir\")
rds <- Sys.getenv(\"rds\")
l <- unlist(strsplit(gsub(\".RDS\",\"\",rds),\"-\"))
d <- readRDS(file.path(dir,rds)) %>%
mutate(prot=l[1],snpid=l[2]) %>%
data.frame
n <- names(d)
pp <- grepl(\"PP\",n)
d[pp] <- round(d[pp],2)
write.table(d,row.names=FALSE,col.names=FALSE,quote=FALSE,sep=\"\t\")
"
'
) > ${dir}/${out}-all.tsv
}
# recollect coloc GTEx
# recollect eQTLCatalogue eQTLCatalogue
function setdiff()
{
Rscript -e '
l <- read.delim("~/INF/eQTLCatalogue/eQTLCatalogue-all.tsv")
p <- unique(l$prot)
METAL <- read.delim("~/INF/work/INF1.METAL")
p2 <- subset(METAL,cis.trans=="cis")$prot
df <- setdiff(p2,p)
df <- c("CD6","CCL23","MIP.1.alpha","CCL4","TNFSF14","CCL25","FGF.5")
write.table(pQTLdata::inf1[pQTLdata::inf1$prot%in%df,c("prot","gene","ensembl_gene_id")],quote=FALSE,row.names=FALSE,sep="\t")
h <- c("variant","r2","pvalue","molecular_trait_object_id","molecular_trait_id",
"maf","gene_id","median_tpm","beta","se","an","ac","chromosome","position",
"ref","alt","type","rsid")
'
}
function rough_check()
{
export csv=~/rds/public_databases/GTEx/csv
for tissue in $(ls ${csv}/*gz | xargs -l -I{} basename {} .tsv.gz)
do
export tissue=${tissue}
echo ${tissue}
(
echo "variant r2 pvalue molecular_trait_object_id molecular_trait_id maf gene_id median_tpm beta se an ac chromosome position ref alt type rsid"
cat << 'EOL' | parallel -j10 -C' ' --env csv --env tissue 'awk -vgene_id={3} "\$7==gene_id" ${csv}/${tissue}.tsv.gz'
CCL23 CCL23 ENSG00000167236
CCL25 CCL25 ENSG00000131142
CCL4 CCL4 ENSG00000129277
CD6 CD6 ENSG00000138675
FGF.5 FGF5 ENSG00000013725
MIP.1.alpha CCL3 ENSG00000006075
TNFSF14 TNFSF14 ENSG00000125735
EOL
) | gzip -f > ${tissue}.gz
done
}
function fusion()
{
cd work
export FUSION=${HPC_WORK}/fusion_twas
export trait=$(awk -v 'NR==ENVIRON["SLURM_ARRAY_TASK_ID"] {print $1}' ${INF}/work/inf1.tmp)
(
awk -v OFS='\t' 'BEGIN{print "SNP","A1","A2","N","CHISQ","Z"}'
gunzip -c ${INF}/METAL/${trait}-1.tbl.gz | \
awk -v OFS='\t' 'NR>1{print $3,toupper($4),toupper($5),$18,($10/$11)^2,$10/$11,$1,$2}' | \
sort -k1,1 | \
join <(cat INTERVAL.rsid | tr ' ' '\t') - -t$'\t' | \
sort -k8,8n -k9,9n | \
cut -f2-7
) > ${trait}.sumstats
ln -sf ${FUSION}/WEIGHTS WEIGHTS
module load gcc/6
for tissue in Whole_Blood
do
export tissue=${tissue}
(
awk -v OFS='\t' 'BEGIN{print "PANEL","FILE","ID","CHR","P0","P1","HSQ","BEST.GWAS.ID","BEST.GWAS.Z",
"EQTL.ID","EQTL.R2","EQTL.Z","EQTL.GWAS.Z","NSNP","NWGT",
"MODEL","MODELCV.R2","MODELCV.PV","TWAS.Z","TWAS.P",
"COLOC.PP0","COLOC.PP1","COLOC.PP2","COLOC.PP3","COLOC.PP4"}'
(
seq 22 | \
parallel -j1 --env FUSION --env trait --env tissue -C' ' '
Rscript ${FUSION}/FUSION.assoc_test.R \
--sumstats ${trait}.sumstats \
--weights ${FUSION}/WEIGHTS/${tissue}.P01.pos \
--weights_dir WEIGHTS \
--ref_ld_chr ${FUSION}/LDREF/1000G.EUR. \
--chr {} \
--coloc_P 5e-8 \
--GWASN 15150 \
--out ${trait}-${tissue}-{}.dat
cat ${trait}-${tissue}-{}.dat | \
if [ {} -eq 1 ]; then cat; else awk "NR>1"; fi
rm ${trait}-${tissue}-{}.dat
'
) | \
grep -v -e WARNING -e skipped -e complete -e consider -e MHC -e TWAS | \
sed "s|WEIGHTS/${tissue}/${tissue}.||g;s/.wgt.RDat//g" | \
sort -k4,4n -k5,5n
) | \
awk '$NF!="NA"' > ${trait}-${tissue}-coloc.dat
done
cd -
}