-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path_ukbb_h2_select_topline.Rmd
1113 lines (867 loc) · 49 KB
/
_ukbb_h2_select_topline.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Selecting Primary UKB Round 2 Phenotypes for LDSR Analysis"
date: "Last updated `r format(Sys.Date())`"
author: "Results from the [Neale Lab](credits.html)"
output:
html_document:
toc: true
toc_float: true
number_sections: false
params:
h2file: "../results/round2_final/ukb31063.h2.baseline1-1.gwas_v2.full_results.02Oct2019.tsv.gz"
cormat_bothsex: "../reference/both_sexes_resid_corrmat.csv"
ns_bothsex: "../reference/both_sexes_pairwise_complete_ns.csv"
outdir: "../results/round2_raw"
---
```{r child = '_toc_fix.Rmd'}
```
```{r child = '_code_highlight_fix.Rmd'}
```
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.align="center")
# devtools::install_github("ropensci/plotly")
require(plotly)
require(DT)
require(crosstalk)
require(crosstool)
require(Rmpfr)
plotly_colors <- c(
'#1f77b4', # muted blue
'#ff7f0e', # safety orange
'#2ca02c', # cooked asparagus green
'#d62728', # brick red
'#9467bd', # muted purple
'#8c564b', # chestnut brown
'#e377c2', # raspberry yogurt pink
'#7f7f7f', # middle gray
'#bcbd22', # curry yellow-green
'#17becf' # blue-teal
) # https://stackoverflow.com/questions/40673490/how-to-get-plotly-js-default-colors-list
# params$h2file <- normalizePath(params$h2file)
# params$cormat_bothsex <- normalizePath(params$cormat_bothsex)
# params$ns_bothsex <- normalizePath(params$ns_bothsex)
```
<style type="text/css">
div.main-container {
max-width: 1800px;
margin-left: auto;
margin-right: auto;
}
</style>
```{r data_load, include=FALSE}
# load full h2 results
dat_load <- read.delim(params$h2file,sep = '\t', quote = "", header=T, stringsAsFactors = F)
# for reverse matching the raw vs irnt codes
dat_load$phen_stem <- as.character(dat_load$phenotype)
dat_load$phen_stem[is.na(dat_load$n_cases)] <- as.character(sapply(dat_load$phenotype[is.na(dat_load$n_cases)],function(a) strsplit(a, "_")[[1]][1]))
# convenience additions
dat_load$isBinary <- !is.na(dat_load$n_cases)
dat_load$Neff <- dat_load$n
dat_load$Neff[dat_load$isBinary] <- round( (4/((1/dat_load$n_cases)+(1/dat_load$n_controls)))[dat_load$isBinary], 2)
dat_load$prevalence <- NA
dat_load$prevalence[dat_load$isBinary] <- (dat_load$n_cases/dat_load$n)[dat_load$isBinary]
# load phenotypic correlations, pairwise sample size
# (need early for finngen redundant phenotypes)
rrmat_both <- read.table(params$cormat_bothsex,header=T,row.names=1,stringsAsFactors=F,sep=',')
rownames(rrmat_both)[rownames(rrmat_both)=="isFemale"] <- "is_female"
colnames(rrmat_both) <- paste0("phen_",rownames(rrmat_both))
nnmat_both <- read.table(params$ns_bothsex,header=T,sep=',',stringsAsFactors=F,row.names=1)
```
```{r split_sex_dat, include=FALSE}
# split sex
dat <- dat_load[dat_load$sex == "both_sexes",]
datf <- dat_load[dat_load$sex == "female",]
datm <- dat_load[dat_load$sex == "male",]
```
<br>
***
# Overview
The [UKB Round 2 GWAS](https://github.com/Nealelab/UK_Biobank_GWAS) contains `r nrow(dat_load)` GWAS of `r length(unique(dat_load$phen_stem))` unique phenotype codes (`r length(unique(dat_load$phen_stem[dat_load$source=="phesant"]))` [PHESANT](https://github.com/astheeggeggs/PHESANT) + `r length(unique(dat_load$phen_stem[dat_load$source=="finngen"]))` [FinnGen](https://www.finngen.fi/en) + `r length(unique(dat_load$phen_stem[dat_load$source=="icd10"]))` ICD10 + `r length(unique(dat_load$phen_stem[dat_load$source=="biomarkers"]))` biomarkers + `r length(unique(dat_load$phen_stem[dat_load$source=="covariate"]))` covariates). For many of these phenotypes, however, there are multiple GWAS, due to:
* Consideration of both [inverse rank-normal transformed (IRNT)](#continuous_phenotype_normalization) and raw untransformed versions of `r length(unique(dat_load$phen_stem[dat_load$variable_type == "continuous_irnt"]))` continuous phenotypes
* Consideration of whether to include the [dilution factor](#biomarker_dilution_factor) as a covariate for the `r length(unique(dat_load$phen_stem[dat_load$source=="biomarkers"]))` biomarker phenotypes
* Available [sex-stratified](#sex-specific_phenotypes) results for most phenotypes
* Redundancy among the [FinnGen codes](#redundant_finngen_codes) constructed from ICD diagnoses
Our first task for the LD Score regression analyses, then, is to get a "primary" result for each phenotype.
***
<br>
# Biomarker dilution factor
For the biomarker assays, UK Biobank has reported that some samples were unintentionally diluted ([pdf](http://biobank.ctsu.ox.ac.uk/crystal/crystal/docs/biomarker_issues.pdf)) during processing. Although an effort has been made to estimate the dilution fraction and correct assay values accordingly, the estimated dilution fraction has also been reported for potential use in additional modelling. For instance, [other initial analyses of the biomarker data](https://www.biorxiv.org/content/10.1101/660506v1) have opted to include the dilution fraction as a covariate in regression analyses.
For the Neale Lab GWAS, the analysis was performed both with and without the dilution fraction as a covariate (along with the same GWAS covariates for age, sex, and PCs used for all phenotypes). We evaluate here whether to use the GWAS results with or without the dilution fraction covariate as the primary GWAS for the purposes of the ldsc $h^2_g$ analyses here.
<br>
## Heritability
If the dilution fraction covariate controls for substantial noise in the phenotype, we may anticipate stronger $h^2_g$ results (higher point estimate, more significant) when the covariate is included.
<div class="well">
```{r dilute, fig.align='center', echo=FALSE}
###########
# biomarker dilution fraction comparisons
###########
# merge pairs
biom <- dat_load[!is.na(dat_load$dilute),]
biodat <- merge(biom[biom$dilute==F & !startsWith(biom$phenotype,"30897_"),c("phenotype","description","h2_liability","h2_z","h2_p","intercept","intercept_z","intercept_p","sex")],
biom[biom$dilute==T & !startsWith(biom$phenotype,"30897_"),c("phenotype","description","h2_liability","h2_z","h2_p","intercept","intercept_z","intercept_p","sex")],
by=c("phenotype","description","sex"))
rm(biom)
```
```{r plotly_dummy, echo=F, warnings=F, message=F,include=F}
# to catch initial plotly package messages
plot_ly(x=rnorm(2),y=rnorm(2),type="scatter",mode="markers")
```
```{r dilute_h2_plot, fig.align='center', echo=FALSE}
# x=no dilution factor
# qref for diagonal reference line
qref1 <- seq(min(0,min(biodat$h2_liability.x,na.rm=T)),max(biodat$h2_liability.x,na.rm=T),.01)
pp1 <- plot_ly(biodat,
x=~h2_liability.x,
y=~h2_liability.y,
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~description,
width=400, height=400
) %>% add_trace(
x=qref1,
y=qref1,
mode="lines",
showlegend=F,
hoverinfo="text",
text=""
) %>% layout(
xaxis = list(title="without dilution covariate"),
yaxis = list(title="with dilution covariate"),
title = "SNP h2 estimate (liability)",
margin=list(b=65)
)
qref2 <- seq(min(0,min(-log10(biodat$h2_p.x),na.rm=T)),max(-log10(biodat$h2_p.x),na.rm=T),.01)
pp2 <- plot_ly(biodat,
x=~-log10(h2_p.x),
y=~-log10(h2_p.y),
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~description,
width=400, height=400
) %>% add_trace(
x=qref2,
y=qref2,
mode="lines",
showlegend=F,
hoverinfo="text",
text=""
) %>% layout(
xaxis = list(title="without dilution covariate"),
yaxis = list(title="with dilution covariate"),
title = "-log10 SNP h2 p-value",
margin=list(b=65)
)
htmltools::div(
style = "display: flex; flex-wrap: wrap; justify-content: center",
htmltools::div(pp1),
htmltools::div(pp2)
)
```
*Takeaway:* $h^2_g$ estimate and significance are not meaningfully affected by the addition of the dilution factor covariate.
</div>
## Intercept
If the dilution fraction is somehow correlated with the genetic data in a way that would lead to overall inflation of the GWAS results (e.g. some correlation with residual population structure), we may anticipate genome-wide inflation to be evident in the intercept results (higher point estimate, more significant) when the covariate is omitted.
<div class="well">
```{r dilute_int_plot, fig.align='center', echo=FALSE}
# x=no dilution factor
# qref for diagonal reference line
qref1 <- seq(min(1,min(biodat$intercept.x,na.rm=T)),max(biodat$intercept.x,na.rm=T),.01)
pp1 <- plot_ly(biodat,
x=~intercept.x,
y=~intercept.y,
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~description,
width=400, height=400
) %>% add_trace(
x=qref1,
y=qref1,
mode="lines",
showlegend=F,
hoverinfo="text",
text=""
) %>% layout(
xaxis = list(title="without dilution covariate"),
yaxis = list(title="with dilution covariate"),
title = "intercept estimate",
margin=list(b=65)
)
qref2 <- seq(min(0,min(-log10(biodat$intercept_p.x),na.rm=T)),max(-log10(biodat$intercept_p.x),na.rm=T),.01)
pp2 <- plot_ly(biodat,
x=~-log10(intercept_p.x),
y=~-log10(intercept_p.y),
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~description,
width=400, height=400
) %>% add_trace(
x=qref2,
y=qref2,
mode="lines",
showlegend=F,
hoverinfo="text",
text=""
) %>% layout(
xaxis = list(title="without dilution covariate"),
yaxis = list(title="with dilution covariate"),
title = "-log10 intercept p-value",
margin=list(b=65)
)
htmltools::div(
style = "display: flex; flex-wrap: wrap; justify-content: center",
htmltools::div(pp1),
htmltools::div(pp2)
)
```
*Takeaway:* The intercept estimate and significance are not substantially affected by the addition of the dilution factor covariate. If anything, adding the dilution factor covariate reduces the stability of the intercept (lower significance) without affecting the point estimate.
</div>
## Conclusion
Overall, the inclusion of the dilution factor covariate has minimal impact on the results for the biomarkers. Therefore in the interest of simplicity we treat the GWAS without the dilution factor covariate as the primary analysis for the biomarker phenotypes. (Results for the GWAS with the dilution factor covariate still appear in the complete results file though.) The only remaining variation in covariates across the analyses is that sex is omitted as a covariate in sex-specific GWAS.
```{r drop_dilute_irnt, echo=FALSE}
dat$keep <- rep(T, nrow(dat))
dat$notes <- rep("", nrow(dat))
dat$keep[dat$source=="biomarkers" & dat$dilute] <- F
dat$notes[dat$source=="biomarkers" & dat$dilute] <- "drop_bio_dilution"
dat_full <- dat
dat <- dat_full[dat$keep,]
rm(biodat)
```
<br>
# Continuous phenotype normalization
The Round 1 GWAS rank-normalized (IRNT) all continuous phenotypes. In Round 2, untransformed copies of the continuous phenotypes were also GWASed for the purposes of evaluating whether rank-normalizing was beneficial. Here we compare the raw and IRNT versions of all of the continuous phenotypes from [PHESANT](https://github.com/astheeggeggs/PHESANT) that were GWASed in `both_sexes` (i.e. that aren't sex-specific).
Specifically, we evaluate whether:
* IRNT vs. raw provide better $h^2_g$ results (i.e. higher $h^2_g$, smaller $h^2_g$ SE, and/or stronger $h^2_g$ significance)
* IRNT vs. raw provide better control of stratification (i.e. higher intercept, and/or stronger intercept significance)
<br>
## Heritability
We first look at heritability:
<div class="well">
```{r irnt_h2, fig.align='center', echo=FALSE}
###########
# raw vs irnt comparisons
###########
# merge of raw, irnt pairs
cont_dat <- dat[dat$phen_stem %in% dat$phen_stem[duplicated(dat$phen_stem)],]
foo_irnt <- cont_dat[grep("_irnt",cont_dat$phenotype),]
foo_raw <- cont_dat[grep("_raw",cont_dat$phenotype),]
#
cont_merge <- merge(x=foo_irnt[,c("phenotype","description","phen_stem","h2_liability","h2_liability_se","h2_p","intercept","intercept_z","intercept_p","ratio")],
y=foo_raw[,c("phenotype","phen_stem","h2_liability","h2_liability_se","h2_p","intercept","intercept_z","intercept_p","ratio")],
by="phen_stem")
rm(foo_irnt)
rm(foo_raw)
rm(cont_dat)
```
```{r irnt_h2_plot, fig.align='center', echo=FALSE}
# x=irnt
# qref for diagonal reference line
qref <- seq(min(0,min(cont_merge$h2_liability.x,na.rm=T)),max(cont_merge$h2_liability.x,na.rm=T),.01)
pp <- plot_ly(cont_merge,
x=~h2_liability.x,
y=~h2_liability.y,
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~description,
width=400, height=400
) %>% add_trace(
x=qref,
y=qref,
mode="lines",
showlegend=F,
hoverinfo="text",
text=""
) %>% layout(
xaxis = list(title="IRNT SNP h2"),
yaxis = list(title="raw SNP h2"),
margin=list(b=65)
)
htmltools::div( pp, align="center")
```
*Takeaway:* $h^2_g$ results are generally consistent, but with higher $h^2_g$ for the IRNT versions of each phenotype.
<br>
```{r irnt_h2_p, echo=FALSE}
# h2 p
qref <- seq(min(0,min(-log10(cont_merge$h2_p.x),na.rm=T)),max(-log10(cont_merge$h2_p.x),na.rm=T),.01)
pp <- plot_ly(cont_merge,
x=~-log10(h2_p.x),
y=~-log10(h2_p.y),
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~description,
width=400, height=400
) %>% add_trace(
x=qref,
y=qref,
mode="lines",
showlegend=F,
hoverinfo="text",
text=""
) %>% layout(
xaxis = list(title="-log10(IRNT SNP h2 p)"),
yaxis = list(title="-log10(raw SNP h2 p)"),
margin=list(b=65)
)
htmltools::div( pp, align="center")
```
*Takeaway:* p-values for testing $h^2_g$ are mostly consistent between scalings, but IRNT does average more significant $h^2_g$ results, especially among the phenotypes that have high $h^2_g$. Compared to the observed differences in $h^2_g$, the moderate change in p-values here reflects that the SEs for $h^2_g$ are often also nominally larger for IRNT (not shown).
</div>
Taken together, these seem to point towards IRNT providing a net benefit to the LDSR $h^2_g$ results.
<br>
## Intercept
Before adopting that conclusion, however, we also look at the results for the intercept term:
<div class="well">
```{r irnt_int, echo=FALSE}
# int
qref <- seq(min(1,min(cont_merge$intercept.x,na.rm=T)),max(cont_merge$intercept.x,na.rm=T),.01)
pp <- plot_ly(cont_merge,
x=~intercept.x,
y=~intercept.y,
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~description,
width=400, height=400
) %>% add_trace(
x=qref,
y=qref,
mode="lines",
showlegend=F,
hoverinfo="text",
text=""
) %>% layout(
xaxis = list(title="IRNT intercept",range=c(.95,1.38)),
yaxis = list(title="raw intercept",range=c(.95,1.38)),
autosize = T,
margin=list(b=65)
)
htmltools::div( pp, align="center" )
```
*Takeaway:* Intercepts are maybe slightly larger on average for IRNT versions of each phenotype (especially for intercepts 1.05-1.20; zoom plot out for additional outliers), but the differences are marginal. The largest differences seem to occur among the biomarkers and haemotology measures. Comparing these estimates with the mean $\chi^2$ values, the estimated intercept ratios remain mostly unchanged between the IRNT and raw versions (not shown).
<br>
```{r irnt_int_p, echo=FALSE}
# for p-values beyond base r precision
cont_merge$int_nlogp_x_tmp <- -log10(cont_merge$intercept_p.x)
cont_merge$int_nlogp_y_tmp <- -log10(cont_merge$intercept_p.y)
cont_merge$int_nlogp_x_tmp[!is.finite(cont_merge$int_nlogp_x_tmp)] <- as.numeric(-log10(mpfr(0.5,64)*erfc(mpfr(cont_merge$intercept_z.x[!is.finite(cont_merge$int_nlogp_x_tmp)],64)/sqrt(mpfr(2,64)))))
cont_merge$int_nlogp_y_tmp[!is.finite(cont_merge$int_nlogp_y_tmp)] <- as.numeric(-log10(mpfr(0.5,64)*erfc(mpfr(cont_merge$intercept_z.y[!is.finite(cont_merge$int_nlogp_y_tmp)],64)/sqrt(mpfr(2,64)))))
# intercept p
qref <- seq(min(0,min(cont_merge$int_nlogp_x_tmp,na.rm=T)),max(max(cont_merge$int_nlogp_x_tmp,na.rm=T),300),.01)
pp <- plot_ly(cont_merge,
x=~int_nlogp_x_tmp,
y=~int_nlogp_y_tmp,
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~description,
width=400, height=400
) %>% add_trace(
x=qref,
y=qref,
mode="lines",
showlegend=F,
hoverinfo="text",
text=""
) %>% layout(
xaxis = list(title="-log10(IRNT intercept p)",range=c(0,16)),
yaxis = list(title="-log10(raw intercept p)",range=c(0,16)),
margin=list(b=65)
)
htmltools::div( pp, align="center" )
rm(qref)
rm(cont_merge)
```
*Takeaway:* Focusing here on the majority of phenotypes with moderate/nominally significant intercept results (zoom out for p-values out to 1e-450), the p-values are strongly consistent between the IRNT and raw untransformed phenotypes. IRNT intercepts are maybe marginally more significant on average.
One noteworthy outlier is the estimated dilution fraction for the biomarker data (code 30897), which has a much higher intercept in the IRNT version of the GWAS (mean $\chi^2 = `r signif(dat_load$mean_chi2[dat_load$phenotype=="30897_irnt" & !dat_load$dilute & dat_load$sex=="both_sexes"],4)`$, intercept $= `r signif(dat_load$intercept[dat_load$phenotype=="30897_irnt" & !dat_load$dilute & dat_load$sex=="both_sexes"],4)`$, SE $= `r signif(dat_load$intercept_se[dat_load$phenotype=="30897_irnt" & !dat_load$dilute & dat_load$sex=="both_sexes"],2)`$, $p = `r signif(dat_load$intercept_p[dat_load$phenotype=="30897_irnt" & !dat_load$dilute & dat_load$sex=="both_sexes"],3)`$) than in the GWAS of the raw value (mean $\chi^2 = `r signif(dat_load$mean_chi2[dat_load$phenotype=="30897_raw" & !dat_load$dilute & dat_load$sex=="both_sexes"],4)`$, intercept $= `r signif(dat_load$intercept[dat_load$phenotype=="30897_raw" & !dat_load$dilute & dat_load$sex=="both_sexes"],4)`$, SE $= `r signif(dat_load$intercept_se[dat_load$phenotype=="30897_raw" & !dat_load$dilute & dat_load$sex=="both_sexes"],2)`$, $p = `r signif(dat_load$intercept_p[dat_load$phenotype=="30897_raw" & !dat_load$dilute & dat_load$sex=="both_sexes"],3)`$). The reason for this strong difference is unclear, though it may relate to the [bimodal, left-skewed distribution of the dilution fraction estimate](http://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=30897). The SNP heritability result is reassuringly null in either case (IRNT: $h^2_g = `r signif(dat_load$h2_liability[dat_load$phenotype=="30897_irnt" & !dat_load$dilute & dat_load$sex=="both_sexes"],3)`$, $p = `r signif(dat_load$h2_p[dat_load$phenotype=="30897_irnt" & !dat_load$dilute & dat_load$sex=="both_sexes"],3)`$; raw: $h^2_g = `r signif(dat_load$h2_liability[dat_load$phenotype=="30897_raw" & !dat_load$dilute & dat_load$sex=="both_sexes"],3)`$, $p = `r signif(dat_load$h2_p[dat_load$phenotype=="30897_raw" & !dat_load$dilute & dat_load$sex=="both_sexes"],3)`$), as would be expected for GWAS of an estimate of sample contamination in the lab. As a result we give limited weight to this outlier in evaluating the choice of IRNT vs. raw versions of the GWAS.
</div>
## Conclusion
Overall, the results are largely consistent regardless of the choice of IRNT or raw untransformed phenotypes. Since IRNT does appear to provide a marginal boost to the $h^2_g$ results, especially in terms of significance, we choose to treat the IRNT version as the primary analysis for continuous phenotypes. (Results for the raw, untransformed versions will still appear in the complete results file though.)
```{r keep_irnt, echo=FALSE}
dat_full$notes[dat_full$keep & grepl("_raw",dat_full$phenotype)] <- "drop_cont_raw"
dat_full$keep[dat_full$keep & grepl("_raw",dat_full$phenotype)] <- F
dat$keep[grepl("_raw",dat$phenotype)] <- F
dat <- dat[dat$keep,]
```
<br>
# Redundant FinnGen codes
During review of the GWAS phenotypes, we identified some instances where a [FinnGen](https://www.finngen.fi/en) code corresponds to a phenotype that is identical to another GWASed FinnGen code and in thus redundant. The most common case is pairs with codes `C_*` and `C3_*` with the same name, description,and sample size. For example, both `C_OTHER_SKIN` and `C3_OTHER_SKIN` are phenotypes for "Other malignant neoplasms of skin", and both have 14,402 cases and 346,792 controls. We identify and mark as redundant those phenotypes here.
<div class="well">
We can systematically identify these redundant pairs by confirming that the phenotypic correlation between the codes is 1 and that the phenotype is observed in the same individuals with the same number of cases. From that process, we identify the following phenotype pairs:
```{r duplicate_finngen, echo=F}
fg_rr <- rrmat_both[rownames(rrmat_both) %in% dat$phenotype[dat$source=="finngen"],
rownames(rrmat_both) %in% dat$phenotype[dat$source=="finngen"]]
fg_nn <- nnmat_both[rownames(nnmat_both) %in% dat$phenotype[dat$source=="finngen"],
rownames(nnmat_both) %in% dat$phenotype[dat$source=="finngen"]]
# get perfect correlation
fg_cor1 <- which(fg_rr==1, arr.ind = T)
# remove diagonal elements and lower triangle (to prevent symmetric duplicates)
fg_cor1 <- fg_cor1[!(fg_cor1[,1]>=fg_cor1[,2]),]
# get just phenotypes where sample size overlap matches full N
fg_cor1_n1 <- diag(as.matrix(fg_nn[match(rownames(fg_rr)[as.numeric(fg_cor1[,1])], rownames(fg_nn)), match(rownames(fg_rr)[as.numeric(fg_cor1[,1])], rownames(fg_nn))]))
fg_cor1_n2 <- diag(as.matrix(fg_nn[match(rownames(fg_rr)[as.numeric(fg_cor1[,2])], rownames(fg_nn)), match(rownames(fg_rr)[as.numeric(fg_cor1[,2])], rownames(fg_nn))]))
fg_cor1_n12 <- diag(as.matrix(fg_nn[match(rownames(fg_rr)[as.numeric(fg_cor1[,1])], rownames(fg_nn)), match(rownames(fg_rr)[as.numeric(fg_cor1[,2])], rownames(fg_nn))]))
fg_cor1 <- fg_cor1[(fg_cor1_n1 == fg_cor1_n2) & (fg_cor1_n2 == fg_cor1_n12),]
# confirm case count also matches
fg_cor1_case1 <- dat$n_cases[match(rownames(fg_rr)[as.numeric(fg_cor1[,1])], dat$phenotype)]
fg_cor1_case2 <- dat$n_cases[match(rownames(fg_rr)[as.numeric(fg_cor1[,2])], dat$phenotype)]
fg_cor1 <- fg_cor1[(fg_cor1_case1==fg_cor1_case2),]
# print pairs
fg_pairs <- cbind(rownames(fg_rr)[fg_cor1[,1]], rownames(fg_rr)[fg_cor1[,2]])
datatable(fg_pairs,
rownames = F,
colnames = c("Phenotype 1","Phenotype 2"),
# extensions='FixedHeader',
selection="none",
style="bootstrap",
class="nowrap display",
escape=F,
options = list(autowidth=F, scrollY="400px", scrollX='400px', pageLength=nrow(fg_pairs), dom='t')#, fixedColumns=TRUE),
)
```
<br>
</div>
## Conclusion
For these pairs, we drop the phenotype listed in the second column as redundant. This leads to the removal of `r length(unique(rownames(fg_rr)[fg_cor1[,2]]))` [FinnGen](https://www.finngen.fi/en) phenotype codes, leaving `r length(unique(dat_load$phen_stem[!(dat_load$phen_stem %in% fg_pairs[,2])]))` unique phenotype codes among the GWAS.
```{r drop_finngen_dupe, echo=F}
dat_full$notes[dat_full$keep & (dat_full$phenotype %in% fg_pairs[,2])] <- "drop_finngen_dupe"
dat_full$keep[dat_full$phenotype %in% fg_pairs[,2]] <- F
dat$keep[dat$phenotype %in% fg_pairs[,2]] <- F
dat <- dat[dat$keep,]
rm(fg_nn)
rm(fg_rr)
```
<br>
# Sex-specific phenotypes
UKB includes a number of sex-specific phenotypes, meaning that not all phenotypes have a `both_sexes` version of the analysis. In addition, although some of these are specified by UKB and thus coded as such (e.g. all members of the non-applicable sex are marked as missing), others do not exclude non-applicable sex, for example treating them as controls. The Round 2 GWAS included strong efforts to address phenotypes with this issue, particularly among the [PHESANT](https://github.com/astheeggeggs/PHESANT) phenotypes, but we still have some GWAS where there's a `both_sexes` analysis of a sex-specific phenotype. Therefore the goal here is to verify that we have the appropriate version of each phenotype respecting sex-specificity where applicable.
Our process is as follows:
* For phenotypes where only a `male` or `female` version of the analysis exists (with no `both_sexes`), take that version.
* For phenotypes where both a `male` and `female` version of the analysis exists, verify that both sex-specific analyses have a non-trivial proportion of the sample size and of the number of cases. As long as that is true, take the `both_sexes` analysis, otherwise take the appropriate sex-specific analysis.
* For phenotypes with a `both_sexes` analysis but only one sex-stratified analysis, compare sample size and case count of the sex-specific analysis to the `both_sexes` analysis. If the sex-specific analysis is the dominant source of samples then use the sex-specific analysis, otherwise use the `both_sexes` analysis.
We inspect the results of this process to ensure face validity of the concluded sex-specificity of the phenotypes.
```{r sex_spec_setup, echo=FALSE}
# setup male/female data
datf$phen_stem <- as.character(datf$phenotype)
datf$phen_stem[is.na(datf$n_cases)] <- as.character(sapply(datf$phenotype[is.na(datf$n_cases)],function(a) strsplit(a, "_")[[1]][1]))
datf$keep <- rep(T, nrow(datf))
datf$notes <- rep("", nrow(datf))
datf$keep[datf$source=="biomarkers" & datf$dilute] <- F
datf$notes[datf$source=="biomarkers" & datf$dilute] <- "drop_bio_dilution"
datf$notes[datf$keep & grepl("_raw",datf$phenotype)] <- "drop_cont_raw"
datf$keep[datf$keep & grepl("_raw",datf$phenotype)] <- F
datf$notes[datf$keep & (datf$phenotype %in% fg_pairs[,2])] <- "drop_finngen_dupe"
datf$keep[datf$phenotype %in% fg_pairs[,2]] <- F
datf_full <- datf
datf <- datf_full[datf$keep,]
datm$phen_stem <- as.character(datm$phenotype)
datm$phen_stem[is.na(datm$n_cases)] <- as.character(sapply(datm$phenotype[is.na(datm$n_cases)],function(a) strsplit(a, "_")[[1]][1]))
datm$keep <- rep(T, nrow(datm))
datm$notes <- rep("", nrow(datm))
datm$keep[datm$source=="biomarkers" & datm$dilute] <- F
datm$notes[datm$source=="biomarkers" & datm$dilute] <- "drop_bio_dilution"
datm$notes[datm$keep & grepl("_raw",datm$phenotype)] <- "drop_cont_raw"
datm$keep[datm$keep & grepl("_raw",datm$phenotype)] <- F
datm$notes[datm$keep & (datm$phenotype %in% fg_pairs[,2])] <- "drop_finngen_dupe"
datm$keep[datm$phenotype %in% fg_pairs[,2]] <- F
datm_full <- datm
datm <- datm_full[datm$keep,c("phenotype","phen_stem","description","n","n_cases","n_controls","keep","notes")]
```
<br>
## Single sex GWAS only
<div class="well">
```{r single_sex, echo=FALSE}
# female only
femonly <- datf[!(datf$phen_stem %in% dat$phen_stem),]
# male only
malonly <- datm[!(datm$phen_stem %in% dat$phen_stem),]
```
**Female-only** <button class="btn btn-outline-primary btn-sm" data-toggle="collapse" data-target="#FemaleOnly"> Show/Hide </button>
<div id="FemaleOnly" class="collapse">
```{r femonly_list, echo=F, comment=NA}
paste0(femonly$description, " [", femonly$phenotype, "]")
rm(femonly)
```
</div>
<br>
**Male-only** <button class="btn btn-outline-primary btn-sm" data-toggle="collapse" data-target="#MaleOnly"> Show/Hide </button>
<div id="MaleOnly" class="collapse">
```{r malonly_list, echo=F, comment=NA}
print(paste0(malonly$description, " [", malonly$phenotype, "]"))
rm(malonly)
```
</div>
<br>
*Takeaway:* Very strong face validity for each of these lists.
</div>
### Conclusion
Take these lists as-is.
<br>
## GWASed within each sex separately
<div class="well">
```{r both_sex_present, echo=F, comment=NA}
# in both male and female
phens_bothsex <- datf$phenotype[datf$phenotype %in% datm$phenotype]
phens_bothsex_idxf <- match(phens_bothsex, datf$phenotype)
phens_bothsex_idxm <- match(phens_bothsex, datm$phenotype)
# compare Ns
sex_ns <- data.frame(phenotype=phens_bothsex,
description=datf$description[phens_bothsex_idxf],
n_fem=datf$n[phens_bothsex_idxf],
n_case_fem=datf$n_cases[phens_bothsex_idxf],
n_control_fem=datf$n_controls[phens_bothsex_idxf],
n_mal=datm$n[phens_bothsex_idxm],
n_case_mal=datm$n_cases[phens_bothsex_idxm],
n_control_mal=datm$n_controls[phens_bothsex_idxm])
pp <- plot_ly(sex_ns,
x=~n_fem,
y=~n_mal,
type="scatter",
mode="markers",
showlegend=F,
hoverinfo="text",
text=~paste0(
"Phenotype: ", description,
"<br>Female N: ", n_fem,
"<br>Male N: ", n_mal,
"<br>Proportion Female: ", round(n_fem/(n_fem+n_mal),3)),
width=400, height=400
) %>% layout(
xaxis = list(title="N in Female GWAS"),
yaxis = list(title="N in Male GWAS"),
margin=list(b=65)
)
htmltools::div( pp, align="center" )
```
*Takeaway:* Sample size is nicely divided male/female for this set of phenotypes.
<br>
```{r both_sex_present_cases, echo=F, comment=NA}
pp1 <- plot_ly(sex_ns[!(is.na(sex_ns$n_case_fem) & is.na(sex_ns$n_case_mal)),],
x=~n_case_fem/(n_case_fem+n_case_mal),
type="histogram",
showlegend=F,
hoverinfo="none",
width=400, height=400
) %>% layout(
xaxis = list(title="Proportion of cases who are female", range=c(0,1)),
title="cases",
margin=list(b=65)
)
pp2 <- plot_ly(sex_ns[!(is.na(sex_ns$n_case_fem) & is.na(sex_ns$n_case_mal)),],
x=~n_control_fem/(n_control_fem+n_control_mal),
type="histogram",
showlegend=F,
hoverinfo="none",
width=400, height=400
) %>% layout(
xaxis = list(title="Proportion of controls who are female", range=c(0,1)),
title="controls",
margin=list(b=65)
)
htmltools::div(
style = "display: flex; flex-wrap: wrap; justify-content: center;",
htmltools::div(pp1),
htmltools::div(pp2)
)
```
*Takeaway:* Cases and controls are also nicely balanced between males and females for most of these phenotypes.
<br>
We can examine more closely the phenotypes in the tails of these distributions, with $>85\%$ of cases or controls coming from one sex:
<style>
table { white-space: nowrap; }
</style>
```{r both_sex_present_cases_outlier, echo=F, comment=NA}
shared_dat <-
sex_ns[ (sex_ns$n_control_fem/(sex_ns$n_control_fem+sex_ns$n_control_mal) > .85 |
sex_ns$n_control_fem/(sex_ns$n_control_fem+sex_ns$n_control_mal) < .15 |
sex_ns$n_case_fem/(sex_ns$n_case_fem+sex_ns$n_case_mal) > .85 |
sex_ns$n_case_fem/(sex_ns$n_case_fem+sex_ns$n_case_mal) < .15) &
!is.na(sex_ns$n_case_fem),]
datatable(shared_dat,
rownames = F,
colnames = c("Code","Phenotype","N Female","N Case (F)","N Con. (F)","N Male","N Case (M)","N Con. (M)"),
# extensions='FixedHeader',
selection="none",
style="bootstrap",
class="nowrap display",
escape=F,
options = list(autowidth=F, scrollY="400px", scrollX='400px', pageLength=50, dom='t')#, fixedColumns=TRUE),
)
```
<!-- </div> -->
*Note:* Table can be scrolled to the right for sample sizes.
*Takeaway:* We note that although these are strongly sex-biased, they are not necessarily sex-specific. For example, this list contains a large number of codes for jobs with a history of being strongly gendered (e.g. nursing) and treatments most commonly recommended for strongly sex-biased phenotypes (e.g. calcium supplements for osteoporosis).
[*NB:* The majority of these phenotypes will also end up below our effective sample size threshold for high confidence results, as described below, and thus will not be a primary contributor to the top level heritability results regardless of the treatment of these phenotypes here.]
</div>
### Conclusion
Although a handful of these phenotypes are strongly sex-biased, none appear to be fully sex-specific. Therefore we take the `both_sexes` GWAS as the primary result for all phenotypes in this set.
<br>
## GWASed in `both_sexes` and one sex
```{r one_sex, echo=F}
dat_one_f <- dat[(dat$phenotype %in% datf$phenotype & !(dat$phenotype %in% datm$phenotype)),]
dat_one_m <- dat[(dat$phenotype %in% datm$phenotype & !(dat$phenotype %in% datf$phenotype)),]
```
There are `r nrow(dat_one_f)+nrow(dat_one_m)` phenotypes with a GWAS in `both_sexes` and one sex but not the other (specifically, `r nrow(dat_one_m)` in males and `r nrow(dat_one_f)` in females).
For all phenotypes in this category, we evaluate what proportion of their total sample size comes from the GWASed sex.
<div class="well">
```{r one_sex_n, echo=F}
phens_onesex_idxf <- match(dat_one_f$phenotype, datf$phenotype)
phens_onesex_idxm <- match(dat_one_m$phenotype, datm$phenotype)
# compare Ns
onesexf_ns <- data.frame(phenotype=dat_one_f$phenotype,
description=datf$description[phens_onesex_idxf],
n=dat_one_f$n,
n_case=dat_one_f$n_case,
n_control=dat_one_f$n_controls,
n_sex=datf$n[phens_onesex_idxf],
n_case_sex=datf$n_cases[phens_onesex_idxf],
n_control_sex=datf$n_controls[phens_onesex_idxf])
onesexm_ns <- data.frame(phenotype=dat_one_m$phenotype,
description=datm$description[phens_onesex_idxm],
n=dat_one_m$n,
n_case=dat_one_m$n_case,
n_control=dat_one_m$n_controls,
n_sex=datm$n[phens_onesex_idxm],
n_case_sex=datm$n_cases[phens_onesex_idxm],
n_control_sex=datm$n_controls[phens_onesex_idxm])
onesex_ns <- rbind(onesexf_ns, onesexm_ns)
rm(onesexf_ns)
rm(onesexm_ns)
pp <- plot_ly(onesex_ns,
x=~n_sex/n,
type="histogram",
showlegend=F,
hoverinfo="none",
width=400, height=400
) %>% layout(
xaxis = list(title="Prop. samples from GWASed sex", range=c(0,1)),
margin=list(b=65)
)
htmltools::div( pp, align="center" )
rm(dat_one_f)
rm(dat_one_m)
rm(datf)
rm(datm)
```
The visible outliers from this distribution ($>85\%$ of samples coming from the GWASed sex) are:
```{r one_sex_n_outlier, echo=F, comment=NA}
shared_dat <- onesex_ns[onesex_ns$n_sex/onesex_ns$n > 0.85,c("phenotype","description","n","n_sex")]
datatable(shared_dat,
rownames = F,
colnames = c("Code","Phenotype","Total N","GWAS Sex N"),
# extensions='FixedHeader',
selection="none",
style="bootstrap",
class="nowrap display",
escape=F,
options = list(autowidth=F, scrollY="400px", scrollX='400px', pageLength=nrow(onesex_ns), dom='t')#, fixedColumns=TRUE),
)
```
*Note:* Table can be scrolled to the right for sample sizes.
*Takeaway:* The majority of these outlying phenotypes clearly either are sex-specific (and just have a redundant `both_sexes` GWAS) or should be (i.e. the collection of traits here where only 2 individuals aren't in the primary sex). Note that this includes items that aren't on their face sex-specific but were asked as part of a sex-specific questionnaire (e.g. 6153 and 6177, administered to self-reported females and males, respectively).
The one exception is 5959 ("Previously smoked cigarettes on most/all days") which is predominantly male ($96.6\%$) but not exclusively gendered. The strong sex bias comes from it being a follow-up question only administered to individuals who report currently smoking "on most or all days" and who "mainly" smoke "cigars or pipes".
</div>
For binary phenotypes on this class (excluding the ones clearly indicated as sex-specific above), we additionally look at the proportion of cases and controls coming from the GWASed sex:
<div class="well">
```{r one_sex_case, echo=F}
pp1 <- plot_ly(onesex_ns[!is.na(onesex_ns$n_case) & !(onesex_ns$n_sex/onesex_ns$n > 0.97),],
x=~n_case_sex/n_case,
type="histogram",
showlegend=F,
hoverinfo="none",
width=400, height=400
) %>% layout(
xaxis = list(title="Prop. cases from GWASed sex", range=c(0,1)),
title = "cases",
margin=list(b=65)
)
pp2 <- plot_ly(onesex_ns[!is.na(onesex_ns$n_case) & !(onesex_ns$n_sex/onesex_ns$n > 0.97),],
x=~n_control_sex/n_control,
type="histogram",
showlegend=F,
hoverinfo="none",
width=400, height=400
) %>% layout(
xaxis = list(title="Prop. controls from GWASed sex", range=c(0,1)),
title = "controls",
margin=list(b=65)
)
htmltools::div(
style = "display: flex; flex-wrap: wrap; justify-content: center",
htmltools::div(pp1),
htmltools::div(pp2)
)
```
The outliers from these two distributions, again using the $>85\%$ threshold, are:
```{r one_sex_ncase_outlier, echo=F, comment=NA}
shared_dat <- onesex_ns[
!is.na(onesex_ns$n_case) &
!(onesex_ns$n_sex/onesex_ns$n > 0.97) &
((onesex_ns$n_case_sex/onesex_ns$n_case > 0.85) | (onesex_ns$n_control_sex/onesex_ns$n_control > 0.85)),
c("phenotype","description","n_case","n_case_sex","n_control","n_control_sex")]
datatable(shared_dat,
rownames = F,
colnames = c("Code","Phenotype","Total Case","GWAS Sex Case","Total Control","GWAS Sex Control"),
# extensions='FixedHeader',
selection="none",
style="bootstrap",
class="nowrap display",
escape=F,
options = list(autowidth=F, scrollY="400px", scrollX='400px', pageLength=nrow(onesex_ns), dom='t')#, fixedColumns=TRUE),
)
```
*Note:* Table can be scrolled to the right for sample sizes.
*Takeaway:* This list of phenotypes with cases and/or controls primarily from one sex is dominanted by job codes and medical outcomes that are either strongly sex biased or (in the case of some medical phenotypes, e.g. pregnancy-related ICD codes) sex-specific. Considering the $97\%$ threshold used to evaluate total sample size above, none of the phenotypes with $85-97\%$ of cases or controls from a single sex are definitionally-sex specific, instead they are all instances where cases/controls from the rarer sex are entirely plausible. The `both_sexes` analysis of these phenotypes therefore seems appropriate the treat as the "primary" analysis (though we revisit expectations about the stability of these results below).
For the remaining phenotypes with >$97\%$ of cases and/or controls coming from the GWASed sex (relisted below), we see three clear scenarios:
* job codes with a very strong sex bias, up to and including jobs only reported in a single sex (e.g. dinner lady, midwife, bricklayer, pipe fitter)
* medical ICD codes that are sex-specific (e.g. disorders of the prostate or overies)
* medical ICD codes that are very strongly sex-biased but can occur in either sex (e.g. breast cancer)
Following the standard applied above, we opt to use the sex-specific GWAS as the primary result for the truely sex-specific medical phenotypes, but keep the `both_sexes` analysis for the other two scenarios where cases/controls from both sexes are reasonable despite their rarity.
```{r one_sex_ncase_outlier97, echo=F, comment=NA}
shared_dat <- onesex_ns[
!is.na(onesex_ns$n_case) &
!(onesex_ns$n_sex/onesex_ns$n > 0.97) &
((onesex_ns$n_case_sex/onesex_ns$n_case > 0.97) | (onesex_ns$n_control_sex/onesex_ns$n_control > 0.97)),
c("phenotype","description","n_case","n_case_sex","n_control","n_control_sex")]
datatable(shared_dat,
rownames = F,
colnames = c("Code","Phenotype","Total Case","GWAS Sex Case","Total Control","GWAS Sex Control"),
# extensions='FixedHeader',
selection="none",
style="bootstrap",
class="nowrap display",
escape=F,
options = list(autowidth=F, scrollY="400px", scrollX='400px', pageLength=nrow(onesex_ns), dom='t')#, fixedColumns=TRUE),
)
```
*Note:* Table can be scrolled to the right for sample sizes.
</div>
```{r resolve_sex_spec, echo=F, comment=NA}
# previously have dat, datf, datm
# with _full versions of each that additionally have "keep" and "notes"
# here: assemble kept versions from sex checks
# no both_sex = keep the sex-specific
# = no existing entries that get dropped
# both sexes have a separate gwas = keep both_sex
# = drop the sex-specific entries
datm_full$keep[datm_full$phenotype %in% datf_full$phenotype] <- F
datm_full$notes[(datm_full$phenotype %in% datf_full$phenotype) & datm_full$notes == ""] <- "both_sexes_gwased"
datf_full$keep[datf_full$phenotype %in% datm_full$phenotype] <- F
datf_full$notes[(datf_full$phenotype %in% datm_full$phenotype) & datf_full$notes == ""] <- "both_sexes_gwased"
# gwas in one sex+both, and <97% n/case/control from one = keep both_sexes
idxm <- match(onesex_ns$phenotype[
(onesex_ns$n_sex / onesex_ns$n < 0.97) &
(is.na(onesex_ns$n_case) |
(!is.na(onesex_ns$n_case) & (onesex_ns$n_case_sex/onesex_ns$n_case < 0.97) &
(onesex_ns$n_control_sex/onesex_ns$n_control < 0.97)))
],
datm_full$phenotype)
idxm <- idxm[!is.na(idxm)]
idxf <- match(onesex_ns$phenotype[
(onesex_ns$n_sex / onesex_ns$n < 0.97) &
(is.na(onesex_ns$n_case) |
(!is.na(onesex_ns$n_case) & (onesex_ns$n_case_sex/onesex_ns$n_case < 0.97) &
(onesex_ns$n_control_sex/onesex_ns$n_control < 0.97)))
],
datf_full$phenotype)
idxf <- idxf[!is.na(idxf)]
datm_full$keep[idxm] <- F
datm_full$notes[idxm[datm_full$notes[idxm]==""]] <- "not_sex_spec"
datf_full$keep[idxf] <- F
datf_full$notes[idxf[datf_full$notes[idxf]==""]] <- "not_sex_spec"
# gwas in one sex+both, and n > 97% from one = single sex
idx <- match(onesex_ns$phenotype[onesex_ns$n_sex / onesex_ns$n >= 0.97], dat_full$phenotype)
dat_full$keep[idx] <- F
dat_full$notes[idx[dat_full$notes[idx]==""]] <- "sex_spec_n"
# gwas in one sex+both, and ncas/con sex spec (>.997) and not job codes = use single sex gwas
idx <- match(onesex_ns$phenotype[(onesex_ns$n_sex / onesex_ns$n < 0.97) &