correcting minor issues and adding annotation to phenotypes file

MRCIEU · Jan 13, 2025 · bcd8ea9 · bcd8ea9
1 parent a85410e
commit bcd8ea9
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 23 deletions.
diff --git a/resources/phenotypes/organise_phenotypes.rmd b/resources/phenotypes/organise_phenotypes.rmd
@@ -8,11 +8,12 @@ Files needed to input into this:
  - configuration environment
  - phenotype description (name, id, units, min, max etc..)
  - genetic covariates (id, PC's, sex*)
- - phenotype file (id, age, phenotype value, covariates)
+ - phenotype file (id, age, phenotype value, age varying covariates)
 
-Minimum phenotype columns required: 'FID' 'IID' 'age' 'value' then any covariates. Value is the value of the trait, column name should be 'value' and the file name the id of the trait (e.g. 'bmi.txt' for body mass index). Phenotypic covariates (and column names) required for each trait are described in "phenotypes_list" csv file. 
+Minimum phenotype columns required: 'FID' 'IID' 'age' 'value' then any required covariates**. Value is the value of the trait, column name should be 'value' and the file name the id of the trait (e.g. 'bmi.txt' for body mass index). Phenotypic covariates (and column names) required for each trait are described in "phenotypes_list" csv file. 
 
-*Sex can be input in either genetic covariates file or phenotype data file but is assumed to be coded as 1 - male, 2 - female.
+*Sex is assumed to be coded as 1 - male, 2 - female.
+**If observations for either blood pressure or cholesterol medication are missing they should be included as 0. If this information has not been recorded (or is assumed to be 0 for all individuals) please include a column of 0's with the appropriate heading in the phenotypic data. 
 
 Other notes:
 

diff --git a/resources/phenotypes/phenotype_functions.R b/resources/phenotypes/phenotype_functions.R
@@ -22,9 +22,6 @@ read_phenotype_data <- function(phecode, input_dir, agebins, covlist=NULL) {
     stop("expect 'FID', 'IID', 'age', 'value' columns to be present")
   }
 
-  # Keep only required columns
-  #phen <- phen %>% dplyr::select(all_of(column_names))
-
   # Remove duplicates by FID,IID,age
   phen <- subset(phen, !duplicated(paste(FID, IID, age)))
 
@@ -46,8 +43,6 @@ read_phenotype_data <- function(phecode, input_dir, agebins, covlist=NULL) {
   phen <- subset(phen, select=c("FID", "IID", "age", "value", covlist))
 
   # Cut into age bins
-  # phen$agebin <- cut(phen$age+1, breaks=c(0:19, seq(20, 120, by=5)))
-  # levels(phen$agebin) <- levels(phen$agebin) %>% gsub("\\(", "", .) %>% gsub("]", "", .) %>% gsub(",", "-", .)
   phen <- make_agebin(phen, agebins)
 
   return(phen)
@@ -176,7 +171,9 @@ summarise_phen <- function(data) {
       m_age=mean(age, na.rm=T),
       sd_age=sd(age, na.rm=T),
       m_age2=mean(age^2,na.rm=T),
-      m_age3=mean(age^3,na.rm=T)
+      sd_age2=sd(age^2, na.rm=T),
+      m_age3=mean(age^3,na.rm=T),
+      sd_age3=sd(age^3, na.rm=T)
     )
 }
 
@@ -317,19 +314,44 @@ phenoplot <- function(Yvbl, Xvbl, Quantiles=c(0.25,0.5,0.75), knots=NA, Nknots=0
   if (plotname!="") dev.off()
 }
 
+
+###############################
+##Function to convert the phenotype data into what we need for the GWAS
+#includes:
+#-importing the phenotype data
+#-categorising age groups
+#-removing outliers
+#-data transformation
+#-dividing the data into the approprate age groups
+#-checking sample size for each group
+#-saving the file for the GWAS for each group that passes the threshold
+
+
+##For most phenotypes outlier removal and transformation is done based on the phenotypes list csv file
+##Pulse pressure and bmiz are generated from other data provided and the outlier removal values and transformations are set in this code. 
+
+
 organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebins, pl=TRUE) {
 
+  ##read in the data and any age varying covariates (that are in the same file as the phenotypic data)
+
   if(phecode != "pp" & phecode != "bmiz"){
     type <- filter(df, pheno_id == phecode)$var_type
     str(filter(df, pheno_id == phecode))
     cs <- list()
     adjust_covs <- NULL
     if(phecode == "ldl"){
       adjust_covs <- "cholesterol_med"
-      }
+    }
+    if(phecode == "sbp" | phecode == "dbp"){
+      adjust_covs <- "bp_med"
+    }
     phen <- read_phenotype_data(phecode, Sys.getenv("phenotype_input_dir"), agebins, adjust_covs)
   }
 
+  ##read in data for dpb and sbp and generate pulse pressure
+  #exclude outliers based on dbp, sbp or pp
+
   if(phecode == "pp") {
     type <- "cont"
     cs <- list()
@@ -353,6 +375,7 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
     phen2 <- remove_outliers(phen2, phecode2)
     phen2 <- rename(phen2, dbp = value)
 
+    #join together the dbp and sbp data - these should only match if all of the other variables match (i.e. ID value, age and bp_med need to match for the variables to be merged)
     phen <- inner_join(phen1, phen2)
     phen$value <- phen$sbp - phen$dbp
     phen <- phen %>%
@@ -367,6 +390,9 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
 
   }
 
+  ##read in bmi data and generate bmiz for children - only keeps data for <= 18, adults are excluded for bmiz
+  #outliers for bmiz are excluded based on having a bmiz <-5 or >5
+
   if(phecode=="bmiz"){
     type <- "cont"
     cs <- list()
@@ -406,6 +432,7 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
 
   }
 
+  ##return null and move on to the next trait if this one is missing
 
   if(is.null(phen)) {
     return(NULL)
@@ -414,7 +441,7 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
   # Only one genetic observation per person
   setequal(unique(gen_covs$IID), gen_covs$IID)
 
-  # Merge covariates with data
+  # Merge genetic covariates with phenotypic data
   dat <- phen %>%
     left_join(gen_covs, by = c("FID", "IID")) %>%
     na.exclude
@@ -452,8 +479,11 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
 
 
   #remove outliers and plot the remaining data
-  #The Jitter applied to the plot so no actual observations are plotted
+  #A jitter is applied to the plot so no actual observations are plotted
 
+  #remove outliers based on values in the phenotypes description for every trait except pp and bmiz
+  #outliers for pp and bmiz are removed at the point the data is imported. 
+
   if(phecode != "pp" & phecode != "bmiz"){
   outliers <- detect_outliers(dat, phecode)
   analysis_data <- remove_outliers(dat, phecode)
@@ -462,16 +492,21 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
   }
 
 
+  ##This will break the code if all values are removed as outliers - this breaks rather than moving on to the next trait as it should only happen if there is an error
+  #e.g. incorrectly defined outliers, phenotype values entered in the wrong units etc. 
+
   if(length(analysis_data$value) == 0){
     stop(paste(phecode,"All observations removed as outliers"))
   }
 
+  #generate the plot 
   age_range = max(analysis_data$age) - min(analysis_data$age)
   y_sd = sd(analysis_data$value)
 
   phenoplot(analysis_data$value, analysis_data$age, Quantiles=c(0.05,0.25,0.5,0.75,0.95), knots=NA, Nknots=5, jitter_X=0.5, jitter_Y=0.1, plotname=ifelse(pl, "", phecode), title=phecode)
 
-  # medication adjustment - note this assumes that the value in the phenotype covariates is the same as the column name
+
+  # medication adjustment for ldl, sbp and dbp
 
   if(phecode=="ldl"){
     analysis_data$value <- analysis_data$value + (0.40 * analysis_data$value * analysis_data$cholesterol_med)
@@ -485,6 +520,10 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
     analysis_data$value <- analysis_data$value + (10 * analysis_data$bp_med)
   }
 
+
+  ##state the transformation that will be applied
+  #no transformation is applied to bmiz as its already a standardised variable
+
   if(phecode != "pp" & phecode != "bmiz"){
   if (filter(df, pheno_id == phecode)$transformation=="log") {
     print("log transformed")
@@ -507,7 +546,10 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
   } 
 
 
-  # Counts of sample size in each age group
+  ####By age group (1 year <20, 5 years >= 20)
+  ############################################
+
+  # Generate counts of sample size in each age group by sex and combined
 
   cats_sexspec <- analysis_data %>% 
     group_by(agebin, sex) %>%
@@ -526,18 +568,21 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
   cats <- bind_rows(cats_sexspec, cats)
   cs$categories <- cats 
 
+  #filter so know which age group have a large enough sample and make the sex combined sex=3
   cats <- filter(cats, n >= as.numeric(Sys.getenv("env_minumum_strat_n"))) %>%
     replace_na(list(sex=3))
 
   summary_output <- list()
 
-
+  ###this section operates on each age/sex (including sex combined) group individually
   if(length(cats$n>=1)){
     for(k in 1:length(cats$n)){
 
       age_group <- cats$agebin[k]
       sex_group <- cats$sex[k]
 
+      #select the individual observations from the phenotypes file that are the desired age/sex
+      #check and remove one of any duplicated individuals in that group
       if(sex_group < 3) {
         pheno_out <- filter(analysis_data, (agebin == cats$agebin[k])) %>% 
           filter(sex == cats$sex[k]) %>%
@@ -549,10 +594,12 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
           select(FID, IID, value, age, sex)
       }
 
+
       #summarise the pretransformed variable 
       sumstats <- summarise_phen(pheno_out)
 
       ## apply transformation and standardisation if required
+      #applied to all traits except pp and bmiz based on phenotypes description file
       if(phecode != "pp" & phecode != "bmiz"){
       if (filter(df, pheno_id == phecode)$transformation=="log") {
         pheno_out$value <- log(pheno_out$value)
@@ -570,10 +617,13 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
         pheno_out$value <- pheno_out$value
       } 
       }
+
+      #pp is standardised bit not transformed
       if(phecode == "pp") {
         pheno_out$value <- pheno_out$value/sd(pheno_out$value)
       } 
 
+      #define the sex groups for saving 
       if(sex_group == 1){
         sex_out = "m"
       } else if (sex_group == 2){
@@ -582,18 +632,26 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
         sex_out = "both"
       }
 
+      #save the (transformed) phenotype values for the GWAS
       write.table(subset(pheno_out, select=c("FID", "IID", "value")), file=file.path(Sys.getenv("phenotype_processed_dir"), paste0(phecode, "_", age_group, "_", sex_out, ".phen")), row=FALSE, col=FALSE, qu=FALSE)
 
+      #define the covariates that are saved as adjustments for the GWAS
       if(phecode != "pp" & phecode != "bmiz"){
       cov_ids <- subset(df, pheno_id == phecode)$covs %>% strsplit(., ":") %>% {.[[1]]}
       }
       if(phecode == "pp"){
-        cov_ids <- "sex"
+        cov_ids <- c("sex", "bp_med")
       }
       if(phecode == "bmiz"){
         cov_ids <- "sex"
       }
-      covs <- dat %>% select(all_of(c(names(gen_covs), cov_ids))) %>% filter(IID %in% pheno_out$IID) %>% filter(!duplicated(paste(FID, IID)))
+
+      #select these observations from the original data and remove 
+      covs <- analysis_dat %>% 
+        select(all_of(c(names(gen_covs), cov_ids))) %>% 
+        filter(agebin == age_group) %>%
+        filter(IID %in% pheno_out$IID) %>% 
+        filter(!duplicated(paste(FID, IID)))
 
       write.table(covs, file=file.path(Sys.getenv("phenotype_processed_dir"), paste0(phecode, "_", age_group, "_", sex_out, ".covs")), row=FALSE, col=FALSE, qu=FALSE)
 
@@ -605,7 +663,13 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
     print(cs$sums)
   }
 
-  # counts of sample size in each lifestage group
+
+  ####By lifestage category
+  ############################################
+  #This section repeats the section above but for each lifestage category rather than each age group
+
+
+  #generate counts of sample size in each lifestage group by sex and combined
 
   catsls_ss <- analysis_data %>% 
     group_by(lsbin, sex) %>%
@@ -621,18 +685,23 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
 
   catsls <- bind_rows(catsls_ss, catsls)
   cs$categories_ls <- catsls
-
+
+  #keep the categories where the sample size is greater than the minimum
   catsls <- filter(catsls, n >= as.numeric(Sys.getenv("env_minumum_strat_n"))) %>%
    replace_na(list(sex=3))
 
   summary_output <- list()
 
+
+  ##From here the code is applied to keep age bin and sex combination
+
   if(length(catsls$n>=1)){
     for(k in 1:length(catsls$n)){
 
       age_group <- catsls$lsbin[k]    
       sex_group <- catsls$sex[k] 
 
+      #keep the data from the main dataset that is part of this group and ensure that no individuals are repeated. 
       if(sex_group < 3) {
         pheno_out <- filter(analysis_data, (lsbin == catsls$lsbin[k])) %>% 
         filter(sex == catsls$sex[k]) %>%
@@ -644,6 +713,8 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
         select(FID, IID, value, age, sex)
       }
 
+      #apply the required transformation to the data based on the phenotypes info file 
+      #pp is standardised below, nothing is done to bmiz as it is already standardised
       if(phecode != "pp" & phecode != "bmiz"){
       if (filter(df, pheno_id == phecode)$transformation=="log") {
         pheno_out$value <- log(pheno_out$value)
@@ -654,18 +725,20 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
       }
       }
 
-      ## apply standardisation if required
+      ## apply standardisation if required for traits other than bmiz and pp
       if(phecode != "pp" & phecode != "bmiz"){
       if (filter(df, pheno_id == phecode)$standardisation=="yes") {
         pheno_out$value <- pheno_out$value/sd(pheno_out$value)
       } else if(filter(df, pheno_id == phecode)$standardisation=="no") {
         pheno_out$value <- pheno_out$value
       }
       }
+      #for pp the data is standardised but not transformed
       if(phecode == "pp"){
         pheno_out$value <- pheno_out$value/sd(pheno_out$value)
       }
 
+      #define the sex based on the code 
       if(sex_group == 1){
         sex_out = "m"
       } else if (sex_group == 2){
@@ -674,20 +747,25 @@ organise_phenotype <- function(phecode, phenotypes, df, gen_covs, covdat, agebin
         sex_out = "both"
       }
 
-
+      ##save the phenotype data for the GWAS
       write.table(subset(pheno_out, select=c("FID", "IID", "value")), file=file.path(Sys.getenv("phenotype_processed_dir"), paste0(phecode, "_", age_group, "_", sex_out, ".phen")), row=FALSE, col=FALSE, qu=FALSE)
 
       if(phecode != "pp" & phecode != "bmiz"){
       cov_ids <- subset(df, pheno_id == phecode)$covs %>% strsplit(., ":") %>% {.[[1]]}
       }
       if(phecode == "pp"){
-        cov_ids <- "sex"
+        cov_ids <- c("sex", "bp_med")
       }
       if(phecode == "bmiz"){
         cov_ids <- "sex"
       }
-      covs <- dat %>% select(all_of(c(names(gen_covs), cov_ids, "age"))) %>% filter(IID %in% pheno_out$IID) %>% filter(!duplicated(paste(FID, IID)))
+      covs <- analysis_dat %>% 
+        select(all_of(c(names(gen_covs), cov_ids, "age"))) %>% 
+        filter(agebin == age_group) %>%
+        filter(IID %in% pheno_out$IID) %>% 
+        filter(!duplicated(paste(FID, IID)))
 
+      #save the covariates for the GWAS
       write.table(covs, file=file.path(Sys.getenv("phenotype_processed_dir"), paste0(phecode, "_", age_group, "_", sex_out, ".covs")), row=FALSE, col=FALSE, qu=FALSE)
 
       sumstats <- summarise_phen(pheno_out)