-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTutorial-for-Phenotype-File.py
318 lines (148 loc) · 14.3 KB
/
Tutorial-for-Phenotype-File.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', 120)
##################################################
# This tutorial uses the WHI dataset as an example for generating phenotype files. Users are advised to make a copy of this script rather than edit directly.
# Users should replace the file path and file name variables to reflect that of their own system.
# Load in the necessary files which will be used for building the phenotype dataset #
# We will load in the files which contain the phenotype data we want to inlcude in the analysis.
# First, we will define the directories which contain the phenotype files as c1_pheno_data_dir and c2_pheno_data_dir, where the 'c' in c1 and c2 represents consent group.
# Therefore, you can add additional lines if there is more than 2 consent groups (or delete the second consent group) depending on your study.
c1_pheno_data_dir = "/usr/path/to/files/consentgroup1"
c2_pheno_data_dir = "/usr/path/to/files/consentgroup2"
# In the step above, we defined the directories which contain the phenotype files as c1_pheno_data_dir etc.
# In the code below, you can rename the dataframe name (currently c1_form2) with any name you wish to use.
# Then, you can simple replace the file name within the quotation marks ("") with the name of the file you wish to use.
c1_form2 = pd.read_csv(os.path.join(c1_pheno_data_dir, "phs000200.v11.pht000998.v6.p3.c1.f2_rel1.HMB-IRB.txt"), sep="\t", comment="#")
c2_form2 = pd.read_csv(os.path.join(c2_pheno_data_dir, "phs000200.v11.pht000998.v6.p3.c2.f2_rel1.HMB-IRB-NPU.txt"), sep="\t", comment="#")
form2 = pd.concat([c1_form2, c2_form2], axis=0, ignore_index=True)
c1_form30 = pd.read_csv(os.path.join(c1_pheno_data_dir, "phs000200.v11.pht000999.v6.p3.c1.f30_rel2.HMB-IRB.txt"),sep="\t", comment="#")
c2_form30 = pd.read_csv(os.path.join(c2_pheno_data_dir, "phs000200.v11.pht000999.v6.p3.c2.f30_rel2.HMB-IRB-NPU.txt"), sep="\t", comment="#")
form30 = pd.concat([c1_form30, c2_form30], axis=0, ignore_index=True)
c1_form41 = pd.read_csv(os.path.join(c1_pheno_data_dir, "phs000200.v11.pht001009.v6.p3.c1.f41_rel1.HMB-IRB.txt"), sep="\t", comment="#")
c2_form41 = pd.read_csv(os.path.join(c2_pheno_data_dir, "phs000200.v11.pht001009.v6.p3.c2.f41_rel1.HMB-IRB-NPU.txt"), sep="\t", comment="#")
form41 = pd.concat([c1_form41, c2_form41], axis=0, ignore_index=True)
c1_form80 = pd.read_csv(os.path.join(c1_pheno_data_dir, "phs000200.v11.pht001019.v6.p3.c1.f80_rel1.HMB-IRB.txt"), sep="\t",comment="#")
c2_form80 = pd.read_csv(os.path.join(c2_pheno_data_dir, "phs000200.v11.pht001019.v6.p3.c2.f80_rel1.HMB-IRB-NPU.txt"), sep="\t",comment="#")
form80 = pd.concat([c1_form80, c2_form80], axis=0, ignore_index=True)
c1_outc_ct_os = pd.read_csv(os.path.join(c1_pheno_data_dir, "phs000200.v11.pht003407.v3.p3.c1.outc_ct_os_rel4.HMB-IRB.txt"), sep="\t", comment="#")
c2_outc_ct_os = pd.read_csv(os.path.join(c2_pheno_data_dir, "phs000200.v11.pht003407.v3.p3.c2.outc_ct_os_rel4.HMB-IRB-NPU.txt"), sep="\t", comment="#")
outc_ct_os = pd.concat([c1_outc_ct_os, c2_outc_ct_os], axis=0, ignore_index=True)
c1_outc_cancer_rel4 = pd.read_csv(os.path.join(c1_pheno_data_dir, "phs000200.v11.pht003404.v3.p3.c1.outc_cancer_rel4.HMB-IRB.txt"), sep="\t", comment="#")
c2_outc_cancer_rel4 = pd.read_csv(os.path.join(c2_pheno_data_dir, "phs000200.v11.pht003404.v3.p3.c2.outc_cancer_rel4.HMB-IRB-NPU.txt"), sep="\t", comment="#")
outc_cancer_rel4 = pd.concat([c1_outc_cancer_rel4, c2_outc_cancer_rel4], axis=0, ignore_index=True)
# These files are not part of the phenotype data that you are interested in per se, but are important for extracting the samples which only are present in the study you are investigating (i.e. the harmonized and imputed data)
WHI_Sample = pd.read_csv(os.path.join(c1_pheno_data_dir, "phs000200.v11.pht001032.v8.p3.WHI_Sample.MULTI.txt"), sep="\t", comment="#")
WHI_Subject = pd.read_csv(os.path.join(c1_pheno_data_dir, "phs000200.v11.pht000982.v8.p3.WHI_Subject.MULTI.txt"), sep="\t", comment="#")
# Loading the phenotype file for genotype data #
# The next set of files we will use contain information about the biological sample that was taken for conducting the genotyping and are stored within the phenotype subdirectory of the genotype directory.
# We will use this files to extract the biological samples which are from the study we are interested in (i.e. SHARE or WHIMS)
c1_genopheno_data_dir = "/home/anbennett2/scratch/datasets/raw_data/dbgap_data/WHI/746/c1/PhenotypeFiles"
c2_genopheno_data_dir = "/home/anbennett2/scratch/datasets/raw_data/dbgap_data/WHI/746/c2/PhenotypeFiles"
c1_746_pheno = pd.read_csv(os.path.join(c1_genopheno_data_dir, "phs000746.v2.pht004719.v1.p3.c1.WHI_Imputation_Sample_Attributes.HMB-IRB.txt"), sep="\t", comment="#")
c2_746_pheno = pd.read_csv(os.path.join(c2_genopheno_data_dir, "phs000746.v2.pht004719.v1.p3.c2.WHI_Imputation_Sample_Attributes.HMB-IRB-NPU.txt"), sep="\t", comment="#")
phenogeno = pd.concat([c1_746_pheno, c2_746_pheno], axis=0, ignore_index=True)
# Loading the samples in the genotype dataset #
# Here we will define the directory that contains the genotype files. This is because we will use the _allchr.tsv file produced in the convert-mac-module.sh script.
GARNET_genodir= "/home/anbennett2/scratch/datasets/processed_data/dbgap/WHI/GARNET_dataset"
WHIMS_genodir= "/home/anbennett2/scratch/datasets/processed_data/dbgap/WHI/WHIMS_dataset"
GARNET_geno_samples = pd.read_csv(os.path.join(GARNET_genodir, "GARNET_allchr.tsv"), sep="\t", comment="#", names=["FID", "IID", "In-Fam-ID-Dad", "In-Fam-ID-Mum", "SEX", "Pheno"])
WHIMS_geno_samples = pd.read_csv(os.path.join(WHIMS_genodir, "WHIMS_allchr.tsv"), sep="\t", comment="#", names=["FID", "IID", "In-Fam-ID-Dad", "In-Fam-ID-Mum", "SEX", "Pheno"])
WHI_Sample_filt_746 = WHI_Sample[WHI_Sample.STUDY.str.match('phs000746')]
GARNET_dataset_temp1= pd.merge(left=GARNET_geno_samples, right=WHI_Sample_filt_746[['SAMPLE_ID', 'dbGaP_Subject_ID']], how='left', left_on='FID', right_on='SAMPLE_ID')
WHIMS_dataset_temp1= pd.merge(left=WHIMS_geno_samples, right=WHI_Sample_filt_746[['SAMPLE_ID', 'dbGaP_Subject_ID']], how='left', left_on='FID', right_on='SAMPLE_ID')
# check for duplicates:
GARNET_dataset_temp1.drop_duplicates(subset=['SAMPLE_ID'])
WHIMS_dataset_temp1.drop_duplicates(subset=['SAMPLE_ID'])
WHI_Sample_filt_746.drop_duplicates(subset=['SAMPLE_ID'])
#######
# Create a new dataframe which contains the phenotype data relevant to the GWAS. First, an empty df is created and db_GaP_Subject_ID is added using form 2, as it contains basic information about the patients,
# without which we cannot control for key confounders - hence this information is essential:
pancan_occ = outc_cancer_rel4[outc_cancer_rel4['PANCREAS'] == 1.0].copy()
pancan_occ.reset_index(inplace=True,drop=True)
pancan_occ['case'] = 2
pancan_occ.drop_duplicates(subset=['dbGaP_Subject_ID'])
form80_BMI=form80.groupby(by='dbGaP_Subject_ID')['BMIX'].mean()
form80_BMI=pd.DataFrame(form80_BMI)
form80_BMI.reset_index(inplace=True)
form80_WHRX=form80.groupby(by='dbGaP_Subject_ID')['WHRX'].mean()
form80_WHRX=pd.DataFrame(form80_WHRX)
form80_WHRX.reset_index(inplace=True)
# Create empty DF with subject ID column based on form 2
subject_ID = pd.DataFrame()
subject_ID['dbGaP_Subject_ID'] = form2['dbGaP_Subject_ID'].copy()
# - Merge the necessary columns to the above dfs:
# Extract and merge the columns needed for my phenotype file
my_pheno_file = pd.DataFrame()
my_pheno_file['dbGaP_Subject_ID'] = subject_ID['dbGaP_Subject_ID']
my_pheno_file = pd.merge(left=my_pheno_file, right=form2[['dbGaP_Subject_ID', 'AGE', 'RACE', 'DIAB']], how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
my_pheno_file = pd.merge(left=my_pheno_file, right=form41[['dbGaP_Subject_ID','SPANISH', 'BLACK']], how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
my_pheno_file = pd.merge(left=my_pheno_file, right=form30[['dbGaP_Subject_ID', 'PANCREAT']], how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
my_pheno_file = pd.merge(left=my_pheno_file, right=outc_ct_os[['dbGaP_Subject_ID', 'PANCREASDY', 'DEATH','DEATHDY', 'ENDWHIDY', 'ENDFOLLOWDY']], how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
my_pheno_file = pd.merge(left=my_pheno_file, right=pancan_occ[['dbGaP_Subject_ID', 'PANCREAS', 'ICDCODE', 'BEHAVIOR', 'DIAGSTAT', 'LATERAL', 'MRPHHISTB', 'GRADING', 'SIZE', 'STAGE', 'case']], how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
my_pheno_file = pd.merge(left=my_pheno_file, right=form80_BMI, how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
my_pheno_file = pd.merge(left=my_pheno_file, right=form80_WHRX[['dbGaP_Subject_ID', 'WHRX']], how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
#can check with :
my_pheno_file[my_pheno_file['case'] == 2]
my_pheno_file.drop_duplicates(subset=['dbGaP_Subject_ID'])
###############################################################################################################################################################################
## Add column denoting sex:
my_pheno_file['sex'] = 2
# In plink, the sex is denoted as: 1=male/2=female/{0,NA}. Therefore, replace the values accordingly:
#my_pheno_file[my_pheno_file['sex'] == 'M'] = 1
#my_pheno_file[my_pheno_file['sex'] == 'F'] = 2
#my_pheno_file['sex'].loc[my_pheno_file['sex-temp'] == 'F'] = 2
###################################################################################################################################################################################################
# Clean and transform the datasets
## Rename the columns
my_pheno_file = my_pheno_file.rename(columns={"RACE": "race", "DIAB": "HISTORY_DIABETES", "SPANISH": "SPANISH_RACE", "PANCREAT": "PANCREATITIS", "PANCREASDY": "AGE_DIAGNOSED", "DEATH": "DECEASED", "DEATHDY": "AGE_DEATH", "ENDFOLLOWDY": "AGE_END_FOLLOW", "PANCREAS": "PAN_CAN","ICDCODE": "PAN_CAN_LOCATION","BEHAVIOR": "TUMOUR_BEHAVIOUR", "DIAGSTAT": "DIAGNOSIS_STATUS", "LATERAL": "LATERALITY", "MRPHHISTB": "TUMOUR_MORPHOLOGY", "GRADING": "TUMOUR_GRADE", "SIZE": "TUMOUR_SIZE", "STAGE": "TUMOUR_STAGE", "BMIX": "MEAN_BMI", "WHRX": "MEAN_WHR"})
# Transform the columns as required for the analysis. Further information can be found within the WHI_SHARE_phenovariables_plan.xml file in the project directory.
## Convert age days enrolled from diagnosis to age diagnosed
my_pheno_file.AGE_DIAGNOSED = my_pheno_file.AGE + (my_pheno_file.AGE_DIAGNOSED / 365)
##Convert days till death to age of death in years
my_pheno_file.AGE_DEATH = my_pheno_file.AGE + (my_pheno_file.AGE_DEATH / 365)
## Calculate number of months survived with pancreatic cancer
my_pheno_file['PAN_CAN_MONTHS_SURVIVED'] = (my_pheno_file.AGE_DEATH - my_pheno_file.AGE_DIAGNOSED) * 12
## Convert the number of days till end follow up to their age at end of follow up
my_pheno_file.AGE_END_FOLLOW = my_pheno_file.AGE + (my_pheno_file.AGE_END_FOLLOW / 365)
GARNET_dataset_temp2 = pd.merge(left=GARNET_dataset_temp1, right=my_pheno_file, how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
WHIMS_dataset_temp2 = pd.merge(left=WHIMS_dataset_temp1, right=my_pheno_file, how='left', left_on='dbGaP_Subject_ID', right_on='dbGaP_Subject_ID')
GARNET_dataset_temp2['case']
GARNET_dataset_temp2[GARNET_dataset_temp2['case'] == 2]
WHIMS_dataset_temp2['case']
WHIMS_dataset_temp2[WHIMS_dataset_temp2['case'] == 2]
GARNET_dataset_pfam = GARNET_dataset_temp2[['FID', 'IID', 'In-Fam-ID-Dad', 'In-Fam-ID-Mum', 'sex', 'case']].copy()
GARNET_dataset_pfam['case'] = GARNET_dataset_pfam ['case'].fillna(1)
WHIMS_dataset_pfam = WHIMS_dataset_temp2[['FID', 'IID', 'In-Fam-ID-Dad', 'In-Fam-ID-Mum', 'sex', 'case']].copy()
WHIMS_dataset_pfam['case'] = WHIMS_dataset_pfam ['case'].fillna(1)
GARNET_dataset_pheno=GARNET_dataset_pfam[['FID', 'IID', 'case']].copy()
WHIMS_dataset_pheno=WHIMS_dataset_pfam[['FID', 'IID', 'case']].copy()
## Generate Covariate File ##
# Make a covariate file by extracting the required variables.
# Copy covariates to the covariate file so that any changes to this dataframe will not affect the original dataframe
GARNET_dataset_covar_temp1 = GARNET_dataset_temp2[['FID', 'IID', 'AGE', 'race', 'HISTORY_DIABETES', 'MEAN_BMI', 'MEAN_WHR', 'PANCREATITIS', 'PAN_CAN_MONTHS_SURVIVED', 'DECEASED']].copy()
WHIMS_dataset_covar_temp1 = WHIMS_dataset_temp2[['FID', 'IID','AGE', 'race', 'HISTORY_DIABETES', 'MEAN_BMI', 'MEAN_WHR', 'PANCREATITIS', 'PAN_CAN_MONTHS_SURVIVED', 'DECEASED']].copy()
# Make another temp dataset which has the principle components
GARNET_dataset_covar_temp2 = pd.merge(left=GARNET_dataset_covar_temp1, right=phenogeno[['SAMPLE_ID','PC1', 'PC2', 'PC3']], how='left', left_on='IID', right_on='SAMPLE_ID')
WHIMS_dataset_covar_temp2 = pd.merge(left=WHIMS_dataset_covar_temp1, right=phenogeno[['SAMPLE_ID','PC1', 'PC2', 'PC3']], how='left', left_on='IID', right_on='SAMPLE_ID')
# Make the final covariate file with the columns in the order required by PLINK
GARNET_dataset_covar = GARNET_dataset_covar_temp2[['FID', 'IID', 'PC1', 'PC2', 'PC3', 'AGE', 'race', 'HISTORY_DIABETES', 'MEAN_BMI', 'MEAN_WHR', 'PANCREATITIS', 'PAN_CAN_MONTHS_SURVIVED', 'DECEASED']]
WHIMS_dataset_covar = WHIMS_dataset_covar_temp2[['FID', 'IID', 'PC1', 'PC2', 'PC3', 'AGE', 'race', 'HISTORY_DIABETES', 'MEAN_BMI', 'MEAN_WHR', 'PANCREATITIS', 'PAN_CAN_MONTHS_SURVIVED', 'DECEASED']]
# For plink compatibility, fill NA with -9
GARNET_dataset_covar = GARNET_dataset_covar.fillna(-9)
WHIMS_dataset_covar = WHIMS_dataset_covar.fillna(-9)
# Set output directory
# replace accordingly
GARNET_out_dir = "/usr/path/to/files/GARNET_dataset"
# Output the files
GARNET_dataset_pheno.to_csv(os.path.join(GARNET_out_dir, "GARNET_dataset_pheno.tsv"), sep='\t', index=False)
GARNET_dataset_covar.to_csv(os.path.join(GARNET_out_dir, "GARNET_dataset_covar.tsv"), sep='\t', index=False)
GARNET_dataset_pfam.to_csv(os.path.join(GARNET_out_dir, "GARNET_dataset_allchr.pfam"), sep='\t', index=False, header=False)
# Set output directory
# replace accordingly
WHIMS_out_dir = "/usr/path/to/files/WHIMS_dataset"
# Output the files
WHIMS_dataset_pheno.to_csv(os.path.join(WHIMS_out_dir, "WHIMS_dataset_pheno.tsv"), sep='\t', index=False)
WHIMS_dataset_covar.to_csv(os.path.join(WHIMS_out_dir, "WHIMS_dataset_covar.tsv"), sep='\t', index=False)
WHIMS_dataset_pfam.to_csv(os.path.join(WHIMS_out_dir, "WHIMS_dataset_allchr.pfam"), sep='\t', index=False, header=False)