forked from ds763/PRoGeM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProGeM_commands.R
215 lines (142 loc) · 9.65 KB
/
ProGeM_commands.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
######################################## CODE FOR EXECUTION #######################################
### IMPORT RELEVANT PACKAGES
library(GenomicAlignments)
library(GenomicFeatures)
library(GenomicRanges)
library(biomaRt)
library(data.table)
cat("\nImported required packages.\n")
### READ IN SENTINEL AND PROXY DATA FILES AND FORMAT APPROPRIATELY
# read in sentinel and proxy data files
sentinel_data <- data.table(read.table(file = file.path(base_dir, sentinel_filename), header = TRUE,
quote = NULL, sep = "\t",
colClasses = c("character", "character", "numeric", "numeric")))
proxy_data <- data.table(read.table(file = file.path(base_dir, proxy_filename), header = TRUE,
quote = NULL, sep = "\t", stringsAsFactors = FALSE,
colClasses = c("character", "character", "numeric", "numeric",
"character", "numeric")))
# filter proxies based on LD (if required) and remove sentinel records from proxy_data
if(filtering_required == TRUE) {
proxy_data <- proxy_data[proxy_data$r2 >= r2_thresh,]
}
proxy_data <- proxy_data[which(proxy_data$PROXY_rsID != proxy_data$LEAD_rsID),]
# add a new column containing chromsome and position information in the format: "chr_position_" (this
# will be a search term for GTEX cis-eQTL data further down the script)
GTEx_search_term <- paste(sentinel_data$CHR, "_", sentinel_data$START, "_", sep = "")
sentinel_data <- cbind(sentinel_data, GTEx_search_term)
sentinel_data$GTEx_search_term <- as.character(sentinel_data$GTEx_search_term)
GTEx_search_term <- paste(proxy_data$PROXY_CHR, "_", proxy_data$PROXY_START, "_", sep = "")
proxy_data <- cbind(proxy_data, GTEx_search_term)
proxy_data$GTEx_search_term <- as.character(proxy_data$GTEx_search_term)
cat("Sentinel and proxy information read and formatted.\n\n")
### PULL OUT ALL GENES THAT RESIDE WITHIN +/-"X"kb FROM EACH SENTINEL VARIANT
# create a genomic ranges object for the sentinel variants
sentinel_ranges <- GR_object_creator(sentinel_data$CHR, sentinel_data$START, sentinel_data$END,
sentinel_data$rsID)
# load a TxDb object containing ensembl genes according to build GRCh37 - this is based on the following
# GTF file: "Homo_sapiens.GRCh37.82.gtf"
load(file.path(base_dir, "GRCh37_genes.RData"))
# identify all genes that reside +/-"X"kb from the sentinel variants
local_genes <- find_overlapping_genes(sentinel_ranges, ensembl_genes, interval_kb)
# determine distance between each local gene and the corresponding sentinel variant - rank accordingly
local_genes <- gene_variant_distance_finder(sentinel_ranges, local_genes)
# annotate the genes in the above local_genes object
local_genes_annotation <- gene_annotator(unique(names(local_genes)))
### IDENTIFY ALL BOTTOM-UP CANDIDATE GENES
cat(paste("-------------------------------------------------------------------\n",
"Beginning bottom-up annotation...\n\n", sep = ""))
## pull out all genes from "GRCh37_genes" that overlap the LD region around each sentinel variant
# create a genomic ranges object containing LD regions
LD_region_ranges <- LD_region_range_finder(sentinel_data, proxy_data)
# identify all genes that overlap the LD regions (+/-Xkb)
LD_region_genes <- find_overlapping_genes(LD_region_ranges, ensembl_genes,
LD_region_overhang_kb)
# annotate the genes in the above LD_region_overlapping_genes object
LD_region_genes_annotated <- gene_annotator(unique(names(LD_region_genes)))
cat("\t- Genes overlapping an LD region have been identified.\n")
## pull out the nearest genes to each sentinel variant
nearest_genes <- nearest_genes_selector(local_genes, local_genes_annotation, biotype_of_interest,
number_of_nearest)
cat(paste("\t- The ", number_of_nearest, " genes nearest to each sentinel have been identified.\n",
sep = ""))
## identify all cis-eQTL targets of the sentinel and proxy variants from GTEx data v6p
cat("\t- Searching for cis-eQTL targets...\n")
# search cis-eQTL files (NOTE: will take approx 10-15 minutes if all tissues have been chosen)
cis_eQTL_hits <- cis_eQTL_target_finder(eQTLdata_dir, tissues_of_interest, sentinel_data, proxy_data)
# format the cis_eQTL_hits list; i.e., add rsids and remove .* from ensembl_ids
cis_eQTL_hits <- cis_eQTL_formatter(sentinel_data, proxy_data, cis_eQTL_hits)
cis_eQTL_targets <- cis_eQTL_gene_extractor(sentinel_data, proxy_data, cis_eQTL_hits)
cis_eQTL_target_annotation <- gene_annotator(unique(cis_eQTL_targets$ensembl_id))
cat("\t- Cis-eQTL targets have been identified.\n")
### SUMARISE BOTTOM-UP INFORMATION
bottom_up_summary <- bottom_up_summariser(sentinel_data, nearest_genes, LD_region_genes, LD_region_genes_annotated, cis_eQTL_targets,
cis_eQTL_target_annotation)
# filter by biotype_of_interest
bottom_up_summary <- bottom_up_summary[bottom_up_summary$gene_biotype == biotype_of_interest,]
### IDENTIFY GENES WITH INCREASED LIKELIHOOD OF HAVING A FUNCTIONAL IMPACT USING VEP ANNOTATION
# note: to save resources with particularly large datasets it may be worthwhile filtering for only coding variants when running VEP,
# however, bear in mind that this would limit your variant consequences of interest; i.e., the moderate impact consequence "regulatory
# region ablation" would be lost
# read in VEP output file
VEP_annotations <- data.table(read.table(file = file.path(base_dir, VEP_filename), header = TRUE,
quote = NULL, sep = "\t", stringsAsFactors = FALSE))
if(filtering_required == TRUE & length(r2_column) > 0) {
VEP_annotations <- VEP_annotations[VEP_annotations[[r2_column]] >= r2_thresh,]
}
# filter out all records that do not have either a "HIGH" or "MODERATE" IMPACT
VEP_IMPACTS_of_interest <- c("HIGH", "MODERATE")
VEP_annotations <- VEP_annotations[VEP_annotations[[IMPACT_column]] %in% VEP_IMPACTS_of_interest,]
# identify leads and proxies most likely to have a functional impact as per VEP annotation
SNP_IMPACT_annotations <- VEP_integrator(sentinel_data, bottom_up_summary, VEP_annotations, sentinel_rsID_column, proxy_rsID_column,
ensembl_gene_id_column, IMPACT_column)
# combine bottom_up_summary with the SNP_IMPACT_annotations
bottom_up_summary <- cbind(bottom_up_summary, SNP_IMPACT_annotations)
cat("\t- VEP annotations have been integrated.\n\n")
# classify bottom_up evidence according to source
# 1 = gene contains a lead or proxy with either a HIGH or MODERATE IMPACT (proximity and cis-eQTL evidence may also be present)
# 2 = gene is proximal to lead; i.e., nearest or LD overlapping (cis-eQTL evidence may also be present)
# 3 = gene is a cis-eQTL target of either the lead or a proxy variant
bottom_up_summary <- bottom_up_classifier(bottom_up_summary)
cat("Bottom-up annotation complete!\n\n")
### IDENTIFY ALL TOP-DOWN CANDIDATES
cat(paste("-------------------------------------------------------------------\n",
"Beginning top-down annotation...\n\n", sep = ""))
# load in annotated metabolic genes list of genomicRanges objects
load(file.path(base_dir, filename_metabolic_genes))
# identify top-down candidates and score each candidate accordingly
metabolic_overlapping_genes <- top_down_candidate_identifier(sentinel_ranges, annotated_metabolic_genes,
interval_kb)
top_down_scores <- top_down_scorer(sentinel_data, metabolic_overlapping_genes)
cat("\t- Top-down candidates have been identified and scored.\n\n")
# filter by biotype of interest
top_down_scores <- top_down_scores[top_down_scores$gene_biotype == biotype_of_interest,]
cat("Top-down annotation complete.\n\n")
### IDENTIFY CO-OCCURRING GENES
cooccurring_candidates <- converging_candidate_finder(sentinel_data, bottom_up_summary, top_down_scores)
cat(paste("-------------------------------------------------------------------\n",
"Cooccurring candidates identified.\n", sep = ""))
### WRITE OUT PRIMARY INFORMATION TO FILE
# bottom-up summary
write.table(bottom_up_summary, file = file.path(output_dir, "OUTPUT_bottom_up_summary.txt"),
quote = FALSE, row.names = FALSE, sep = "\t")
# top-down scores
write.table(top_down_scores, file = file.path(output_dir, "OUTPUT_top_down_scores.txt"),
quote = FALSE, row.names = FALSE, sep = "\t")
# cooccurring candidates
write.table(cooccurring_candidates, file = file.path(output_dir, "OUTPUT_cooccurring_candidates.txt"),
quote = FALSE, row.names = FALSE, sep = "\t")
# HIGH and MODERATE VEP dataframe
write.table(VEP_annotations, file = file.path(output_dir, "OUTPUT_VEP_IMPACT_annotations.txt"),
quote = FALSE, row.names = FALSE, sep = "\t")
# lead eQTLs
write.table(cis_eQTL_hits[[1]], file = file.path(output_dir, "OUTPUT_lead_cis_eQTLs.txt"),
quote = FALSE, row.names = FALSE, sep = "\t")
# proxy eQTLs
write.table(cis_eQTL_hits[[2]], file = file.path(output_dir, "OUTPUT_proxy_cis_eQTLs.txt"),
quote = FALSE, row.names = FALSE, sep = "\t")
# nearest genes
write.table(nearest_genes, file = file.path(output_dir, "OUTPUT_nearest_genes.txt"),
quote = FALSE, row.names = FALSE, sep = "\t")
cat("Output files written to base_dir.\n")
cat("Annotation complete!!\n\n")
#######################################################################################################