-
Notifications
You must be signed in to change notification settings - Fork 27
/
synteny_2d_fishers_test.R
152 lines (120 loc) · 5.69 KB
/
synteny_2d_fishers_test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# calculation of probabilities of matching genes between scaffolds, as having derived from a random distribution
# by WRF 2020-06-15
# last modified 2022-12-27
# analysis is modeled from this explanation from Srivastava 2008
# "Scaffold_8 has genes belonging to 174 of the 5,487 ancestral gene clusters, while human chromosome segment 10.4
# has genes belonging to 165 of them. 54 ancestral gene clusters have genes from both Scaffold_8 and human
# chromosome segment 10.4. Fisher's exact test rejects the null model that the gene complements of Trichoplax
# scaffold_8 and human chromosome segment 10.4 are drawn independently from the set of ancestral genes, and
# that all ancestral genes have an equal probability of having a descendant on a given segment"
#s2008_matrix = matrix(data=c(54, 174, 165, 5487), nrow=2)
#s2008_ft = fisher.test(test_matrix)
#s2008_ft$p.value
# [1] 4.75784e-30
# define function to calculate p values for each scaffold pair across both genomes
calculate_ftest_pvalues <- function(freq_matrix) {
# start from empty vector
p_values = c()
nscaf_g1 = dim(freq_matrix)[1]
nscaf_g2 = dim(freq_matrix)[2]
match_total = sum(freq_matrix)
# loop over each matrix position, calculate Fishers test
for ( i in 1:nscaf_g1) {
for (j in 1:nscaf_g2) {
h1h1 = freq_matrix[i,j]
h1h0 = sum(freq_matrix[i])
h0h1 = sum(freq_matrix[j])
h0h0 = match_total - h1h0 - h0h1 + h1h1
test_matrix = matrix( data=c(h1h1,h1h0,h0h1,h0h0), nrow=2)
ft = fisher.test(test_matrix)
p_values = c(p_values, ft$p.value)
}
}
return(p_values)
}
args = commandArgs(trailingOnly=TRUE)
# read data file from scaffold_synteny.py
all2Dfile = args[1]
# read optional species names
genome1_arg = args[2]
genome2_arg = args[3]
all2Ddata = read.table(all2Dfile, sep="\t", stringsAsFactors=FALSE)
categories = all2Ddata[,1]
is_scaf1 = which(categories=="s1")
scafdata1 = all2Ddata[is_scaf1,]
longestscaf1 = max(scafdata1[,6])
is_longscafs1 = which(as.numeric(scafdata1[,5]) > 0.0001)
is_scaf2 = which(categories=="s2")
scafdata2 = all2Ddata[is_scaf2,]
longestscaf2 = max(scafdata2[,6])
is_longscafs2 = which(as.numeric(scafdata2[,5]) > 0.0001)
is_points = which(categories=="g")
pointsdata = all2Ddata[is_points,]
if (sum(is.na(pointsdata[,3])) > 0 | sum(is.na(pointsdata[,5])) > 0){
print("WARNING: NAs DETECTED IN POINTS, OUTPUT MAY NOT WORK")}
scaffold_data = data.frame( sc1 = pointsdata[,3], sc2 = pointsdata[,5])
match_freq_table = table(scaffold_data)
# must na.omit in case some scaffold has no matches
scaf1_names = scafdata1[,2]
sc1_reorder_index = na.omit( match(scaf1_names,row.names(match_freq_table))[is_longscafs1] )
scaf2_names = scafdata2[,2]
sc2_reorder_index = na.omit( match(scaf2_names,colnames(match_freq_table))[is_longscafs2] )
# maybe need to switch indexing
#match(row.names(match_freq_table), scaf1_names[is_longscafs1])
reordered_table = match_freq_table[sc1_reorder_index,sc2_reorder_index]
match_freq_matrix = as.matrix(reordered_table)
nscaf_g1 = dim(match_freq_matrix)[1]
nscaf_g2 = dim(match_freq_matrix)[2]
match_total = sum(match_freq_matrix)
ntests = nscaf_g1 * nscaf_g2
ftest_p_values = calculate_ftest_pvalues(match_freq_matrix)
ftest_p_value_adj = ftest_p_values * ntests
#range(ftest_p_values)
ftest_p_value_log = floor(-log10(ftest_p_value_adj))
ftest_hist = hist(ftest_p_value_log, breaks=20, plot=FALSE) # ylim=c(0,500),
ftest_p_value_log[ftest_p_value_log<1] = 1
ftest_p_value_log[ftest_p_value_log>100] = 100
# final matrix for graph
ftest_p_value_mat = matrix( data=ftest_p_value_log, nrow=nscaf_g1, byrow=TRUE )
# raw p value matrix for csv
raw_p_value_mat = matrix( data=ftest_p_values, nrow=nscaf_g1, byrow=TRUE )
colnames(raw_p_value_mat) = scaf2_names[sc2_reorder_index]
# write csv
output_csv = gsub("([\\w/]+)\\....$","\\1.pvalue.csv",gsub(".gz$","",all2Dfile,perl=TRUE),perl=TRUE)
print( paste("writing csv to",output_csv) )
write.csv(raw_p_value_mat, file=output_csv, row.names=scaf1_names[sc1_reorder_index] )
#
# count blocks that are significant against the randomized dataset
has_randomized = FALSE
if (has_randomized == TRUE) {
# Nb with lower value than the global min of the randomized set
length( ftest_p_values[ftest_p_values<(min(ftest_p_values)/ntests)] )
# NBcS, Bonferroni corrected scaffolds, each relative to the randomized
length( ftest_p_values[(ftest_p_values / ftest_p_values) < (0.01/ntests)] )
}
# make color slices for the scalebar
scalebar_range = c(1,rev(pretty(ftest_p_value_log))[1])
scale_slice_width = 0.4/scalebar_range[2]
scale_x_starts = seq(0.5, 0.9-scale_slice_width, scale_slice_width)
scale_x_endss = seq(0.5+scale_slice_width, 0.9, scale_slice_width)
colorscheme = colorRampPalette(c("#f1eef6","#66a4c2","#4d004b"))(scalebar_range[2])
outputfile = gsub("([\\w/]+)\\....$","\\1.pvalue.pdf",gsub(".gz$","",all2Dfile,perl=TRUE),perl=TRUE)
print( paste("creating pdf",outputfile) )
main_label = paste("Fisher's exact test P values\n", genome1_arg, "vs", genome2_arg)
# create pdf as full page on a4
pdf(file=outputfile, width=8, height=11, paper="a4")
par( mar=c(5.5,4.5,4,1) )
# draw heatmap
image(z=ftest_p_value_mat, col=colorscheme, axes=FALSE, zlim=scalebar_range, main=main_label)
# label scaffolds
mtext( scaf1_names, side=1, at=seq(0,1,1/(length(scaf1_names)-1)), las=2, cex=0.95)
mtext( scaf2_names,side=2,at=seq(0,1,1/(length(scaf2_names)-1)), las=1, cex=0.75)
# make scalebar
rect( scale_x_starts, rep(0.9,scalebar_range[2]+1), scale_x_endss , rep(0.99,scalebar_range[2]+1), col=colorscheme, border=FALSE)
text(0.5,0.85, "p=0.1", cex=2)
text(0.9,0.85, paste0("p<1e-",scalebar_range[2]), cex=2)
# notes at bottom of page
range_text = paste("Min corrected P-value:", formatC(min(ftest_p_value_adj),format="e", digits=4 ) )
mtext(range_text,side=1, line=3, at=0.1)
dev.off()
#