-
Notifications
You must be signed in to change notification settings - Fork 0
/
Benchmark2.R
107 lines (82 loc) · 3.43 KB
/
Benchmark2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#####
#Libraries loading
library(tidyverse)
library(factoextra)
#####
par(mfrow=c(2,2))
#IntaRNA
#separate antisense from intergenic
inta_AS <- inta_s %>% filter(Relative.position.to.CDSs == "AS")
inta_OT <- inta_s %>% filter(Relative.position.to.CDSs != "AS")
#AS
hist(inta_AS$E, breaks = 25,
main = "A",
xlab = "Hybridization energy") #before normalization
hist(inta_AS$E / inta_AS$`sRNA length`, breaks = 100,
main = "A",
xlab = "Hybridization energy") #after normalization
#Other
hist(inta_OT$E, breaks = 25,
main = "Intergenic & RBS sRNAs: hybridization energy (pre-normalized) values - IntaRNA",
xlab = "Hybridization energy") #before normalization
hist(inta_OT$E / inta_OT$`sRNA length`, breaks = 100,
main = "B",
xlab = "Hybridization energy") #after normalization
#####
#sTar
#separate antisense from intergenic
sTar_AS <- sTar_s %>% filter(Relative.position.to.CDSs == "AS")
sTar_OT <- sTar_s %>% filter(Relative.position.to.CDSs != "AS")
#AS
hist(sTar_AS$E, breaks = 25,
main = "Antisense sRNAs: hybridization energy (pre-normalized) values - IntaRNAsTar",
xlab = "Hybridization energy") #before normalization
hist(sTar_AS$E / sTar_AS$`sRNA length`, breaks = 100,
main = "A",
xlab = "Hybridization energy") #after normalization
#Other
hist(sTar_OT$E, breaks = 25,
main = "Intergenic & RBS sRNAs: hybridization energy (pre-normalized) values - IntaRNAsTar",
xlab = "Hybridization energy") #before normalization
hist(sTar_OT$E / sTar_OT$`sRNA length`, breaks = 100,
main = "B",
xlab = "Hybridization energy") #after normalization
#####
#sRNAfTarget
#separate antisense from intergenic
srnar_AS <- srnar_s %>% filter(Relative.position.to.CDSs == "AS")
srnar_OT <- srnar_s %>% filter(Relative.position.to.CDSs != "AS")
#AS
hist(srnar_AS$E, breaks = 40,
main = "Hybridization energy before normalization - sRNAfTarget",
xlab = "Hybridization energy")#before normalization
hist(srnar_AS$E / srnar$`sRNA length`, breaks = 40,
main = "A",
xlab = "Hybridization energy")#after normalization
#Other sRNAs
hist(srnar_OT$E, breaks = 100,
main = "Hybridization energy before normalization - sRNAfTarget",
xlab = "Hybridization energy") #before normalization
hist(srnar_OT$E / srnar_OT$`sRNA length`, breaks = 100,
main = "B",
xlab = "Hybridization energy") #after normalization: as expected there's a significant variation
#####
#Clustering analysis, using unsupervised k-means on
#top40 data, by taking a sample of 5000 records
set.seed(400)
inta_kmeans <- inta_s %>% select("id2","E", "sRNA length")
inta_kmeans <- inta_kmeans[sample(nrow(inta_kmeans), 5000),] #taking 5000 samples
inta_kmeans <- inta_kmeans[,-1]
inta_kmeans_scale <- scale(inta_kmeans) #scale data
inta_kmeans_data <- dist(inta_kmeans_scale)#distance matrix
fit <- kmeans(inta_kmeans_scale, centers = 3, nstart = 100)
fviz_cluster(fit, data = inta_kmeans_scale, geom = "point",
main = "B") #first try on pre-assumption of three clusters
#calculate how many clusters
fviz_nbclust(inta_kmeans_scale, kmeans, method = "wss") +
labs(subtitle = "Elbow method") +
geom_vline(xintercept = 4, linetype = 2) +
geom_vline(xintercept = 3, linetype = 1, color = "red") +
ggtitle("A. Optimal number of clusters") #we selected 5 centroids
fit2 <- kmeans(inta_kmeans_scale, centers = 5, nstart = 100) #fitting model
fviz_cluster(fit2, data = inta_kmeans_scale, geom = "point", main = "C")