-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathsp_pretteyHeatmap.sh
executable file
·248 lines (227 loc) · 6.74 KB
/
sp_pretteyHeatmap.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/bin/bash
#############
#CT##########
#############
#set -x
set -e
set -u
usage()
{
cat <<EOF
${txtcyn}
Usage:
$0 options${txtrst}
${bldblu}Function${txtrst}:
This script is used to plot heatmap.
First it will use kmeans to cluster rows and search for correct order.
Then the heatmap will be genrated on ordered data.
${txtred}Warning${txtrst}: This script can not deal well with one element cluster.
Please check it yourself.
${txtbld}OPTIONS${txtrst}:
-f Data file (with header line, the first column is the
rowname, tab seperated)${bldred}[NECESSARY]${txtrst}
-c The number of cluster wanted.${bldred}[NECESSARY]${txtrst}
-P The way to preprocess data before clustering.
[${txtred}
0 : means no preprocess
1 : means scale data
2 : means using the difference value between current
: column and the one before
3 : means values in each row are divided by the
: sqrt(sum(squares of values in this row))
4 (default): means first scale data and then use the difference value between current
: column and the one before
Both 2, 3, 4 are designed for clustering genes with same
changing trend together. Only affect data order steps.
${txtrst}]
#-p Select the optimum cluster number by elbow algorithm.
# [${txtred}Default FALSE.${txtrst}]
# Accept TRUE.
# When this is set to TRUE, a plot of within groups
# sum of squares is made. Acroread will open it, you choose the
# cluster number before which there is a sharp decline. Then
# close the opened pdf, you will be asked to input the chosed
# number. When this is TRUE, the number after -c is the expected
# maximum cluster number. All number of clusters ranges from 2
# to maximum cluster number will be calculated.
-t Give the maximum try number to run kmeans multiple times and
choose the one with the least withinss.
[${txtred}Default 10.${txtrst}]
-i Install needed packages.[default FALSE, given TRUE once if there is
error like <there is no package>]
-l Parameters for s-plot heatmapS (-f is not needed).
One can add new parameters to override remaining one or set new attributes.
[Default: '-A 90 -T 1.5 -V 0.5 -l top -v 22 -u 15 -F 9
-j TRUE -M yellow -x green -y red -Z TRUE']
EOF
}
file=
center=
preprocess=4
plotwithinss='FALSE'
try=10
evaluation='FALSE'
xlab='Value'
ylab='Variable'
mainT=''
ly_default='-A 90 -T 1.5 -V 0.5 -l top -v 22 -u 15 -F 9 -j TRUE -M yellow -x green -y red -Z TRUE'
ly=''
ist='FALSE'
while getopts "hf:c:P:p:t:e:x:y:m:l:i:" OPTION
do
case $OPTION in
h)
usage
exit 1
;;
f)
file=$OPTARG
;;
c)
center=$OPTARG
;;
P)
preprocess=$OPTARG
;;
p)
plotwithinss=$OPTARG
;;
t)
try=$OPTARG
;;
e)
evaluation=$OPTARG
;;
x)
xlab=$OPTARG
;;
y)
ylab=$OPTARG
;;
m)
mainT=$OPTARG
;;
i)
ist=$OPTARG
;;
l)
ly=$OPTARG
;;
?)
usage
exit 1
;;
esac
done
if [ -z $file ]; then
usage
exit 1
fi
mid=''
cat <<EOF >${file}${mid}.kmeans.r
if (${ist}){
install.packages("cluster", repo="http://cran.us.r-project.org")
install.packages("psych", repo="http://cran.us.r-project.org")
install.packages("fpc", repo="http://cran.us.r-project.org")
}
library(cluster)
library(psych)
library(fpc)
data <- read.table(file="$file", sep='\t', header=T, row.names=1,
check.names=FALSE)
data <- as.matrix(data)
#Delete rows containing only zero
data <- data[rowSums(data==0)<ncol(data),]
if($preprocess == 0){
kdata <- data
}else if(${preprocess} == 1){
kdata <- t(apply(data,1,scale))
}else if(${preprocess} == 2){
kdata <- t(apply(data,1,diff))
}else if(${preprocess} == 3){
norm_factors_for_each_row <- sqrt(apply(data^2, 1, sum))
kdata <- data / norm_factors_for_each_row
}else if(${preprocess} == 4){
kdata <- t(apply(data,1,scale))
kdata <- t(apply(kdata,1,diff))
}
#print("Try kmeans for the first time.")
fit <- kmeans(kdata, centers=$center, iter.max=100, nstart=25)
withinss <- sum(fit\$withinss)#
#print(paste("Get withinss for the first run", withinss))
for (i in 1:$try) {
tmpfit <- kmeans(kdata, centers=$center, iter.max=100, nstart=25)
tmpwithinss <- sum(tmpfit\$withinss)
#print(paste(("The additional "), i, 'run, withinss', tmpwithinss))
if (tmpwithinss < withinss){
withins <- tmpwithinss
fit <- tmpfit
}
}
cluster <- fit\$cluster
cluster <- as.data.frame(cluster)
dataWithClu <- cbind(ID=rownames(data), data, cluster)
dataWithClu <- dataWithClu[order(dataWithClu\$cluster),]
write.table(as.data.frame(dataWithClu),
file="${file}${mid}.kmeans.xls",
sep="\t", row.names=F, col.names=T, quote=F)
EOF
Rscript ${file}${mid}.kmeans.r
parseHeatmapSoutput.2.py -i ${file}${mid}.kmeans.xls >${file}${mid}.kmeans.sort
s-plot heatmapS -f ${file}${mid}.kmeans.sort ${ly_default} ${ly}
/bin/rm -f ${file}${mid}.kmeans.sort ${file}${mid}.kmeans.r
#if [ "$plotwithinss" == 'TRUE' ]; then
# cat <<EOF >${file}${mid}.$center.kmeans.chooseClusterNumber.r
#data <- read.table(file="$file", sep='\t', header=T, row.names=1,
#check.names=FALSE)
#data <- as.matrix(data)
##Delate rows containing only zero
#data <- data[rowSums(data==0)<ncol(data),]
#if($preprocess == 0){
# kdata <- data
#}else if(${preprocess} == 1){
# kdata <- t(apply(data,1,scale))
#}else if(${preprocess} == 2){
# kdata <- t(apply(data,1,diff))
#}else if(${preprocess} == 3){
# norm_factors_for_each_row <- sqrt(apply(data^2, 1, sum))
# kdata <- data / norm_factors_for_each_row
#}
#
#wss <- (nrow(kdata)-1)*sum(apply(kdata,2,var))
#
#for (i in 2:$center) wss[i] <- sum(kmeans(kdata, centers=i,
# iter.max=100, nstart=25)\$withinss)
#
#pdf("${file}${mid}.$center.kmeans.chooseClusterNumber.pdf")
#plot(1:$center, wss, type="b", xlab="Number of Clusters", ylab="Within
#groups sum of squares")
#dev.off()
#
#EOF
#
#Rscript --save ${file}${mid}.$center.kmeans.chooseClusterNumber.r
#if [ $? == 0 ]; then
#acroread ${file}${mid}.$center.kmeans.chooseClusterNumber.pdf
#
#read -p ">>>Input the cluster number : " center
#else
# echo "Wrong Rscript"
#fi
#fi
#print("Output the mean value of cluster")
#cluster.mean <- aggregate(data, by=list(fit\$cluster), FUN=mean)
#cluster.mean.colnames <- colnames(cluster.mean)
#cluster.mean.colnames[1] = paste('#',cluster.mean.colnames[1], sep='')
#colnames(cluster.mean) <- cluster.mean.colnames
#write.table(t(cluster.mean), file="${file}${mid}.$center.kmeans.cluster.mean.xls", sep='\t',col.names=F, row.names=T, quote=F)
#print("Output the total sorted cluster name")
#clust.out <- fit\$cluster
#kclust <- as.matrix(clust.out)
#kclust.out <- cbind(kclust, data)
#means of n points in each cluster
#mns <- sapply(split(data, fit\$cluster), function(x) mean(unlist(x)))
#order the data
#data.order <- data[order(order(mns)[fit\$cluster]),]
#write.table(data.order, file="${file}${mid}.$center.kmeans.result",
#sep="\t", row.names=T, col.names=T, quote=F)