-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeans.py
268 lines (256 loc) · 9.35 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 25 19:13:05 2016
@author: solan
"""
#Import the dependencies
import csv
import math
import random
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#The dtastructure which representsa point in 3d plane.Having points X , Y and Z
class Point:
def get_x(self):
return self.x
def get_y(self):
return self.y
def get_z(self):
return self.z
def set_x(self,x):
self.x=x
def set_y(self,y):
self.y=y
def set_z(self,z):
self.z=z
#This is the structure for each cluster.Each of them has a the list of points in it, the SSE
#calculated and centroid value.
class Cluster:
def get_points(self):
return self.points
def set_points(self,points):
self.points= points
#self.set_sse_val()
def set_centroid(self,centroid):
self.centroid = centroid
def get_centroid(self):
return self.centroid
def get_sse(self):
return self.sse
def set_sse(self,sse):
self.sse = sse
#this method calculates the SSE based on the current points in the cluster
def get_sse_val(self):
sse=0
points=self.get_points()
centroid = self.get_centroid()
for i in range(len(points)):
sse= sse+math.pow(centroid.get_x()-points[i].get_x(),2)+math.pow(centroid.get_y()-points[i].get_y(),2)+math.pow(centroid.get_z()-points[i].get_z(),2)
self.set_sse(sse)
return self.sse
#Reads the data from the file, converts each value into points andd return a list of points
def get_data(name):
points=[]
with open(name) as file:
readFile = csv.reader(file,delimiter=',')
for row in readFile:
localPoint = Point()
localPoint.set_x(float(row[0]))
localPoint.set_y(float(row[1]))
localPoint.set_z(float(row[2]))
points.append(localPoint)
return points
#calculate the distance between two points.
def getDistance(centroid,point):
distance = math.sqrt(math.pow(centroid.get_x()-point.get_x(),2)+math.pow(centroid.get_y()-point.get_y(),2)+math.pow(centroid.get_z()-point.get_z(),2))
return distance
#this is the method assign the new seed points too a cluster
def assignToClusters(points,clusters):
#alculate the distance between each point and centroid of each cluster, and assign
#each point to a cluster closest to the clusters centroid
for i in range(len(points)):
minDistance =100000
clusterIndex=-1
for j in range(len(clusters)):
#check the distance between two points
distance= getDistance(clusters[j].get_centroid(),points[i])
if distance <minDistance:
minDistance =distance
clusterIndex=j
someList =clusters[clusterIndex].get_points()
someList.append(points[i])
clusters[clusterIndex].set_points(someList)
return clusters
#this is the actualk means algorithm,
def run_k_means(points,k):
#sampling random points from initial given data set.
randomPoints = random.sample(points, k)
clusters =[]
#Creating empty clusters and empty list of points
for i in range(k):
newCluster =Cluster()
listPoints=[]
newCluster.set_points(listPoints)
#newCluster.set_points(newCluster.get_points().append(Point()))
newCluster.set_centroid(randomPoints[i])
clusters.append(newCluster)
#Assign the seed points to certein cluster
clusters = assignToClusters(points,clusters)
#this iteration will continue, till stopping criteria met.
while True:
#Storing previous centroid
oldCentroids=[]
for i in range(len(clusters)):
oldCentroids.append(clusters[i].get_centroid())
#No update the clusters based on the new centroid
clusters =updateClusters(clusters,points)
newCentroids =[]
for i in range(len(clusters)):
newCentroids.append(clusters[i].get_centroid())
check = False
for i in range(len(newCentroids)):
if getDistance(newCentroids[i],oldCentroids[i])< 0.01:
check=True
else:
check=False
break
if check==True:
break
return clusters
#empty the clusters and only keep the centroid and re calculate the distance between each point and centroid
def redoPoints(clusters,points):
#print("Points"+str(len(clusters)))
for i in range(len(clusters)):
temp =[]
clusters[i].set_points(temp)
clusters=assignToClusters(points,clusters)
return clusters
#Calculate the new Centroid and assign to cluster
def updateClusters(clusters,points):
for i in range(len(clusters)):
newCentroid = calculateCentroid(clusters[i].get_points())
#assign the new centroid to cluster.
clusters[i].set_centroid(newCentroid)
clusters =redoPoints(clusters,points)
return clusters
#this method calculates the new centroidand returns a Point
def calculateCentroid(cluster):
x=0
y=0
z=0
length = len(cluster)
for i in range(len(cluster)):
x= x+cluster[i].get_x()
y = y+cluster[i].get_y()
z=z+cluster[i].get_z()
point =Point()
point.set_x(x/length)
point.set_y(y/length)
point.set_z(z/length)
return point
#this metho prints the clusters
def printClusters(clusters):
for i in range(len(clusters)):
print("Cluster "+str(i+1)+" contains:"+str(len(clusters[i].get_points())))
#printPoints(clusters[i].get_points())
#this metho prints the points
def printPoints(points):
for i in range(len(points)):
print(str(points[i].get_x())+","+str(points[i].get_y())+","+str(points[i].get_z()))
#this method sorts the clusters based on the number of points in a given cluster.
def sort_clusters(clusters):
for i in range(len(clusters)):
for j in range(len(clusters)):
if len(clusters[j].get_points())>len(clusters[i].get_points()):
temp = clusters[i]
clusters[i]=clusters[j]
clusters[j]=temp
printClusters(clusters)
#this method plots the 3d scatter plot of differnt clusters.
def plotClusters(clusters):
x=[]
y=[]
z=[]
fig = plt.figure()
#ax = fig.add_subplot(111, projection='3d')
ax = Axes3D(fig)
#define the colors.
colors =['g','r','b','y','m','k','0.45','#f4a460','#ffa500','#ffc0cb','c']
c_color=[]
m_mark=[]
markers=['o', '8','D','v', '^', '<', '>', 's', 'p', '*', 'h', 'H', 'd']
for i in range(len(clusters)):
points=clusters[i].get_points()
color=colors[i]
mark=markers[i]
for j in range(len(points)):
x.append(points[j].get_x())
y.append(points[j].get_y())
z.append(points[j].get_z())
c_color.append(color)
m_mark.append(mark)
for i in range(len(x)):
ax.scatter(x[i],y[i],z[i],s=40,c=c_color[i],marker=m_mark[i])
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
plt.show()
#This method calculates the sum of all SSEs of given list o clusters
def getSumSSE(clusters):
sumVal=0.0
#check if the cluster has 0 points
if len(clusters[0].get_points())==0:
return 1000000
for i in range(len(clusters)):
sumVal=sumVal+clusters[i].get_sse_val()
return sumVal
#This is the main method
def main():
points =get_data('data.csv')
#this is the sse for each cluster
sse=[]
#this is value of K
ks=[]
#List of final cluster. Each item in this list is bunch of clusters for given i, where i=k
final_clusters=[]
#assign empty clusters to final list with infinite sse and empty points
for i in range(13):
list2=[]
cluster =Cluster()
some=[]
cluster.set_points(some)
cluster.set_sse(100000)
list2.append(cluster)
final_clusters.append(list2)
#Iterate N times to rremove the factor of randomness
for n in range(10):
#for each value of K from 1 to 12
for i in range(13):
#run K means for given set of points and k
clusters= run_k_means(points,i+1)
sumSSECurrent=getSumSSE(clusters)
sumSSEPrevious=getSumSSE(final_clusters[i])
#compare the sse for given K and get the best cluster , having least SSE for the given K
if sumSSECurrent<sumSSEPrevious :
final_clusters[i]=clusters
#this is the final list of clusters for given K
for i in range(len(final_clusters)):
print("K="+str(i+1))
print("#############################")
clusters=final_clusters[i]
sort_clusters(clusters)
sum_sse=0
for j in range(len(clusters)):
sum_sse= sum_sse+clusters[j].get_sse_val()
sse.append(sum_sse)
ks.append(i+1)
#plot k vs SSE
plt.plot(ks,sse,lw=2)
plt.xlabel("k")
plt.ylabel("SSE")
plt.title("K vs SSE")
plt.show()
print("SSE at K =8 is :"+str(getSumSSE(final_clusters[7])))
#chosen K =8, least inflection point an plot the cluster based on the
plotClusters(final_clusters[7])
main()