Skip to content

Commit d76081e

Browse files
committed
Merge branch 'boruvka_algorithm'
# Conflicts: # hdbscan/hdbscan_.py # notebooks/Benchmarking scalability of clustering implementations.ipynb # setup.py
2 parents 1abf7c8 + 064deb5 commit d76081e

13 files changed

+82987
-3207
lines changed

hdbscan/_hdbscan_boruvka.c

Lines changed: 31956 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

hdbscan/_hdbscan_boruvka.pyx

Lines changed: 666 additions & 128 deletions
Large diffs are not rendered by default.

hdbscan/_hdbscan_linkage.c

Lines changed: 15653 additions & 2321 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

hdbscan/_hdbscan_linkage.pyx

Lines changed: 48 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ cimport numpy as np
1111
from libc.float cimport DBL_MAX
1212

1313
from scipy.spatial.distance import cdist, pdist
14+
from dist_metrics import DistanceMetric
15+
from dist_metrics cimport DistanceMetric
1416

1517
cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core(
1618
np.ndarray[np.double_t, ndim=2] distance_matrix):
@@ -80,7 +82,6 @@ cdef void select_distances(
8082

8183
result_buffer[i:n_labels] = pdist_matrix[current_labels[i:] - (row_num + 1) + row_start]
8284
return
83-
8485

8586
cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_pdist(
8687
np.ndarray[np.double_t, ndim=1] pdist_matrix):
@@ -127,29 +128,33 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_pdist(
127128

128129
return result
129130

131+
130132
cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_cdist(
131-
np.ndarray raw_data,
133+
np.ndarray[np.double_t, ndim=2] raw_data,
132134
np.ndarray[np.double_t, ndim=1] core_distances,
133-
object metric,
134-
int p):
135+
DistanceMetric dist_metric):
135136

136-
cdef np.ndarray[np.int64_t, ndim=1] node_labels
137-
cdef np.ndarray[np.int64_t, ndim=1] current_labels
138-
cdef np.ndarray[np.double_t, ndim=1] current_distances
139-
cdef np.ndarray[np.double_t, ndim=1] current_core_distances
140-
cdef np.ndarray[np.double_t, ndim=1] left
141-
# cdef np.ndarray[np.double_t, ndim=1] right
142-
cdef np.ndarray[np.double_t, ndim=2] result
137+
cdef np.ndarray[np.double_t, ndim=1] current_distances_arr
138+
# cdef np.ndarray[np.double_t, ndim=1] left_arr
139+
cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr
140+
cdef np.ndarray[np.double_t, ndim=2] result_arr
141+
142+
cdef np.double_t * current_distances
143+
cdef np.double_t * current_core_distances
144+
cdef np.double_t * raw_data_ptr
145+
# cdef np.double_t * left
146+
cdef np.int8_t * in_tree
147+
cdef np.double_t[:, ::1] raw_data_view
148+
cdef np.double_t[:, ::1] result
143149

144150
cdef np.ndarray label_filter
145151

146152
cdef long long current_node
147-
cdef long long comparison_node
148-
cdef long long new_node_index
149153
cdef long long new_node
150154
cdef long long i
151155
cdef long long j
152156
cdef long long dim
157+
cdef long long num_features
153158

154159
cdef double current_node_core_distance
155160
cdef double right_value
@@ -158,39 +163,49 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_cdist(
158163
cdef double new_distance
159164

160165
dim = raw_data.shape[0]
166+
num_features = raw_data.shape[1]
161167

162-
result = np.zeros((dim - 1, 3))
163-
node_labels = np.arange(dim, dtype=np.int64)
168+
raw_data_view = (<np.double_t [:raw_data.shape[0], :raw_data.shape[1]:1]> (<np.double_t *> raw_data.data))
169+
raw_data_ptr = (<np.double_t *> &raw_data_view[0, 0])
170+
171+
result_arr = np.zeros((dim - 1, 3))
172+
in_tree_arr = np.zeros(dim, dtype=np.int8)
164173
current_node = 0
165-
current_distances = np.infty * np.ones(dim)
166-
current_labels = node_labels
167-
current_core_distances = core_distances
174+
current_distances_arr = np.infty * np.ones(dim)
168175

169-
masked = 0
176+
result = (<np.double_t [:dim - 1, :3:1]> (<np.double_t *> result_arr.data))
177+
in_tree = (<np.int8_t *> in_tree_arr.data)
178+
current_distances = (<np.double_t *> current_distances_arr.data)
179+
current_core_distances = (<np.double_t *> core_distances.data)
180+
#raw_data_view = (<np.double_t [:raw_data.shape[0], :raw_data.shape[1]:1]> (<np.double_t *> raw_data.data))
170181

171182
for i in range(1, dim):
172183

173-
label_filter = current_labels != current_node
174-
current_labels = current_labels[label_filter]
175-
current_core_distances = current_core_distances[label_filter]
184+
in_tree[current_node] = 1
185+
186+
# left_arr = cdist((raw_data_view[current_node],), raw_data, metric=metric, p=p)[0]
176187

177-
left = cdist(raw_data[[current_node]], raw_data, metric=metric, p=p)[0][current_labels] # good
188+
current_node_core_distance = current_core_distances[current_node]
178189

179-
current_distances = current_distances[label_filter]
180-
current_node_core_distance = core_distances[current_node]
190+
# left = (<np.double_t *> left_arr.data)
181191

182192
new_distance = DBL_MAX
183193
new_node = 0
184194

185-
for j in range(current_labels.shape[0]):
195+
for j in range(dim):
196+
if in_tree[j]:
197+
continue
198+
186199
right_value = current_distances[j]
187-
left_value = left[j]
188-
comparison_node = current_labels[j]
189-
core_value = core_distances[comparison_node]
200+
# left_value = left[j]
201+
left_value = dist_metric.dist(&raw_data_ptr[num_features * current_node],
202+
&raw_data_ptr[num_features * j],
203+
num_features)
204+
core_value = core_distances[j]
190205
if current_node_core_distance > right_value or core_value > right_value or left_value > right_value:
191206
if right_value < new_distance:
192207
new_distance = right_value
193-
new_node = current_labels[j]
208+
new_node = j
194209
continue
195210

196211
if core_value > current_node_core_distance:
@@ -204,81 +219,18 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_cdist(
204219
current_distances[j] = left_value
205220
if left_value < new_distance:
206221
new_distance = left_value
207-
new_node = current_labels[j]
222+
new_node = j
208223
else:
209224
if right_value < new_distance:
210225
new_distance = right_value
211-
new_node = current_labels[j]
226+
new_node = j
212227

213228
result[i - 1, 0] = <double> current_node
214229
result[i - 1, 1] = <double> new_node
215230
result[i - 1, 2] = new_distance
216231
current_node = new_node
217232

218-
return result
219-
220-
221-
222-
cdef tuple get_candidate(object kdtree, long long point_index, np.ndarray[np.double_t, ndim=1] core_distances):
223-
224-
point_core_distance = core_distances[point_index]
225-
point = kdtree.data[point_index]
226-
first_pass_candidates = kdtree.query_radius(point, point_core_distance)[0]
227-
228-
candidate_core_distances = core_distances[first_pass_candidates]
229-
best_candidate_index = candidate_core_distances.argmin()
230-
second_pass_radius = candidate_core_distances[best_candidate_index]
231-
232-
if second_pass_radius <= point_core_distance:
233-
return first_pass_candidates[best_candidate_index], point_core_distance
234-
235-
second_pass_candidates, second_pass_distances = kdtree.query_radius(point, second_pass_radius, return_distance=True)
236-
237-
candidate_core_distances = core_distances[second_pass_candidates]
238-
239-
candidate_reachability_distances = np.empty(len(second_pass_candidates), dtype=np.double)
240-
241-
for i in range(len(second_pass_candidates)):
242-
if candidate_core_distances[i] > second_pass_distances:
243-
if candidate_core_distances[i] > point_core_distance:
244-
candidate_reachability_distances[i] = second_pass_distances[i]
245-
else:
246-
candidate_reachability_distances[i] = point_core_distance
247-
else:
248-
if second_pass_distances[i] > point_core_distance:
249-
candidate_reachability_distances[i] = second_pass_distances[i]
250-
else:
251-
candidate_reachability_distances[i] = point_core_distance
252-
253-
best_candidate_index = candidate_reachability_distances.argmin()
254-
255-
return (second_pass_candidates[best_candidate_index],
256-
candidate_reachability_distances[best_candidate_index])
257-
258-
# Started blocking this out, so hang onto it for now, but need considerably more work
259-
# to make it correct (deal with removing and updating node candidates rom the queue)
260-
#cpdef mst_linkage_core_kdtree(object kdtree, np.ndarray[np.double_t, ndim=1] core_distances):
261-
#
262-
# cdef long long dim
263-
# cdef long long current_node
264-
# cdef long long new_candidate
265-
#
266-
# dim = kdtree.data.shape[0]
267-
# node_labels = np.arange(dim, dtype=np.int64)
268-
# result = np.zeros((dim - 1, 3))
269-
#
270-
# current_node = 0
271-
#
272-
# for i in range(1, dim):
273-
# new_candidate, new_distance = get_candidate(kdtree, current_node, core_distances)
274-
# priority_queue.add(new_candidate, new_distance)
275-
# new_node, new_distance = priority_queue.pop()
276-
# result[i - 1, 0] = <double> current_node
277-
# result[i - 1, 1] = <double> new_node
278-
# result[i - 1, 2] = new_distance
279-
# current_node = new_node
280-
#
281-
# return result
233+
return result_arr
282234

283235
cdef class UnionFind (object):
284236

0 commit comments

Comments
 (0)