@@ -11,6 +11,8 @@ cimport numpy as np
11
11
from libc.float cimport DBL_MAX
12
12
13
13
from scipy.spatial.distance import cdist, pdist
14
+ from dist_metrics import DistanceMetric
15
+ from dist_metrics cimport DistanceMetric
14
16
15
17
cpdef np.ndarray[np.double_t, ndim= 2 ] mst_linkage_core(
16
18
np.ndarray[np.double_t, ndim= 2 ] distance_matrix):
@@ -80,7 +82,6 @@ cdef void select_distances(
80
82
81
83
result_buffer[i:n_labels] = pdist_matrix[current_labels[i:] - (row_num + 1 ) + row_start]
82
84
return
83
-
84
85
85
86
cpdef np.ndarray[np.double_t, ndim= 2 ] mst_linkage_core_pdist(
86
87
np.ndarray[np.double_t, ndim= 1 ] pdist_matrix):
@@ -127,29 +128,33 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_pdist(
127
128
128
129
return result
129
130
131
+
130
132
cpdef np.ndarray[np.double_t, ndim= 2 ] mst_linkage_core_cdist(
131
- np.ndarray raw_data,
133
+ np.ndarray[np.double_t, ndim = 2 ] raw_data,
132
134
np.ndarray[np.double_t, ndim= 1 ] core_distances,
133
- object metric,
134
- int p):
135
+ DistanceMetric dist_metric):
135
136
136
- cdef np.ndarray[np.int64_t, ndim= 1 ] node_labels
137
- cdef np.ndarray[np.int64_t, ndim= 1 ] current_labels
138
- cdef np.ndarray[np.double_t, ndim= 1 ] current_distances
139
- cdef np.ndarray[np.double_t, ndim= 1 ] current_core_distances
140
- cdef np.ndarray[np.double_t, ndim= 1 ] left
141
- # cdef np.ndarray[np.double_t, ndim=1] right
142
- cdef np.ndarray[np.double_t, ndim= 2 ] result
137
+ cdef np.ndarray[np.double_t, ndim= 1 ] current_distances_arr
138
+ # cdef np.ndarray[np.double_t, ndim=1] left_arr
139
+ cdef np.ndarray[np.int8_t, ndim= 1 ] in_tree_arr
140
+ cdef np.ndarray[np.double_t, ndim= 2 ] result_arr
141
+
142
+ cdef np.double_t * current_distances
143
+ cdef np.double_t * current_core_distances
144
+ cdef np.double_t * raw_data_ptr
145
+ # cdef np.double_t * left
146
+ cdef np.int8_t * in_tree
147
+ cdef np.double_t[:, ::1 ] raw_data_view
148
+ cdef np.double_t[:, ::1 ] result
143
149
144
150
cdef np.ndarray label_filter
145
151
146
152
cdef long long current_node
147
- cdef long long comparison_node
148
- cdef long long new_node_index
149
153
cdef long long new_node
150
154
cdef long long i
151
155
cdef long long j
152
156
cdef long long dim
157
+ cdef long long num_features
153
158
154
159
cdef double current_node_core_distance
155
160
cdef double right_value
@@ -158,39 +163,49 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_cdist(
158
163
cdef double new_distance
159
164
160
165
dim = raw_data.shape[0 ]
166
+ num_features = raw_data.shape[1 ]
161
167
162
- result = np.zeros((dim - 1 , 3 ))
163
- node_labels = np.arange(dim, dtype = np.int64)
168
+ raw_data_view = (< np.double_t [:raw_data.shape[0 ], :raw_data.shape[1 ]:1 ]> (< np.double_t * > raw_data.data))
169
+ raw_data_ptr = (< np.double_t * > & raw_data_view[0 , 0 ])
170
+
171
+ result_arr = np.zeros((dim - 1 , 3 ))
172
+ in_tree_arr = np.zeros(dim, dtype = np.int8)
164
173
current_node = 0
165
- current_distances = np.infty * np.ones(dim)
166
- current_labels = node_labels
167
- current_core_distances = core_distances
174
+ current_distances_arr = np.infty * np.ones(dim)
168
175
169
- masked = 0
176
+ result = (< np.double_t [:dim - 1 , :3 :1 ]> (< np.double_t * > result_arr.data))
177
+ in_tree = (< np.int8_t * > in_tree_arr.data)
178
+ current_distances = (< np.double_t * > current_distances_arr.data)
179
+ current_core_distances = (< np.double_t * > core_distances.data)
180
+ # raw_data_view = (<np.double_t [:raw_data.shape[0], :raw_data.shape[1]:1]> (<np.double_t *> raw_data.data))
170
181
171
182
for i in range (1 , dim):
172
183
173
- label_filter = current_labels ! = current_node
174
- current_labels = current_labels[label_filter]
175
- current_core_distances = current_core_distances[label_filter ]
184
+ in_tree[current_node] = 1
185
+
186
+ # left_arr = cdist((raw_data_view[current_node],), raw_data, metric=metric, p=p)[0 ]
176
187
177
- left = cdist(raw_data[[ current_node]], raw_data, metric = metric, p = p)[ 0 ][current_labels] # good
188
+ current_node_core_distance = current_core_distances[ current_node]
178
189
179
- current_distances = current_distances[label_filter]
180
- current_node_core_distance = core_distances[current_node]
190
+ # left = (<np.double_t *> left_arr.data)
181
191
182
192
new_distance = DBL_MAX
183
193
new_node = 0
184
194
185
- for j in range (current_labels.shape[0 ]):
195
+ for j in range (dim):
196
+ if in_tree[j]:
197
+ continue
198
+
186
199
right_value = current_distances[j]
187
- left_value = left[j]
188
- comparison_node = current_labels[j]
189
- core_value = core_distances[comparison_node]
200
+ # left_value = left[j]
201
+ left_value = dist_metric.dist(& raw_data_ptr[num_features * current_node],
202
+ & raw_data_ptr[num_features * j],
203
+ num_features)
204
+ core_value = core_distances[j]
190
205
if current_node_core_distance > right_value or core_value > right_value or left_value > right_value:
191
206
if right_value < new_distance:
192
207
new_distance = right_value
193
- new_node = current_labels[j]
208
+ new_node = j
194
209
continue
195
210
196
211
if core_value > current_node_core_distance:
@@ -204,81 +219,18 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_cdist(
204
219
current_distances[j] = left_value
205
220
if left_value < new_distance:
206
221
new_distance = left_value
207
- new_node = current_labels[j]
222
+ new_node = j
208
223
else :
209
224
if right_value < new_distance:
210
225
new_distance = right_value
211
- new_node = current_labels[j]
226
+ new_node = j
212
227
213
228
result[i - 1 , 0 ] = < double > current_node
214
229
result[i - 1 , 1 ] = < double > new_node
215
230
result[i - 1 , 2 ] = new_distance
216
231
current_node = new_node
217
232
218
- return result
219
-
220
-
221
-
222
- cdef tuple get_candidate(object kdtree, long long point_index, np.ndarray[np.double_t, ndim= 1 ] core_distances):
223
-
224
- point_core_distance = core_distances[point_index]
225
- point = kdtree.data[point_index]
226
- first_pass_candidates = kdtree.query_radius(point, point_core_distance)[0 ]
227
-
228
- candidate_core_distances = core_distances[first_pass_candidates]
229
- best_candidate_index = candidate_core_distances.argmin()
230
- second_pass_radius = candidate_core_distances[best_candidate_index]
231
-
232
- if second_pass_radius <= point_core_distance:
233
- return first_pass_candidates[best_candidate_index], point_core_distance
234
-
235
- second_pass_candidates, second_pass_distances = kdtree.query_radius(point, second_pass_radius, return_distance = True )
236
-
237
- candidate_core_distances = core_distances[second_pass_candidates]
238
-
239
- candidate_reachability_distances = np.empty(len (second_pass_candidates), dtype = np.double)
240
-
241
- for i in range (len (second_pass_candidates)):
242
- if candidate_core_distances[i] > second_pass_distances:
243
- if candidate_core_distances[i] > point_core_distance:
244
- candidate_reachability_distances[i] = second_pass_distances[i]
245
- else :
246
- candidate_reachability_distances[i] = point_core_distance
247
- else :
248
- if second_pass_distances[i] > point_core_distance:
249
- candidate_reachability_distances[i] = second_pass_distances[i]
250
- else :
251
- candidate_reachability_distances[i] = point_core_distance
252
-
253
- best_candidate_index = candidate_reachability_distances.argmin()
254
-
255
- return (second_pass_candidates[best_candidate_index],
256
- candidate_reachability_distances[best_candidate_index])
257
-
258
- # Started blocking this out, so hang onto it for now, but need considerably more work
259
- # to make it correct (deal with removing and updating node candidates rom the queue)
260
- # cpdef mst_linkage_core_kdtree(object kdtree, np.ndarray[np.double_t, ndim=1] core_distances):
261
- #
262
- # cdef long long dim
263
- # cdef long long current_node
264
- # cdef long long new_candidate
265
- #
266
- # dim = kdtree.data.shape[0]
267
- # node_labels = np.arange(dim, dtype=np.int64)
268
- # result = np.zeros((dim - 1, 3))
269
- #
270
- # current_node = 0
271
- #
272
- # for i in range(1, dim):
273
- # new_candidate, new_distance = get_candidate(kdtree, current_node, core_distances)
274
- # priority_queue.add(new_candidate, new_distance)
275
- # new_node, new_distance = priority_queue.pop()
276
- # result[i - 1, 0] = <double> current_node
277
- # result[i - 1, 1] = <double> new_node
278
- # result[i - 1, 2] = new_distance
279
- # current_node = new_node
280
- #
281
- # return result
233
+ return result_arr
282
234
283
235
cdef class UnionFind (object ):
284
236
0 commit comments