1
2
3
4
5
6
7
8
9 """Distance functions to be used in kernels and elsewhere
10 """
11
12 __docformat__ = 'restructuredtext'
13
14
15
16
17
18
19 import numpy as N
20 from mvpa.base import externals
21
22 if __debug__:
23 from mvpa.base import debug, warning
24
25
27 """Return Cartesian distance between a and b
28 """
29 return N.linalg.norm(a-b)
30
31
33 """Returns dinstance max(\|a-b\|)
34 XXX There must be better name!
35 XXX Actually, why is it absmin not absmax?
36
37 Useful to select a whole cube of a given "radius"
38 """
39 return max(abs(a-b))
40
41
43 """Return Manhatten distance between a and b
44 """
45 return sum(abs(a-b))
46
47
49 """Calculate Mahalanobis distance of the pairs of points.
50
51 :Parameters:
52 `x`
53 first list of points. Rows are samples, columns are
54 features.
55 `y`
56 second list of points (optional)
57 `w` : N.ndarray
58 optional inverse covariance matrix between the points. It is
59 computed if not given
60
61 Inverse covariance matrix can be calculated with the following
62
63 w = N.linalg.solve(N.cov(x.T), N.identity(x.shape[1]))
64
65 or
66
67 w = N.linalg.inv(N.cov(x.T))
68 """
69
70 if y is None:
71
72
73 if w is None:
74 w = N.linalg.inv(N.cov(x.T))
75
76
77 mx, nx = x.shape
78
79
80
81 d = N.zeros((mx, mx), dtype=N.float32)
82 for i in range(mx-1):
83
84 xi = x[i, :]
85
86 xi = xi[N.newaxis, :].repeat(mx-i-1, axis=0)
87
88 dc = x[i+1:mx, :] - xi
89
90 d[i+1:mx, i] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1))
91
92 d[i, i+1:mx] = d[i+1:mx, i].T
93 else:
94
95
96 if w is None:
97
98 w = N.linalg.inv(N.cov(N.concatenate((x, y)).T))
99
100
101 mx, nx = x.shape
102 my, ny = y.shape
103
104
105 d = N.zeros((mx, my), dtype=N.float32)
106
107
108 if mx <= my:
109
110 for i in range(mx):
111
112 xi = x[i, :]
113
114 xi = xi[N.newaxis, :].repeat(my, axis=0)
115
116 dc = xi - y
117
118 d[i, :] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1))
119 else:
120
121 for j in range(my):
122
123 yj = y[j, :]
124
125 yj = yj[N.newaxis, :].repeat(mx, axis=0)
126
127 dc = x - yj
128
129 d[:, j] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1))
130
131
132 return N.sqrt(d)
133
134
136 """Compute weighted euclidean distance matrix between two datasets.
137
138
139 :Parameters:
140 data1 : N.ndarray
141 first dataset
142 data2 : N.ndarray
143 second dataset. If None, compute the euclidean distance between
144 the first dataset versus itself.
145 (Defaults to None)
146 weight : N.ndarray
147 vector of weights, each one associated to each dimension of the
148 dataset (Defaults to None)
149 """
150 if __debug__:
151
152 if not N.issubdtype(data1.dtype, 'f') \
153 or (data2 is not None and not N.issubdtype(data2.dtype, 'f')):
154 warning('Computing euclidean distance on integer data ' \
155 'is not supported.')
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179 if weight is None:
180 data1w = data1
181 if data2 is None:
182 data2, data2w = data1, data1w
183 else:
184 data2w = data2
185 else:
186 data1w = data1 * weight
187 if data2 is None:
188 data2, data2w = data1, data1w
189 else:
190 data2w = data2 * weight
191
192 squared_euclidean_distance_matrix = \
193 (data1w * data1).sum(1)[:, None] \
194 -2 * N.dot(data1w, data2.T) \
195 + (data2 * data2w).sum(1)
196
197
198 less0 = squared_euclidean_distance_matrix < 0
199 if __debug__ and 'CHECK_STABILITY' in debug.active:
200 less0num = N.sum(less0)
201 if less0num > 0:
202 norm0 = N.linalg.norm(squared_euclidean_distance_matrix[less0])
203 totalnorm = N.linalg.norm(squared_euclidean_distance_matrix)
204 if totalnorm != 0 and norm0 / totalnorm > 1e-8:
205 warning("Found %d elements out of %d unstable (<0) in " \
206 "computation of squared_euclidean_distance_matrix. " \
207 "Their norm is %s when total norm is %s" % \
208 (less0num, N.sum(less0.shape), norm0, totalnorm))
209 squared_euclidean_distance_matrix[less0] = 0
210 return squared_euclidean_distance_matrix
211
212
214 """Return one minus the correlation matrix between the rows of two matrices.
215
216 This functions computes a matrix of correlations between all pairs of
217 rows of two matrices. Unlike NumPy's corrcoef() this function will only
218 considers pairs across matrices and not within, e.g. both elements of
219 a pair never have the same source matrix as origin.
220
221 Both arrays need to have the same number of columns.
222
223 :Parameters:
224 X: 2D-array
225 Y: 2D-array
226
227 Example:
228
229 >>> X = N.random.rand(20,80)
230 >>> Y = N.random.rand(5,80)
231 >>> C = oneMinusCorrelation(X, Y)
232 >>> print C.shape
233 (20, 5)
234 """
235
236 if __debug__:
237 if not X.shape[1] == Y.shape[1]:
238 raise ValueError, 'correlation() requires to matrices with the ' \
239 'same #columns (Got: %s and %s)' \
240 % (X.shape, Y.shape)
241
242
243 Zx = X - N.c_[X.mean(axis=1)]
244 Zx /= N.c_[X.std(axis=1)]
245 Zy = Y - N.c_[Y.mean(axis=1)]
246 Zy /= N.c_[Y.std(axis=1)]
247
248 C = ((N.matrix(Zx) * N.matrix(Zy).T) / Zx.shape[1]).A
249
250
251 C -= 1.0
252
253 return N.abs(C)
254
255
256 -def pnorm_w_python(data1, data2=None, weight=None, p=2,
257 heuristic='auto', use_sq_euclidean=True):
258 """Weighted p-norm between two datasets (pure Python implementation)
259
260 ||x - x'||_w = (\sum_{i=1...N} (w_i*|x_i - x'_i|)**p)**(1/p)
261
262 :Parameters:
263 data1 : N.ndarray
264 First dataset
265 data2 : N.ndarray or None
266 Optional second dataset
267 weight : N.ndarray or None
268 Optional weights per 2nd dimension (features)
269 p
270 Power
271 heuristic : basestring
272 Which heuristic to use:
273 * 'samples' -- python sweep over 0th dim
274 * 'features' -- python sweep over 1st dim
275 * 'auto' decides automatically. If # of features (shape[1]) is much
276 larger than # of samples (shape[0]) -- use 'samples', and use
277 'features' otherwise
278 use_sq_euclidean : bool
279 Either to use squared_euclidean_distance_matrix for computation if p==2
280 """
281 if weight == None:
282 weight = N.ones(data1.shape[1], 'd')
283 pass
284
285 if p == 2 and use_sq_euclidean:
286 return N.sqrt(squared_euclidean_distance(data1=data1, data2=data2,
287 weight=weight**2))
288
289 if data2 == None:
290 data2 = data1
291 pass
292
293 S1,F1 = data1.shape[:2]
294 S2,F2 = data2.shape[:2]
295
296 if not (F1==F2==weight.size):
297 raise ValueError, \
298 "Datasets should have same #columns == #weights. Got " \
299 "%d %d %d" % (F1, F2, weight.size)
300 d = N.zeros((S1, S2), 'd')
301
302
303
304
305 if p == 1:
306 pf = lambda x:x
307 af = lambda x:x
308 else:
309 pf = lambda x:x ** p
310 af = lambda x:x ** (1.0/p)
311
312
313 if heuristic == 'auto':
314 heuristic = {False: 'samples',
315 True: 'features'}[(F1/S1) < 500]
316
317 if heuristic == 'features':
318
319 for NF in range(F1):
320 d += pf(N.abs(N.subtract.outer(data1[:,NF],
321 data2[:,NF]))*weight[NF])
322 pass
323 elif heuristic == 'samples':
324
325
326 for NS in xrange(S1):
327 dfw = pf(N.abs(data1[NS] - data2) * weight)
328 d[NS] = N.sum(dfw, axis=1)
329 pass
330 else:
331 raise ValueError, "Unknown heuristic '%s'. Need one of " \
332 "'auto', 'samples', 'features'" % heuristic
333 return af(d)
334
335
336 if externals.exists('weave'):
337 from scipy import weave
338 from scipy.weave import converters
339
340 - def pnorm_w(data1, data2=None, weight=None, p=2):
341 """Weighted p-norm between two datasets (scipy.weave implementation)
342
343 ||x - x'||_w = (\sum_{i=1...N} (w_i*|x_i - x'_i|)**p)**(1/p)
344
345 :Parameters:
346 data1 : N.ndarray
347 First dataset
348 data2 : N.ndarray or None
349 Optional second dataset
350 weight : N.ndarray or None
351 Optional weights per 2nd dimension (features)
352 p
353 Power
354 """
355
356 if weight == None:
357 weight = N.ones(data1.shape[1], 'd')
358 pass
359 S1, F1 = data1.shape[:2]
360 code = ""
361 if data2 == None or id(data1)==id(data2):
362 if not (F1==weight.size):
363 raise ValueError, \
364 "Dataset should have same #columns == #weights. Got " \
365 "%d %d" % (F1, weight.size)
366 F = F1
367 d = N.zeros((S1, S1), 'd')
368 try:
369 code_peritem = \
370 {1.0 : "tmp = tmp+weight(t)*fabs(data1(i,t)-data1(j,t))",
371 2.0 : "tmp2 = weight(t)*(data1(i,t)-data1(j,t));" \
372 " tmp = tmp + tmp2*tmp2"}[p]
373 except KeyError:
374 code_peritem = "tmp = tmp+pow(weight(t)*fabs(data1(i,t)-data1(j,t)),p)"
375
376 code = """
377 int i,j,t;
378 double tmp, tmp2;
379 for (i=0; i<S1-1; i++) {
380 for (j=i+1; j<S1; j++) {
381 tmp = 0.0;
382 for(t=0; t<F; t++) {
383 %s;
384 }
385 d(i,j) = tmp;
386 }
387 }
388 return_val = 0;
389 """ % code_peritem
390
391
392 counter = weave.inline(code,
393 ['data1', 'S1', 'F', 'weight', 'd', 'p'],
394 type_converters=converters.blitz,
395 compiler = 'gcc')
396 d = d + N.triu(d).T
397 return d**(1.0/p)
398
399 S2,F2 = data2.shape[:2]
400 if not (F1==F2==weight.size):
401 raise ValueError, \
402 "Datasets should have same #columns == #weights. Got " \
403 "%d %d %d" % (F1, F2, weight.size)
404 F = F1
405 d = N.zeros((S1, S2), 'd')
406 try:
407 code_peritem = \
408 {1.0 : "tmp = tmp+weight(t)*fabs(data1(i,t)-data2(j,t))",
409 2.0 : "tmp2 = weight(t)*(data1(i,t)-data2(j,t));" \
410 " tmp = tmp + tmp2*tmp2"}[p]
411 except KeyError:
412 code_peritem = "tmp = tmp+pow(weight(t)*fabs(data1(i,t)-data2(j,t)),p)"
413 pass
414
415 code = """
416 int i,j,t;
417 double tmp, tmp2;
418 for (i=0; i<S1; i++) {
419 for (j=0; j<S2; j++) {
420 tmp = 0.0;
421 for(t=0; t<F; t++) {
422 %s;
423 }
424 d(i,j) = tmp;
425 }
426 }
427 return_val = 0;
428
429 """ % code_peritem
430
431 counter = weave.inline(code,
432 ['data1', 'data2', 'S1', 'S2',
433 'F', 'weight', 'd', 'p'],
434 type_converters=converters.blitz,
435 compiler = 'gcc')
436 return d**(1.0/p)
437
438 else:
439
440 pnorm_w = pnorm_w_python
441 pass
442