mvpa.clfs.distance

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # See COPYING file distributed along with the PyMVPA package for the 6 # copyright and license terms. 7 # 8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 """Distance functions to be used in kernels and elsewhere 10 """ 11 12 __docformat__ = 'restructuredtext' 13 14 # TODO: Make all distance functions accept 2D matrices samples x features 15 # and compute the distance matrix between all samples. They would 16 # need to be capable of dealing with unequal number of rows! 17 # If we would have that, we could make use of them in kNN. 18 19 import numpy as N 20 from mvpa.base import externals 21 22 if __debug__: 23 from mvpa.base import debug, warning 24 25

26 -def cartesianDistance(a, b):

27 """Return Cartesian distance between a and b 28 """ 29 return N.linalg.norm(a-b)

30 31

32 -def absminDistance(a, b):

33 """Returns dinstance max(\|a-b\|) 34 XXX There must be better name! 35 XXX Actually, why is it absmin not absmax? 36 37 Useful to select a whole cube of a given "radius" 38 """ 39 return max(abs(a-b))

40 41

42 -def manhattenDistance(a, b):

43 """Return Manhatten distance between a and b 44 """ 45 return sum(abs(a-b))

46 47

48 -def mahalanobisDistance(x, y=None, w=None):

49 """Calculate Mahalanobis distance of the pairs of points. 50 51 :Parameters: 52 `x` 53 first list of points. Rows are samples, columns are 54 features. 55 `y` 56 second list of points (optional) 57 `w` : N.ndarray 58 optional inverse covariance matrix between the points. It is 59 computed if not given 60 61 Inverse covariance matrix can be calculated with the following 62 63 w = N.linalg.solve(N.cov(x.T), N.identity(x.shape[1])) 64 65 or 66 67 w = N.linalg.inv(N.cov(x.T)) 68 """ 69 # see if pairwise between two matrices or just within a single matrix 70 if y is None: 71 # pairwise distances of single matrix 72 # calculate the inverse correlation matrix if necessary 73 if w is None: 74 w = N.linalg.inv(N.cov(x.T)) 75 76 # get some shapes of the data 77 mx, nx = x.shape 78 #mw, nw = w.shape 79 80 # allocate for the matrix to fill 81 d = N.zeros((mx, mx), dtype=N.float32) 82 for i in range(mx-1): 83 # get the current row to compare 84 xi = x[i, :] 85 # replicate the row 86 xi = xi[N.newaxis, :].repeat(mx-i-1, axis=0) 87 # take the distance between all the matrices 88 dc = x[i+1:mx, :] - xi 89 # scale the distance by the correlation 90 d[i+1:mx, i] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1)) 91 # fill the other direction of the matrix 92 d[i, i+1:mx] = d[i+1:mx, i].T 93 else: 94 # is between two matrixes 95 # calculate the inverse correlation matrix if necessary 96 if w is None: 97 # calculate over all points 98 w = N.linalg.inv(N.cov(N.concatenate((x, y)).T)) 99 100 # get some shapes of the data 101 mx, nx = x.shape 102 my, ny = y.shape 103 104 # allocate for the matrix to fill 105 d = N.zeros((mx, my), dtype=N.float32) 106 107 # loop over shorter of two dimensions 108 if mx <= my: 109 # loop over the x patterns 110 for i in range(mx): 111 # get the current row to compare 112 xi = x[i, :] 113 # replicate the row 114 xi = xi[N.newaxis, :].repeat(my, axis=0) 115 # take the distance between all the matrices 116 dc = xi - y 117 # scale the distance by the correlation 118 d[i, :] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1)) 119 else: 120 # loop over the y patterns 121 for j in range(my): 122 # get the current row to compare 123 yj = y[j, :] 124 # replicate the row 125 yj = yj[N.newaxis, :].repeat(mx, axis=0) 126 # take the distance between all the matrices 127 dc = x - yj 128 # scale the distance by the correlation 129 d[:, j] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1)) 130 131 # return the dist 132 return N.sqrt(d)

133 134

135 -def squared_euclidean_distance(data1, data2=None, weight=None):

136 """Compute weighted euclidean distance matrix between two datasets. 137 138 139 :Parameters: 140 data1 : N.ndarray 141 first dataset 142 data2 : N.ndarray 143 second dataset. If None, compute the euclidean distance between 144 the first dataset versus itself. 145 (Defaults to None) 146 weight : N.ndarray 147 vector of weights, each one associated to each dimension of the 148 dataset (Defaults to None) 149 """ 150 if __debug__: 151 # check if both datasets are floating point 152 if not N.issubdtype(data1.dtype, 'f') \ 153 or (data2 is not None and not N.issubdtype(data2.dtype, 'f')): 154 warning('Computing euclidean distance on integer data ' \ 155 'is not supported.') 156 157 # removed for efficiency (see below) 158 #if weight is None: 159 # weight = N.ones(data1.shape[1], 'd') # unitary weight 160 161 # In the following you can find faster implementations of this 162 # basic code: 163 # 164 # squared_euclidean_distance_matrix = \ 165 # N.zeros((data1.shape[0], data2.shape[0]), 'd') 166 # for i in range(size1): 167 # for j in range(size2): 168 # squared_euclidean_distance_matrix[i, j] = \ 169 # ((data1[i, :]-data2[j, :])**2*weight).sum() 170 # pass 171 # pass 172 173 # Fast computation of distance matrix in Python+NumPy, 174 # adapted from Bill Baxter's post on [numpy-discussion]. 175 # Basically: (x-y)**2*w = x*w*x - 2*x*w*y + y*y*w 176 177 # based on value of weight and data2 we might save on computation 178 # and resources 179 if weight is None: 180 data1w = data1 181 if data2 is None: 182 data2, data2w = data1, data1w 183 else: 184 data2w = data2 185 else: 186 data1w = data1 * weight 187 if data2 is None: 188 data2, data2w = data1, data1w 189 else: 190 data2w = data2 * weight 191 192 squared_euclidean_distance_matrix = \ 193 (data1w * data1).sum(1)[:, None] \ 194 -2 * N.dot(data1w, data2.T) \ 195 + (data2 * data2w).sum(1) 196 197 # correction to some possible numerical instabilities: 198 less0 = squared_euclidean_distance_matrix < 0 199 if __debug__ and 'CHECK_STABILITY' in debug.active: 200 less0num = N.sum(less0) 201 if less0num > 0: 202 norm0 = N.linalg.norm(squared_euclidean_distance_matrix[less0]) 203 totalnorm = N.linalg.norm(squared_euclidean_distance_matrix) 204 if totalnorm != 0 and norm0 / totalnorm > 1e-8: 205 warning("Found %d elements out of %d unstable (<0) in " \ 206 "computation of squared_euclidean_distance_matrix. " \ 207 "Their norm is %s when total norm is %s" % \ 208 (less0num, N.sum(less0.shape), norm0, totalnorm)) 209 squared_euclidean_distance_matrix[less0] = 0 210 return squared_euclidean_distance_matrix

211 212

213 -def oneMinusCorrelation(X, Y):

214 """Return one minus the correlation matrix between the rows of two matrices. 215 216 This functions computes a matrix of correlations between all pairs of 217 rows of two matrices. Unlike NumPy's corrcoef() this function will only 218 considers pairs across matrices and not within, e.g. both elements of 219 a pair never have the same source matrix as origin. 220 221 Both arrays need to have the same number of columns. 222 223 :Parameters: 224 X: 2D-array 225 Y: 2D-array 226 227 Example: 228 229 >>> X = N.random.rand(20,80) 230 >>> Y = N.random.rand(5,80) 231 >>> C = oneMinusCorrelation(X, Y) 232 >>> print C.shape 233 (20, 5) 234 """ 235 # check if matrices have same number of columns 236 if __debug__: 237 if not X.shape[1] == Y.shape[1]: 238 raise ValueError, 'correlation() requires to matrices with the ' \ 239 'same #columns (Got: %s and %s)' \ 240 % (X.shape, Y.shape) 241 242 # zscore each sample/row 243 Zx = X - N.c_[X.mean(axis=1)] 244 Zx /= N.c_[X.std(axis=1)] 245 Zy = Y - N.c_[Y.mean(axis=1)] 246 Zy /= N.c_[Y.std(axis=1)] 247 248 C = ((N.matrix(Zx) * N.matrix(Zy).T) / Zx.shape[1]).A 249 250 # let it behave like a distance, i.e. smaller is closer 251 C -= 1.0 252 253 return N.abs(C)

254 255

256 -def pnorm_w_python(data1, data2=None, weight=None, p=2, 257 heuristic='auto', use_sq_euclidean=True):

258 """Weighted p-norm between two datasets (pure Python implementation) 259 260 ||x - x'||_w = (\sum_{i=1...N} (w_i*|x_i - x'_i|)**p)**(1/p) 261 262 :Parameters: 263 data1 : N.ndarray 264 First dataset 265 data2 : N.ndarray or None 266 Optional second dataset 267 weight : N.ndarray or None 268 Optional weights per 2nd dimension (features) 269 p 270 Power 271 heuristic : basestring 272 Which heuristic to use: 273 * 'samples' -- python sweep over 0th dim 274 * 'features' -- python sweep over 1st dim 275 * 'auto' decides automatically. If # of features (shape[1]) is much 276 larger than # of samples (shape[0]) -- use 'samples', and use 277 'features' otherwise 278 use_sq_euclidean : bool 279 Either to use squared_euclidean_distance_matrix for computation if p==2 280 """ 281 if weight == None: 282 weight = N.ones(data1.shape[1], 'd') 283 pass 284 285 if p == 2 and use_sq_euclidean: 286 return N.sqrt(squared_euclidean_distance(data1=data1, data2=data2, 287 weight=weight**2)) 288 289 if data2 == None: 290 data2 = data1 291 pass 292 293 S1,F1 = data1.shape[:2] 294 S2,F2 = data2.shape[:2] 295 # sanity check 296 if not (F1==F2==weight.size): 297 raise ValueError, \ 298 "Datasets should have same #columns == #weights. Got " \ 299 "%d %d %d" % (F1, F2, weight.size) 300 d = N.zeros((S1, S2), 'd') 301 302 # Adjust local functions for specific p values 303 # pf - power function 304 # af - after function 305 if p == 1: 306 pf = lambda x:x 307 af = lambda x:x 308 else: 309 pf = lambda x:x ** p 310 af = lambda x:x ** (1.0/p) 311 312 # heuristic 'auto' might need to be adjusted 313 if heuristic == 'auto': 314 heuristic = {False: 'samples', 315 True: 'features'}[(F1/S1) < 500] 316 317 if heuristic == 'features': 318 # Efficient implementation if the feature size is little. 319 for NF in range(F1): 320 d += pf(N.abs(N.subtract.outer(data1[:,NF], 321 data2[:,NF]))*weight[NF]) 322 pass 323 elif heuristic == 'samples': 324 # Efficient implementation if the feature size is much larger 325 # than number of samples 326 for NS in xrange(S1): 327 dfw = pf(N.abs(data1[NS] - data2) * weight) 328 d[NS] = N.sum(dfw, axis=1) 329 pass 330 else: 331 raise ValueError, "Unknown heuristic '%s'. Need one of " \ 332 "'auto', 'samples', 'features'" % heuristic 333 return af(d)

334 335 336 if externals.exists('weave'): 337 from scipy import weave 338 from scipy.weave import converters 339

340 - def pnorm_w(data1, data2=None, weight=None, p=2):

341 """Weighted p-norm between two datasets (scipy.weave implementation) 342 343 ||x - x'||_w = (\sum_{i=1...N} (w_i*|x_i - x'_i|)**p)**(1/p) 344 345 :Parameters: 346 data1 : N.ndarray 347 First dataset 348 data2 : N.ndarray or None 349 Optional second dataset 350 weight : N.ndarray or None 351 Optional weights per 2nd dimension (features) 352 p 353 Power 354 """ 355 356 if weight == None: 357 weight = N.ones(data1.shape[1], 'd') 358 pass 359 S1, F1 = data1.shape[:2] 360 code = "" 361 if data2 == None or id(data1)==id(data2): 362 if not (F1==weight.size): 363 raise ValueError, \ 364 "Dataset should have same #columns == #weights. Got " \ 365 "%d %d" % (F1, weight.size) 366 F = F1 367 d = N.zeros((S1, S1), 'd') 368 try: 369 code_peritem = \ 370 {1.0 : "tmp = tmp+weight(t)*fabs(data1(i,t)-data1(j,t))", 371 2.0 : "tmp2 = weight(t)*(data1(i,t)-data1(j,t));" \ 372 " tmp = tmp + tmp2*tmp2"}[p] 373 except KeyError: 374 code_peritem = "tmp = tmp+pow(weight(t)*fabs(data1(i,t)-data1(j,t)),p)" 375 376 code = """ 377 int i,j,t; 378 double tmp, tmp2; 379 for (i=0; i<S1-1; i++) { 380 for (j=i+1; j<S1; j++) { 381 tmp = 0.0; 382 for(t=0; t<F; t++) { 383 %s; 384 } 385 d(i,j) = tmp; 386 } 387 } 388 return_val = 0; 389 """ % code_peritem 390 391 392 counter = weave.inline(code, 393 ['data1', 'S1', 'F', 'weight', 'd', 'p'], 394 type_converters=converters.blitz, 395 compiler = 'gcc') 396 d = d + N.triu(d).T # copy upper part to lower part 397 return d**(1.0/p) 398 399 S2,F2 = data2.shape[:2] 400 if not (F1==F2==weight.size): 401 raise ValueError, \ 402 "Datasets should have same #columns == #weights. Got " \ 403 "%d %d %d" % (F1, F2, weight.size) 404 F = F1 405 d = N.zeros((S1, S2), 'd') 406 try: 407 code_peritem = \ 408 {1.0 : "tmp = tmp+weight(t)*fabs(data1(i,t)-data2(j,t))", 409 2.0 : "tmp2 = weight(t)*(data1(i,t)-data2(j,t));" \ 410 " tmp = tmp + tmp2*tmp2"}[p] 411 except KeyError: 412 code_peritem = "tmp = tmp+pow(weight(t)*fabs(data1(i,t)-data2(j,t)),p)" 413 pass 414 415 code = """ 416 int i,j,t; 417 double tmp, tmp2; 418 for (i=0; i<S1; i++) { 419 for (j=0; j<S2; j++) { 420 tmp = 0.0; 421 for(t=0; t<F; t++) { 422 %s; 423 } 424 d(i,j) = tmp; 425 } 426 } 427 return_val = 0; 428 429 """ % code_peritem 430 431 counter = weave.inline(code, 432 ['data1', 'data2', 'S1', 'S2', 433 'F', 'weight', 'd', 'p'], 434 type_converters=converters.blitz, 435 compiler = 'gcc') 436 return d**(1.0/p)

437 438 else: 439 # Bind pure python implementation 440 pnorm_w = pnorm_w_python 441 pass 442

Source Code for Module mvpa.clfs.distance