mvpa.measures.irelief

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # Copyright (c) 2008 Emanuele Olivetti <emanuele@relativita.com> 6 # See COPYING file distributed along with the PyMVPA package for the 7 # copyright and license terms. 8 # 9 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 10 """FeaturewiseDatasetMeasure performing multivariate Iterative RELIEF 11 (I-RELIEF) algorithm. 12 See : Y. Sun, Iterative RELIEF for Feature Weighting: Algorithms, Theories, 13 and Applications, IEEE Trans. on Pattern Analysis and Machine Intelligence 14 (TPAMI), vol. 29, no. 6, pp. 1035-1051, June 2007.""" 15 16 17 __docformat__ = 'restructuredtext' 18 19 import numpy as N 20 21 from mvpa.measures.base import FeaturewiseDatasetMeasure 22 from mvpa.clfs.kernel import KernelSquaredExponential, KernelExponential, \ 23 KernelMatern_3_2, KernelMatern_5_2 24 from mvpa.clfs.distance import pnorm_w 25 26 if __debug__: 27 from mvpa.base import debug 28 29

30 -class IterativeRelief_Devel(FeaturewiseDatasetMeasure):

31 """`FeaturewiseDatasetMeasure` that performs multivariate I-RELIEF 32 algorithm. Batch version allowing various kernels. 33 34 UNDER DEVELOPEMNT. 35 36 Batch I-RELIEF-2 feature weighting algorithm. Works for binary or 37 multiclass class-labels. Batch version with complexity O(T*N^2*I), 38 where T is the number of iterations, N the number of instances, I 39 the number of features. 40 41 See: Y. Sun, Iterative RELIEF for Feature Weighting: Algorithms, 42 Theories, and Applications, IEEE Trans. on Pattern Analysis and 43 Machine Intelligence (TPAMI), vol. 29, no. 6, pp. 1035-1051, June 44 2007. http://plaza.ufl.edu/sunyijun/Paper/PAMI_1.pdf 45 46 Note that current implementation allows to use only 47 exponential-like kernels. Support for linear kernel will be 48 added later. 49 """

50 - def __init__(self, threshold = 1.0e-2, kernel = None, kernel_width = 1.0, 51 w_guess = None, **kwargs):

52 """Constructor of the IRELIEF class. 53 54 """ 55 # init base classes first 56 FeaturewiseDatasetMeasure.__init__(self, **kwargs) 57 58 # Threshold in W changes (stopping criterion for irelief) 59 self.threshold = threshold 60 if kernel == None: 61 self.kernel = KernelExponential 62 else: 63 self.kernel = kernel 64 65 self.w_guess = w_guess 66 self.w = None 67 self.kernel_width = kernel_width

68 69

70 - def compute_M_H(self, label):

71 """Compute hit/miss dictionaries. 72 73 For each instance compute the set of indices having the same 74 class label and different class label. 75 76 Note that this computation is independent of the number of 77 features. 78 """ 79 80 M = {} 81 H = {} 82 for i in range(label.size): 83 M[i] = N.where(label != label[i])[0] 84 tmp = (N.where(label == label[i])[0]).tolist() 85 tmp.remove(i) 86 # There must be at least two exampls for class label[i] 87 assert(tmp != []) 88 H[i] = N.array(tmp) 89 90 return M, H

91 92

93 - def _call(self, dataset):

94 """Computes featurewise I-RELIEF weights.""" 95 samples = dataset.samples 96 NS, NF = samples.shape[:2] 97 if self.w_guess == None: 98 self.w = N.ones(NF, 'd') 99 # do normalization in all cases to be safe :) 100 self.w = self.w/(self.w**2).sum() 101 102 M, H = self.compute_M_H(dataset.labels) 103 104 while True: 105 self.k = self.kernel(length_scale = self.kernel_width/self.w) 106 d_w_k = self.k.compute(samples) 107 # set d_w_k to zero where distance=0 (i.e. kernel == 108 # 1.0), otherwise I-RELIEF could not converge. 109 # XXX Note that kernel==1 for distance=0 only for 110 # exponential kernels!! IMPROVE 111 d_w_k[N.abs(d_w_k-1.0) < 1.0e-15] = 0.0 112 ni = N.zeros(NF, 'd') 113 for n in range(NS): 114 # d_w_k[n,n] could be omitted since == 0.0 115 gamma_n = 1.0 - N.nan_to_num(d_w_k[n, M[n]].sum() \ 116 / (d_w_k[n, :].sum()-d_w_k[n, n])) 117 alpha_n = N.nan_to_num(d_w_k[n, M[n]]/(d_w_k[n, M[n]].sum())) 118 beta_n = N.nan_to_num(d_w_k[n, H[n]]/(d_w_k[n, H[n]].sum())) 119 120 m_n = (N.abs(samples[n, :] - samples[M[n], :]) \ 121 * alpha_n[:, None]).sum(0) 122 h_n = (N.abs(samples[n, :] - samples[H[n], :]) \ 123 * beta_n[:, None]).sum(0) 124 ni += gamma_n*(m_n-h_n) 125 ni = ni/NS 126 127 ni_plus = N.clip(ni, 0.0, N.inf) # set all negative elements to zero 128 w_new = N.nan_to_num(ni_plus/(N.sqrt((ni_plus**2).sum()))) 129 change = N.abs(w_new-self.w).sum() 130 if __debug__ and 'IRELIEF' in debug.active: 131 debug('IRELIEF', 132 "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" 133 % (change, w_new.max(), w_new.min(), w_new.mean(), 134 w_new.std(), N.isnan(w_new).sum())) 135 136 # update weights: 137 self.w = w_new 138 if change < self.threshold: 139 break 140 141 return self.w

142 143

144 -class IterativeReliefOnline_Devel(IterativeRelief_Devel):

145 """`FeaturewiseDatasetMeasure` that performs multivariate I-RELIEF 146 algorithm. Online version. 147 148 UNDER DEVELOPMENT 149 150 Online version with complexity O(T*N*I), 151 where N is the number of instances and I the number of features. 152 153 See: Y. Sun, Iterative RELIEF for Feature Weighting: Algorithms, 154 Theories, and Applications, IEEE Trans. on Pattern Analysis and 155 Machine Intelligence (TPAMI), vol. 29, no. 6, pp. 1035-1051, June 156 2007. http://plaza.ufl.edu/sunyijun/Paper/PAMI_1.pdf 157 158 Note that this implementation is not fully online, since hit and 159 miss dictionaries (H,M) are computed once at the beginning using 160 full access to all labels. This can be easily corrected to a full 161 online implementation. But this is not mandatory now since the 162 major goal of this current online implementation is reduction of 163 computational complexity. 164 """ 165

166 - def __init__(self, a=5.0, permute=True, max_iter=3, **kwargs):

167 """Constructor of the IRELIEF class. 168 169 """ 170 # init base classes first 171 IterativeRelief_Devel.__init__(self, **kwargs) 172 173 self.a = a # parameter of the learning rate 174 self.permute = permute # shuffle data when running I-RELIEF 175 self.max_iter = max_iter # maximum number of iterations

176 177

178 - def _call(self, dataset):

179 """Computes featurewise I-RELIEF-2 weights. Online version.""" 180 NS = dataset.samples.shape[0] 181 NF = dataset.samples.shape[1] 182 if self.w_guess == None: 183 self.w = N.ones(NF, 'd') 184 # do normalization in all cases to be safe :) 185 self.w = self.w/(self.w**2).sum() 186 187 M, H = self.compute_M_H(dataset.labels) 188 189 ni = N.zeros(NF, 'd') 190 pi = N.zeros(NF, 'd') 191 192 if self.permute: 193 # indices to go through samples in random order 194 random_sequence = N.random.permutation(NS) 195 else: 196 random_sequence = N.arange(NS) 197 198 change = self.threshold + 1.0 199 iteration = 0 200 counter = 0.0 201 while change > self.threshold and iteration < self.max_iter: 202 if __debug__: 203 debug('IRELIEF', "Iteration %d" % iteration) 204 205 for t in range(NS): 206 counter += 1.0 207 n = random_sequence[t] 208 209 self.k = self.kernel(length_scale = self.kernel_width/self.w) 210 d_w_k_xn_Mn = self.k.compute(dataset.samples[None, n, :], 211 dataset.samples[M[n], :]).squeeze() 212 d_w_k_xn_Mn_sum = d_w_k_xn_Mn.sum() 213 d_w_k_xn_x = self.k.compute(dataset.samples[None, n, :], 214 dataset.samples).squeeze() 215 gamma_n = 1.0 - d_w_k_xn_Mn_sum / d_w_k_xn_x.sum() 216 alpha_n = d_w_k_xn_Mn / d_w_k_xn_Mn_sum 217 218 d_w_k_xn_Hn = self.k.compute(dataset.samples[None, n, :], 219 dataset.samples[H[n], :]).squeeze() 220 beta_n = d_w_k_xn_Hn / d_w_k_xn_Hn.sum() 221 222 m_n = (N.abs(dataset.samples[n, :] - dataset.samples[M[n], :]) \ 223 * alpha_n[:, N.newaxis]).sum(0) 224 h_n = (N.abs(dataset.samples[n, :] - dataset.samples[H[n], :]) \ 225 * beta_n[:, N.newaxis]).sum(0) 226 pi = gamma_n * (m_n-h_n) 227 learning_rate = 1.0 / (counter * self.a + 1.0) 228 ni_new = ni + learning_rate * (pi - ni) 229 ni = ni_new 230 231 # set all negative elements to zero 232 ni_plus = N.clip(ni, 0.0, N.inf) 233 w_new = N.nan_to_num(ni_plus / (N.sqrt((ni_plus ** 2).sum()))) 234 change = N.abs(w_new - self.w).sum() 235 if t % 10 == 0 and __debug__ and 'IRELIEF' in debug.active: 236 debug('IRELIEF', 237 "t=%d change=%.4f max=%f min=%.4f mean=%.4f std=%.4f" 238 " #nan=%d" % 239 (t, change, w_new.max(), w_new.min(), w_new.mean(), 240 w_new.std(), N.isnan(w_new).sum())) 241 242 self.w = w_new 243 244 if change < self.threshold and iteration > 0: 245 break 246 247 iteration += 1 248 249 return self.w

250 251 252

253 -class IterativeRelief(FeaturewiseDatasetMeasure):

254 """`FeaturewiseDatasetMeasure` that performs multivariate I-RELIEF 255 algorithm. Batch version. 256 257 Batch I-RELIEF-2 feature weighting algorithm. Works for binary or 258 multiclass class-labels. Batch version with complexity O(T*N^2*I), 259 where T is the number of iterations, N the number of instances, I 260 the number of features. 261 262 See: Y. Sun, Iterative RELIEF for Feature Weighting: Algorithms, 263 Theories, and Applications, IEEE Trans. on Pattern Analysis and 264 Machine Intelligence (TPAMI), vol. 29, no. 6, pp. 1035-1051, June 265 2007. http://plaza.ufl.edu/sunyijun/Paper/PAMI_1.pdf 266 267 Note that current implementation allows to use only 268 exponential-like kernels. Support for linear kernel will be 269 added later. 270 """

271 - def __init__(self, threshold=1.0e-2, kernel_width=1.0, 272 w_guess=None, **kwargs):

273 """Constructor of the IRELIEF class. 274 275 """ 276 # init base classes first 277 FeaturewiseDatasetMeasure.__init__(self, **kwargs) 278 279 # Threshold in W changes (stopping criterion for irelief). 280 self.threshold = threshold 281 self.w_guess = w_guess 282 self.w = None 283 self.kernel_width = kernel_width

284 285

286 - def compute_M_H(self, label):

287 """Compute hit/miss dictionaries. 288 289 For each instance compute the set of indices having the same 290 class label and different class label. 291 292 Note that this computation is independent of the number of 293 features. 294 295 XXX should it be some generic function since it doesn't use self 296 """ 297 298 M = {} 299 H = {} 300 for i in range(label.size): 301 M[i] = N.where(label != label[i])[0] 302 tmp = (N.where(label == label[i])[0]).tolist() 303 tmp.remove(i) 304 # There must be least two exampls for class label[i] 305 assert(tmp != []) 306 H[i] = N.array(tmp) 307 308 return M, H

309 310

311 - def k(self, distances):

312 """Exponential kernel.""" 313 kd = N.exp(-distances/self.kernel_width) 314 # set kd to zero where distance=0 otherwise I-RELIEF could not converge. 315 kd[N.abs(distances) < 1.0e-15] = 0.0 316 return kd

317 318

319 - def _call(self, dataset):

320 """Computes featurewise I-RELIEF weights.""" 321 samples = dataset.samples 322 NS, NF = samples.shape[:2] 323 324 if self.w_guess == None: 325 w = N.ones(NF, 'd') 326 327 w /= (w ** 2).sum() # do normalization in all cases to be safe :) 328 329 M, H = self.compute_M_H(dataset.labels) 330 331 while True: 332 d_w_k = self.k(pnorm_w(data1=samples, weight=w, p=1)) 333 ni = N.zeros(NF, 'd') 334 for n in range(NS): 335 # d_w_k[n, n] could be omitted since == 0.0 336 gamma_n = 1.0 - N.nan_to_num(d_w_k[n, M[n]].sum() \ 337 / (d_w_k[n, :].sum() - d_w_k[n, n])) 338 alpha_n = N.nan_to_num(d_w_k[n, M[n]] / (d_w_k[n, M[n]].sum())) 339 beta_n = N.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum())) 340 341 m_n = (N.abs(samples[n, :] - samples[M[n], :]) \ 342 * alpha_n[:, None]).sum(0) 343 h_n = (N.abs(samples[n, :] - samples[H[n], :]) \ 344 * beta_n[:, None]).sum(0) 345 ni += gamma_n*(m_n - h_n) 346 347 ni = ni / NS 348 349 ni_plus = N.clip(ni, 0.0, N.inf) # set all negative elements to zero 350 w_new = N.nan_to_num(ni_plus / (N.sqrt((ni_plus**2).sum()))) 351 change = N.abs(w_new - w).sum() 352 if __debug__ and 'IRELIEF' in debug.active: 353 debug('IRELIEF', 354 "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" \ 355 % (change, w_new.max(), w_new.min(), w_new.mean(), 356 w_new.std(), N.isnan(w_new).sum())) 357 358 # update weights: 359 w = w_new 360 if change < self.threshold: 361 break 362 363 self.w = w 364 return w

365 366

367 -class IterativeReliefOnline(IterativeRelief):

368 """`FeaturewiseDatasetMeasure` that performs multivariate I-RELIEF 369 algorithm. Online version. 370 371 This algorithm is exactly the one in the referenced paper 372 (I-RELIEF-2 online), using weighted 1-norm and Exponential 373 Kernel. 374 """ 375

376 - def __init__(self, a=10.0, permute=True, max_iter=3, **kwargs):

377 """Constructor of the IRELIEF class. 378 379 """ 380 # init base classes first 381 IterativeRelief.__init__(self, **kwargs) 382 383 self.a = a # parameter of the learning rate 384 self.permute = permute # shuffle data when running I-RELIEF 385 self.max_iter = max_iter # maximum number of iterations

386 387

388 - def _call(self, dataset):

389 """Computes featurewise I-RELIEF-2 weights. Online version.""" 390 # local bindings 391 samples = dataset.samples 392 NS, NF = samples.shape[:2] 393 threshold = self.threshold 394 a = self.a 395 396 if self.w_guess == None: 397 w = N.ones(NF, 'd') 398 399 # do normalization in all cases to be safe :) 400 w /= (w ** 2).sum() 401 402 M, H = self.compute_M_H(dataset.labels) 403 404 ni = N.zeros(NF, 'd') 405 pi = N.zeros(NF, 'd') 406 407 if self.permute: 408 # indices to go through x in random order 409 random_sequence = N.random.permutation(NS) 410 else: 411 random_sequence = N.arange(NS) 412 413 change = threshold + 1.0 414 iteration = 0 415 counter = 0.0 416 while change > threshold and iteration < self.max_iter: 417 if __debug__: 418 debug('IRELIEF', "Iteration %d" % iteration) 419 for t in range(NS): 420 counter += 1.0 421 n = random_sequence[t] 422 423 d_xn_x = N.abs(samples[n, :] - samples) 424 d_w_k_xn_x = self.k((d_xn_x * w).sum(1)) 425 426 d_w_k_xn_Mn = d_w_k_xn_x[M[n]] 427 d_w_k_xn_Mn_sum = d_w_k_xn_Mn.sum() 428 429 gamma_n = 1.0 - d_w_k_xn_Mn_sum / d_w_k_xn_x.sum() 430 alpha_n = d_w_k_xn_Mn / d_w_k_xn_Mn_sum 431 432 d_w_k_xn_Hn = d_w_k_xn_x[H[n]] 433 beta_n = d_w_k_xn_Hn / d_w_k_xn_Hn.sum() 434 435 m_n = (d_xn_x[M[n], :] * alpha_n[:, None]).sum(0) 436 h_n = (d_xn_x[H[n], :] * beta_n[:, None]).sum(0) 437 pi = gamma_n * (m_n - h_n) 438 learning_rate = 1.0 / (counter * a + 1.0) 439 ni_new = ni + learning_rate * (pi - ni) 440 ni = ni_new 441 442 # set all negative elements to zero 443 ni_plus = N.clip(ni, 0.0, N.inf) 444 w_new = N.nan_to_num(ni_plus / (N.sqrt((ni_plus ** 2).sum()))) 445 change = N.abs(w_new - w).sum() 446 if t % 10 == 0 and __debug__ and 'IRELIEF' in debug.active: 447 debug('IRELIEF', 448 "t=%d change=%.4f max=%f min=%.4f mean=%.4f std=%.4f" 449 " #nan=%d" % 450 (t, change, w_new.max(), w_new.min(), w_new.mean(), 451 w_new.std(), N.isnan(w_new).sum())) 452 453 w = w_new 454 455 if change < threshold and iteration > 0: 456 break 457 458 iteration += 1 459 460 self.w = w 461 return w

462

Source Code for Module mvpa.measures.irelief