Package mvpa :: Package misc :: Module transformers
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.transformers

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Simply functors that transform something.""" 
 10   
 11  _DEV_DOC = """ 
 12  Follow the convetion that functions start with lower case, and classes 
 13  with uppercase letter. 
 14  """ 
 15   
 16  __docformat__ = 'restructuredtext' 
 17   
 18   
 19  import numpy as N 
 20   
 21  from mvpa.base import externals, warning 
 22  from mvpa.misc.state import StateVariable, ClassWithCollections 
 23   
 24  if __debug__: 
 25      from mvpa.base import debug 
 26   
 27   
28 -def Absolute(x):
29 """ 30 Returns the elementwise absolute of any argument. 31 32 :Parameter: 33 x: scalar | sequence 34 35 """ 36 return N.absolute(x)
37 38
39 -def OneMinus(x):
40 """Returns elementwise '1 - x' of any argument.""" 41 return 1 - x
42 43
44 -def Identity(x):
45 """Return whatever it was called with.""" 46 return x
47 48
49 -def FirstAxisMean(x):
50 """Mean computed along the first axis.""" 51 return N.mean(x, axis=0)
52
53 -def FirstAxisSumNotZero(x):
54 """Sum computed over first axis of whether the values are not 55 equal to zero.""" 56 return (N.asarray(x)!=0).sum(axis=0)
57 58
59 -def SecondAxisMean(x):
60 """Mean across 2nd axis 61 62 Use cases: 63 - to combine multiple sensitivities to get sense about 64 mean sensitivity across splits 65 """ 66 return N.mean(x, axis=1)
67 68
69 -def SecondAxisSumOfAbs(x):
70 """Sum of absolute values along the 2nd axis 71 72 Use cases: 73 - to combine multiple sensitivities to get sense about 74 what features are most influential 75 """ 76 return N.abs(x).sum(axis=1)
77 78
79 -def SecondAxisMaxOfAbs(x):
80 """Max of absolute values along the 2nd axis 81 """ 82 return N.abs(x).max(axis=1)
83 84
85 -def GrandMean(x):
86 """Just what the name suggests.""" 87 return N.mean(x)
88 89
90 -def L2Normed(x, norm=1.0, reverse=False):
91 """Norm the values so that regular vector norm becomes `norm` 92 93 More verbose: Norm that the sum of the squared elements of the 94 returned vector becomes `norm`. 95 """ 96 xnorm = N.linalg.norm(x) 97 return x * (norm/xnorm)
98
99 -def L1Normed(x, norm=1.0, reverse=False):
100 """Norm the values so that L_1 norm (sum|x|) becomes `norm`""" 101 xnorm = N.sum(N.abs(x)) 102 return x * (norm/xnorm)
103 104
105 -def RankOrder(x, reverse=False):
106 """Rank-order by value. Highest gets 0""" 107 108 # XXX was Yarik on drugs? please simplify this beast 109 nelements = len(x) 110 indexes = N.arange(nelements) 111 t_indexes = indexes 112 if not reverse: 113 t_indexes = indexes[::-1] 114 tosort = zip(x, indexes) 115 tosort.sort() 116 ztosort = zip(tosort, t_indexes) 117 rankorder = N.empty(nelements, dtype=int) 118 rankorder[ [x[0][1] for x in ztosort] ] = \ 119 [x[1] for x in ztosort] 120 return rankorder
121 122
123 -def ReverseRankOrder(x):
124 """Convinience functor""" 125 return RankOrder(x, reverse=True)
126 127
128 -class OverAxis(object):
129 """Helper to apply transformer over specific axis 130 """ 131
132 - def __init__(self, transformer, axis=None):
133 """Initialize transformer wrapper with an axis. 134 135 :Parameters: 136 transformer 137 A callable to be used 138 axis : None or int 139 If None -- apply transformer across all the data. If some 140 int -- over that axis 141 """ 142 self.transformer = transformer 143 # sanity check 144 if not (axis is None or isinstance(axis, int)): 145 raise ValueError, "axis must be specified by integer" 146 self.axis = axis
147 148
149 - def __call__(self, x, *args, **kwargs):
150 transformer = self.transformer 151 axis = self.axis 152 if axis is None: 153 return transformer(x, *args, **kwargs) 154 155 x = N.asanyarray(x) 156 shape = x.shape 157 if axis >= len(shape): 158 raise ValueError, "Axis given in constructor %d is higher " \ 159 "than dimensionality of the data of shape %s" % (axis, shape) 160 161 # WRONG! ;-) 162 #for ind in xrange(shape[axis]): 163 # results.append(transformer(x.take([ind], axis=axis), 164 # *args, **kwargs)) 165 166 # TODO: more elegant/speedy solution? 167 shape_sweep = shape[:axis] + shape[axis+1:] 168 shrinker = None 169 """Either transformer reduces the dimensionality of the data""" 170 #results = N.empty(shape_out, dtype=x.dtype) 171 for index_sweep in N.ndindex(shape_sweep): 172 if axis > 0: 173 index = index_sweep[:axis] 174 else: 175 index = () 176 index = index + (slice(None),) + index_sweep[axis:] 177 x_sel = x[index] 178 x_t = transformer(x_sel, *args, **kwargs) 179 if shrinker is None: 180 if N.isscalar(x_t) or x_t.shape == shape_sweep: 181 results = N.empty(shape_sweep, dtype=x.dtype) 182 shrinker = True 183 elif x_t.shape == x_sel.shape: 184 results = N.empty(x.shape, dtype=x.dtype) 185 shrinker = False 186 else: 187 raise RuntimeError, 'Not handled by OverAxis kind of transformer' 188 189 if shrinker: 190 results[index_sweep] = x_t 191 else: 192 results[index] = x_t 193 194 return results
195 196
197 -class DistPValue(ClassWithCollections):
198 """Converts values into p-values under vague and non-scientific assumptions 199 """ 200 201 nulldist_number = StateVariable(enabled=True, 202 doc="Number of features within the estimated null-distribution") 203 204 positives_recovered = StateVariable(enabled=True, 205 doc="Number of features considered to be positives and which were recovered") 206 207
208 - def __init__(self, sd=0, distribution='rdist', fpp=None, nbins=400, **kwargs):
209 """L2-Norm the values, convert them to p-values of a given distribution. 210 211 :Parameters: 212 sd : int 213 Samples dimension (if len(x.shape)>1) on which to operate 214 distribution : string 215 Which distribution to use. Known are: 'rdist' (later normal should 216 be there as well) 217 fpp : float 218 At what p-value (both tails) if not None, to control for false 219 positives. It would iteratively prune the tails (tentative real positives) 220 until empirical p-value becomes less or equal to numerical. 221 nbins : int 222 Number of bins for the iterative pruning of positives 223 224 WARNING: Highly experimental/slow/etc: no theoretical grounds have been 225 presented in any paper, nor proven 226 """ 227 externals.exists('scipy', raiseException=True) 228 ClassWithCollections.__init__(self, **kwargs) 229 230 self.sd = sd 231 if not (distribution in ['rdist']): 232 raise ValueError, "Actually only rdist supported at the moment" \ 233 " got %s" % distribution 234 self.distribution = distribution 235 self.fpp = fpp 236 self.nbins = nbins
237 238
239 - def __call__(self, x):
240 from mvpa.support.stats import scipy 241 import scipy.stats as stats 242 243 # some local bindings 244 distribution = self.distribution 245 sd = self.sd 246 fpp = self.fpp 247 nbins = self.nbins 248 249 x = N.asanyarray(x) 250 shape_orig = x.shape 251 ndims = len(shape_orig) 252 253 # (very) old numpy had different format of returned bins -- 254 # there were not edges but center points. We care here about 255 # center points, so we will transform edge points into center 256 # points for newer versions of numpy 257 numpy_center_points = externals.versions['numpy'] < (1, 1) 258 259 # XXX May be just utilize OverAxis transformer? 260 if ndims > 2: 261 raise NotImplementedError, \ 262 "TODO: add support for more than 2 dimensions" 263 elif ndims == 1: 264 x, sd = x[:, N.newaxis], 0 265 266 # lets transpose for convenience 267 if sd == 0: x = x.T 268 269 # Output p-values of x in null-distribution 270 pvalues = N.zeros(x.shape) 271 nulldist_number, positives_recovered = [], [] 272 273 # finally go through all data 274 nd = x.shape[1] 275 if __debug__: 276 if nd < x.shape[0]: 277 warning("Number of features in DistPValue lower than number of" 278 " items -- may be incorrect sd=%d was provided" % sd) 279 for i, xx in enumerate(x): 280 dist = stats.rdist(nd-1, 0, 1) 281 xx /= N.linalg.norm(xx) 282 283 if fpp is not None: 284 if __debug__: 285 debug('TRAN_', "starting adaptive adjustment i=%d" % i) 286 287 # Adaptive adjustment for false negatives: 288 Nxx, xxx, pN_emp_prev = len(xx), xx, 1.0 289 Nxxx = Nxx 290 indexes = N.arange(Nxx) 291 """What features belong to Null-distribution""" 292 while True: 293 hist, bins = N.histogram(xxx, bins=nbins, normed=False) 294 pdf = hist.astype(float)/Nxxx 295 if not numpy_center_points: 296 # if we obtain edge points for bins -- take centers 297 bins = 0.5 * (bins[0:-1] + bins[1:]) 298 bins_halfstep = (bins[1] - bins[2])/2.0 299 300 # theoretical CDF 301 # was really unstable -- now got better ;) 302 dist_cdf = dist.cdf(bins) 303 304 # otherwise just recompute manually 305 # dist_pdf = dist.pdf(bins) 306 # dist_pdf /= N.sum(dist_pdf) 307 308 # XXX can't recall the function... silly 309 # probably could use N.integrate 310 cdf = N.zeros(nbins, dtype=float) 311 #dist_cdf = cdf.copy() 312 dist_prevv = cdf_prevv = 0.0 313 for j in range(nbins): 314 cdf_prevv = cdf[j] = cdf_prevv + pdf[j] 315 #dist_prevv = dist_cdf[j] = dist_prevv + dist_pdf[j] 316 317 # what bins fall into theoretical 'positives' in both tails 318 p = (0.5 - N.abs(dist_cdf - 0.5) < fpp/2.0) 319 320 # amount in empirical tails -- if we match theoretical, we 321 # should have total >= p 322 323 pN_emp = N.sum(pdf[p]) # / (1.0 * nbins) 324 325 if __debug__: 326 debug('TRAN_', "empirical p=%.3f for theoretical p=%.3f" 327 % (pN_emp, fpp)) 328 329 if pN_emp <= fpp: 330 # we are done 331 break 332 333 if pN_emp > pN_emp_prev: 334 if __debug__: 335 debug('TRAN_', "Diverging -- thus keeping last result " 336 "with p=%.3f" % pN_emp_prev) 337 # we better restore previous result 338 indexes, xxx, dist = indexes_prev, xxx_prev, dist_prev 339 break 340 341 pN_emp_prev = pN_emp 342 # very silly way for now -- just proceed by 1 bin 343 keep = N.logical_and(xxx > bins[0], # + bins_halfstep, 344 xxx < bins[-1]) # - bins_halfstep) 345 if __debug__: 346 debug('TRAN_', "Keeping %d out of %d elements" % 347 (N.sum(keep), Nxxx)) 348 349 # Preserve them if we need to "roll back" 350 indexes_prev, xxx_prev, dist_prev = indexes, xxx, dist 351 352 # we should remove those which we assume to be positives and 353 # which should not belong to Null-dist 354 xxx, indexes = xxx[keep], indexes[keep] 355 # L2 renorm it 356 xxx = xxx / N.linalg.norm(xxx) 357 Nxxx = len(xxx) 358 dist = stats.rdist(Nxxx-1, 0, 1) 359 360 Nindexes = len(indexes) 361 Nrecovered = Nxx - Nindexes 362 363 nulldist_number += [Nindexes] 364 positives_recovered += [Nrecovered] 365 366 if __debug__: 367 if distribution == 'rdist': 368 assert(dist.args[0] == Nindexes-1) 369 debug('TRAN', "Positives recovery finished with %d out of %d " 370 "entries in Null-distribution, thus %d positives " 371 "were recovered" % (Nindexes, Nxx, Nrecovered)) 372 373 # And now we need to perform our duty -- assign p-values 374 #dist = stats.rdist(Nindexes-1, 0, 1) 375 pvalues[i, :] = dist.cdf(xx) 376 377 # XXX we might add an option to transform it to z-scores? 378 result = pvalues 379 380 # charge state variables 381 # XXX might want to populate them for non-adaptive handling as well 382 self.nulldist_number = nulldist_number 383 self.positives_recovered = positives_recovered 384 385 # transpose if needed 386 if sd == 0: 387 result = result.T 388 389 return result
390