Package mvpa :: Package clfs :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.stats

   1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Estimator for classifier error distributions.""" 
  10   
  11  __docformat__ = 'restructuredtext' 
  12   
  13  import numpy as N 
  14   
  15  from mvpa.base import externals, warning 
  16  from mvpa.misc.state import ClassWithCollections, StateVariable 
  17   
  18  if __debug__: 
  19      from mvpa.base import debug 
20 21 22 -class Nonparametric(object):
23 """Non-parametric 1d distribution -- derives cdf based on stored values. 24 25 Introduced to complement parametric distributions present in scipy.stats. 26 """ 27
28 - def __init__(self, dist_samples, correction='clip'):
29 """ 30 :Parameters: 31 dist_samples : ndarray 32 Samples to be used to assess the distribution. 33 correction : {'clip'} or None, optional 34 Determines the behavior when .cdf is queried. If None -- no 35 correction is made. If 'clip' -- values are clipped to lie 36 in the range [1/(N+2), (N+1)/(N+2)] (simply because 37 non-parametric assessment lacks the power to resolve with 38 higher precision in the tails, so 'imagery' samples are 39 placed in each of the two tails). 40 """ 41 self._dist_samples = N.ravel(dist_samples) 42 self._correction = correction
43
44 - def __repr__(self):
45 return '%s(%r%s)' % ( 46 self.__class__.__name__, 47 self._dist_samples, 48 ('', ', correction=%r' % self._correction) 49 [int(self._correction != 'clip')])
50 51 @staticmethod
52 - def fit(dist_samples):
53 return [dist_samples]
54 55
56 - def cdf(self, x):
57 """Returns the cdf value at `x`. 58 """ 59 dist_samples = self._dist_samples 60 res = N.vectorize(lambda v:(dist_samples <= v).mean())(x) 61 if self._correction == 'clip': 62 nsamples = len(dist_samples) 63 N.clip(res, 1.0/(nsamples+2), (nsamples+1.0)/(nsamples+2), res) 64 elif self._correction is None: 65 pass 66 else: 67 raise ValueError, \ 68 '%r is incorrect value for correction parameter of %s' \ 69 % (self._correction, self.__class__.__name__) 70 return res
71
72 73 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
74 """Helper function to return p-value(x) given cdf and tail 75 76 :Parameters: 77 cdf_func : callable 78 Function to be used to derive cdf values for x 79 tail : str ('left', 'right', 'any', 'both') 80 Which tail of the distribution to report. For 'any' and 'both' 81 it chooses the tail it belongs to based on the comparison to 82 p=0.5. In the case of 'any' significance is taken like in a 83 one-tailed test. 84 return_tails : bool 85 If True, a tuple return (pvalues, tails), where tails contain 86 1s if value was from the right tail, and 0 if the value was 87 from the left tail. 88 """ 89 is_scalar = N.isscalar(x) 90 if is_scalar: 91 x = [x] 92 93 cdf = cdf_func(x) 94 95 if __debug__ and 'CHECK_STABILITY' in debug.active: 96 cdf_min, cdf_max = N.min(cdf), N.max(cdf) 97 if cdf_min < 0 or cdf_max > 1.0: 98 s = ('', ' for %s' % name)[int(name is not None)] 99 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \ 100 (cdf_func, s, cdf_min, cdf_max)) 101 102 # no escape but to assure that CDF is in the right range. Some 103 # distributions from scipy tend to jump away from [0,1] 104 cdf = N.clip(cdf, 0, 1.0) 105 106 if tail == 'left': 107 if return_tails: 108 right_tail = N.zeros(cdf.shape, dtype=bool) 109 elif tail == 'right': 110 cdf = 1 - cdf 111 if return_tails: 112 right_tail = N.ones(cdf.shape, dtype=bool) 113 elif tail in ('any', 'both'): 114 right_tail = (cdf >= 0.5) 115 cdf[right_tail] = 1.0 - cdf[right_tail] 116 if tail == 'both': 117 # we need to half the signficance 118 cdf *= 2 119 120 # Assure that NaNs didn't get significant value 121 cdf[N.isnan(x)] = 1.0 122 if is_scalar: res = cdf[0] 123 else: res = cdf 124 125 if return_tails: 126 return (res, right_tail) 127 else: 128 return res
129
130 131 -class NullDist(ClassWithCollections):
132 """Base class for null-hypothesis testing. 133 134 """ 135 136 # Although base class is not benefiting from states, derived 137 # classes do (MCNullDist). For the sake of avoiding multiple 138 # inheritance and associated headache -- let them all be ClassWithCollections, 139 # performance hit should be negligible in most of the scenarios 140 _ATTRIBUTE_COLLECTIONS = ['states'] 141
142 - def __init__(self, tail='both', **kwargs):
143 """Cheap initialization. 144 145 :Parameter: 146 tail: str ('left', 'right', 'any', 'both') 147 Which tail of the distribution to report. For 'any' and 'both' 148 it chooses the tail it belongs to based on the comparison to 149 p=0.5. In the case of 'any' significance is taken like in a 150 one-tailed test. 151 """ 152 ClassWithCollections.__init__(self, **kwargs) 153 154 self._setTail(tail)
155
156 - def __repr__(self, prefixes=[]):
157 return super(NullDist, self).__repr__( 158 prefixes=["tail=%s" % `self.__tail`] + prefixes)
159 160
161 - def _setTail(self, tail):
162 # sanity check 163 if tail not in ('left', 'right', 'any', 'both'): 164 raise ValueError, 'Unknown value "%s" to `tail` argument.' \ 165 % tail 166 self.__tail = tail
167 168
169 - def fit(self, measure, wdata, vdata=None):
170 """Implement to fit the distribution to the data.""" 171 raise NotImplementedError
172 173
174 - def cdf(self, x):
175 """Implementations return the value of the cumulative distribution 176 function (left or right tail dpending on the setting). 177 """ 178 raise NotImplementedError
179 180
181 - def p(self, x, **kwargs):
182 """Returns the p-value for values of `x`. 183 Returned values are determined left, right, or from any tail 184 depending on the constructor setting. 185 186 In case a `FeaturewiseDatasetMeasure` was used to estimate the 187 distribution the method returns an array. In that case `x` can be 188 a scalar value or an array of a matching shape. 189 """ 190 return _pvalue(x, self.cdf, self.__tail, **kwargs)
191 192 193 tail = property(fget=lambda x:x.__tail, fset=_setTail)
194
195 196 -class MCNullDist(NullDist):
197 """Null-hypothesis distribution is estimated from randomly permuted data labels. 198 199 The distribution is estimated by calling fit() with an appropriate 200 `DatasetMeasure` or `TransferError` instance and a training and a 201 validation dataset (in case of a `TransferError`). For a customizable 202 amount of cycles the training data labels are permuted and the 203 corresponding measure computed. In case of a `TransferError` this is the 204 error when predicting the *correct* labels of the validation dataset. 205 206 The distribution can be queried using the `cdf()` method, which can be 207 configured to report probabilities/frequencies from `left` or `right` tail, 208 i.e. fraction of the distribution that is lower or larger than some 209 critical value. 210 211 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()` 212 returns an array of featurewise probabilities/frequencies. 213 """ 214 215 _DEV_DOC = """ 216 TODO automagically decide on the number of samples/permutations needed 217 Caution should be paid though since resultant distributions might be 218 quite far from some conventional ones (e.g. Normal) -- it is expected to 219 them to be bimodal (or actually multimodal) in many scenarios. 220 """ 221 222 dist_samples = StateVariable(enabled=False, 223 doc='Samples obtained for each permutation') 224
225 - def __init__(self, dist_class=Nonparametric, permutations=100, **kwargs):
226 """Initialize Monte-Carlo Permutation Null-hypothesis testing 227 228 :Parameters: 229 dist_class: class 230 This can be any class which provides parameters estimate 231 using `fit()` method to initialize the instance, and 232 provides `cdf(x)` method for estimating value of x in CDF. 233 All distributions from SciPy's 'stats' module can be used. 234 permutations: int 235 This many permutations of label will be performed to 236 determine the distribution under the null hypothesis. 237 """ 238 NullDist.__init__(self, **kwargs) 239 240 self._dist_class = dist_class 241 self._dist = [] # actual distributions 242 243 self.__permutations = permutations 244 """Number of permutations to compute the estimate the null 245 distribution."""
246
247 - def __repr__(self, prefixes=[]):
248 prefixes_ = ["permutations=%s" % self.__permutations] 249 if self._dist_class != Nonparametric: 250 prefixes_.insert(0, 'dist_class=%s' % `self._dist_class`) 251 return super(MCNullDist, self).__repr__( 252 prefixes=prefixes_ + prefixes)
253 254
255 - def fit(self, measure, wdata, vdata=None):
256 """Fit the distribution by performing multiple cycles which repeatedly 257 permuted labels in the training dataset. 258 259 :Parameters: 260 measure: (`Featurewise`)`DatasetMeasure` | `TransferError` 261 TransferError instance used to compute all errors. 262 wdata: `Dataset` which gets permuted and used to compute the 263 measure/transfer error multiple times. 264 vdata: `Dataset` used for validation. 265 If provided measure is assumed to be a `TransferError` and 266 working and validation dataset are passed onto it. 267 """ 268 dist_samples = [] 269 """Holds the values for randomized labels.""" 270 271 # Needs to be imported here upon demand due to circular imports 272 # TODO: place MC into a separate module 273 from mvpa.clfs.base import DegenerateInputError, FailedToTrainError 274 275 # decide on the arguments to measure 276 if not vdata is None: 277 measure_args = [vdata, wdata] 278 else: 279 measure_args = [wdata] 280 281 # estimate null-distribution 282 skipped = 0 # # of skipped permutations 283 for p in xrange(self.__permutations): 284 # new permutation all the time 285 # but only permute the training data and keep the testdata constant 286 # 287 if __debug__: 288 debug('STATMC', "Doing %i permutations: %i" \ 289 % (self.__permutations, p+1), cr=True) 290 291 # TODO this really needs to be more clever! If data samples are 292 # shuffled within a class it really makes no difference for the 293 # classifier, hence the number of permutations to estimate the 294 # null-distribution of transfer errors can be reduced dramatically 295 # when the *right* permutations (the ones that matter) are done. 296 wdata.permuteLabels(True, perchunk=False) 297 298 # compute and store the measure of this permutation 299 # assume it has `TransferError` interface 300 try: 301 dist_samples.append(measure(*measure_args)) 302 except (DegenerateInputError, FailedToTrainError), e: 303 if __debug__: 304 debug('STATMC', " skipped", cr=True) 305 warning("Failed to estimate %s on %s, due to %s. " 306 "Permutation %d skipped." % 307 (measure, measure_args, e, p)) 308 skipped += 1 309 continue 310 311 if __debug__: 312 debug('STATMC', ' Skipped: %d permutations' % skipped) 313 314 # restore original labels 315 wdata.permuteLabels(False, perchunk=False) 316 317 # store samples 318 self.dist_samples = dist_samples = N.asarray(dist_samples) 319 320 # fit distribution per each element 321 322 # to decide either it was done on scalars or vectors 323 shape = dist_samples.shape 324 nshape = len(shape) 325 # if just 1 dim, original data was scalar, just create an 326 # artif dimension for it 327 if nshape == 1: 328 dist_samples = dist_samples[:, N.newaxis] 329 330 # fit per each element. 331 # XXX could be more elegant? may be use N.vectorize? 332 dist_samples_rs = dist_samples.reshape((shape[0], -1)) 333 dist = [] 334 for samples in dist_samples_rs.T: 335 params = self._dist_class.fit(samples) 336 if __debug__ and 'STAT' in debug.active: 337 debug('STAT', 'Estimated parameters for the %s are %s' 338 % (self._dist_class, str(params))) 339 dist.append(self._dist_class(*params)) 340 self._dist = dist
341 342
343 - def cdf(self, x):
344 """Return value of the cumulative distribution function at `x`. 345 """ 346 if self._dist is None: 347 # XXX We might not want to descriminate that way since 348 # usually generators also have .cdf where they rely on the 349 # default parameters. But then what about Nonparametric 350 raise RuntimeError, "Distribution has to be fit first" 351 352 is_scalar = N.isscalar(x) 353 if is_scalar: 354 x = [x] 355 356 x = N.asanyarray(x) 357 xshape = x.shape 358 # assure x is a 1D array now 359 x = x.reshape((-1,)) 360 361 if len(self._dist) != len(x): 362 raise ValueError, 'Distribution was fit for structure with %d' \ 363 ' elements, whenever now queried with %d elements' \ 364 % (len(self._dist), len(x)) 365 366 # extract cdf values per each element 367 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ] 368 return N.array(cdfs).reshape(xshape)
369 370
371 - def clean(self):
372 """Clean stored distributions 373 374 Storing all of the distributions might be too expensive 375 (e.g. in case of Nonparametric), and the scope of the object 376 might be too broad to wait for it to be destroyed. Clean would 377 bind dist_samples to empty list to let gc revoke the memory. 378 """ 379 self._dist = []
380
381 382 383 -class FixedNullDist(NullDist):
384 """Proxy/Adaptor class for SciPy distributions. 385 386 All distributions from SciPy's 'stats' module can be used with this class. 387 388 >>> import numpy as N 389 >>> from scipy import stats 390 >>> from mvpa.clfs.stats import FixedNullDist 391 >>> 392 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4)) 393 >>> dist.p(2) 394 0.5 395 >>> 396 >>> dist.cdf(N.arange(5)) 397 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246]) 398 >>> 399 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right') 400 >>> dist.p(N.arange(5)) 401 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754]) 402 """
403 - def __init__(self, dist, **kwargs):
404 """ 405 :Parameter: 406 dist: distribution object 407 This can be any object the has a `cdf()` method to report the 408 cumulative distribition function values. 409 """ 410 NullDist.__init__(self, **kwargs) 411 412 self._dist = dist
413 414
415 - def fit(self, measure, wdata, vdata=None):
416 """Does nothing since the distribution is already fixed.""" 417 pass
418 419
420 - def cdf(self, x):
421 """Return value of the cumulative distribution function at `x`. 422 """ 423 return self._dist.cdf(x)
424 425
426 - def __repr__(self, prefixes=[]):
427 prefixes_ = ["dist=%s" % `self._dist`] 428 return super(FixedNullDist, self).__repr__( 429 prefixes=prefixes_ + prefixes)
430
431 432 -class AdaptiveNullDist(FixedNullDist):
433 """Adaptive distribution which adjusts parameters according to the data 434 435 WiP: internal implementation might change 436 """
437 - def fit(self, measure, wdata, vdata=None):
438 """Cares about dimensionality of the feature space in measure 439 """ 440 441 try: 442 nfeatures = len(measure.feature_ids) 443 except ValueError: # XXX 444 nfeatures = N.prod(wdata.shape[1:]) 445 446 dist_gen = self._dist 447 if not hasattr(dist_gen, 'fit'): # frozen already 448 dist_gen = dist_gen.dist # rv_frozen at least has it ;) 449 450 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata) 451 if __debug__: 452 debug('STAT', 'Adapted parameters for %s to be %s, %s' 453 % (dist_gen, args, kwargs)) 454 self._dist = dist_gen(*args, **kwargs)
455 456
457 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
458 raise NotImplementedError
459
460 461 -class AdaptiveRDist(AdaptiveNullDist):
462 """Adaptive rdist: params are (nfeatures-1, 0, 1) 463 """ 464
465 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
466 return (nfeatures-1, 0, 1), {}
467 468 # XXX: RDist has stability issue, just run 469 # python -c "import scipy.stats; print scipy.stats.rdist(541,0,1).cdf(0.72)" 470 # to get some improbable value, so we need to take care about that manually 471 # here
472 - def cdf(self, x):
473 cdf_ = self._dist.cdf(x) 474 bad_values = N.where(N.abs(cdf_)>1) 475 # XXX there might be better implementation (faster/elegant) using N.clip, 476 # the only problem is that instability results might flip the sign 477 # arbitrarily 478 if len(bad_values[0]): 479 # in this distribution we have mean at 0, so we can take that easily 480 # into account 481 cdf_bad = cdf_[bad_values] 482 x_bad = x[bad_values] 483 cdf_bad[x_bad<0] = 0.0 484 cdf_bad[x_bad>=0] = 1.0 485 cdf_[bad_values] = cdf_bad 486 return cdf_
487
488 489 -class AdaptiveNormal(AdaptiveNullDist):
490 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures)) 491 """ 492
493 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
494 return (0, 1.0/N.sqrt(nfeatures)), {}
495 496 497 if externals.exists('scipy'): 498 from mvpa.support.stats import scipy 499 from scipy.stats import kstest 500 """ 501 Thoughts: 502 503 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to 504 check/reject H0 that samples come from a given distribution. But 505 since it is based on a full range of data, we might better of with 506 some ad-hoc checking by the detection power of the values in the 507 tail of a tentative distribution. 508 509 """ 510 511 # We need a way to fixate estimation of some parameters 512 # (e.g. mean) so lets create a simple proxy, or may be class 513 # generator later on, which would take care about punishing change 514 # from the 'right' arguments 515 516 import scipy
517 518 - class rv_semifrozen(object):
519 """Helper proxy-class to fit distribution when some parameters are known 520 521 It is an ugly hack with snippets of code taken from scipy, which is 522 Copyright (c) 2001, 2002 Enthought, Inc. 523 and is distributed under BSD license 524 http://www.scipy.org/License_Compatibility 525 """ 526
527 - def __init__(self, dist, loc=None, scale=None, args=None):
528 529 self._dist = dist 530 # loc and scale 531 theta = (loc, scale) 532 # args 533 Narg_ = dist.numargs 534 if args is not None: 535 Narg = len(args) 536 if Narg > Narg_: 537 raise ValueError, \ 538 'Distribution %s has only %d arguments. Got %d' \ 539 % (dist, Narg_, Narg) 540 args += (None,) * (Narg_ - Narg) 541 else: 542 args = (None,) * Narg_ 543 544 args_i = [i for i,v in enumerate(args) if v is None] 545 self._fargs = (list(args+theta), args_i) 546 """Arguments which should get some fixed value"""
547 548
549 - def __call__(self, *args, **kwargs):
550 """Upon call mimic call to get actual rv_frozen distribution 551 """ 552 return self._dist(*args, **kwargs)
553 554
555 - def nnlf(self, theta, x):
556 # - sum (log pdf(x, theta),axis=0) 557 # where theta are the parameters (including loc and scale) 558 # 559 fargs, fargs_i = self._fargs 560 try: 561 i=-1 562 if fargs[-1] is not None: 563 scale = fargs[-1] 564 else: 565 scale = theta[i] 566 i -= 1 567 568 if fargs[-2] is not None: 569 loc = fargs[-2] 570 else: 571 loc = theta[i] 572 i -= 1 573 574 args = theta[:i+1] 575 # adjust args if there were fixed 576 for i,a in zip(fargs_i, args): 577 fargs[i] = a 578 args = fargs[:-2] 579 580 except IndexError: 581 raise ValueError, "Not enough input arguments." 582 if not self._argcheck(*args) or scale <= 0: 583 return N.inf 584 x = N.asarray((x-loc) / scale) 585 cond0 = (x <= self.a) | (x >= self.b) 586 if (N.any(cond0)): 587 return N.inf 588 else: 589 return self._nnlf(x, *args) + len(x)*N.log(scale)
590
591 - def fit(self, data, *args, **kwds):
592 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0]) 593 fargs, fargs_i = self._fargs 594 Narg = len(args) 595 Narg_ = self.numargs 596 if Narg != Narg_: 597 if Narg > Narg_: 598 raise ValueError, "Too many input arguments." 599 else: 600 args += (1.0,)*(self.numargs-Narg) 601 602 # Provide only those args which are not fixed, and 603 # append location and scale (if not fixed) at the end 604 if len(fargs_i) != Narg_: 605 x0 = tuple([args[i] for i in fargs_i]) 606 else: 607 x0 = args 608 if fargs[-2] is None: x0 = x0 + (loc0,) 609 if fargs[-1] is None: x0 = x0 + (scale0,) 610 611 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0) 612 613 # reconstruct back 614 i = 0 615 loc, scale = fargs[-2:] 616 if fargs[-1] is None: 617 i -= 1 618 scale = opt_x[i] 619 if fargs[-2] is None: 620 i -= 1 621 loc = opt_x[i] 622 623 # assign those which weren't fixed 624 for i in fargs_i: 625 fargs[i] = opt_x[i] 626 627 #raise ValueError 628 opt_x = N.hstack((fargs[:-2], (loc, scale))) 629 return opt_x
630 631
632 - def __setattr__(self, a, v):
633 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 634 self._dist.__setattr__(a, v) 635 else: 636 object.__setattr__(self, a, v)
637 638
639 - def __getattribute__(self, a):
640 """We need to redirect all queries correspondingly 641 """ 642 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 643 return getattr(self._dist, a) 644 else: 645 return object.__getattribute__(self, a)
646
647 648 649 - def matchDistribution(data, nsamples=None, loc=None, scale=None, 650 args=None, test='kstest', distributions=None, 651 **kwargs):
652 """Determine best matching distribution. 653 654 Can be used for 'smelling' the data, as well to choose a 655 parametric distribution for data obtained from non-parametric 656 testing (e.g. `MCNullDist`). 657 658 WiP: use with caution, API might change 659 660 :Parameters: 661 data : N.ndarray 662 Array of the data for which to deduce the distribution. It has 663 to be sufficiently large to make a reliable conclusion 664 nsamples : int or None 665 If None -- use all samples in data to estimate parametric 666 distribution. Otherwise use only specified number randomly selected 667 from data. 668 loc : float or None 669 Loc for the distribution (if known) 670 scale : float or None 671 Scale for the distribution (if known) 672 test : basestring 673 What kind of testing to do. Choices: 674 'p-roc' : detection power for a given ROC. Needs two 675 parameters: `p=0.05` and `tail='both'` 676 'kstest' : 'full-body' distribution comparison. The best 677 choice is made by minimal reported distance after estimating 678 parameters of the distribution. Parameter `p=0.05` sets 679 threshold to reject null-hypothesis that distribution is the 680 same. 681 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have 682 incorrect kstest implementation and do not function 683 properly 684 distributions : None or list of basestring or tuple(basestring, dict) 685 Distributions to check. If None, all known in scipy.stats 686 are tested. If distribution is specified as a tuple, then 687 it must contain name and additional parameters (name, loc, 688 scale, args) in the dictionary. Entry 'scipy' adds all known 689 in scipy.stats. 690 **kwargs 691 Additional arguments which are needed for each particular test 692 (see above) 693 694 :Example: 695 data = N.random.normal(size=(1000,1)); 696 matches = matchDistribution( 697 data, 698 distributions=['rdist', 699 ('rdist', {'name':'rdist_fixed', 700 'loc': 0.0, 701 'args': (10,)})], 702 nsamples=30, test='p-roc', p=0.05) 703 """ 704 705 # Handle parameters 706 _KNOWN_TESTS = ['p-roc', 'kstest'] 707 if not test in _KNOWN_TESTS: 708 raise ValueError, 'Unknown kind of test %s. Known are %s' \ 709 % (test, _KNOWN_TESTS) 710 711 data = N.ravel(data) 712 # data sampled 713 if nsamples is not None: 714 if __debug__: 715 debug('STAT', 'Sampling %d samples from data for the ' \ 716 'estimation of the distributions parameters' % nsamples) 717 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int) 718 data_selected = data[indexes_selected] 719 else: 720 indexes_selected = N.arange(len(data)) 721 data_selected = data 722 723 p_thr = kwargs.get('p', 0.05) 724 if test == 'p-roc': 725 tail = kwargs.get('tail', 'both') 726 data_p = _pvalue(data, Nonparametric(data).cdf, tail) 727 data_p_thr = N.abs(data_p) <= p_thr 728 true_positives = N.sum(data_p_thr) 729 if true_positives == 0: 730 raise ValueError, "Provided data has no elements in non-" \ 731 "parametric distribution under p<=%.3f. Please " \ 732 "increase the size of data or value of p" % p 733 if __debug__: 734 debug('STAT_', 'Number of positives in non-parametric ' 735 'distribution is %d' % true_positives) 736 737 if distributions is None: 738 distributions = ['scipy'] 739 740 # lets see if 'scipy' entry was in there 741 try: 742 scipy_ind = distributions.index('scipy') 743 distributions.pop(scipy_ind) 744 sp_dists = scipy.stats.distributions.__all__ 745 sp_version = externals.versions['scipy'] 746 if sp_version >= '0.9.0': 747 for d_ in ['ncf']: 748 if d_ in sp_dists: 749 warning("Not considering %s distribution because of " 750 "known issues in scipy %s" % (d_, sp_version)) 751 _ = sp_dists.pop(sp_dists.index(d_)) 752 distributions += sp_dists 753 except ValueError: 754 pass 755 756 results = [] 757 for d in distributions: 758 dist_gen, loc_, scale_, args_ = None, loc, scale, args 759 if isinstance(d, basestring): 760 dist_gen = d 761 dist_name = d 762 elif isinstance(d, tuple): 763 if not (len(d)==2 and isinstance(d[1], dict)): 764 raise ValueError,\ 765 "Tuple specification of distribution must be " \ 766 "(d, {params}). Got %s" % (d,) 767 dist_gen = d[0] 768 loc_ = d[1].get('loc', loc) 769 scale_ = d[1].get('scale', scale) 770 args_ = d[1].get('args', args) 771 dist_name = d[1].get('name', str(dist_gen)) 772 else: 773 dist_gen = d 774 dist_name = str(d) 775 776 # perform actions which might puke for some distributions 777 try: 778 dist_gen_ = getattr(scipy.stats, dist_gen) 779 # specify distribution 'optimizer'. If loc or scale was provided, 780 # use home-brewed rv_semifrozen 781 if args_ is not None or loc_ is not None or scale_ is not None: 782 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_) 783 else: 784 dist_opt = dist_gen_ 785 dist_params = dist_opt.fit(data_selected) 786 if __debug__: 787 debug('STAT__', 788 'Got distribution parameters %s for %s' 789 % (dist_params, dist_name)) 790 if test == 'p-roc': 791 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params) 792 # We need to compare detection under given p 793 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen)) 794 cdf_p_thr = cdf_p <= p_thr 795 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1 796 if __debug__: res_sum = 'D=%.2f' % D 797 elif test == 'kstest': 798 D, p = kstest(data, d, args=dist_params) 799 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p) 800 except (TypeError, ValueError, AttributeError, 801 NotImplementedError), e:#Exception, e: 802 if __debug__: 803 debug('STAT__', 804 'Testing for %s distribution failed due to %s' 805 % (d, str(e))) 806 continue 807 808 if p > p_thr and not N.isnan(D): 809 results += [ (D, dist_gen, dist_name, dist_params) ] 810 if __debug__: 811 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum)) 812 else: 813 if __debug__: 814 debug('STAT__', 'Cannot consider %s dist. with %s' 815 % (d, res_sum)) 816 continue 817 818 # sort in ascending order, so smaller is better 819 results.sort() 820 821 if __debug__ and 'STAT' in debug.active: 822 # find the best and report it 823 nresults = len(results) 824 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0]) 825 if nresults>0: 826 nnextbest = min(2, nresults-1) 827 snextbest = ', '.join(map(sresult, results[1:1+nnextbest])) 828 debug('STAT', 'Best distribution %s. Next best: %s' 829 % (sresult(results[0]), snextbest)) 830 else: 831 debug('STAT', 'Could not find suitable distribution') 832 833 # return all the results 834 return results
835 836 837 if externals.exists('pylab'): 838 import pylab as P
839 840 - def plotDistributionMatches(data, matches, nbins=31, nbest=5, 841 expand_tails=8, legend=2, plot_cdf=True, 842 p=None, tail='both'):
843 """Plot best matching distributions 844 845 :Parameters: 846 data : N.ndarray 847 Data which was used to obtain the matches 848 matches : list of tuples 849 Sorted matches as provided by matchDistribution 850 nbins : int 851 Number of bins in the histogram 852 nbest : int 853 Number of top matches to plot 854 expand_tails : int 855 How many bins away to add to parametrized distributions 856 plots 857 legend : int 858 Either to provide legend and statistics in the legend. 859 1 -- just lists distributions. 860 2 -- adds distance measure 861 3 -- tp/fp/fn in the case if p is provided 862 plot_cdf : bool 863 Either to plot cdf for data using non-parametric distribution 864 p : float or None 865 If not None, visualize null-hypothesis testing (given p). 866 Bars in the histogram which fall under given p are colored 867 in red. False positives and false negatives are marked as 868 triangle up and down symbols correspondingly 869 tail : ('left', 'right', 'any', 'both') 870 If p is not None, the choise of tail for null-hypothesis 871 testing 872 873 :Returns: tuple(histogram, list of lines) 874 """ 875 876 hist = P.hist(data, nbins, normed=1, align='center') 877 data_range = [N.min(data), N.max(data)] 878 879 # x's 880 x = hist[1] 881 dx = x[expand_tails] - x[0] # how much to expand tails by 882 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx)) 883 884 nonparam = Nonparametric(data) 885 # plot cdf 886 if plot_cdf: 887 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1) 888 889 p_thr = p 890 891 data_p = _pvalue(data, nonparam.cdf, tail) 892 data_p_thr = (data_p <= p_thr).ravel() 893 894 x_p = _pvalue(x, Nonparametric(data).cdf, tail) 895 x_p_thr = N.abs(x_p) <= p_thr 896 # color bars which pass thresholding in red 897 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]): 898 bar.set_facecolor(('w','r')[int(thr)]) 899 900 if not len(matches): 901 # no matches were provided 902 warning("No matching distributions were provided -- nothing to plot") 903 return (hist, ) 904 905 lines = [] 906 labels = [] 907 for i in xrange(min(nbest, len(matches))): 908 D, dist_gen, dist_name, params = matches[i] 909 dist = getattr(scipy.stats, dist_gen)(*params) 910 911 label = '%s' % (dist_name) 912 if legend > 1: label += '(D=%.2f)' % (D) 913 914 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail)) 915 xcdf_p_thr = (xcdf_p <= p_thr).ravel() 916 917 if p is not None and legend > 2: 918 # We need to compare detection under given p 919 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail)) 920 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel() 921 922 # true positives 923 tp = N.logical_and(data_cdf_p_thr, data_p_thr) 924 # false positives 925 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr) 926 # false negatives 927 fn = N.logical_and(~data_cdf_p_thr, data_p_thr) 928 929 label += ' tp/fp/fn=%d/%d/%d)' % \ 930 tuple(map(N.sum, [tp,fp,fn])) 931 932 pdf = dist.pdf(x) 933 line = P.plot(x, pdf, '-', linewidth=2, label=label) 934 color = line[0].get_color() 935 936 if plot_cdf: 937 cdf = dist.cdf(x) 938 P.plot(x, cdf, ':', linewidth=1, color=color, label=label) 939 940 # TODO: decide on tp/fp/fn by not centers of the bins but 941 # by the values in data in the ranges covered by 942 # those bins. Then it would correspond to the values 943 # mentioned in the legend 944 if p is not None: 945 # true positives 946 xtp = N.logical_and(xcdf_p_thr, x_p_thr) 947 # false positives 948 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr) 949 # false negatives 950 xfn = N.logical_and(~xcdf_p_thr, x_p_thr) 951 952 # no need to plot tp explicitely -- marked by color of the bar 953 # P.plot(x[xtp], pdf[xtp], 'o', color=color) 954 P.plot(x[xfp], pdf[xfp], '^', color=color) 955 P.plot(x[xfn], pdf[xfn], 'v', color=color) 956 957 lines.append(line) 958 labels.append(label) 959 960 if legend: 961 P.legend(lines, labels) 962 963 return (hist, lines)
964
965 #if True: 966 # data = N.random.normal(size=(1000,1)); 967 # matches = matchDistribution( 968 # data, 969 # distributions=['scipy', 970 # ('norm', {'name':'norm_known', 971 # 'scale': 1.0, 972 # 'loc': 0.0})], 973 # nsamples=30, test='p-roc', p=0.05) 974 # P.figure(); plotDistributionMatches(data, matches, nbins=101, 975 # p=0.05, legend=4, nbest=5) 976 977 978 -def autoNullDist(dist):
979 """Cheater for human beings -- wraps `dist` if needed with some 980 NullDist 981 982 tail and other arguments are assumed to be default as in 983 NullDist/MCNullDist 984 """ 985 if dist is None or isinstance(dist, NullDist): 986 return dist 987 elif hasattr(dist, 'fit'): 988 if __debug__: 989 debug('STAT', 'Wrapping %s into MCNullDist' % dist) 990 return MCNullDist(dist) 991 else: 992 if __debug__: 993 debug('STAT', 'Wrapping %s into FixedNullDist' % dist) 994 return FixedNullDist(dist)
995
996 997 # if no scipy, we need nanmean 998 -def _chk_asarray(a, axis):
999 if axis is None: 1000 a = N.ravel(a) 1001 outaxis = 0 1002 else: 1003 a = N.asarray(a) 1004 outaxis = axis 1005 return a, outaxis
1006
1007 -def nanmean(x, axis=0):
1008 """Compute the mean over the given axis ignoring nans. 1009 1010 :Parameters: 1011 x : ndarray 1012 input array 1013 axis : int 1014 axis along which the mean is computed. 1015 1016 :Results: 1017 m : float 1018 the mean.""" 1019 x, axis = _chk_asarray(x,axis) 1020 x = x.copy() 1021 Norig = x.shape[axis] 1022 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig 1023 1024 x[N.isnan(x)] = 0 1025 return N.mean(x,axis)/factor
1026