1
2
3
4
5
6
7
8
9 """Estimator for classifier error distributions."""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from mvpa.base import externals, warning
16 from mvpa.misc.state import ClassWithCollections, StateVariable
17
18 if __debug__:
19 from mvpa.base import debug
23 """Non-parametric 1d distribution -- derives cdf based on stored values.
24
25 Introduced to complement parametric distributions present in scipy.stats.
26 """
27
28 - def __init__(self, dist_samples, correction='clip'):
29 """
30 :Parameters:
31 dist_samples : ndarray
32 Samples to be used to assess the distribution.
33 correction : {'clip'} or None, optional
34 Determines the behavior when .cdf is queried. If None -- no
35 correction is made. If 'clip' -- values are clipped to lie
36 in the range [1/(N+2), (N+1)/(N+2)] (simply because
37 non-parametric assessment lacks the power to resolve with
38 higher precision in the tails, so 'imagery' samples are
39 placed in each of the two tails).
40 """
41 self._dist_samples = N.ravel(dist_samples)
42 self._correction = correction
43
45 return '%s(%r%s)' % (
46 self.__class__.__name__,
47 self._dist_samples,
48 ('', ', correction=%r' % self._correction)
49 [int(self._correction != 'clip')])
50
51 @staticmethod
52 - def fit(dist_samples):
54
55
57 """Returns the cdf value at `x`.
58 """
59 dist_samples = self._dist_samples
60 res = N.vectorize(lambda v:(dist_samples <= v).mean())(x)
61 if self._correction == 'clip':
62 nsamples = len(dist_samples)
63 N.clip(res, 1.0/(nsamples+2), (nsamples+1.0)/(nsamples+2), res)
64 elif self._correction is None:
65 pass
66 else:
67 raise ValueError, \
68 '%r is incorrect value for correction parameter of %s' \
69 % (self._correction, self.__class__.__name__)
70 return res
71
72
73 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
74 """Helper function to return p-value(x) given cdf and tail
75
76 :Parameters:
77 cdf_func : callable
78 Function to be used to derive cdf values for x
79 tail : str ('left', 'right', 'any', 'both')
80 Which tail of the distribution to report. For 'any' and 'both'
81 it chooses the tail it belongs to based on the comparison to
82 p=0.5. In the case of 'any' significance is taken like in a
83 one-tailed test.
84 return_tails : bool
85 If True, a tuple return (pvalues, tails), where tails contain
86 1s if value was from the right tail, and 0 if the value was
87 from the left tail.
88 """
89 is_scalar = N.isscalar(x)
90 if is_scalar:
91 x = [x]
92
93 cdf = cdf_func(x)
94
95 if __debug__ and 'CHECK_STABILITY' in debug.active:
96 cdf_min, cdf_max = N.min(cdf), N.max(cdf)
97 if cdf_min < 0 or cdf_max > 1.0:
98 s = ('', ' for %s' % name)[int(name is not None)]
99 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \
100 (cdf_func, s, cdf_min, cdf_max))
101
102
103
104 cdf = N.clip(cdf, 0, 1.0)
105
106 if tail == 'left':
107 if return_tails:
108 right_tail = N.zeros(cdf.shape, dtype=bool)
109 elif tail == 'right':
110 cdf = 1 - cdf
111 if return_tails:
112 right_tail = N.ones(cdf.shape, dtype=bool)
113 elif tail in ('any', 'both'):
114 right_tail = (cdf >= 0.5)
115 cdf[right_tail] = 1.0 - cdf[right_tail]
116 if tail == 'both':
117
118 cdf *= 2
119
120
121 cdf[N.isnan(x)] = 1.0
122 if is_scalar: res = cdf[0]
123 else: res = cdf
124
125 if return_tails:
126 return (res, right_tail)
127 else:
128 return res
129
130
131 -class NullDist(ClassWithCollections):
132 """Base class for null-hypothesis testing.
133
134 """
135
136
137
138
139
140 _ATTRIBUTE_COLLECTIONS = ['states']
141
142 - def __init__(self, tail='both', **kwargs):
143 """Cheap initialization.
144
145 :Parameter:
146 tail: str ('left', 'right', 'any', 'both')
147 Which tail of the distribution to report. For 'any' and 'both'
148 it chooses the tail it belongs to based on the comparison to
149 p=0.5. In the case of 'any' significance is taken like in a
150 one-tailed test.
151 """
152 ClassWithCollections.__init__(self, **kwargs)
153
154 self._setTail(tail)
155
157 return super(NullDist, self).__repr__(
158 prefixes=["tail=%s" % `self.__tail`] + prefixes)
159
160
162
163 if tail not in ('left', 'right', 'any', 'both'):
164 raise ValueError, 'Unknown value "%s" to `tail` argument.' \
165 % tail
166 self.__tail = tail
167
168
169 - def fit(self, measure, wdata, vdata=None):
170 """Implement to fit the distribution to the data."""
171 raise NotImplementedError
172
173
175 """Implementations return the value of the cumulative distribution
176 function (left or right tail dpending on the setting).
177 """
178 raise NotImplementedError
179
180
181 - def p(self, x, **kwargs):
182 """Returns the p-value for values of `x`.
183 Returned values are determined left, right, or from any tail
184 depending on the constructor setting.
185
186 In case a `FeaturewiseDatasetMeasure` was used to estimate the
187 distribution the method returns an array. In that case `x` can be
188 a scalar value or an array of a matching shape.
189 """
190 return _pvalue(x, self.cdf, self.__tail, **kwargs)
191
192
193 tail = property(fget=lambda x:x.__tail, fset=_setTail)
194
197 """Null-hypothesis distribution is estimated from randomly permuted data labels.
198
199 The distribution is estimated by calling fit() with an appropriate
200 `DatasetMeasure` or `TransferError` instance and a training and a
201 validation dataset (in case of a `TransferError`). For a customizable
202 amount of cycles the training data labels are permuted and the
203 corresponding measure computed. In case of a `TransferError` this is the
204 error when predicting the *correct* labels of the validation dataset.
205
206 The distribution can be queried using the `cdf()` method, which can be
207 configured to report probabilities/frequencies from `left` or `right` tail,
208 i.e. fraction of the distribution that is lower or larger than some
209 critical value.
210
211 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()`
212 returns an array of featurewise probabilities/frequencies.
213 """
214
215 _DEV_DOC = """
216 TODO automagically decide on the number of samples/permutations needed
217 Caution should be paid though since resultant distributions might be
218 quite far from some conventional ones (e.g. Normal) -- it is expected to
219 them to be bimodal (or actually multimodal) in many scenarios.
220 """
221
222 dist_samples = StateVariable(enabled=False,
223 doc='Samples obtained for each permutation')
224
226 """Initialize Monte-Carlo Permutation Null-hypothesis testing
227
228 :Parameters:
229 dist_class: class
230 This can be any class which provides parameters estimate
231 using `fit()` method to initialize the instance, and
232 provides `cdf(x)` method for estimating value of x in CDF.
233 All distributions from SciPy's 'stats' module can be used.
234 permutations: int
235 This many permutations of label will be performed to
236 determine the distribution under the null hypothesis.
237 """
238 NullDist.__init__(self, **kwargs)
239
240 self._dist_class = dist_class
241 self._dist = []
242
243 self.__permutations = permutations
244 """Number of permutations to compute the estimate the null
245 distribution."""
246
248 prefixes_ = ["permutations=%s" % self.__permutations]
249 if self._dist_class != Nonparametric:
250 prefixes_.insert(0, 'dist_class=%s' % `self._dist_class`)
251 return super(MCNullDist, self).__repr__(
252 prefixes=prefixes_ + prefixes)
253
254
255 - def fit(self, measure, wdata, vdata=None):
256 """Fit the distribution by performing multiple cycles which repeatedly
257 permuted labels in the training dataset.
258
259 :Parameters:
260 measure: (`Featurewise`)`DatasetMeasure` | `TransferError`
261 TransferError instance used to compute all errors.
262 wdata: `Dataset` which gets permuted and used to compute the
263 measure/transfer error multiple times.
264 vdata: `Dataset` used for validation.
265 If provided measure is assumed to be a `TransferError` and
266 working and validation dataset are passed onto it.
267 """
268 dist_samples = []
269 """Holds the values for randomized labels."""
270
271
272
273 from mvpa.clfs.base import DegenerateInputError, FailedToTrainError
274
275
276 if not vdata is None:
277 measure_args = [vdata, wdata]
278 else:
279 measure_args = [wdata]
280
281
282 skipped = 0
283 for p in xrange(self.__permutations):
284
285
286
287 if __debug__:
288 debug('STATMC', "Doing %i permutations: %i" \
289 % (self.__permutations, p+1), cr=True)
290
291
292
293
294
295
296 wdata.permuteLabels(True, perchunk=False)
297
298
299
300 try:
301 dist_samples.append(measure(*measure_args))
302 except (DegenerateInputError, FailedToTrainError), e:
303 if __debug__:
304 debug('STATMC', " skipped", cr=True)
305 warning("Failed to estimate %s on %s, due to %s. "
306 "Permutation %d skipped." %
307 (measure, measure_args, e, p))
308 skipped += 1
309 continue
310
311 if __debug__:
312 debug('STATMC', ' Skipped: %d permutations' % skipped)
313
314
315 wdata.permuteLabels(False, perchunk=False)
316
317
318 self.dist_samples = dist_samples = N.asarray(dist_samples)
319
320
321
322
323 shape = dist_samples.shape
324 nshape = len(shape)
325
326
327 if nshape == 1:
328 dist_samples = dist_samples[:, N.newaxis]
329
330
331
332 dist_samples_rs = dist_samples.reshape((shape[0], -1))
333 dist = []
334 for samples in dist_samples_rs.T:
335 params = self._dist_class.fit(samples)
336 if __debug__ and 'STAT' in debug.active:
337 debug('STAT', 'Estimated parameters for the %s are %s'
338 % (self._dist_class, str(params)))
339 dist.append(self._dist_class(*params))
340 self._dist = dist
341
342
344 """Return value of the cumulative distribution function at `x`.
345 """
346 if self._dist is None:
347
348
349
350 raise RuntimeError, "Distribution has to be fit first"
351
352 is_scalar = N.isscalar(x)
353 if is_scalar:
354 x = [x]
355
356 x = N.asanyarray(x)
357 xshape = x.shape
358
359 x = x.reshape((-1,))
360
361 if len(self._dist) != len(x):
362 raise ValueError, 'Distribution was fit for structure with %d' \
363 ' elements, whenever now queried with %d elements' \
364 % (len(self._dist), len(x))
365
366
367 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ]
368 return N.array(cdfs).reshape(xshape)
369
370
372 """Clean stored distributions
373
374 Storing all of the distributions might be too expensive
375 (e.g. in case of Nonparametric), and the scope of the object
376 might be too broad to wait for it to be destroyed. Clean would
377 bind dist_samples to empty list to let gc revoke the memory.
378 """
379 self._dist = []
380
384 """Proxy/Adaptor class for SciPy distributions.
385
386 All distributions from SciPy's 'stats' module can be used with this class.
387
388 >>> import numpy as N
389 >>> from scipy import stats
390 >>> from mvpa.clfs.stats import FixedNullDist
391 >>>
392 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4))
393 >>> dist.p(2)
394 0.5
395 >>>
396 >>> dist.cdf(N.arange(5))
397 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246])
398 >>>
399 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right')
400 >>> dist.p(N.arange(5))
401 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754])
402 """
404 """
405 :Parameter:
406 dist: distribution object
407 This can be any object the has a `cdf()` method to report the
408 cumulative distribition function values.
409 """
410 NullDist.__init__(self, **kwargs)
411
412 self._dist = dist
413
414
415 - def fit(self, measure, wdata, vdata=None):
416 """Does nothing since the distribution is already fixed."""
417 pass
418
419
421 """Return value of the cumulative distribution function at `x`.
422 """
423 return self._dist.cdf(x)
424
425
427 prefixes_ = ["dist=%s" % `self._dist`]
428 return super(FixedNullDist, self).__repr__(
429 prefixes=prefixes_ + prefixes)
430
433 """Adaptive distribution which adjusts parameters according to the data
434
435 WiP: internal implementation might change
436 """
437 - def fit(self, measure, wdata, vdata=None):
438 """Cares about dimensionality of the feature space in measure
439 """
440
441 try:
442 nfeatures = len(measure.feature_ids)
443 except ValueError:
444 nfeatures = N.prod(wdata.shape[1:])
445
446 dist_gen = self._dist
447 if not hasattr(dist_gen, 'fit'):
448 dist_gen = dist_gen.dist
449
450 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata)
451 if __debug__:
452 debug('STAT', 'Adapted parameters for %s to be %s, %s'
453 % (dist_gen, args, kwargs))
454 self._dist = dist_gen(*args, **kwargs)
455
456
457 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
458 raise NotImplementedError
459
462 """Adaptive rdist: params are (nfeatures-1, 0, 1)
463 """
464
465 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
467
468
469
470
471
473 cdf_ = self._dist.cdf(x)
474 bad_values = N.where(N.abs(cdf_)>1)
475
476
477
478 if len(bad_values[0]):
479
480
481 cdf_bad = cdf_[bad_values]
482 x_bad = x[bad_values]
483 cdf_bad[x_bad<0] = 0.0
484 cdf_bad[x_bad>=0] = 1.0
485 cdf_[bad_values] = cdf_bad
486 return cdf_
487
490 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures))
491 """
492
493 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
495
496
497 if externals.exists('scipy'):
498 from mvpa.support.stats import scipy
499 from scipy.stats import kstest
500 """
501 Thoughts:
502
503 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to
504 check/reject H0 that samples come from a given distribution. But
505 since it is based on a full range of data, we might better of with
506 some ad-hoc checking by the detection power of the values in the
507 tail of a tentative distribution.
508
509 """
510
511
512
513
514
515
516 import scipy
519 """Helper proxy-class to fit distribution when some parameters are known
520
521 It is an ugly hack with snippets of code taken from scipy, which is
522 Copyright (c) 2001, 2002 Enthought, Inc.
523 and is distributed under BSD license
524 http://www.scipy.org/License_Compatibility
525 """
526
527 - def __init__(self, dist, loc=None, scale=None, args=None):
528
529 self._dist = dist
530
531 theta = (loc, scale)
532
533 Narg_ = dist.numargs
534 if args is not None:
535 Narg = len(args)
536 if Narg > Narg_:
537 raise ValueError, \
538 'Distribution %s has only %d arguments. Got %d' \
539 % (dist, Narg_, Narg)
540 args += (None,) * (Narg_ - Narg)
541 else:
542 args = (None,) * Narg_
543
544 args_i = [i for i,v in enumerate(args) if v is None]
545 self._fargs = (list(args+theta), args_i)
546 """Arguments which should get some fixed value"""
547
548
550 """Upon call mimic call to get actual rv_frozen distribution
551 """
552 return self._dist(*args, **kwargs)
553
554
555 - def nnlf(self, theta, x):
556
557
558
559 fargs, fargs_i = self._fargs
560 try:
561 i=-1
562 if fargs[-1] is not None:
563 scale = fargs[-1]
564 else:
565 scale = theta[i]
566 i -= 1
567
568 if fargs[-2] is not None:
569 loc = fargs[-2]
570 else:
571 loc = theta[i]
572 i -= 1
573
574 args = theta[:i+1]
575
576 for i,a in zip(fargs_i, args):
577 fargs[i] = a
578 args = fargs[:-2]
579
580 except IndexError:
581 raise ValueError, "Not enough input arguments."
582 if not self._argcheck(*args) or scale <= 0:
583 return N.inf
584 x = N.asarray((x-loc) / scale)
585 cond0 = (x <= self.a) | (x >= self.b)
586 if (N.any(cond0)):
587 return N.inf
588 else:
589 return self._nnlf(x, *args) + len(x)*N.log(scale)
590
591 - def fit(self, data, *args, **kwds):
592 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0])
593 fargs, fargs_i = self._fargs
594 Narg = len(args)
595 Narg_ = self.numargs
596 if Narg != Narg_:
597 if Narg > Narg_:
598 raise ValueError, "Too many input arguments."
599 else:
600 args += (1.0,)*(self.numargs-Narg)
601
602
603
604 if len(fargs_i) != Narg_:
605 x0 = tuple([args[i] for i in fargs_i])
606 else:
607 x0 = args
608 if fargs[-2] is None: x0 = x0 + (loc0,)
609 if fargs[-1] is None: x0 = x0 + (scale0,)
610
611 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0)
612
613
614 i = 0
615 loc, scale = fargs[-2:]
616 if fargs[-1] is None:
617 i -= 1
618 scale = opt_x[i]
619 if fargs[-2] is None:
620 i -= 1
621 loc = opt_x[i]
622
623
624 for i in fargs_i:
625 fargs[i] = opt_x[i]
626
627
628 opt_x = N.hstack((fargs[:-2], (loc, scale)))
629 return opt_x
630
631
633 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
634 self._dist.__setattr__(a, v)
635 else:
636 object.__setattr__(self, a, v)
637
638
640 """We need to redirect all queries correspondingly
641 """
642 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
643 return getattr(self._dist, a)
644 else:
645 return object.__getattribute__(self, a)
646
647
648
649 - def matchDistribution(data, nsamples=None, loc=None, scale=None,
650 args=None, test='kstest', distributions=None,
651 **kwargs):
652 """Determine best matching distribution.
653
654 Can be used for 'smelling' the data, as well to choose a
655 parametric distribution for data obtained from non-parametric
656 testing (e.g. `MCNullDist`).
657
658 WiP: use with caution, API might change
659
660 :Parameters:
661 data : N.ndarray
662 Array of the data for which to deduce the distribution. It has
663 to be sufficiently large to make a reliable conclusion
664 nsamples : int or None
665 If None -- use all samples in data to estimate parametric
666 distribution. Otherwise use only specified number randomly selected
667 from data.
668 loc : float or None
669 Loc for the distribution (if known)
670 scale : float or None
671 Scale for the distribution (if known)
672 test : basestring
673 What kind of testing to do. Choices:
674 'p-roc' : detection power for a given ROC. Needs two
675 parameters: `p=0.05` and `tail='both'`
676 'kstest' : 'full-body' distribution comparison. The best
677 choice is made by minimal reported distance after estimating
678 parameters of the distribution. Parameter `p=0.05` sets
679 threshold to reject null-hypothesis that distribution is the
680 same.
681 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have
682 incorrect kstest implementation and do not function
683 properly
684 distributions : None or list of basestring or tuple(basestring, dict)
685 Distributions to check. If None, all known in scipy.stats
686 are tested. If distribution is specified as a tuple, then
687 it must contain name and additional parameters (name, loc,
688 scale, args) in the dictionary. Entry 'scipy' adds all known
689 in scipy.stats.
690 **kwargs
691 Additional arguments which are needed for each particular test
692 (see above)
693
694 :Example:
695 data = N.random.normal(size=(1000,1));
696 matches = matchDistribution(
697 data,
698 distributions=['rdist',
699 ('rdist', {'name':'rdist_fixed',
700 'loc': 0.0,
701 'args': (10,)})],
702 nsamples=30, test='p-roc', p=0.05)
703 """
704
705
706 _KNOWN_TESTS = ['p-roc', 'kstest']
707 if not test in _KNOWN_TESTS:
708 raise ValueError, 'Unknown kind of test %s. Known are %s' \
709 % (test, _KNOWN_TESTS)
710
711 data = N.ravel(data)
712
713 if nsamples is not None:
714 if __debug__:
715 debug('STAT', 'Sampling %d samples from data for the ' \
716 'estimation of the distributions parameters' % nsamples)
717 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int)
718 data_selected = data[indexes_selected]
719 else:
720 indexes_selected = N.arange(len(data))
721 data_selected = data
722
723 p_thr = kwargs.get('p', 0.05)
724 if test == 'p-roc':
725 tail = kwargs.get('tail', 'both')
726 data_p = _pvalue(data, Nonparametric(data).cdf, tail)
727 data_p_thr = N.abs(data_p) <= p_thr
728 true_positives = N.sum(data_p_thr)
729 if true_positives == 0:
730 raise ValueError, "Provided data has no elements in non-" \
731 "parametric distribution under p<=%.3f. Please " \
732 "increase the size of data or value of p" % p
733 if __debug__:
734 debug('STAT_', 'Number of positives in non-parametric '
735 'distribution is %d' % true_positives)
736
737 if distributions is None:
738 distributions = ['scipy']
739
740
741 try:
742 scipy_ind = distributions.index('scipy')
743 distributions.pop(scipy_ind)
744 sp_dists = scipy.stats.distributions.__all__
745 sp_version = externals.versions['scipy']
746 if sp_version >= '0.9.0':
747 for d_ in ['ncf']:
748 if d_ in sp_dists:
749 warning("Not considering %s distribution because of "
750 "known issues in scipy %s" % (d_, sp_version))
751 _ = sp_dists.pop(sp_dists.index(d_))
752 distributions += sp_dists
753 except ValueError:
754 pass
755
756 results = []
757 for d in distributions:
758 dist_gen, loc_, scale_, args_ = None, loc, scale, args
759 if isinstance(d, basestring):
760 dist_gen = d
761 dist_name = d
762 elif isinstance(d, tuple):
763 if not (len(d)==2 and isinstance(d[1], dict)):
764 raise ValueError,\
765 "Tuple specification of distribution must be " \
766 "(d, {params}). Got %s" % (d,)
767 dist_gen = d[0]
768 loc_ = d[1].get('loc', loc)
769 scale_ = d[1].get('scale', scale)
770 args_ = d[1].get('args', args)
771 dist_name = d[1].get('name', str(dist_gen))
772 else:
773 dist_gen = d
774 dist_name = str(d)
775
776
777 try:
778 dist_gen_ = getattr(scipy.stats, dist_gen)
779
780
781 if args_ is not None or loc_ is not None or scale_ is not None:
782 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_)
783 else:
784 dist_opt = dist_gen_
785 dist_params = dist_opt.fit(data_selected)
786 if __debug__:
787 debug('STAT__',
788 'Got distribution parameters %s for %s'
789 % (dist_params, dist_name))
790 if test == 'p-roc':
791 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params)
792
793 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen))
794 cdf_p_thr = cdf_p <= p_thr
795 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1
796 if __debug__: res_sum = 'D=%.2f' % D
797 elif test == 'kstest':
798 D, p = kstest(data, d, args=dist_params)
799 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p)
800 except (TypeError, ValueError, AttributeError,
801 NotImplementedError), e:
802 if __debug__:
803 debug('STAT__',
804 'Testing for %s distribution failed due to %s'
805 % (d, str(e)))
806 continue
807
808 if p > p_thr and not N.isnan(D):
809 results += [ (D, dist_gen, dist_name, dist_params) ]
810 if __debug__:
811 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum))
812 else:
813 if __debug__:
814 debug('STAT__', 'Cannot consider %s dist. with %s'
815 % (d, res_sum))
816 continue
817
818
819 results.sort()
820
821 if __debug__ and 'STAT' in debug.active:
822
823 nresults = len(results)
824 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0])
825 if nresults>0:
826 nnextbest = min(2, nresults-1)
827 snextbest = ', '.join(map(sresult, results[1:1+nnextbest]))
828 debug('STAT', 'Best distribution %s. Next best: %s'
829 % (sresult(results[0]), snextbest))
830 else:
831 debug('STAT', 'Could not find suitable distribution')
832
833
834 return results
835
836
837 if externals.exists('pylab'):
838 import pylab as P
839
840 - def plotDistributionMatches(data, matches, nbins=31, nbest=5,
841 expand_tails=8, legend=2, plot_cdf=True,
842 p=None, tail='both'):
843 """Plot best matching distributions
844
845 :Parameters:
846 data : N.ndarray
847 Data which was used to obtain the matches
848 matches : list of tuples
849 Sorted matches as provided by matchDistribution
850 nbins : int
851 Number of bins in the histogram
852 nbest : int
853 Number of top matches to plot
854 expand_tails : int
855 How many bins away to add to parametrized distributions
856 plots
857 legend : int
858 Either to provide legend and statistics in the legend.
859 1 -- just lists distributions.
860 2 -- adds distance measure
861 3 -- tp/fp/fn in the case if p is provided
862 plot_cdf : bool
863 Either to plot cdf for data using non-parametric distribution
864 p : float or None
865 If not None, visualize null-hypothesis testing (given p).
866 Bars in the histogram which fall under given p are colored
867 in red. False positives and false negatives are marked as
868 triangle up and down symbols correspondingly
869 tail : ('left', 'right', 'any', 'both')
870 If p is not None, the choise of tail for null-hypothesis
871 testing
872
873 :Returns: tuple(histogram, list of lines)
874 """
875
876 hist = P.hist(data, nbins, normed=1, align='center')
877 data_range = [N.min(data), N.max(data)]
878
879
880 x = hist[1]
881 dx = x[expand_tails] - x[0]
882 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx))
883
884 nonparam = Nonparametric(data)
885
886 if plot_cdf:
887 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1)
888
889 p_thr = p
890
891 data_p = _pvalue(data, nonparam.cdf, tail)
892 data_p_thr = (data_p <= p_thr).ravel()
893
894 x_p = _pvalue(x, Nonparametric(data).cdf, tail)
895 x_p_thr = N.abs(x_p) <= p_thr
896
897 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]):
898 bar.set_facecolor(('w','r')[int(thr)])
899
900 if not len(matches):
901
902 warning("No matching distributions were provided -- nothing to plot")
903 return (hist, )
904
905 lines = []
906 labels = []
907 for i in xrange(min(nbest, len(matches))):
908 D, dist_gen, dist_name, params = matches[i]
909 dist = getattr(scipy.stats, dist_gen)(*params)
910
911 label = '%s' % (dist_name)
912 if legend > 1: label += '(D=%.2f)' % (D)
913
914 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail))
915 xcdf_p_thr = (xcdf_p <= p_thr).ravel()
916
917 if p is not None and legend > 2:
918
919 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail))
920 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel()
921
922
923 tp = N.logical_and(data_cdf_p_thr, data_p_thr)
924
925 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr)
926
927 fn = N.logical_and(~data_cdf_p_thr, data_p_thr)
928
929 label += ' tp/fp/fn=%d/%d/%d)' % \
930 tuple(map(N.sum, [tp,fp,fn]))
931
932 pdf = dist.pdf(x)
933 line = P.plot(x, pdf, '-', linewidth=2, label=label)
934 color = line[0].get_color()
935
936 if plot_cdf:
937 cdf = dist.cdf(x)
938 P.plot(x, cdf, ':', linewidth=1, color=color, label=label)
939
940
941
942
943
944 if p is not None:
945
946 xtp = N.logical_and(xcdf_p_thr, x_p_thr)
947
948 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr)
949
950 xfn = N.logical_and(~xcdf_p_thr, x_p_thr)
951
952
953
954 P.plot(x[xfp], pdf[xfp], '^', color=color)
955 P.plot(x[xfn], pdf[xfn], 'v', color=color)
956
957 lines.append(line)
958 labels.append(label)
959
960 if legend:
961 P.legend(lines, labels)
962
963 return (hist, lines)
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978 -def autoNullDist(dist):
979 """Cheater for human beings -- wraps `dist` if needed with some
980 NullDist
981
982 tail and other arguments are assumed to be default as in
983 NullDist/MCNullDist
984 """
985 if dist is None or isinstance(dist, NullDist):
986 return dist
987 elif hasattr(dist, 'fit'):
988 if __debug__:
989 debug('STAT', 'Wrapping %s into MCNullDist' % dist)
990 return MCNullDist(dist)
991 else:
992 if __debug__:
993 debug('STAT', 'Wrapping %s into FixedNullDist' % dist)
994 return FixedNullDist(dist)
995
999 if axis is None:
1000 a = N.ravel(a)
1001 outaxis = 0
1002 else:
1003 a = N.asarray(a)
1004 outaxis = axis
1005 return a, outaxis
1006
1008 """Compute the mean over the given axis ignoring nans.
1009
1010 :Parameters:
1011 x : ndarray
1012 input array
1013 axis : int
1014 axis along which the mean is computed.
1015
1016 :Results:
1017 m : float
1018 the mean."""
1019 x, axis = _chk_asarray(x,axis)
1020 x = x.copy()
1021 Norig = x.shape[axis]
1022 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig
1023
1024 x[N.isnan(x)] = 0
1025 return N.mean(x,axis)/factor
1026