1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9  """Base class for data measures: algorithms that quantify properties of 
 10  datasets. 
 11   
 12  Besides the `DatasetMeasure` base class this module also provides the 
 13  (abstract) `FeaturewiseDatasetMeasure` class. The difference between a general 
 14  measure and the output of the `FeaturewiseDatasetMeasure` is that the latter 
 15  returns a 1d map (one value per feature in the dataset). In contrast there are 
 16  no restrictions on the returned value of `DatasetMeasure` except for that it 
 17  has to be in some iterable container. 
 18   
 19  """ 
 20   
 21  __docformat__ = 'restructuredtext' 
 22   
 23  import numpy as N 
 24  import mvpa.support.copy as copy 
 25   
 26  from mvpa.misc.state import StateVariable, ClassWithCollections 
 27  from mvpa.misc.args import group_kwargs 
 28  from mvpa.misc.transformers import FirstAxisMean, SecondAxisSumOfAbs 
 29  from mvpa.base.dochelpers import enhancedDocString 
 30  from mvpa.base import externals, warning 
 31  from mvpa.clfs.stats import autoNullDist 
 32   
 33  if __debug__: 
 34      from mvpa.base import debug 
 38      """A measure computed from a `Dataset` 
 39   
 40      All dataset measures support arbitrary transformation of the measure 
 41      after it has been computed. Transformation are done by processing the 
 42      measure with a functor that is specified via the `transformer` keyword 
 43      argument of the constructor. Upon request, the raw measure (before 
 44      transformations are applied) is stored in the `raw_results` state variable. 
 45   
 46      Additionally all dataset measures support the estimation of the 
 47      probabilit(y,ies) of a measure under some distribution. Typically this will 
 48      be the NULL distribution (no signal), that can be estimated with 
 49      permutation tests. If a distribution estimator instance is passed to the 
 50      `null_dist` keyword argument of the constructor the respective 
 51      probabilities are automatically computed and stored in the `null_prob` 
 52      state variable. 
 53   
 54      .. note:: 
 55        For developers: All subclasses shall get all necessary parameters via 
 56        their constructor, so it is possible to get the same type of measure for 
 57        multiple datasets by passing them to the __call__() method successively. 
 58      """ 
 59   
 60      raw_results = StateVariable(enabled=False, 
 61          doc="Computed results before applying any " + 
 62              "transformation algorithm") 
 63      null_prob = StateVariable(enabled=True) 
 64      """Stores the probability of a measure under the NULL hypothesis""" 
 65      null_t = StateVariable(enabled=False) 
 66      """Stores the t-score corresponding to null_prob under assumption 
 67      of Normal distribution""" 
 68   
 69 -    def __init__(self, transformer=None, null_dist=None, **kwargs): 
  70          """Does nothing special. 
 71   
 72          :Parameters: 
 73            transformer: Functor 
 74              This functor is called in `__call__()` to perform a final 
 75              processing step on the to be returned dataset measure. If None, 
 76              nothing is called 
 77            null_dist: instance of distribution estimator 
 78              The estimated distribution is used to assign a probability for a 
 79              certain value of the computed measure. 
 80          """ 
 81          ClassWithCollections.__init__(self, **kwargs) 
 82   
 83          self.__transformer = transformer 
 84          """Functor to be called in return statement of all subclass __call__() 
 85          methods.""" 
 86          null_dist_ = autoNullDist(null_dist) 
 87          if __debug__: 
 88              debug('SA', 'Assigning null_dist %s whenever original given was %s' 
 89                    % (null_dist_, null_dist)) 
 90          self.__null_dist = null_dist_ 
  91   
 92   
 93      __doc__ = enhancedDocString('DatasetMeasure', locals(), ClassWithCollections) 
 94   
 95   
 97          """Compute measure on a given `Dataset`. 
 98   
 99          Each implementation has to handle a single arguments: the source 
100          dataset. 
101   
102          Returns the computed measure in some iterable (list-like) 
103          container applying transformer if such is defined 
104          """ 
105          result = self._call(dataset) 
106          result = self._postcall(dataset, result) 
107          return result 
 108   
109   
110 -    def _call(self, dataset): 
 111          """Actually compute measure on a given `Dataset`. 
112   
113          Each implementation has to handle a single arguments: the source 
114          dataset. 
115   
116          Returns the computed measure in some iterable (list-like) container. 
117          """ 
118          raise NotImplemented 
 119   
120   
121 -    def _postcall(self, dataset, result): 
 122          """Some postprocessing on the result 
123          """ 
124          self.states.raw_results = result 
125          if not self.__transformer is None: 
126              if __debug__: 
127                  debug("SA_", "Applying transformer %s" % self.__transformer) 
128              result = self.__transformer(result) 
129   
130           
131          if not self.__null_dist is None: 
132              if __debug__: 
133                  debug("SA_", "Estimating NULL distribution using %s" 
134                        % self.__null_dist) 
135   
136               
137               
138               
139              measure = copy.copy(self) 
140              measure.__null_dist = None 
141              self.__null_dist.fit(measure, dataset) 
142   
143              if self.states.isEnabled('null_t'): 
144                   
145                   
146                  null_prob, null_right_tail = \ 
147                             self.__null_dist.p(result, return_tails=True) 
148                  self.states.null_prob = null_prob 
149   
150                  externals.exists('scipy', raiseException=True) 
151                  from scipy.stats import norm 
152   
153                   
154                   
155                  tail = self.null_dist.tail 
156                  if tail == 'left': 
157                      acdf = N.abs(null_prob) 
158                  elif tail == 'right': 
159                      acdf = 1.0 - N.abs(null_prob) 
160                  elif tail in ['any', 'both']: 
161                      acdf = 1.0 - N.clip(N.abs(null_prob), 0, 0.5) 
162                  else: 
163                      raise RuntimeError, 'Unhandled tail %s' % tail 
164                   
165                   
166                   
167                   
168                   
169                   
170                   
171                  clip = 1e-16 
172                  null_t = norm.ppf(N.clip(acdf, clip, 1.0 - clip)) 
173                   
174                  null_t = N.array(null_t, ndmin=1, copy=False) 
175                  null_t[~null_right_tail] *= -1.0  
176                  self.states.null_t = null_t                  
177              else: 
178                   
179                   
180                  self.null_prob = self.__null_dist.p(result) 
181   
182          return result 
 183   
184   
186          """String representation of DatasetMeasure 
187   
188          Includes only arguments which differ from default ones 
189          """ 
190          prefixes = prefixes[:] 
191          if self.__transformer is not None: 
192              prefixes.append("transformer=%s" % self.__transformer) 
193          if self.__null_dist is not None: 
194              prefixes.append("null_dist=%s" % self.__null_dist) 
195          return super(DatasetMeasure, self).__repr__(prefixes=prefixes) 
 196   
198          """'Untraining' Measure 
199   
200          Some derived classes might used classifiers, so we need to 
201          untrain those 
202          """ 
203          pass 
 204   
205      @property 
207          """Return Null Distribution estimator""" 
208          return self.__null_dist 
 209   
210      @property 
 214   
217      """A per-feature-measure computed from a `Dataset` (base class). 
218   
219      Should behave like a DatasetMeasure. 
220      """ 
221   
222      base_sensitivities = StateVariable(enabled=False, 
223          doc="Stores basic sensitivities if the sensitivity " + 
224              "relies on combining multiple ones") 
225   
226       
227       
228       
229       
230       
231       
232       
233       
234       
235       
236       
237       
239          """Initialize 
240   
241          :Parameters: 
242            combiner : Functor 
243              The combiner is only applied if the computed featurewise dataset 
244              measure is more than one-dimensional. This is different from a 
245              `transformer`, which is always applied. By default, the sum of 
246              absolute values along the second axis is computed. 
247          """ 
248          DatasetMeasure.__init__(self, **kwargs) 
249   
250          self.__combiner = combiner 
 251   
259   
260   
261 -    def _call(self, dataset): 
 262          """Computes a per-feature-measure on a given `Dataset`. 
263   
264          Behaves like a `DatasetMeasure`, but computes and returns a 1d ndarray 
265          with one value per feature. 
266          """ 
267          raise NotImplementedError 
 268   
269   
270 -    def _postcall(self, dataset, result): 
 271          """Adjusts per-feature-measure for computed `result` 
272   
273   
274          TODO: overlaps in what it does heavily with 
275           CombinedSensitivityAnalyzer, thus this one might make use of 
276           CombinedSensitivityAnalyzer yoh thinks, and here 
277           base_sensitivities doesn't sound appropriate. 
278           MH: There is indeed some overlap, but also significant differences. 
279               This one operates on a single sensana and combines over second 
280               axis, CombinedFeaturewiseDatasetMeasure uses first axis. 
281               Additionally, 'Sensitivity' base class is 
282               FeaturewiseDatasetMeasures which would have to be changed to 
283               CombinedFeaturewiseDatasetMeasure to deal with stuff like 
284               SMLRWeights that return multiple sensitivity values by default. 
285               Not sure if unification of both (and/or removal of functionality 
286               here does not lead to an overall more complicated situation, 
287               without any real gain -- after all this one works ;-) 
288          """ 
289           
290           
291          result = N.atleast_1d(result) 
292          result_sq = result.squeeze() 
293           
294           
295          result_sq = N.atleast_1d(result_sq) 
296   
297          if len(result_sq.shape)>1: 
298              n_base = result.shape[1] 
299              """Number of base sensitivities""" 
300              if self.states.isEnabled('base_sensitivities'): 
301                  b_sensitivities = [] 
302                  if not self.states.isKnown('biases'): 
303                      biases = None 
304                  else: 
305                      biases = self.states.biases 
306                      if len(self.states.biases) != n_base: 
307                          warning("Number of biases %d differs from number " 
308                                  "of base sensitivities %d which could happen " 
309                                  "when measure is collided across labels." 
310                                  % (len(self.states.biases), n_base)) 
311                  for i in xrange(n_base): 
312                      if not biases is None: 
313                          if n_base > 1 and len(biases) == 1: 
314                               
315                              bias = biases[0] 
316                          else: 
317                              bias = biases[i] 
318                      else: 
319                          bias = None 
320                      b_sensitivities = StaticDatasetMeasure( 
321                          measure = result[:,i], 
322                          bias = bias) 
323                  self.states.base_sensitivities = b_sensitivities 
324   
325               
326               
327              if self.__combiner is not None: 
328                  result = self.__combiner(result) 
329          else: 
330               
331               
332               
333              result = result_sq 
334   
335           
336          result = DatasetMeasure._postcall(self, dataset, result) 
337   
338          return result 
 339   
340      @property 
342          """Return combiner""" 
343          return self.__combiner 
 344   
348      """A static (assigned) sensitivity measure. 
349   
350      Since implementation is generic it might be per feature or 
351      per whole dataset 
352      """ 
353   
354 -    def __init__(self, measure=None, bias=None, *args, **kwargs): 
 355          """Initialize. 
356   
357          :Parameters: 
358            measure 
359               actual sensitivity to be returned 
360            bias 
361               optionally available bias 
362          """ 
363          DatasetMeasure.__init__(self, *args, **kwargs) 
364          if measure is None: 
365              raise ValueError, "Sensitivity measure has to be provided" 
366          self.__measure = measure 
367          self.__bias = bias 
 368   
369 -    def _call(self, dataset): 
 370          """Returns assigned sensitivity 
371          """ 
372          return self.__measure 
 373   
374       
375      bias = property(fget=lambda self:self.__bias) 
 376   
377   
378   
379   
380   
381   
382 -class Sensitivity(FeaturewiseDatasetMeasure): 
 383   
384      _LEGAL_CLFS = [] 
385      """If Sensitivity is classifier specific, classes of classifiers 
386      should be listed in the list 
387      """ 
388   
389 -    def __init__(self, clf, force_training=True, **kwargs): 
 390          """Initialize the analyzer with the classifier it shall use. 
391   
392          :Parameters: 
393            clf : :class:`Classifier` 
394              classifier to use. 
395            force_training : Bool 
396              if classifier was already trained -- do not retrain 
397          """ 
398   
399          """Does nothing special.""" 
400          FeaturewiseDatasetMeasure.__init__(self, **kwargs) 
401   
402          _LEGAL_CLFS = self._LEGAL_CLFS 
403          if len(_LEGAL_CLFS) > 0: 
404              found = False 
405              for clf_class in _LEGAL_CLFS: 
406                  if isinstance(clf, clf_class): 
407                      found = True 
408                      break 
409              if not found: 
410                  raise ValueError, \ 
411                    "Classifier %s has to be of allowed class (%s), but is %s" \ 
412                                % (clf, _LEGAL_CLFS, `type(clf)`) 
413   
414          self.__clf = clf 
415          """Classifier used to computed sensitivity""" 
416   
417          self._force_training = force_training 
418          """Either to force it to train""" 
 419   
421          if prefixes is None: 
422              prefixes = [] 
423          prefixes.append("clf=%s" % repr(self.clf)) 
424          if not self._force_training: 
425              prefixes.append("force_training=%s" % self._force_training) 
426          return super(Sensitivity, self).__repr__(prefixes=prefixes) 
 427   
428   
430          """Train classifier on `dataset` and then compute actual sensitivity. 
431   
432          If the classifier is already trained it is possible to extract the 
433          sensitivities without passing a dataset. 
434          """ 
435           
436          clf = self.__clf 
437          if not clf.trained or self._force_training: 
438              if dataset is None: 
439                  raise ValueError, \ 
440                        "Training classifier to compute sensitivities requires " \ 
441                        "a dataset." 
442              if __debug__: 
443                  debug("SA", "Training classifier %s %s" % 
444                        (`clf`, 
445                         {False: "since it wasn't yet trained", 
446                          True:  "although it was trained previousely"} 
447                         [clf.trained])) 
448              clf.train(dataset) 
449   
450          return FeaturewiseDatasetMeasure.__call__(self, dataset) 
 451   
452   
455   
456   
458          """Untrain corresponding classifier for Sensitivity 
459          """ 
460          if self.__clf is not None: 
461              self.__clf.untrain() 
 462   
463      @property 
465          """Return feature_ids used by the underlying classifier 
466          """ 
467          return self.__clf._getFeatureIds() 
 468   
469   
470      clf = property(fget=lambda self:self.__clf, 
471                     fset=_setClassifier) 
 472   
476      """Set sensitivity analyzers to be merged into a single output""" 
477   
478      sensitivities = StateVariable(enabled=False, 
479          doc="Sensitivities produced by each analyzer") 
480   
481       
482       
483       
484 -    def __init__(self, analyzers=None,   
485                   combiner=None,  
486                   **kwargs): 
 487          """Initialize CombinedFeaturewiseDatasetMeasure 
488   
489          :Parameters: 
490            analyzers : list or None 
491              List of analyzers to be used. There is no logic to populate 
492              such a list in __call__, so it must be either provided to 
493              the constructor or assigned to .analyzers prior calling 
494          """ 
495          if analyzers is None: 
496              analyzers = [] 
497   
498          FeaturewiseDatasetMeasure.__init__(self, **kwargs) 
499          self.__analyzers = analyzers 
500          """List of analyzers to use""" 
501   
502          self.__combiner = combiner 
503          """Which functor to use to combine all sensitivities""" 
 504   
505   
506 -    def _call(self, dataset): 
 527   
528   
530          """Untrain CombinedFDM 
531          """ 
532          if self.__analyzers is not None: 
533              for anal in self.__analyzers: 
534                  anal.untrain() 
 535   
537          """Set the analyzers 
538          """ 
539          self.__analyzers = analyzers 
540          """Analyzers to use""" 
 541   
542      analyzers = property(fget=lambda x:x.__analyzers, 
543                           fset=_setAnalyzers, 
544                           doc="Used analyzers") 
 545   
552      """Compute measures across splits for a specific analyzer""" 
553   
554       
555       
556   
557      sensitivities = StateVariable(enabled=False, 
558          doc="Sensitivities produced for each split") 
559   
560      splits = StateVariable(enabled=False, doc= 
561         """Store the actual splits of the data. Can be memory expensive""") 
562   
563 -    def __init__(self, splitter, analyzer, 
564                   insplit_index=0, combiner=None, **kwargs): 
 565          """Initialize SplitFeaturewiseDatasetMeasure 
566   
567          :Parameters: 
568            splitter : Splitter 
569              Splitter to use to split the dataset 
570            analyzer : DatasetMeasure 
571              Measure to be used. Could be analyzer as well (XXX) 
572            insplit_index : int 
573              splitter generates tuples of dataset on each iteration 
574              (usually 0th for training, 1st for testing). 
575              On what split index in that tuple to operate. 
576          """ 
577   
578           
579           
580   
581           
582   
583           
584           
585          FeaturewiseDatasetMeasure.__init__(self, combiner=None, **kwargs) 
586   
587          self.__analyzer = analyzer 
588          """Analyzer to use per split""" 
589   
590          self.__combiner = combiner 
591          """Which functor to use to combine all sensitivities""" 
592   
593          self.__splitter = splitter 
594          """Splitter to be used on the dataset""" 
595   
596          self.__insplit_index = insplit_index 
 597   
598   
600          """Untrain SplitFeaturewiseDatasetMeasure 
601          """ 
602          if self.__analyzer is not None: 
603              self.__analyzer.untrain() 
 604   
605   
606 -    def _call(self, dataset): 
  637   
640      """Set sensitivity analyzers to be merged into a single output""" 
641   
642   
643       
644      @group_kwargs(prefixes=['slave_'], assign=True) 
645 -    def __init__(self, 
646                   clf, 
647                   analyzer=None, 
648                   combined_analyzer=None, 
649                   slave_kwargs={}, 
650                   **kwargs): 
 651          """Initialize Sensitivity Analyzer for `BoostedClassifier` 
652   
653          :Parameters: 
654            clf : `BoostedClassifier` 
655              Classifier to be used 
656            analyzer : analyzer 
657              Is used to populate combined_analyzer  
658            slave_* 
659              Arguments to pass to created analyzer if analyzer is None 
660          """ 
661          Sensitivity.__init__(self, clf, **kwargs) 
662          if combined_analyzer is None: 
663               
664              kwargs.pop('force_training', None) 
665              combined_analyzer = CombinedFeaturewiseDatasetMeasure(**kwargs) 
666          self.__combined_analyzer = combined_analyzer 
667          """Combined analyzer to use""" 
668   
669          if analyzer is not None and len(self._slave_kwargs): 
670              raise ValueError, \ 
671                    "Provide either analyzer of slave_* arguments, not both" 
672          self.__analyzer = analyzer 
673          """Analyzer to use for basic classifiers within boosted classifier""" 
 674   
675   
677          """Untrain BoostedClassifierSensitivityAnalyzer 
678          """ 
679          if self.__analyzer is not None: 
680              self.__analyzer.untrain() 
681          self.__combined_analyzer.untrain() 
 682   
683   
684 -    def _call(self, dataset): 
 715   
716      combined_analyzer = property(fget=lambda x:x.__combined_analyzer) 
 717   
720      """Set sensitivity analyzer output just to pass through""" 
721   
722      clf_sensitivities = StateVariable(enabled=False, 
723          doc="Stores sensitivities of the proxied classifier") 
724   
725   
726      @group_kwargs(prefixes=['slave_'], assign=True) 
727 -    def __init__(self, 
728                   clf, 
729                   analyzer=None, 
730                   **kwargs): 
 731          """Initialize Sensitivity Analyzer for `BoostedClassifier` 
732          """ 
733          Sensitivity.__init__(self, clf, **kwargs) 
734   
735          if analyzer is not None and len(self._slave_kwargs): 
736              raise ValueError, \ 
737                    "Provide either analyzer of slave_* arguments, not both" 
738   
739          self.__analyzer = analyzer 
740          """Analyzer to use for basic classifiers within boosted classifier""" 
 741   
742   
747   
748   
749 -    def _call(self, dataset): 
 779   
780      analyzer = property(fget=lambda x:x.__analyzer) 
 781   
784      """Set sensitivity analyzer output be reverse mapped using mapper of the 
785      slave classifier""" 
786   
787 -    def _call(self, dataset): 
  795   
798      """Set sensitivity analyzer output be reverse mapped using mapper of the 
799      slave classifier""" 
800   
801 -    def _call(self, dataset): 
  809