mvpa.clfs.meta

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # See COPYING file distributed along with the PyMVPA package for the 6 # copyright and license terms. 7 # 8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 """Classes for meta classifiers -- classifiers which use other classifiers 10 11 Meta Classifiers can be grouped according to their function as 12 13 :group BoostedClassifiers: CombinedClassifier MulticlassClassifier 14 SplitClassifier 15 :group ProxyClassifiers: ProxyClassifier BinaryClassifier MappedClassifier 16 FeatureSelectionClassifier 17 :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner 18 MaximalVote MeanPrediction 19 20 """ 21 22 __docformat__ = 'restructuredtext' 23 24 import operator 25 import numpy as N 26 27 from mvpa.misc.args import group_kwargs 28 from mvpa.mappers.mask import MaskMapper 29 from mvpa.datasets.splitters import NFoldSplitter 30 from mvpa.misc.state import StateVariable, ClassWithCollections, Harvestable 31 32 from mvpa.clfs.base import Classifier 33 from mvpa.misc.transformers import FirstAxisMean 34 35 from mvpa.measures.base import \ 36 BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \ 37 MappedClassifierSensitivityAnalyzer, \ 38 FeatureSelectionClassifierSensitivityAnalyzer 39 40 from mvpa.base import warning 41 42 if __debug__: 43 from mvpa.base import debug

44 45 46 -class BoostedClassifier(Classifier, Harvestable):

47 """Classifier containing the farm of other classifiers. 48 49 Should rarely be used directly. Use one of its childs instead 50 """ 51 52 # should not be needed if we have prediction_values upstairs 53 # raw_predictions should be handled as Harvestable??? 54 raw_predictions = StateVariable(enabled=False, 55 doc="Predictions obtained from each classifier") 56 57 raw_values = StateVariable(enabled=False, 58 doc="Values obtained from each classifier") 59 60

61 - def __init__(self, clfs=None, propagate_states=True, 62 harvest_attribs=None, copy_attribs='copy', 63 **kwargs):

64 """Initialize the instance. 65 66 :Parameters: 67 clfs : list 68 list of classifier instances to use (slave classifiers) 69 propagate_states : bool 70 either to propagate enabled states into slave classifiers. 71 It is in effect only when slaves get assigned - so if state 72 is enabled not during construction, it would not necessarily 73 propagate into slaves 74 kwargs : dict 75 dict of keyworded arguments which might get used 76 by State or Classifier 77 """ 78 if clfs == None: 79 clfs = [] 80 81 Classifier.__init__(self, **kwargs) 82 Harvestable.__init__(self, harvest_attribs, copy_attribs) 83 84 self.__clfs = None 85 """Pylint friendly definition of __clfs""" 86 87 self.__propagate_states = propagate_states 88 """Enable current enabled states in slave classifiers""" 89 90 self._setClassifiers(clfs) 91 """Store the list of classifiers"""

92 93

94 - def __repr__(self, prefixes=[]):

95 if self.__clfs is None or len(self.__clfs)==0: 96 #prefix_ = "clfs=%s" % repr(self.__clfs) 97 prefix_ = [] 98 else: 99 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])] 100 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)

101 102

103 - def _train(self, dataset):

104 """Train `BoostedClassifier` 105 """ 106 for clf in self.__clfs: 107 clf.train(dataset)

108 109

110 - def _posttrain(self, dataset):

111 """Custom posttrain of `BoostedClassifier` 112 113 Harvest over the trained classifiers if it was asked to so 114 """ 115 Classifier._posttrain(self, dataset) 116 if self.states.isEnabled('harvested'): 117 for clf in self.__clfs: 118 self._harvest(locals()) 119 if self.params.retrainable: 120 self.__changedData_isset = False

121 122

123 - def _getFeatureIds(self):

124 """Custom _getFeatureIds for `BoostedClassifier` 125 """ 126 # return union of all used features by slave classifiers 127 feature_ids = set([]) 128 for clf in self.__clfs: 129 feature_ids = feature_ids.union(set(clf.feature_ids)) 130 return list(feature_ids)

131 132

133 - def _predict(self, data):

134 """Predict using `BoostedClassifier` 135 """ 136 raw_predictions = [ clf.predict(data) for clf in self.__clfs ] 137 self.raw_predictions = raw_predictions 138 assert(len(self.__clfs)>0) 139 if self.states.isEnabled("values"): 140 if N.array([x.states.isEnabled("values") 141 for x in self.__clfs]).all(): 142 values = [ clf.values for clf in self.__clfs ] 143 self.raw_values = values 144 else: 145 warning("One or more classifiers in %s has no 'values' state" % 146 self + "enabled, thus BoostedClassifier can't have" + 147 " 'raw_values' state variable defined") 148 149 return raw_predictions

150 151

152 - def _setClassifiers(self, clfs):

153 """Set the classifiers used by the boosted classifier 154 155 We have to allow to set list of classifiers after the object 156 was actually created. It will be used by 157 MulticlassClassifier 158 """ 159 self.__clfs = clfs 160 """Classifiers to use""" 161 162 if len(clfs): 163 for flag in ['regression']: 164 values = N.array([clf.params[flag].value for clf in clfs]) 165 value = values.any() 166 if __debug__: 167 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers " 168 "%(clfs)s with %(values)s", 169 msgargs={'flag' : flag, 'value' : value, 170 'clfs' : clfs, 171 'values' : values}) 172 # set flag if it needs to be trained before predicting 173 self.params[flag].value = value 174 175 # enable corresponding states in the slave-classifiers 176 if self.__propagate_states: 177 for clf in self.__clfs: 178 clf.states.enable(self.states.enabled, missingok=True) 179 180 # adhere to their capabilities + 'multiclass' 181 # XXX do intersection across all classifiers! 182 # TODO: this seems to be wrong since it can be regression etc 183 self._clf_internals = [ 'binary', 'multiclass', 'meta' ] 184 if len(clfs)>0: 185 self._clf_internals += self.__clfs[0]._clf_internals

186

187 - def untrain(self):

188 """Untrain `BoostedClassifier` 189 190 Has to untrain any known classifier 191 """ 192 if not self.trained: 193 return 194 for clf in self.clfs: 195 clf.untrain() 196 super(BoostedClassifier, self).untrain()

197

198 - def getSensitivityAnalyzer(self, **kwargs):

199 """Return an appropriate SensitivityAnalyzer""" 200 return BoostedClassifierSensitivityAnalyzer( 201 self, 202 **kwargs)

203 204 205 clfs = property(fget=lambda x:x.__clfs, 206 fset=_setClassifiers, 207 doc="Used classifiers")

208

209 210 211 -class ProxyClassifier(Classifier):

212 """Classifier which decorates another classifier 213 214 Possible uses: 215 216 - modify data somehow prior training/testing: 217 * normalization 218 * feature selection 219 * modification 220 221 - optimized classifier? 222 223 """ 224

225 - def __init__(self, clf, **kwargs):

226 """Initialize the instance 227 228 :Parameters: 229 clf : Classifier 230 classifier based on which mask classifiers is created 231 """ 232 233 Classifier.__init__(self, regression=clf.regression, **kwargs) 234 235 self.__clf = clf 236 """Store the classifier to use.""" 237 238 # adhere to slave classifier capabilities 239 # TODO: unittest 240 self._clf_internals = self._clf_internals[:] + ['meta'] 241 if clf is not None: 242 self._clf_internals += clf._clf_internals

243 244

245 - def __repr__(self, prefixes=[]):

246 return super(ProxyClassifier, self).__repr__( 247 ["clf=%s" % repr(self.__clf)] + prefixes)

248

249 - def summary(self):

250 s = super(ProxyClassifier, self).summary() 251 if self.trained: 252 s += "\n Slave classifier summary:" + \ 253 '\n + %s' % \ 254 (self.__clf.summary().replace('\n', '\n |')) 255 return s

256 257 258

259 - def _train(self, dataset):

260 """Train `ProxyClassifier` 261 """ 262 # base class does nothing much -- just proxies requests to underlying 263 # classifier 264 self.__clf.train(dataset)

265 266 # for the ease of access 267 # TODO: if to copy we should exclude some states which are defined in 268 # base Classifier (such as training_time, predicting_time) 269 # YOH: for now _copy_states_ would copy only set states variables. If 270 # anything needs to be overriden in the parent's class, it is 271 # welcome to do so 272 #self.states._copy_states_(self.__clf, deep=False) 273 274

275 - def _predict(self, data):

276 """Predict using `ProxyClassifier` 277 """ 278 clf = self.__clf 279 if self.states.isEnabled('values'): 280 clf.states.enable(['values']) 281 282 result = clf.predict(data) 283 # for the ease of access 284 self.states._copy_states_(self.__clf, ['values'], deep=False) 285 return result

286 287

288 - def untrain(self):

289 """Untrain ProxyClassifier 290 """ 291 if not self.__clf is None: 292 self.__clf.untrain() 293 super(ProxyClassifier, self).untrain()

294 295 296 @group_kwargs(prefixes=['slave_'], passthrough=True)

297 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):

298 """Return an appropriate SensitivityAnalyzer""" 299 return ProxyClassifierSensitivityAnalyzer( 300 self, 301 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 302 **kwargs)

303 304 305 clf = property(lambda x:x.__clf, doc="Used `Classifier`")

306

307 308 309 # 310 # Various combiners for CombinedClassifier 311 # 312 313 -class PredictionsCombiner(ClassWithCollections):

314 """Base class for combining decisions of multiple classifiers""" 315

316 - def train(self, clfs, dataset):

317 """PredictionsCombiner might need to be trained 318 319 :Parameters: 320 clfs : list of Classifier 321 List of classifiers to combine. Has to be classifiers (not 322 pure predictions), since combiner might use some other 323 state variables (value's) instead of pure prediction's 324 dataset : Dataset 325 training data in this case 326 """ 327 pass

328 329

330 - def __call__(self, clfs, dataset):

331 """Call function 332 333 :Parameters: 334 clfs : list of Classifier 335 List of classifiers to combine. Has to be classifiers (not 336 pure predictions), since combiner might use some other 337 state variables (value's) instead of pure prediction's 338 """ 339 raise NotImplementedError

340

341 342 343 -class MaximalVote(PredictionsCombiner):

344 """Provides a decision using maximal vote rule""" 345 346 predictions = StateVariable(enabled=True, 347 doc="Voted predictions") 348 all_label_counts = StateVariable(enabled=False, 349 doc="Counts across classifiers for each label/sample") 350

351 - def __init__(self):

352 """XXX Might get a parameter to use raw decision values if 353 voting is not unambigous (ie two classes have equal number of 354 votes 355 """ 356 PredictionsCombiner.__init__(self)

357 358

359 - def __call__(self, clfs, dataset):

360 """Actuall callable - perform voting 361 362 Extended functionality which might not be needed actually: 363 Since `BinaryClassifier` might return a list of possible 364 predictions (not just a single one), we should consider all of those 365 366 MaximalVote doesn't care about dataset itself 367 """ 368 if len(clfs)==0: 369 return [] # to don't even bother 370 371 all_label_counts = None 372 for clf in clfs: 373 # Lets check first if necessary state variable is enabled 374 if not clf.states.isEnabled("predictions"): 375 raise ValueError, "MaximalVote needs classifiers (such as " + \ 376 "%s) with state 'predictions' enabled" % clf 377 predictions = clf.predictions 378 if all_label_counts is None: 379 all_label_counts = [ {} for i in xrange(len(predictions)) ] 380 381 # for every sample 382 for i in xrange(len(predictions)): 383 prediction = predictions[i] 384 if not operator.isSequenceType(prediction): 385 prediction = (prediction,) 386 for label in prediction: # for every label 387 # XXX we might have multiple labels assigned 388 # but might not -- don't remember now 389 if not all_label_counts[i].has_key(label): 390 all_label_counts[i][label] = 0 391 all_label_counts[i][label] += 1 392 393 predictions = [] 394 # select maximal vote now for each sample 395 for i in xrange(len(all_label_counts)): 396 label_counts = all_label_counts[i] 397 # lets do explicit search for max so we know 398 # if it is unique 399 maxk = [] # labels of elements with max vote 400 maxv = -1 401 for k, v in label_counts.iteritems(): 402 if v > maxv: 403 maxk = [k] 404 maxv = v 405 elif v == maxv: 406 maxk.append(k) 407 408 assert len(maxk) >= 1, \ 409 "We should have obtained at least a single key of max label" 410 411 if len(maxk) > 1: 412 warning("We got multiple labels %s which have the " % maxk + 413 "same maximal vote %d. XXX disambiguate" % maxv) 414 predictions.append(maxk[0]) 415 416 self.all_label_counts = all_label_counts 417 self.predictions = predictions 418 return predictions

419

420 421 422 -class MeanPrediction(PredictionsCombiner):

423 """Provides a decision by taking mean of the results 424 """ 425 426 predictions = StateVariable(enabled=True, 427 doc="Mean predictions") 428

429 - def __call__(self, clfs, dataset):

430 """Actuall callable - perform meaning 431 432 """ 433 if len(clfs)==0: 434 return [] # to don't even bother 435 436 all_predictions = [] 437 for clf in clfs: 438 # Lets check first if necessary state variable is enabled 439 if not clf.states.isEnabled("predictions"): 440 raise ValueError, "MeanPrediction needs classifiers (such " \ 441 " as %s) with state 'predictions' enabled" % clf 442 all_predictions.append(clf.predictions) 443 444 # compute mean 445 predictions = N.mean(N.asarray(all_predictions), axis=0) 446 self.predictions = predictions 447 return predictions

448

449 450 -class ClassifierCombiner(PredictionsCombiner):

451 """Provides a decision using training a classifier on predictions/values 452 453 TODO: implement 454 """ 455 456 predictions = StateVariable(enabled=True, 457 doc="Trained predictions") 458 459

460 - def __init__(self, clf, variables=None):

461 """Initialize `ClassifierCombiner` 462 463 :Parameters: 464 clf : Classifier 465 Classifier to train on the predictions 466 variables : list of basestring 467 List of state variables stored in 'combined' classifiers, which 468 to use as features for training this classifier 469 """ 470 PredictionsCombiner.__init__(self) 471 472 self.__clf = clf 473 """Classifier to train on `variables` states of provided classifiers""" 474 475 if variables == None: 476 variables = ['predictions'] 477 self.__variables = variables 478 """What state variables of the classifiers to use"""

479 480

481 - def untrain(self):

482 """It might be needed to untrain used classifier""" 483 if self.__clf: 484 self.__clf.untrain()

485

486 - def __call__(self, clfs, dataset):

487 """ 488 """ 489 if len(clfs)==0: 490 return [] # to don't even bother 491 492 raise NotImplementedError

493

494 495 496 -class CombinedClassifier(BoostedClassifier):

497 """`BoostedClassifier` which combines predictions using some 498 `PredictionsCombiner` functor. 499 """ 500

501 - def __init__(self, clfs=None, combiner=None, **kwargs):

502 """Initialize the instance. 503 504 :Parameters: 505 clfs : list of Classifier 506 list of classifier instances to use 507 combiner : PredictionsCombiner 508 callable which takes care about combining multiple 509 results into a single one (e.g. maximal vote for 510 classification, MeanPrediction for regression)) 511 kwargs : dict 512 dict of keyworded arguments which might get used 513 by State or Classifier 514 515 NB: `combiner` might need to operate not on 'predictions' descrete 516 labels but rather on raw 'class' values classifiers 517 estimate (which is pretty much what is stored under 518 `values` 519 """ 520 if clfs == None: 521 clfs = [] 522 523 BoostedClassifier.__init__(self, clfs, **kwargs) 524 525 # assign default combiner 526 if combiner is None: 527 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]() 528 self.__combiner = combiner 529 """Functor destined to combine results of multiple classifiers"""

530 531

532 - def __repr__(self, prefixes=[]):

533 """Literal representation of `CombinedClassifier`. 534 """ 535 return super(CombinedClassifier, self).__repr__( 536 ["combiner=%s" % repr(self.__combiner)] + prefixes)

537 538

539 - def summary(self):

540 """Provide summary for the `CombinedClassifier`. 541 """ 542 s = super(CombinedClassifier, self).summary() 543 if self.trained: 544 s += "\n Slave classifiers summaries:" 545 for i, clf in enumerate(self.clfs): 546 s += '\n + %d clf: %s' % \ 547 (i, clf.summary().replace('\n', '\n |')) 548 return s

549 550

551 - def untrain(self):

552 """Untrain `CombinedClassifier` 553 """ 554 try: 555 self.__combiner.untrain() 556 except: 557 pass 558 super(CombinedClassifier, self).untrain()

559

560 - def _train(self, dataset):

561 """Train `CombinedClassifier` 562 """ 563 BoostedClassifier._train(self, dataset) 564 # combiner might need to train as well 565 self.__combiner.train(self.clfs, dataset)

566 567

568 - def _predict(self, data):

569 """Predict using `CombinedClassifier` 570 """ 571 BoostedClassifier._predict(self, data) 572 # combiner will make use of state variables instead of only predictions 573 # returned from _predict 574 predictions = self.__combiner(self.clfs, data) 575 self.predictions = predictions 576 577 if self.states.isEnabled("values"): 578 if self.__combiner.states.isActive("values"): 579 # XXX or may be we could leave simply up to accessing .combiner? 580 self.values = self.__combiner.values 581 else: 582 if __debug__: 583 warning("Boosted classifier %s has 'values' state enabled," 584 " but combiner doesn't have 'values' active, thus " 585 " .values cannot be provided directly, access .clfs" 586 % self) 587 return predictions

588 589 590 combiner = property(fget=lambda x:x.__combiner, 591 doc="Used combiner to derive a single result")

592

593 594 595 -class TreeClassifier(ProxyClassifier):

596 """`TreeClassifier` which allows to create hierarchy of classifiers 597 598 Functions by grouping some labels into a single "meta-label" and training 599 classifier first to separate between meta-labels. Then 600 each group further proceeds with classification within each group. 601 602 Possible scenarios:: 603 604 TreeClassifier(SVM(), 605 {'animate': ((1,2,3,4), 606 TreeClassifier(SVM(), 607 {'human': (('male', 'female'), SVM()), 608 'animals': (('monkey', 'dog'), SMLR())})), 609 'inanimate': ((5,6,7,8), SMLR())}) 610 611 would create classifier which would first do binary classification 612 to separate animate from inanimate, then for animate result it 613 would separate to classify human vs animal and so on:: 614 615 SVM 616 / \ 617 animate inanimate 618 / \ 619 SVM SMLR 620 / \ / | \ \ 621 human animal 5 6 7 8 622 | | 623 SVM SVM 624 / \ / \ 625 male female monkey dog 626 1 2 3 4 627 628 If it is desired to have a trailing node with a single label and 629 thus without any classification, such as in 630 631 SVM 632 / \ 633 g1 g2 634 / \ 635 1 SVM 636 / \ 637 2 3 638 639 then just specify None as the classifier to use:: 640 641 TreeClassifier(SVM(), 642 {'g1': ((1,), None), 643 'g2': ((1,2,3,4), SVM())}) 644 645 """ 646 647 _DEV__doc = """ 648 Questions: 649 * how to collect confusion matrices at a particular layer if such 650 classifier is given to SplitClassifier or CVTE 651 652 * What additional states to add, something like 653 clf_labels -- store remapped labels for the dataset 654 clf_values ... 655 656 * What do we store into values ? just values from the clfs[] 657 for corresponding samples, or top level clf values as well? 658 659 * what should be SensitivityAnalyzer? by default it would just 660 use top slave classifier (i.e. animate/inanimate) 661 662 Problems? 663 * .clf is not actually "proxied" per se, so not sure what things 664 should be taken care of yet... 665 666 TODO: 667 * Allow a group to be just a single category, so no further 668 classifier is needed, it just should stay separate from the 669 other groups 670 671 Possible TODO: 672 * Add ability to provide results of clf.values as features into 673 input of clfs[]. This way we could provide additional 'similarity' 674 information to the "other" branch 675 676 """ 677

678 - def __init__(self, clf, groups, **kwargs):

679 """Initialize TreeClassifier 680 681 :Parameters: 682 clf : Classifier 683 Classifier to separate between the groups 684 groups : dict of meta-label: tuple of (tuple of labels, classifier) 685 Defines the groups of labels and their classifiers. 686 See :class:`~mvpa.clfs.meta.TreeClassifier` for example 687 """ 688 689 # Basic initialization 690 ProxyClassifier.__init__(self, clf, **kwargs) 691 self._regressionIsBogus() 692 693 # XXX RF: probably create internal structure with dictionary, 694 # not just a tuple, and store all information in there 695 # accordingly 696 697 self._groups = groups 698 self._index2group = groups.keys() 699 700 # All processing of groups needs to be handled within _train 701 # since labels_map is not available here and definition 702 # is allowed to carry both symbolic and numeric values for 703 # labels 704 705 # We can only assign respective classifiers 706 self.clfs = dict([(gk, c) for gk, (ls, c) in groups.iteritems()]) 707 """Dictionary of classifiers used by the groups"""

708 709

710 - def __repr__(self, prefixes=[]):

711 """String representation of TreeClassifier 712 """ 713 prefix = "groups=%s" % repr(self._groups) 714 return super(TreeClassifier, self).__repr__([prefix] + prefixes)

715 716

717 - def summary(self):

718 """Provide summary for the `TreeClassifier`. 719 """ 720 s = super(TreeClassifier, self).summary() 721 if self.trained: 722 s += "\n Node classifiers summaries:" 723 for i, (clfname, clf) in enumerate(self.clfs.iteritems()): 724 s += '\n + %d %s clf: %s' % \ 725 (i, clfname, clf.summary().replace('\n', '\n |')) 726 return s

727 728

729 - def _train(self, dataset):

730 """Train TreeClassifier 731 732 First train .clf on groupped samples, then train each of .clfs 733 on a corresponding subset of samples. 734 """ 735 # Local bindings 736 clf, clfs, index2group = self.clf, self.clfs, self._index2group 737 738 # Handle groups of labels 739 groups = self._groups 740 labels_map = dataset.labels_map 741 # just for convenience 742 if labels_map is None: labels_map = {} 743 groups_labels = {} # just groups with numeric indexes 744 label2index = {} # how to map old labels to new 745 known = set() 746 for gi, gk in enumerate(index2group): 747 ls = groups[gk][0] 748 # if mapping exists -- map 749 ls_ = [labels_map.get(l, l) for l in ls] 750 known_already = known.intersection(ls_) 751 if len(known_already): 752 raise ValueError, "Grouping of labels is not appropriate. " \ 753 "Got labels %s already among known in %s. " \ 754 "Used labelsmap %s" % (known_already, known, labels_map) 755 groups_labels[gk] = ls_ # needed? XXX 756 for l in ls_: 757 label2index[l] = gi 758 known = known.union(ls_) 759 # TODO: check if different literal labels weren't mapped into 760 # same numerical but here asked to belong to different groups 761 # yoh: actually above should catch it 762 763 # Check if none of the labels is missing from known groups 764 dsul = set(dataset.uniquelabels) 765 if known.intersection(dsul) != dsul: 766 raise ValueError, \ 767 "Dataset %s had some labels not defined in groups: %s. " \ 768 "Known are %s" % \ 769 (dataset, dsul.difference(known), known) 770 771 # We can operate on the same dataset here 772 # Nope: doesn't work nicely with the classifier like kNN 773 # which links to the dataset used in the training, 774 # so whenever if we simply restore labels back, we 775 # would get kNN confused in _predict() 776 # Therefore we need to create a shallow copy of 777 # dataset and provide it with new labels 778 ds_group = dataset.copy(deep=False) 779 # assign new labels group samples into groups of labels 780 ds_group.labels = [label2index[l] for l in dataset.labels] 781 782 # train primary classifier 783 if __debug__: 784 debug('CLFTREE', "Training primary %(clf)s on %(ds)s", 785 msgargs=dict(clf=clf, ds=ds_group)) 786 clf.train(ds_group) 787 788 # ??? should we obtain values for anything? 789 # may be we could training values of .clfs to be added 790 # as features to the next level -- i.e. .clfs 791 792 # Proceed with next 'layer' and train all .clfs on corresponding 793 # selection of samples 794 # ??? should we may be allow additional 'the other' category, to 795 # signal contain all the other categories data? probably not 796 # since then it would lead to undetermined prediction (which 797 # might be not a bad thing altogether...) 798 for gk in groups.iterkeys(): 799 clf = clfs[gk] 800 group_labels = groups_labels[gk] 801 if clf is None: # Trailing node 802 if len(group_labels) != 1: 803 raise ValueError( 804 "Trailing nodes with no classifier assigned must have " 805 "only a single label associated. Got %s defined in " 806 "group %r of %s" 807 % (group_labels, gk, self)) 808 else: 809 # select samples per each group 810 ids = dataset.idsbylabels(group_labels) 811 ds_group = dataset.selectSamples(ids) 812 if __debug__: 813 debug('CLFTREE', "Training %(clf)s for group %(gk)s on %(ds)s", 814 msgargs=dict(clf=clfs[gk], gk=gk, ds=ds_group)) 815 # and train corresponding slave clf 816 clf.train(ds_group)

817 818

819 - def untrain(self):

820 """Untrain TreeClassifier 821 """ 822 super(TreeClassifier, self).untrain() 823 for clf in self.clfs.values(): 824 if clf is not None: 825 clf.untrain()

826 827

828 - def _predict(self, data):

829 """ 830 """ 831 # Local bindings 832 clfs, index2group, groups = self.clfs, self._index2group, self._groups 833 clf_predictions = N.asanyarray(ProxyClassifier._predict(self, data)) 834 # assure that predictions are indexes, ie int 835 clf_predictions = clf_predictions.astype(int) 836 # now for predictions pointing to specific groups go into 837 # corresponding one 838 predictions = N.array([N.nan]*len(data)) 839 for pred_group in set(clf_predictions): 840 gk = index2group[pred_group] 841 clf_ = clfs[gk] 842 group_indexes = (clf_predictions == pred_group) 843 if __debug__: 844 debug('CLFTREE', 'Predicting for group %s using %s on %d samples' % 845 (gk, clf_, N.sum(group_indexes))) 846 if clf_ is None: 847 predictions[group_indexes] = groups[gk][0] # our only label 848 else: 849 predictions[group_indexes] = clf_.predict(data[group_indexes]) 850 return predictions

851

852 853 -class BinaryClassifier(ProxyClassifier):

854 """`ProxyClassifier` which maps set of two labels into +1 and -1 855 """ 856

857 - def __init__(self, clf, poslabels, neglabels, **kwargs):

858 """ 859 :Parameters: 860 clf : Classifier 861 classifier to use 862 poslabels : list 863 list of labels which are treated as +1 category 864 neglabels : list 865 list of labels which are treated as -1 category 866 """ 867 868 ProxyClassifier.__init__(self, clf, **kwargs) 869 870 self._regressionIsBogus() 871 872 # Handle labels 873 sposlabels = set(poslabels) # so to remove duplicates 874 sneglabels = set(neglabels) # so to remove duplicates 875 876 # check if there is no overlap 877 overlap = sposlabels.intersection(sneglabels) 878 if len(overlap)>0: 879 raise ValueError("Sets of positive and negative labels for " + 880 "BinaryClassifier must not overlap. Got overlap " % 881 overlap) 882 883 self.__poslabels = list(sposlabels) 884 self.__neglabels = list(sneglabels) 885 886 # define what values will be returned by predict: if there is 887 # a single label - return just it alone, otherwise - whole 888 # list 889 # Such approach might come useful if we use some classifiers 890 # over different subsets of data with some voting later on 891 # (1-vs-therest?) 892 893 if len(self.__poslabels) > 1: 894 self.__predictpos = self.__poslabels 895 else: 896 self.__predictpos = self.__poslabels[0] 897 898 if len(self.__neglabels) > 1: 899 self.__predictneg = self.__neglabels 900 else: 901 self.__predictneg = self.__neglabels[0]

902 903

904 - def __repr__(self, prefixes=[]):

905 prefix = "poslabels=%s, neglabels=%s" % ( 906 repr(self.__poslabels), repr(self.__neglabels)) 907 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)

908 909

910 - def _train(self, dataset):

911 """Train `BinaryClassifier` 912 """ 913 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \ 914 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)] 915 # XXX we have to sort ids since at the moment Dataset.selectSamples 916 # doesn't take care about order 917 idlabels.sort() 918 # select the samples 919 orig_labels = None 920 921 # If we need all samples, why simply not perform on original 922 # data, an just store/restore labels. But it really should be done 923 # within Dataset.selectSamples 924 if len(idlabels) == dataset.nsamples \ 925 and [x[0] for x in idlabels] == range(dataset.nsamples): 926 # the last condition is not even necessary... just overly 927 # cautious 928 datasetselected = dataset # no selection is needed 929 orig_labels = dataset.labels # but we would need to restore labels 930 if __debug__: 931 debug('CLFBIN', 932 "Assigned all %d samples for binary " % 933 (dataset.nsamples) + 934 " classification among labels %s/+1 and %s/-1" % 935 (self.__poslabels, self.__neglabels)) 936 else: 937 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ]) 938 if __debug__: 939 debug('CLFBIN', 940 "Selected %d samples out of %d samples for binary " % 941 (len(idlabels), dataset.nsamples) + 942 " classification among labels %s/+1 and %s/-1" % 943 (self.__poslabels, self.__neglabels) + 944 ". Selected %s" % datasetselected) 945 946 # adjust the labels 947 datasetselected.labels = [ x[1] for x in idlabels ] 948 949 # now we got a dataset with only 2 labels 950 if __debug__: 951 assert((datasetselected.uniquelabels == [-1, 1]).all()) 952 953 self.clf.train(datasetselected) 954 955 if not orig_labels is None: 956 dataset.labels = orig_labels

957

958 - def _predict(self, data):

959 """Predict the labels for a given `data` 960 961 Predicts using binary classifier and spits out list (for each sample) 962 where with either poslabels or neglabels as the "label" for the sample. 963 If there was just a single label within pos or neg labels then it would 964 return not a list but just that single label. 965 """ 966 binary_predictions = ProxyClassifier._predict(self, data) 967 self.values = binary_predictions 968 predictions = [ {-1: self.__predictneg, 969 +1: self.__predictpos}[x] for x in binary_predictions] 970 self.predictions = predictions 971 return predictions

972

973 974 975 -class MulticlassClassifier(CombinedClassifier):

976 """`CombinedClassifier` to perform multiclass using a list of 977 `BinaryClassifier`. 978 979 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which 980 is yet to think about) 981 """ 982

983 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):

984 """Initialize the instance 985 986 :Parameters: 987 clf : Classifier 988 classifier based on which multiple classifiers are created 989 for multiclass 990 bclf_type 991 "1-vs-1" or "1-vs-all", determines the way to generate binary 992 classifiers 993 """ 994 CombinedClassifier.__init__(self, **kwargs) 995 self._regressionIsBogus() 996 if not clf is None: 997 clf._regressionIsBogus() 998 999 self.__clf = clf 1000 """Store sample instance of basic classifier""" 1001 1002 # Some checks on known ways to do multiclass 1003 if bclf_type == "1-vs-1": 1004 pass 1005 elif bclf_type == "1-vs-all": # TODO 1006 raise NotImplementedError 1007 else: 1008 raise ValueError, \ 1009 "Unknown type of classifier %s for " % bclf_type + \ 1010 "BoostedMulticlassClassifier" 1011 self.__bclf_type = bclf_type

1012 1013 # XXX fix it up a bit... it seems that MulticlassClassifier should 1014 # be actually ProxyClassifier and use BoostedClassifier internally

1015 - def __repr__(self, prefixes=[]):

1016 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type), 1017 repr(self.__clf)) 1018 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)

1019 1020

1021 - def _train(self, dataset):

1022 """Train classifier 1023 """ 1024 # construct binary classifiers 1025 ulabels = dataset.uniquelabels 1026 if self.__bclf_type == "1-vs-1": 1027 # generate pairs and corresponding classifiers 1028 biclfs = [] 1029 for i in xrange(len(ulabels)): 1030 for j in xrange(i+1, len(ulabels)): 1031 clf = self.__clf.clone() 1032 biclfs.append( 1033 BinaryClassifier( 1034 clf, 1035 poslabels=[ulabels[i]], neglabels=[ulabels[j]])) 1036 if __debug__: 1037 debug("CLFMC", "Created %d binary classifiers for %d labels" % 1038 (len(biclfs), len(ulabels))) 1039 1040 self.clfs = biclfs 1041 1042 elif self.__bclf_type == "1-vs-all": 1043 raise NotImplementedError 1044 1045 # perform actual training 1046 CombinedClassifier._train(self, dataset)

1047

1048 1049 1050 -class SplitClassifier(CombinedClassifier):

1051 """`BoostedClassifier` to work on splits of the data 1052 1053 """ 1054 1055 """ 1056 TODO: SplitClassifier and MulticlassClassifier have too much in 1057 common -- need to refactor: just need a splitter which would 1058 split dataset in pairs of class labels. MulticlassClassifier 1059 does just a tiny bit more which might be not necessary at 1060 all: map sets of labels into 2 categories... 1061 """ 1062 1063 # TODO: unify with CrossValidatedTransferError which now uses 1064 # harvest_attribs to expose gathered attributes 1065 confusion = StateVariable(enabled=False, 1066 doc="Resultant confusion whenever classifier trained " + 1067 "on 1 part and tested on 2nd part of each split") 1068 1069 splits = StateVariable(enabled=False, doc= 1070 """Store the actual splits of the data. Can be memory expensive""") 1071 1072 # ??? couldn't be training_confusion since it has other meaning 1073 # here, BUT it is named so within CrossValidatedTransferError 1074 # -- unify 1075 # decided to go with overriding semantics tiny bit. For split 1076 # classifier training_confusion would correspond to summary 1077 # over training errors across all splits. Later on if need comes 1078 # we might want to implement global_training_confusion which would 1079 # correspond to overall confusion on full training dataset as it is 1080 # done in base Classifier 1081 #global_training_confusion = StateVariable(enabled=False, 1082 # doc="Summary over training confusions acquired at each split") 1083

1084 - def __init__(self, clf, splitter=NFoldSplitter(cvtype=1), **kwargs):

1085 """Initialize the instance 1086 1087 :Parameters: 1088 clf : Classifier 1089 classifier based on which multiple classifiers are created 1090 for multiclass 1091 splitter : Splitter 1092 `Splitter` to use to split the dataset prior training 1093 """ 1094 1095 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs) 1096 self.__clf = clf 1097 """Store sample instance of basic classifier""" 1098 1099 if isinstance(splitter, type): 1100 raise ValueError, \ 1101 "Please provide an instance of a splitter, not a type." \ 1102 " Got %s" % splitter 1103 1104 self.__splitter = splitter

1105 1106

1107 - def _train(self, dataset):

1108 """Train `SplitClassifier` 1109 """ 1110 # generate pairs and corresponding classifiers 1111 bclfs = [] 1112 1113 # local binding 1114 states = self.states 1115 1116 clf_template = self.__clf 1117 if states.isEnabled('confusion'): 1118 states.confusion = clf_template._summaryClass() 1119 if states.isEnabled('training_confusion'): 1120 clf_template.states.enable(['training_confusion']) 1121 states.training_confusion = clf_template._summaryClass() 1122 1123 clf_hastestdataset = hasattr(clf_template, 'testdataset') 1124 1125 # for proper and easier debugging - first define classifiers and then 1126 # train them 1127 for split in self.__splitter.splitcfg(dataset): 1128 if __debug__: 1129 debug("CLFSPL_", 1130 "Deepcopying %(clf)s for %(sclf)s", 1131 msgargs={'clf':clf_template, 1132 'sclf':self}) 1133 clf = clf_template.clone() 1134 bclfs.append(clf) 1135 self.clfs = bclfs 1136 1137 self.splits = [] 1138 1139 for i, split in enumerate(self.__splitter(dataset)): 1140 if __debug__: 1141 debug("CLFSPL", "Training classifier for split %d" % (i)) 1142 1143 if states.isEnabled("splits"): 1144 self.splits.append(split) 1145 1146 clf = self.clfs[i] 1147 1148 # assign testing dataset if given classifier can digest it 1149 if clf_hastestdataset: 1150 clf.testdataset = split[1] 1151 1152 clf.train(split[0]) 1153 1154 # unbind the testdataset from the classifier 1155 if clf_hastestdataset: 1156 clf.testdataset = None 1157 1158 if states.isEnabled("confusion"): 1159 predictions = clf.predict(split[1].samples) 1160 self.confusion.add(split[1].labels, predictions, 1161 clf.states.get('values', None)) 1162 if __debug__: 1163 dact = debug.active 1164 if 'CLFSPL_' in dact: 1165 debug('CLFSPL_', 'Split %d:\n%s' % (i, self.confusion)) 1166 elif 'CLFSPL' in dact: 1167 debug('CLFSPL', 'Split %d error %.2f%%' 1168 % (i, self.confusion.summaries[-1].error)) 1169 1170 if states.isEnabled("training_confusion"): 1171 states.training_confusion += \ 1172 clf.states.training_confusion 1173 # hackish way -- so it should work only for ConfusionMatrix??? 1174 try: 1175 if states.isEnabled("confusion"): 1176 states.confusion.labels_map = dataset.labels_map 1177 if states.isEnabled("training_confusion"): 1178 states.training_confusion.labels_map = dataset.labels_map 1179 except: 1180 pass

1181 1182 1183 @group_kwargs(prefixes=['slave_'], passthrough=True)

1184 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):

1185 """Return an appropriate SensitivityAnalyzer for `SplitClassifier` 1186 1187 :Parameters: 1188 combiner 1189 If not provided, FirstAxisMean is assumed 1190 """ 1191 kwargs.setdefault('combiner', FirstAxisMean) 1192 return BoostedClassifierSensitivityAnalyzer( 1193 self, 1194 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 1195 **kwargs)

1196 1197 splitter = property(fget=lambda x:x.__splitter, 1198 doc="Splitter user by SplitClassifier")

1199

1200 1201 -class MappedClassifier(ProxyClassifier):

1202 """`ProxyClassifier` which uses some mapper prior training/testing. 1203 1204 `MaskMapper` can be used just a subset of features to 1205 train/classify. 1206 Having such classifier we can easily create a set of classifiers 1207 for BoostedClassifier, where each classifier operates on some set 1208 of features, e.g. set of best spheres from SearchLight, set of 1209 ROIs selected elsewhere. It would be different from simply 1210 applying whole mask over the dataset, since here initial decision 1211 is made by each classifier and then later on they vote for the 1212 final decision across the set of classifiers. 1213 """ 1214

1215 - def __init__(self, clf, mapper, **kwargs):

1216 """Initialize the instance 1217 1218 :Parameters: 1219 clf : Classifier 1220 classifier based on which mask classifiers is created 1221 mapper 1222 whatever `Mapper` comes handy 1223 """ 1224 ProxyClassifier.__init__(self, clf, **kwargs) 1225 1226 self.__mapper = mapper 1227 """mapper to help us our with prepping data to 1228 training/classification"""

1229 1230

1231 - def _train(self, dataset):

1232 """Train `MappedClassifier` 1233 """ 1234 # first train the mapper 1235 # XXX: should training be done using whole dataset or just samples 1236 # YYY: in some cases labels might be needed, thus better full dataset 1237 self.__mapper.train(dataset) 1238 1239 # for train() we have to provide dataset -- not just samples to train! 1240 wdataset = dataset.applyMapper(featuresmapper = self.__mapper) 1241 ProxyClassifier._train(self, wdataset)

1242 1243

1244 - def _predict(self, data):

1245 """Predict using `MappedClassifier` 1246 """ 1247 return ProxyClassifier._predict(self, self.__mapper.forward(data))

1248 1249 1250 @group_kwargs(prefixes=['slave_'], passthrough=True)

1251 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):

1252 """Return an appropriate SensitivityAnalyzer""" 1253 return MappedClassifierSensitivityAnalyzer( 1254 self, 1255 analyzer=self.clf.getSensitivityAnalyzer(**slave_kwargs), 1256 **kwargs)

1257 1258 1259 mapper = property(lambda x:x.__mapper, doc="Used mapper")

1260

1261 1262 1263 -class FeatureSelectionClassifier(ProxyClassifier):

1264 """`ProxyClassifier` which uses some `FeatureSelection` prior training. 1265 1266 `FeatureSelection` is used first to select features for the classifier to 1267 use for prediction. Internally it would rely on MappedClassifier which 1268 would use created MaskMapper. 1269 1270 TODO: think about removing overhead of retraining the same classifier if 1271 feature selection was carried out with the same classifier already. It 1272 has been addressed by adding .trained property to classifier, but now 1273 we should expclitely use isTrained here if we want... need to think more 1274 """ 1275 1276 _clf_internals = [ 'does_feature_selection', 'meta' ] 1277

1278 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):

1279 """Initialize the instance 1280 1281 :Parameters: 1282 clf : Classifier 1283 classifier based on which mask classifiers is created 1284 feature_selection : FeatureSelection 1285 whatever `FeatureSelection` comes handy 1286 testdataset : Dataset 1287 optional dataset which would be given on call to feature_selection 1288 """ 1289 ProxyClassifier.__init__(self, clf, **kwargs) 1290 1291 self.__maskclf = None 1292 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on.""" 1293 1294 self.__feature_selection = feature_selection 1295 """`FeatureSelection` to select the features prior training""" 1296 1297 self.__testdataset = testdataset 1298 """`FeatureSelection` might like to use testdataset"""

1299 1300

1301 - def untrain(self):

1302 """Untrain `FeatureSelectionClassifier` 1303 1304 Has to untrain any known classifier 1305 """ 1306 if self.__feature_selection is not None: 1307 self.__feature_selection.untrain() 1308 if not self.trained: 1309 return 1310 if not self.__maskclf is None: 1311 self.__maskclf.untrain() 1312 super(FeatureSelectionClassifier, self).untrain()

1313 1314

1315 - def _train(self, dataset):

1316 """Train `FeatureSelectionClassifier` 1317 """ 1318 # temporarily enable selected_ids 1319 self.__feature_selection.states._changeTemporarily( 1320 enable_states=["selected_ids"]) 1321 1322 if __debug__: 1323 debug("CLFFS", "Performing feature selection using %s" % 1324 self.__feature_selection + " on %s" % dataset) 1325 1326 (wdataset, tdataset) = self.__feature_selection(dataset, 1327 self.__testdataset) 1328 if __debug__: 1329 add_ = "" 1330 if "CLFFS_" in debug.active: 1331 add_ = " Selected features: %s" % \ 1332 self.__feature_selection.selected_ids 1333 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " + 1334 "%(dsnfeat)d features.%(app)s", 1335 msgargs={'fs':self.__feature_selection, 1336 'nfeat':wdataset.nfeatures, 1337 'dsnfeat':dataset.nfeatures, 1338 'app':add_}) 1339 1340 # create a mask to devise a mapper 1341 # TODO -- think about making selected_ids a MaskMapper 1342 mappermask = N.zeros(dataset.nfeatures) 1343 mappermask[self.__feature_selection.selected_ids] = 1 1344 mapper = MaskMapper(mappermask) 1345 1346 self.__feature_selection.states._resetEnabledTemporarily() 1347 1348 # create and assign `MappedClassifier` 1349 self.__maskclf = MappedClassifier(self.clf, mapper) 1350 # we could have called self.__clf.train(dataset), but it would 1351 # cause unnecessary masking 1352 self.__maskclf.clf.train(wdataset)

1353 1354 # for the ease of access 1355 # TODO see for ProxyClassifier 1356 #self.states._copy_states_(self.__maskclf, deep=False) 1357

1358 - def _getFeatureIds(self):

1359 """Return used feature ids for `FeatureSelectionClassifier` 1360 1361 """ 1362 return self.__feature_selection.selected_ids

1363

1364 - def _predict(self, data):

1365 """Predict using `FeatureSelectionClassifier` 1366 """ 1367 clf = self.__maskclf 1368 if self.states.isEnabled('values'): 1369 clf.states.enable(['values']) 1370 1371 result = clf._predict(data) 1372 # for the ease of access 1373 self.states._copy_states_(clf, ['values'], deep=False) 1374 return result

1375

1376 - def setTestDataset(self, testdataset):

1377 """Set testing dataset to be used for feature selection 1378 """ 1379 self.__testdataset = testdataset

1380 1381 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`") 1382 feature_selection = property(lambda x:x.__feature_selection, 1383 doc="Used `FeatureSelection`") 1384 1385 @group_kwargs(prefixes=['slave_'], passthrough=True)

1386 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):

1387 """Return an appropriate SensitivityAnalyzer 1388 1389 had to clone from mapped classifier??? 1390 """ 1391 return FeatureSelectionClassifierSensitivityAnalyzer( 1392 self, 1393 analyzer=self.clf.getSensitivityAnalyzer(**slave_kwargs), 1394 **kwargs)

1395 1396 1397 1398 testdataset = property(fget=lambda x:x.__testdataset, 1399 fset=setTestDataset)

1400

Source Code for Module mvpa.clfs.meta