1
2
3
4
5
6
7
8
9 """Classes for meta classifiers -- classifiers which use other classifiers
10
11 Meta Classifiers can be grouped according to their function as
12
13 :group BoostedClassifiers: CombinedClassifier MulticlassClassifier
14 SplitClassifier
15 :group ProxyClassifiers: ProxyClassifier BinaryClassifier MappedClassifier
16 FeatureSelectionClassifier
17 :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner
18 MaximalVote MeanPrediction
19
20 """
21
22 __docformat__ = 'restructuredtext'
23
24 import operator
25 import numpy as N
26
27 from mvpa.misc.args import group_kwargs
28 from mvpa.mappers.mask import MaskMapper
29 from mvpa.datasets.splitters import NFoldSplitter
30 from mvpa.misc.state import StateVariable, ClassWithCollections, Harvestable
31
32 from mvpa.clfs.base import Classifier
33 from mvpa.misc.transformers import FirstAxisMean
34
35 from mvpa.measures.base import \
36 BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \
37 MappedClassifierSensitivityAnalyzer, \
38 FeatureSelectionClassifierSensitivityAnalyzer
39
40 from mvpa.base import warning
41
42 if __debug__:
43 from mvpa.base import debug
47 """Classifier containing the farm of other classifiers.
48
49 Should rarely be used directly. Use one of its childs instead
50 """
51
52
53
54 raw_predictions = StateVariable(enabled=False,
55 doc="Predictions obtained from each classifier")
56
57 raw_values = StateVariable(enabled=False,
58 doc="Values obtained from each classifier")
59
60
61 - def __init__(self, clfs=None, propagate_states=True,
62 harvest_attribs=None, copy_attribs='copy',
63 **kwargs):
64 """Initialize the instance.
65
66 :Parameters:
67 clfs : list
68 list of classifier instances to use (slave classifiers)
69 propagate_states : bool
70 either to propagate enabled states into slave classifiers.
71 It is in effect only when slaves get assigned - so if state
72 is enabled not during construction, it would not necessarily
73 propagate into slaves
74 kwargs : dict
75 dict of keyworded arguments which might get used
76 by State or Classifier
77 """
78 if clfs == None:
79 clfs = []
80
81 Classifier.__init__(self, **kwargs)
82 Harvestable.__init__(self, harvest_attribs, copy_attribs)
83
84 self.__clfs = None
85 """Pylint friendly definition of __clfs"""
86
87 self.__propagate_states = propagate_states
88 """Enable current enabled states in slave classifiers"""
89
90 self._setClassifiers(clfs)
91 """Store the list of classifiers"""
92
93
95 if self.__clfs is None or len(self.__clfs)==0:
96
97 prefix_ = []
98 else:
99 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])]
100 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
101
102
104 """Train `BoostedClassifier`
105 """
106 for clf in self.__clfs:
107 clf.train(dataset)
108
109
110 - def _posttrain(self, dataset):
111 """Custom posttrain of `BoostedClassifier`
112
113 Harvest over the trained classifiers if it was asked to so
114 """
115 Classifier._posttrain(self, dataset)
116 if self.states.isEnabled('harvested'):
117 for clf in self.__clfs:
118 self._harvest(locals())
119 if self.params.retrainable:
120 self.__changedData_isset = False
121
122
131
132
150
151
153 """Set the classifiers used by the boosted classifier
154
155 We have to allow to set list of classifiers after the object
156 was actually created. It will be used by
157 MulticlassClassifier
158 """
159 self.__clfs = clfs
160 """Classifiers to use"""
161
162 if len(clfs):
163 for flag in ['regression']:
164 values = N.array([clf.params[flag].value for clf in clfs])
165 value = values.any()
166 if __debug__:
167 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers "
168 "%(clfs)s with %(values)s",
169 msgargs={'flag' : flag, 'value' : value,
170 'clfs' : clfs,
171 'values' : values})
172
173 self.params[flag].value = value
174
175
176 if self.__propagate_states:
177 for clf in self.__clfs:
178 clf.states.enable(self.states.enabled, missingok=True)
179
180
181
182
183 self._clf_internals = [ 'binary', 'multiclass', 'meta' ]
184 if len(clfs)>0:
185 self._clf_internals += self.__clfs[0]._clf_internals
186
197
203
204
205 clfs = property(fget=lambda x:x.__clfs,
206 fset=_setClassifiers,
207 doc="Used classifiers")
208
212 """Classifier which decorates another classifier
213
214 Possible uses:
215
216 - modify data somehow prior training/testing:
217 * normalization
218 * feature selection
219 * modification
220
221 - optimized classifier?
222
223 """
224
243
244
248
250 s = super(ProxyClassifier, self).summary()
251 if self.trained:
252 s += "\n Slave classifier summary:" + \
253 '\n + %s' % \
254 (self.__clf.summary().replace('\n', '\n |'))
255 return s
256
257
258
260 """Train `ProxyClassifier`
261 """
262
263
264 self.__clf.train(dataset)
265
266
267
268
269
270
271
272
273
274
276 """Predict using `ProxyClassifier`
277 """
278 clf = self.__clf
279 if self.states.isEnabled('values'):
280 clf.states.enable(['values'])
281
282 result = clf.predict(data)
283
284 self.states._copy_states_(self.__clf, ['values'], deep=False)
285 return result
286
287
294
295
296 @group_kwargs(prefixes=['slave_'], passthrough=True)
303
304
305 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
306
314 """Base class for combining decisions of multiple classifiers"""
315
316 - def train(self, clfs, dataset):
317 """PredictionsCombiner might need to be trained
318
319 :Parameters:
320 clfs : list of Classifier
321 List of classifiers to combine. Has to be classifiers (not
322 pure predictions), since combiner might use some other
323 state variables (value's) instead of pure prediction's
324 dataset : Dataset
325 training data in this case
326 """
327 pass
328
329
331 """Call function
332
333 :Parameters:
334 clfs : list of Classifier
335 List of classifiers to combine. Has to be classifiers (not
336 pure predictions), since combiner might use some other
337 state variables (value's) instead of pure prediction's
338 """
339 raise NotImplementedError
340
344 """Provides a decision using maximal vote rule"""
345
346 predictions = StateVariable(enabled=True,
347 doc="Voted predictions")
348 all_label_counts = StateVariable(enabled=False,
349 doc="Counts across classifiers for each label/sample")
350
352 """XXX Might get a parameter to use raw decision values if
353 voting is not unambigous (ie two classes have equal number of
354 votes
355 """
356 PredictionsCombiner.__init__(self)
357
358
360 """Actuall callable - perform voting
361
362 Extended functionality which might not be needed actually:
363 Since `BinaryClassifier` might return a list of possible
364 predictions (not just a single one), we should consider all of those
365
366 MaximalVote doesn't care about dataset itself
367 """
368 if len(clfs)==0:
369 return []
370
371 all_label_counts = None
372 for clf in clfs:
373
374 if not clf.states.isEnabled("predictions"):
375 raise ValueError, "MaximalVote needs classifiers (such as " + \
376 "%s) with state 'predictions' enabled" % clf
377 predictions = clf.predictions
378 if all_label_counts is None:
379 all_label_counts = [ {} for i in xrange(len(predictions)) ]
380
381
382 for i in xrange(len(predictions)):
383 prediction = predictions[i]
384 if not operator.isSequenceType(prediction):
385 prediction = (prediction,)
386 for label in prediction:
387
388
389 if not all_label_counts[i].has_key(label):
390 all_label_counts[i][label] = 0
391 all_label_counts[i][label] += 1
392
393 predictions = []
394
395 for i in xrange(len(all_label_counts)):
396 label_counts = all_label_counts[i]
397
398
399 maxk = []
400 maxv = -1
401 for k, v in label_counts.iteritems():
402 if v > maxv:
403 maxk = [k]
404 maxv = v
405 elif v == maxv:
406 maxk.append(k)
407
408 assert len(maxk) >= 1, \
409 "We should have obtained at least a single key of max label"
410
411 if len(maxk) > 1:
412 warning("We got multiple labels %s which have the " % maxk +
413 "same maximal vote %d. XXX disambiguate" % maxv)
414 predictions.append(maxk[0])
415
416 self.all_label_counts = all_label_counts
417 self.predictions = predictions
418 return predictions
419
423 """Provides a decision by taking mean of the results
424 """
425
426 predictions = StateVariable(enabled=True,
427 doc="Mean predictions")
428
430 """Actuall callable - perform meaning
431
432 """
433 if len(clfs)==0:
434 return []
435
436 all_predictions = []
437 for clf in clfs:
438
439 if not clf.states.isEnabled("predictions"):
440 raise ValueError, "MeanPrediction needs classifiers (such " \
441 " as %s) with state 'predictions' enabled" % clf
442 all_predictions.append(clf.predictions)
443
444
445 predictions = N.mean(N.asarray(all_predictions), axis=0)
446 self.predictions = predictions
447 return predictions
448
451 """Provides a decision using training a classifier on predictions/values
452
453 TODO: implement
454 """
455
456 predictions = StateVariable(enabled=True,
457 doc="Trained predictions")
458
459
460 - def __init__(self, clf, variables=None):
461 """Initialize `ClassifierCombiner`
462
463 :Parameters:
464 clf : Classifier
465 Classifier to train on the predictions
466 variables : list of basestring
467 List of state variables stored in 'combined' classifiers, which
468 to use as features for training this classifier
469 """
470 PredictionsCombiner.__init__(self)
471
472 self.__clf = clf
473 """Classifier to train on `variables` states of provided classifiers"""
474
475 if variables == None:
476 variables = ['predictions']
477 self.__variables = variables
478 """What state variables of the classifiers to use"""
479
480
482 """It might be needed to untrain used classifier"""
483 if self.__clf:
484 self.__clf.untrain()
485
487 """
488 """
489 if len(clfs)==0:
490 return []
491
492 raise NotImplementedError
493
497 """`BoostedClassifier` which combines predictions using some
498 `PredictionsCombiner` functor.
499 """
500
501 - def __init__(self, clfs=None, combiner=None, **kwargs):
502 """Initialize the instance.
503
504 :Parameters:
505 clfs : list of Classifier
506 list of classifier instances to use
507 combiner : PredictionsCombiner
508 callable which takes care about combining multiple
509 results into a single one (e.g. maximal vote for
510 classification, MeanPrediction for regression))
511 kwargs : dict
512 dict of keyworded arguments which might get used
513 by State or Classifier
514
515 NB: `combiner` might need to operate not on 'predictions' descrete
516 labels but rather on raw 'class' values classifiers
517 estimate (which is pretty much what is stored under
518 `values`
519 """
520 if clfs == None:
521 clfs = []
522
523 BoostedClassifier.__init__(self, clfs, **kwargs)
524
525
526 if combiner is None:
527 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]()
528 self.__combiner = combiner
529 """Functor destined to combine results of multiple classifiers"""
530
531
533 """Literal representation of `CombinedClassifier`.
534 """
535 return super(CombinedClassifier, self).__repr__(
536 ["combiner=%s" % repr(self.__combiner)] + prefixes)
537
538
540 """Provide summary for the `CombinedClassifier`.
541 """
542 s = super(CombinedClassifier, self).summary()
543 if self.trained:
544 s += "\n Slave classifiers summaries:"
545 for i, clf in enumerate(self.clfs):
546 s += '\n + %d clf: %s' % \
547 (i, clf.summary().replace('\n', '\n |'))
548 return s
549
550
559
566
567
588
589
590 combiner = property(fget=lambda x:x.__combiner,
591 doc="Used combiner to derive a single result")
592
596 """`TreeClassifier` which allows to create hierarchy of classifiers
597
598 Functions by grouping some labels into a single "meta-label" and training
599 classifier first to separate between meta-labels. Then
600 each group further proceeds with classification within each group.
601
602 Possible scenarios::
603
604 TreeClassifier(SVM(),
605 {'animate': ((1,2,3,4),
606 TreeClassifier(SVM(),
607 {'human': (('male', 'female'), SVM()),
608 'animals': (('monkey', 'dog'), SMLR())})),
609 'inanimate': ((5,6,7,8), SMLR())})
610
611 would create classifier which would first do binary classification
612 to separate animate from inanimate, then for animate result it
613 would separate to classify human vs animal and so on::
614
615 SVM
616 / \
617 animate inanimate
618 / \
619 SVM SMLR
620 / \ / | \ \
621 human animal 5 6 7 8
622 | |
623 SVM SVM
624 / \ / \
625 male female monkey dog
626 1 2 3 4
627
628 If it is desired to have a trailing node with a single label and
629 thus without any classification, such as in
630
631 SVM
632 / \
633 g1 g2
634 / \
635 1 SVM
636 / \
637 2 3
638
639 then just specify None as the classifier to use::
640
641 TreeClassifier(SVM(),
642 {'g1': ((1,), None),
643 'g2': ((1,2,3,4), SVM())})
644
645 """
646
647 _DEV__doc = """
648 Questions:
649 * how to collect confusion matrices at a particular layer if such
650 classifier is given to SplitClassifier or CVTE
651
652 * What additional states to add, something like
653 clf_labels -- store remapped labels for the dataset
654 clf_values ...
655
656 * What do we store into values ? just values from the clfs[]
657 for corresponding samples, or top level clf values as well?
658
659 * what should be SensitivityAnalyzer? by default it would just
660 use top slave classifier (i.e. animate/inanimate)
661
662 Problems?
663 * .clf is not actually "proxied" per se, so not sure what things
664 should be taken care of yet...
665
666 TODO:
667 * Allow a group to be just a single category, so no further
668 classifier is needed, it just should stay separate from the
669 other groups
670
671 Possible TODO:
672 * Add ability to provide results of clf.values as features into
673 input of clfs[]. This way we could provide additional 'similarity'
674 information to the "other" branch
675
676 """
677
678 - def __init__(self, clf, groups, **kwargs):
679 """Initialize TreeClassifier
680
681 :Parameters:
682 clf : Classifier
683 Classifier to separate between the groups
684 groups : dict of meta-label: tuple of (tuple of labels, classifier)
685 Defines the groups of labels and their classifiers.
686 See :class:`~mvpa.clfs.meta.TreeClassifier` for example
687 """
688
689
690 ProxyClassifier.__init__(self, clf, **kwargs)
691 self._regressionIsBogus()
692
693
694
695
696
697 self._groups = groups
698 self._index2group = groups.keys()
699
700
701
702
703
704
705
706 self.clfs = dict([(gk, c) for gk, (ls, c) in groups.iteritems()])
707 """Dictionary of classifiers used by the groups"""
708
709
711 """String representation of TreeClassifier
712 """
713 prefix = "groups=%s" % repr(self._groups)
714 return super(TreeClassifier, self).__repr__([prefix] + prefixes)
715
716
718 """Provide summary for the `TreeClassifier`.
719 """
720 s = super(TreeClassifier, self).summary()
721 if self.trained:
722 s += "\n Node classifiers summaries:"
723 for i, (clfname, clf) in enumerate(self.clfs.iteritems()):
724 s += '\n + %d %s clf: %s' % \
725 (i, clfname, clf.summary().replace('\n', '\n |'))
726 return s
727
728
730 """Train TreeClassifier
731
732 First train .clf on groupped samples, then train each of .clfs
733 on a corresponding subset of samples.
734 """
735
736 clf, clfs, index2group = self.clf, self.clfs, self._index2group
737
738
739 groups = self._groups
740 labels_map = dataset.labels_map
741
742 if labels_map is None: labels_map = {}
743 groups_labels = {}
744 label2index = {}
745 known = set()
746 for gi, gk in enumerate(index2group):
747 ls = groups[gk][0]
748
749 ls_ = [labels_map.get(l, l) for l in ls]
750 known_already = known.intersection(ls_)
751 if len(known_already):
752 raise ValueError, "Grouping of labels is not appropriate. " \
753 "Got labels %s already among known in %s. " \
754 "Used labelsmap %s" % (known_already, known, labels_map)
755 groups_labels[gk] = ls_
756 for l in ls_:
757 label2index[l] = gi
758 known = known.union(ls_)
759
760
761
762
763
764 dsul = set(dataset.uniquelabels)
765 if known.intersection(dsul) != dsul:
766 raise ValueError, \
767 "Dataset %s had some labels not defined in groups: %s. " \
768 "Known are %s" % \
769 (dataset, dsul.difference(known), known)
770
771
772
773
774
775
776
777
778 ds_group = dataset.copy(deep=False)
779
780 ds_group.labels = [label2index[l] for l in dataset.labels]
781
782
783 if __debug__:
784 debug('CLFTREE', "Training primary %(clf)s on %(ds)s",
785 msgargs=dict(clf=clf, ds=ds_group))
786 clf.train(ds_group)
787
788
789
790
791
792
793
794
795
796
797
798 for gk in groups.iterkeys():
799 clf = clfs[gk]
800 group_labels = groups_labels[gk]
801 if clf is None:
802 if len(group_labels) != 1:
803 raise ValueError(
804 "Trailing nodes with no classifier assigned must have "
805 "only a single label associated. Got %s defined in "
806 "group %r of %s"
807 % (group_labels, gk, self))
808 else:
809
810 ids = dataset.idsbylabels(group_labels)
811 ds_group = dataset.selectSamples(ids)
812 if __debug__:
813 debug('CLFTREE', "Training %(clf)s for group %(gk)s on %(ds)s",
814 msgargs=dict(clf=clfs[gk], gk=gk, ds=ds_group))
815
816 clf.train(ds_group)
817
818
826
827
829 """
830 """
831
832 clfs, index2group, groups = self.clfs, self._index2group, self._groups
833 clf_predictions = N.asanyarray(ProxyClassifier._predict(self, data))
834
835 clf_predictions = clf_predictions.astype(int)
836
837
838 predictions = N.array([N.nan]*len(data))
839 for pred_group in set(clf_predictions):
840 gk = index2group[pred_group]
841 clf_ = clfs[gk]
842 group_indexes = (clf_predictions == pred_group)
843 if __debug__:
844 debug('CLFTREE', 'Predicting for group %s using %s on %d samples' %
845 (gk, clf_, N.sum(group_indexes)))
846 if clf_ is None:
847 predictions[group_indexes] = groups[gk][0]
848 else:
849 predictions[group_indexes] = clf_.predict(data[group_indexes])
850 return predictions
851
854 """`ProxyClassifier` which maps set of two labels into +1 and -1
855 """
856
857 - def __init__(self, clf, poslabels, neglabels, **kwargs):
858 """
859 :Parameters:
860 clf : Classifier
861 classifier to use
862 poslabels : list
863 list of labels which are treated as +1 category
864 neglabels : list
865 list of labels which are treated as -1 category
866 """
867
868 ProxyClassifier.__init__(self, clf, **kwargs)
869
870 self._regressionIsBogus()
871
872
873 sposlabels = set(poslabels)
874 sneglabels = set(neglabels)
875
876
877 overlap = sposlabels.intersection(sneglabels)
878 if len(overlap)>0:
879 raise ValueError("Sets of positive and negative labels for " +
880 "BinaryClassifier must not overlap. Got overlap " %
881 overlap)
882
883 self.__poslabels = list(sposlabels)
884 self.__neglabels = list(sneglabels)
885
886
887
888
889
890
891
892
893 if len(self.__poslabels) > 1:
894 self.__predictpos = self.__poslabels
895 else:
896 self.__predictpos = self.__poslabels[0]
897
898 if len(self.__neglabels) > 1:
899 self.__predictneg = self.__neglabels
900 else:
901 self.__predictneg = self.__neglabels[0]
902
903
905 prefix = "poslabels=%s, neglabels=%s" % (
906 repr(self.__poslabels), repr(self.__neglabels))
907 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
908
909
911 """Train `BinaryClassifier`
912 """
913 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \
914 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)]
915
916
917 idlabels.sort()
918
919 orig_labels = None
920
921
922
923
924 if len(idlabels) == dataset.nsamples \
925 and [x[0] for x in idlabels] == range(dataset.nsamples):
926
927
928 datasetselected = dataset
929 orig_labels = dataset.labels
930 if __debug__:
931 debug('CLFBIN',
932 "Assigned all %d samples for binary " %
933 (dataset.nsamples) +
934 " classification among labels %s/+1 and %s/-1" %
935 (self.__poslabels, self.__neglabels))
936 else:
937 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ])
938 if __debug__:
939 debug('CLFBIN',
940 "Selected %d samples out of %d samples for binary " %
941 (len(idlabels), dataset.nsamples) +
942 " classification among labels %s/+1 and %s/-1" %
943 (self.__poslabels, self.__neglabels) +
944 ". Selected %s" % datasetselected)
945
946
947 datasetselected.labels = [ x[1] for x in idlabels ]
948
949
950 if __debug__:
951 assert((datasetselected.uniquelabels == [-1, 1]).all())
952
953 self.clf.train(datasetselected)
954
955 if not orig_labels is None:
956 dataset.labels = orig_labels
957
959 """Predict the labels for a given `data`
960
961 Predicts using binary classifier and spits out list (for each sample)
962 where with either poslabels or neglabels as the "label" for the sample.
963 If there was just a single label within pos or neg labels then it would
964 return not a list but just that single label.
965 """
966 binary_predictions = ProxyClassifier._predict(self, data)
967 self.values = binary_predictions
968 predictions = [ {-1: self.__predictneg,
969 +1: self.__predictpos}[x] for x in binary_predictions]
970 self.predictions = predictions
971 return predictions
972
976 """`CombinedClassifier` to perform multiclass using a list of
977 `BinaryClassifier`.
978
979 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which
980 is yet to think about)
981 """
982
983 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
984 """Initialize the instance
985
986 :Parameters:
987 clf : Classifier
988 classifier based on which multiple classifiers are created
989 for multiclass
990 bclf_type
991 "1-vs-1" or "1-vs-all", determines the way to generate binary
992 classifiers
993 """
994 CombinedClassifier.__init__(self, **kwargs)
995 self._regressionIsBogus()
996 if not clf is None:
997 clf._regressionIsBogus()
998
999 self.__clf = clf
1000 """Store sample instance of basic classifier"""
1001
1002
1003 if bclf_type == "1-vs-1":
1004 pass
1005 elif bclf_type == "1-vs-all":
1006 raise NotImplementedError
1007 else:
1008 raise ValueError, \
1009 "Unknown type of classifier %s for " % bclf_type + \
1010 "BoostedMulticlassClassifier"
1011 self.__bclf_type = bclf_type
1012
1013
1014
1016 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type),
1017 repr(self.__clf))
1018 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
1019
1020
1022 """Train classifier
1023 """
1024
1025 ulabels = dataset.uniquelabels
1026 if self.__bclf_type == "1-vs-1":
1027
1028 biclfs = []
1029 for i in xrange(len(ulabels)):
1030 for j in xrange(i+1, len(ulabels)):
1031 clf = self.__clf.clone()
1032 biclfs.append(
1033 BinaryClassifier(
1034 clf,
1035 poslabels=[ulabels[i]], neglabels=[ulabels[j]]))
1036 if __debug__:
1037 debug("CLFMC", "Created %d binary classifiers for %d labels" %
1038 (len(biclfs), len(ulabels)))
1039
1040 self.clfs = biclfs
1041
1042 elif self.__bclf_type == "1-vs-all":
1043 raise NotImplementedError
1044
1045
1046 CombinedClassifier._train(self, dataset)
1047
1051 """`BoostedClassifier` to work on splits of the data
1052
1053 """
1054
1055 """
1056 TODO: SplitClassifier and MulticlassClassifier have too much in
1057 common -- need to refactor: just need a splitter which would
1058 split dataset in pairs of class labels. MulticlassClassifier
1059 does just a tiny bit more which might be not necessary at
1060 all: map sets of labels into 2 categories...
1061 """
1062
1063
1064
1065 confusion = StateVariable(enabled=False,
1066 doc="Resultant confusion whenever classifier trained " +
1067 "on 1 part and tested on 2nd part of each split")
1068
1069 splits = StateVariable(enabled=False, doc=
1070 """Store the actual splits of the data. Can be memory expensive""")
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1085 """Initialize the instance
1086
1087 :Parameters:
1088 clf : Classifier
1089 classifier based on which multiple classifiers are created
1090 for multiclass
1091 splitter : Splitter
1092 `Splitter` to use to split the dataset prior training
1093 """
1094
1095 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs)
1096 self.__clf = clf
1097 """Store sample instance of basic classifier"""
1098
1099 if isinstance(splitter, type):
1100 raise ValueError, \
1101 "Please provide an instance of a splitter, not a type." \
1102 " Got %s" % splitter
1103
1104 self.__splitter = splitter
1105
1106
1108 """Train `SplitClassifier`
1109 """
1110
1111 bclfs = []
1112
1113
1114 states = self.states
1115
1116 clf_template = self.__clf
1117 if states.isEnabled('confusion'):
1118 states.confusion = clf_template._summaryClass()
1119 if states.isEnabled('training_confusion'):
1120 clf_template.states.enable(['training_confusion'])
1121 states.training_confusion = clf_template._summaryClass()
1122
1123 clf_hastestdataset = hasattr(clf_template, 'testdataset')
1124
1125
1126
1127 for split in self.__splitter.splitcfg(dataset):
1128 if __debug__:
1129 debug("CLFSPL_",
1130 "Deepcopying %(clf)s for %(sclf)s",
1131 msgargs={'clf':clf_template,
1132 'sclf':self})
1133 clf = clf_template.clone()
1134 bclfs.append(clf)
1135 self.clfs = bclfs
1136
1137 self.splits = []
1138
1139 for i, split in enumerate(self.__splitter(dataset)):
1140 if __debug__:
1141 debug("CLFSPL", "Training classifier for split %d" % (i))
1142
1143 if states.isEnabled("splits"):
1144 self.splits.append(split)
1145
1146 clf = self.clfs[i]
1147
1148
1149 if clf_hastestdataset:
1150 clf.testdataset = split[1]
1151
1152 clf.train(split[0])
1153
1154
1155 if clf_hastestdataset:
1156 clf.testdataset = None
1157
1158 if states.isEnabled("confusion"):
1159 predictions = clf.predict(split[1].samples)
1160 self.confusion.add(split[1].labels, predictions,
1161 clf.states.get('values', None))
1162 if __debug__:
1163 dact = debug.active
1164 if 'CLFSPL_' in dact:
1165 debug('CLFSPL_', 'Split %d:\n%s' % (i, self.confusion))
1166 elif 'CLFSPL' in dact:
1167 debug('CLFSPL', 'Split %d error %.2f%%'
1168 % (i, self.confusion.summaries[-1].error))
1169
1170 if states.isEnabled("training_confusion"):
1171 states.training_confusion += \
1172 clf.states.training_confusion
1173
1174 try:
1175 if states.isEnabled("confusion"):
1176 states.confusion.labels_map = dataset.labels_map
1177 if states.isEnabled("training_confusion"):
1178 states.training_confusion.labels_map = dataset.labels_map
1179 except:
1180 pass
1181
1182
1183 @group_kwargs(prefixes=['slave_'], passthrough=True)
1196
1197 splitter = property(fget=lambda x:x.__splitter,
1198 doc="Splitter user by SplitClassifier")
1199
1202 """`ProxyClassifier` which uses some mapper prior training/testing.
1203
1204 `MaskMapper` can be used just a subset of features to
1205 train/classify.
1206 Having such classifier we can easily create a set of classifiers
1207 for BoostedClassifier, where each classifier operates on some set
1208 of features, e.g. set of best spheres from SearchLight, set of
1209 ROIs selected elsewhere. It would be different from simply
1210 applying whole mask over the dataset, since here initial decision
1211 is made by each classifier and then later on they vote for the
1212 final decision across the set of classifiers.
1213 """
1214
1215 - def __init__(self, clf, mapper, **kwargs):
1216 """Initialize the instance
1217
1218 :Parameters:
1219 clf : Classifier
1220 classifier based on which mask classifiers is created
1221 mapper
1222 whatever `Mapper` comes handy
1223 """
1224 ProxyClassifier.__init__(self, clf, **kwargs)
1225
1226 self.__mapper = mapper
1227 """mapper to help us our with prepping data to
1228 training/classification"""
1229
1230
1232 """Train `MappedClassifier`
1233 """
1234
1235
1236
1237 self.__mapper.train(dataset)
1238
1239
1240 wdataset = dataset.applyMapper(featuresmapper = self.__mapper)
1241 ProxyClassifier._train(self, wdataset)
1242
1243
1248
1249
1250 @group_kwargs(prefixes=['slave_'], passthrough=True)
1257
1258
1259 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1260
1264 """`ProxyClassifier` which uses some `FeatureSelection` prior training.
1265
1266 `FeatureSelection` is used first to select features for the classifier to
1267 use for prediction. Internally it would rely on MappedClassifier which
1268 would use created MaskMapper.
1269
1270 TODO: think about removing overhead of retraining the same classifier if
1271 feature selection was carried out with the same classifier already. It
1272 has been addressed by adding .trained property to classifier, but now
1273 we should expclitely use isTrained here if we want... need to think more
1274 """
1275
1276 _clf_internals = [ 'does_feature_selection', 'meta' ]
1277
1278 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1279 """Initialize the instance
1280
1281 :Parameters:
1282 clf : Classifier
1283 classifier based on which mask classifiers is created
1284 feature_selection : FeatureSelection
1285 whatever `FeatureSelection` comes handy
1286 testdataset : Dataset
1287 optional dataset which would be given on call to feature_selection
1288 """
1289 ProxyClassifier.__init__(self, clf, **kwargs)
1290
1291 self.__maskclf = None
1292 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on."""
1293
1294 self.__feature_selection = feature_selection
1295 """`FeatureSelection` to select the features prior training"""
1296
1297 self.__testdataset = testdataset
1298 """`FeatureSelection` might like to use testdataset"""
1299
1300
1302 """Untrain `FeatureSelectionClassifier`
1303
1304 Has to untrain any known classifier
1305 """
1306 if self.__feature_selection is not None:
1307 self.__feature_selection.untrain()
1308 if not self.trained:
1309 return
1310 if not self.__maskclf is None:
1311 self.__maskclf.untrain()
1312 super(FeatureSelectionClassifier, self).untrain()
1313
1314
1316 """Train `FeatureSelectionClassifier`
1317 """
1318
1319 self.__feature_selection.states._changeTemporarily(
1320 enable_states=["selected_ids"])
1321
1322 if __debug__:
1323 debug("CLFFS", "Performing feature selection using %s" %
1324 self.__feature_selection + " on %s" % dataset)
1325
1326 (wdataset, tdataset) = self.__feature_selection(dataset,
1327 self.__testdataset)
1328 if __debug__:
1329 add_ = ""
1330 if "CLFFS_" in debug.active:
1331 add_ = " Selected features: %s" % \
1332 self.__feature_selection.selected_ids
1333 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " +
1334 "%(dsnfeat)d features.%(app)s",
1335 msgargs={'fs':self.__feature_selection,
1336 'nfeat':wdataset.nfeatures,
1337 'dsnfeat':dataset.nfeatures,
1338 'app':add_})
1339
1340
1341
1342 mappermask = N.zeros(dataset.nfeatures)
1343 mappermask[self.__feature_selection.selected_ids] = 1
1344 mapper = MaskMapper(mappermask)
1345
1346 self.__feature_selection.states._resetEnabledTemporarily()
1347
1348
1349 self.__maskclf = MappedClassifier(self.clf, mapper)
1350
1351
1352 self.__maskclf.clf.train(wdataset)
1353
1354
1355
1356
1357
1359 """Return used feature ids for `FeatureSelectionClassifier`
1360
1361 """
1362 return self.__feature_selection.selected_ids
1363
1365 """Predict using `FeatureSelectionClassifier`
1366 """
1367 clf = self.__maskclf
1368 if self.states.isEnabled('values'):
1369 clf.states.enable(['values'])
1370
1371 result = clf._predict(data)
1372
1373 self.states._copy_states_(clf, ['values'], deep=False)
1374 return result
1375
1377 """Set testing dataset to be used for feature selection
1378 """
1379 self.__testdataset = testdataset
1380
1381 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`")
1382 feature_selection = property(lambda x:x.__feature_selection,
1383 doc="Used `FeatureSelection`")
1384
1385 @group_kwargs(prefixes=['slave_'], passthrough=True)
1395
1396
1397
1398 testdataset = property(fget=lambda x:x.__testdataset,
1399 fset=setTestDataset)
1400