1
2
3
4
5
6
7
8
9 """Unit tests for PyMVPA basic Classifiers"""
10
11 from mvpa.support.copy import deepcopy
12 from mvpa.base import externals
13
14 from mvpa.datasets import Dataset
15 from mvpa.mappers.mask import MaskMapper
16 from mvpa.datasets.splitters import NFoldSplitter, OddEvenSplitter
17
18 from mvpa.misc.exceptions import UnknownStateError
19
20 from mvpa.clfs.base import DegenerateInputError, FailedToTrainError
21 from mvpa.clfs.meta import CombinedClassifier, \
22 BinaryClassifier, MulticlassClassifier, \
23 SplitClassifier, MappedClassifier, FeatureSelectionClassifier, \
24 TreeClassifier
25 from mvpa.clfs.transerror import TransferError
26 from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
27
28 from tests_warehouse import *
29 from tests_warehouse_clfs import *
30
31
32
33
34 _degenerate_allowed_exceptions = [DegenerateInputError, FailedToTrainError]
35 if externals.exists('rpy'):
36 import rpy
37 _degenerate_allowed_exceptions += [rpy.RPyRException]
41
43 self.clf_sign = SameSignClassifier()
44 self.clf_less1 = Less1Classifier()
45
46
47 self.data_bin_1 = Dataset(
48 samples=[[0,0],[-10,-1],[1,0.1],[1,-1],[-1,1]],
49 labels=[1, 1, 1, -1, -1],
50 chunks=[0, 1, 2, 2, 3])
51
79
80
82
83
84 bclf = CombinedClassifier(clfs=[self.clf_sign.clone(),
85 self.clf_sign.clone()])
86
87 self.failUnlessEqual(list(bclf.predict(self.data_bin_1.samples)),
88 list(self.data_bin_1.labels),
89 msg="Boosted classifier should work")
90 self.failUnlessEqual(bclf.predict(self.data_bin_1.samples),
91 self.clf_sign.predict(self.data_bin_1.samples),
92 msg="Boosted classifier should have the same as regular")
93
94
114
115
116
118 ds = Dataset(samples=[ [0,0], [0,1], [1,100], [-1,0], [-1,-3], [ 0,-10] ],
119 labels=[ 'sp', 'sp', 'sp', 'dn', 'sn', 'dp'])
120 testdata = [ [0,0], [10,10], [-10, -1], [0.1, -0.1], [-0.2, 0.2] ]
121
122
123 clf = SameSignClassifier()
124
125
126 bclf1 = BinaryClassifier(clf=clf,
127 poslabels=['sp', 'sn'],
128 neglabels=['dp', 'dn'])
129
130 orig_labels = ds.labels[:]
131 bclf1.train(ds)
132
133 self.failUnless(bclf1.predict(testdata) ==
134 [['sp', 'sn'], ['sp', 'sn'], ['sp', 'sn'],
135 ['dn', 'dp'], ['dn', 'dp']])
136
137 self.failUnless((ds.labels == orig_labels).all(),
138 msg="BinaryClassifier should not alter labels")
139
140
141 @sweepargs(clf=clfswh['binary'])
150
151
152 @sweepargs(clf=clfswh[:] + regrswh[:])
154 """Basic testing of the clf summary
155 """
156 summary1 = clf.summary()
157 self.failUnless('not yet trained' in summary1)
158 clf.train(datasets['uni2small'])
159 summary = clf.summary()
160
161 self.failUnless(len(summary) > len(summary1))
162 self.failUnless(not 'not yet trained' in summary)
163
164
165 @sweepargs(clf=clfswh[:] + regrswh[:])
204
205
206
207 @sweepargs(clf=clfswh['!sg', '!plr', '!meta'])
209 """Test if binary and multiclass can handle single class training/testing
210 """
211 ds = datasets['uni2small']['labels', (0,)]
212 try:
213 err = TransferError(clf)(
214 datasets['uni2small_test']['labels', (0,)],
215 datasets['uni2small_train']['labels', (0,)])
216 except Exception, e:
217 self.fail(str(e))
218 self.failUnless(err == 0.)
219
220
222 ds = self.data_bin_1
223 clf = SplitClassifier(clf=SameSignClassifier(),
224 splitter=NFoldSplitter(1),
225 enable_states=['confusion', 'training_confusion',
226 'feature_ids'])
227 clf.train(ds)
228 error = clf.confusion.error
229 tr_error = clf.training_confusion.error
230
231 clf2 = clf.clone()
232 cv = CrossValidatedTransferError(
233 TransferError(clf2),
234 NFoldSplitter(),
235 enable_states=['confusion', 'training_confusion'])
236 cverror = cv(ds)
237 tr_cverror = cv.training_confusion.error
238
239 self.failUnlessEqual(error, cverror,
240 msg="We should get the same error using split classifier as"
241 " using CrossValidatedTransferError. Got %s and %s"
242 % (error, cverror))
243
244 self.failUnlessEqual(tr_error, tr_cverror,
245 msg="We should get the same training error using split classifier as"
246 " using CrossValidatedTransferError. Got %s and %s"
247 % (tr_error, tr_cverror))
248
249 self.failUnlessEqual(clf.confusion.percentCorrect,
250 100,
251 msg="Dummy clf should train perfectly")
252 self.failUnlessEqual(len(clf.confusion.sets),
253 len(ds.uniquechunks),
254 msg="Should have 1 confusion per each split")
255 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
256 msg="Should have number of classifiers equal # of epochs")
257 self.failUnlessEqual(clf.predict(ds.samples), list(ds.labels),
258 msg="Should classify correctly")
259
260
261
262
263
264
265
266
267
268
269
270
271
272 summary = clf.summary()
273
274
275 @sweepargs(clf_=clfswh['binary', '!meta'])
277 clf2 = clf_.clone()
278 ds = datasets['uni2medium']
279 clf = SplitClassifier(clf=clf_,
280 splitter=NFoldSplitter(1),
281 enable_states=['confusion', 'feature_ids'])
282 clf.train(ds)
283 error = clf.confusion.error
284
285 cv = CrossValidatedTransferError(
286 TransferError(clf2),
287 NFoldSplitter(),
288 enable_states=['confusion', 'training_confusion'])
289 cverror = cv(ds)
290
291 self.failUnless(abs(error-cverror)<0.01,
292 msg="We should get the same error using split classifier as"
293 " using CrossValidatedTransferError. Got %s and %s"
294 % (error, cverror))
295
296 if cfg.getboolean('tests', 'labile', default='yes'):
297 self.failUnless(error < 0.25,
298 msg="clf should generalize more or less fine. "
299 "Got error %s" % error)
300 self.failUnlessEqual(len(clf.confusion.sets), len(ds.uniquechunks),
301 msg="Should have 1 confusion per each split")
302 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
303 msg="Should have number of classifiers equal # of epochs")
304
305
306
307
308
326
327
329 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ])
330 testdata3 = Dataset(samples=samples, labels=1)
331 res110 = [1, 1, 1, -1, -1]
332 res101 = [-1, 1, -1, -1, 1]
333 res011 = [-1, 1, -1, 1, -1]
334
335 clf110 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,1,0])))
336 clf101 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,0,1])))
337 clf011 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([0,1,1])))
338
339 self.failUnlessEqual(clf110.predict(samples), res110)
340 self.failUnlessEqual(clf101.predict(samples), res101)
341 self.failUnlessEqual(clf011.predict(samples), res011)
342
343
345 from test_rfe import SillySensitivityAnalyzer
346 from mvpa.featsel.base import \
347 SensitivityBasedFeatureSelection
348 from mvpa.featsel.helpers import \
349 FixedNElementTailSelector
350
351
352 sens_ana = SillySensitivityAnalyzer()
353
354 sens_ana_rev = SillySensitivityAnalyzer(mult=-1)
355
356
357 feat_sel = SensitivityBasedFeatureSelection(sens_ana,
358 FixedNElementTailSelector(1, mode='discard'))
359
360 feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev,
361 FixedNElementTailSelector(1))
362
363 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ])
364
365 testdata3 = Dataset(samples=samples, labels=1)
366
367 traindata = Dataset(samples=N.array([ [0, 0,-1], [1,0,1] ]), labels=[1,2])
368
369
370 res110 = [1, 1, 1, -1, -1]
371 res011 = [-1, 1, -1, 1, -1]
372
373
374 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel,
375 enable_states=['feature_ids'])
376
377 self.clf_sign.states._changeTemporarily(enable_states=['values'])
378 clf011.train(traindata)
379
380 self.failUnlessEqual(clf011.predict(testdata3.samples), res011)
381
382 self.failUnless(len(clf011.values) == len(res110),
383 msg="We need to pass values into ProxyClassifier")
384 self.clf_sign.states._resetEnabledTemporarily()
385
386 self.failUnlessEqual(len(clf011.feature_ids), 2)
387 "Feature selection classifier had to be trained on 2 features"
388
389
390 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
391 clf011.train(traindata)
392 self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
393
422
423
425 """Basic tests for TreeClassifier
426 """
427 ds = datasets['uni4medium']
428
429 clfs = clfswh['binary', '!plr']
430
431
432 clfs = [clfs[i] for i in N.random.permutation(len(clfs))]
433
434 tclf = TreeClassifier(clfs[0], {
435 'L0+2' : (('L0', 'L2'), clfs[1]),
436 'L2+3' : ((2, 3), clfs[2])})
437 self.failUnlessRaises(ValueError, tclf.train, ds)
438 """Should raise exception since label 2 is in both"""
439
440
441 tclf = TreeClassifier(clfs[0], {
442 'L0+5' : (('L0', 'L5'), clfs[1]),
443 'L2+3' : ((2, 3), clfs[2])})
444 self.failUnlessRaises(ValueError, tclf.train, ds)
445 """Should raise exception since no group for L1"""
446
447
448 tclf = TreeClassifier(clfs[0], {
449 'L0+1' : (('L0', 1), clfs[1]),
450 'L2+3' : ((2, 3), clfs[2])})
451
452
453 cv = CrossValidatedTransferError(
454 TransferError(tclf),
455 OddEvenSplitter(),
456 enable_states=['confusion', 'training_confusion'])
457 cverror = cv(ds)
458 try:
459 rtclf = repr(tclf)
460 except:
461 self.fail(msg="Could not obtain repr for TreeClassifier")
462
463
464 self.failUnless(tclf.clfs['L0+1'] is clfs[1])
465 self.failUnless(tclf.clfs['L2+3'] is clfs[2])
466
467 cvtrc = cv.training_confusion
468 cvtc = cv.confusion
469 if cfg.getboolean('tests', 'labile', default='yes'):
470
471 self.failUnless(cvtrc != cvtc)
472 self.failUnless(cverror < 0.3,
473 msg="Got too high error = %s using %s"
474 % (cverror, tclf))
475
476
477 tclf = TreeClassifier(clfs[0], {
478 'L0' : ((0,), None),
479 'L1+2+3' : ((1, 2, 3), clfswh['multiclass'][0])})
480 cv = CrossValidatedTransferError(
481 TransferError(tclf),
482 OddEvenSplitter(),
483 enable_states=['confusion', 'training_confusion'])
484 cverror = cv(ds)
485 if cfg.getboolean('tests', 'labile', default='yes'):
486 self.failUnless(cverror < 0.3,
487 msg="Got too high error = %s using %s"
488 % (cverror, tclf))
489
490
491 @sweepargs(clf=clfswh[:])
508
509 @sweepargs(clf=clfswh['linear', 'svm', 'libsvm', '!meta'])
511 oldC = None
512
513
514
515 if clf.params.isKnown('C') and clf.C<0:
516 oldC = clf.C
517 clf.C = 1.0
518
519 svm, svm2 = clf, clf.clone()
520 svm2.states.enable(['training_confusion'])
521
522 mclf = MulticlassClassifier(clf=svm,
523 enable_states=['training_confusion'])
524
525 svm2.train(datasets['uni2small_train'])
526 mclf.train(datasets['uni2small_train'])
527 s1 = str(mclf.training_confusion)
528 s2 = str(svm2.training_confusion)
529 self.failUnlessEqual(s1, s2,
530 msg="Multiclass clf should provide same results as built-in "
531 "libsvm's %s. Got %s and %s" % (svm2, s1, s2))
532
533 svm2.untrain()
534
535 self.failUnless(svm2.trained == False,
536 msg="Un-Trained SVM should be untrained")
537
538 self.failUnless(N.array([x.trained for x in mclf.clfs]).all(),
539 msg="Trained Boosted classifier should have all primary classifiers trained")
540 self.failUnless(mclf.trained,
541 msg="Trained Boosted classifier should be marked as trained")
542
543 mclf.untrain()
544
545 self.failUnless(not mclf.trained,
546 msg="UnTrained Boosted classifier should not be trained")
547 self.failUnless(not N.array([x.trained for x in mclf.clfs]).any(),
548 msg="UnTrained Boosted classifier should have no primary classifiers trained")
549
550 if oldC is not None:
551 clf.C = oldC
552
553
554 @sweepargs(clf=clfswh['svm', '!meta'])
572
573
574 @sweepargs(clf=clfswh['retrainable'])
576
577 clf = clf.clone()
578 clf.states._changeTemporarily(enable_states = ['values'],
579
580
581 disable_states=['training_confusion'])
582 clf_re = clf.clone()
583
584
585 clf_re._setRetrainable(True)
586
587
588
589 dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1,
590 'nonbogus_features':[2,4], 'snr': 5.0}
591
592
593
594
595
596 dstrain = deepcopy(datasets['uni2large_train'])
597 dstest = deepcopy(datasets['uni2large_test'])
598
599 clf.untrain()
600 clf_re.untrain()
601 trerr, trerr_re = TransferError(clf), \
602 TransferError(clf_re, disable_states=['training_confusion'])
603
604
605 err_1 = trerr(dstest, dstrain)
606 self.failUnless(err_1<0.3,
607 msg="We should test here on easy dataset. Got error of %s" % err_1)
608 values_1 = clf.values[:]
609
610 eps = 0.05
611 corrcoef_eps = 0.85
612
613
614 def batch_test(retrain=True, retest=True, closer=True):
615 err = trerr(dstest, dstrain)
616 err_re = trerr_re(dstest, dstrain)
617 corr = N.corrcoef(clf.values, clf_re.values)[0,1]
618 corr_old = N.corrcoef(values_1, clf_re.values)[0,1]
619 if __debug__:
620 debug('TEST', "Retraining stats: errors %g %g corr %g "
621 "with old error %g corr %g" %
622 (err, err_re, corr, err_1, corr_old))
623 self.failUnless(clf_re.states.retrained == retrain,
624 ("Must fully train",
625 "Must retrain instead of full training")[retrain])
626 self.failUnless(clf_re.states.repredicted == retest,
627 ("Must fully test",
628 "Must retest instead of full testing")[retest])
629 self.failUnless(corr > corrcoef_eps,
630 msg="Result must be close to the one without retraining."
631 " Got corrcoef=%s" % (corr))
632 if closer:
633 self.failUnless(corr >= corr_old,
634 msg="Result must be closer to current without retraining"
635 " than to old one. Got corrcoef=%s" % (corr_old))
636
637
638 for i in xrange(3):
639 flag = bool(i!=0)
640
641
642
643 batch_test(retrain=flag, retest=flag, closer=False)
644
645
646 if 'C' in clf.params.names:
647 clf.params.C *= 0.1
648 clf_re.params.C *= 0.1
649 batch_test()
650 elif 'sigma_noise' in clf.params.names:
651 clf.params.sigma_noise *= 100
652 clf_re.params.sigma_noise *= 100
653 batch_test()
654 else:
655 raise RuntimeError, \
656 'Please implement testing while changing some of the ' \
657 'params for clf %s' % clf
658
659
660 if hasattr(clf, 'kernel_params') and len(clf.kernel_params.names):
661 clf.kernel_params.gamma = 0.1
662 clf_re.kernel_params.gamma = 0.1
663
664
665 batch_test(retest=not('gamma' in clf.kernel_params.names))
666
667
668 oldlabels = dstrain.labels[:]
669 dstrain.permuteLabels(status=True, assure_permute=True)
670 self.failUnless((oldlabels != dstrain.labels).any(),
671 msg="We should succeed at permutting -- now got the same labels")
672 batch_test()
673
674
675 oldlabels = dstest.labels[:]
676 dstest.permuteLabels(status=True, assure_permute=True)
677 self.failUnless((oldlabels != dstest.labels).any(),
678 msg="We should succeed at permutting -- now got the same labels")
679 batch_test()
680
681
682
683 if not clf.__class__.__name__ in ['GPR']:
684 oldsamples = dstrain.samples.copy()
685 dstrain.samples[:] += dstrain.samples*0.05
686 self.failUnless((oldsamples != dstrain.samples).any())
687 batch_test(retest=False)
688 clf.states._resetEnabledTemporarily()
689
690
691
692 clf_re.retrain(dstrain); self.failUnless(clf_re.states.retrained)
693 clf_re.retrain(dstrain, labels=True); self.failUnless(clf_re.states.retrained)
694 clf_re.retrain(dstrain, traindataset=True); self.failUnless(clf_re.states.retrained)
695
696
697 clf_re.repredict(dstest.samples);
698 self.failUnless(clf_re.states.repredicted)
699 self.failUnlessRaises(RuntimeError, clf_re.repredict,
700 dstest.samples, labels=True,
701 msg="for now retesting with anything changed makes no sense")
702 clf_re._setRetrainable(False)
703
704
706 """Test all classifiers for conformant behavior
707 """
708 for clf_, traindata in \
709 [(clfswh['binary'], datasets['dumb2']),
710 (clfswh['multiclass'], datasets['dumb'])]:
711 traindata_copy = deepcopy(traindata)
712 for clf in clf_:
713 clf.train(traindata)
714 self.failUnless(
715 (traindata.samples == traindata_copy.samples).all(),
716 "Training of a classifier shouldn't change original dataset")
717
718
719
720
721
722
723 self.failUnless(str(clf) != "")
724 self.failUnless(repr(clf) != "")
725
726
727
728
729
730
731 @sweepargs(clf=clfswh['!smlr', '!knn', '!gnb', '!lars', '!meta', '!ridge'])
733 """To check if known/present Classifiers are working properly
734 with samples being first dimension. Started to worry about
735 possible problems while looking at sg where samples are 2nd
736 dimension
737 """
738
739
740
741 traindatas = [
742 Dataset(samples=N.array([ [0, 0, 1.0],
743 [1, 0, 0] ]), labels=[0, 1]),
744 Dataset(samples=N.array([ [0, 0.0],
745 [1, 1] ]), labels=[0, 1])]
746
747 clf.states._changeTemporarily(enable_states = ['training_confusion'])
748 for traindata in traindatas:
749 clf.train(traindata)
750 self.failUnlessEqual(clf.training_confusion.percentCorrect, 100.0,
751 "Classifier %s must have 100%% correct learning on %s. Has %f" %
752 (`clf`, traindata.samples, clf.training_confusion.percentCorrect))
753
754
755 for i in xrange(traindata.nsamples):
756 sample = traindata.samples[i,:]
757 predicted = clf.predict([sample])
758 self.failUnlessEqual([predicted], traindata.labels[i],
759 "We must be able to predict sample %s using " % sample +
760 "classifier %s" % `clf`)
761 clf.states._resetEnabledTemporarily()
762
765
766
767 if __name__ == '__main__':
768 import runner
769