Package mvpa :: Package tests :: Module test_clf
[hide private]
[frames] | no frames]

Source Code for Module mvpa.tests.test_clf

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Unit tests for PyMVPA basic Classifiers""" 
 10   
 11  from mvpa.support.copy import deepcopy 
 12  from mvpa.base import externals 
 13   
 14  from mvpa.datasets import Dataset 
 15  from mvpa.mappers.mask import MaskMapper 
 16  from mvpa.datasets.splitters import NFoldSplitter, OddEvenSplitter 
 17   
 18  from mvpa.misc.exceptions import UnknownStateError 
 19   
 20  from mvpa.clfs.base import DegenerateInputError, FailedToTrainError 
 21  from mvpa.clfs.meta import CombinedClassifier, \ 
 22       BinaryClassifier, MulticlassClassifier, \ 
 23       SplitClassifier, MappedClassifier, FeatureSelectionClassifier, \ 
 24       TreeClassifier 
 25  from mvpa.clfs.transerror import TransferError 
 26  from mvpa.algorithms.cvtranserror import CrossValidatedTransferError 
 27   
 28  from tests_warehouse import * 
 29  from tests_warehouse_clfs import * 
 30   
 31  # What exceptions to allow while testing degenerate cases. 
 32  # If it pukes -- it is ok -- user will notice that something 
 33  # is wrong 
 34  _degenerate_allowed_exceptions = [DegenerateInputError, FailedToTrainError] 
 35  if externals.exists('rpy'): 
 36      import rpy 
 37      _degenerate_allowed_exceptions += [rpy.RPyRException] 
38 39 40 -class ClassifiersTests(unittest.TestCase):
41
42 - def setUp(self):
43 self.clf_sign = SameSignClassifier() 44 self.clf_less1 = Less1Classifier() 45 46 # simple binary dataset 47 self.data_bin_1 = Dataset( 48 samples=[[0,0],[-10,-1],[1,0.1],[1,-1],[-1,1]], 49 labels=[1, 1, 1, -1, -1], # labels 50 chunks=[0, 1, 2, 2, 3]) # chunks
51
52 - def testDummy(self):
53 clf = SameSignClassifier(enable_states=['training_confusion']) 54 clf.train(self.data_bin_1) 55 self.failUnlessRaises(UnknownStateError, clf.states.__getattribute__, 56 "predictions") 57 """Should have no predictions after training. Predictions 58 state should be explicitely disabled""" 59 60 if not _all_states_enabled: 61 self.failUnlessRaises(UnknownStateError, clf.states.__getattribute__, 62 "trained_dataset") 63 64 self.failUnlessEqual(clf.training_confusion.percentCorrect, 65 100, 66 msg="Dummy clf should train perfectly") 67 self.failUnlessEqual(clf.predict(self.data_bin_1.samples), 68 list(self.data_bin_1.labels)) 69 70 self.failUnlessEqual(len(clf.predictions), self.data_bin_1.nsamples, 71 msg="Trained classifier stores predictions by default") 72 73 clf = SameSignClassifier(enable_states=['trained_dataset']) 74 clf.train(self.data_bin_1) 75 self.failUnless((clf.trained_dataset.samples == 76 self.data_bin_1.samples).all()) 77 self.failUnless((clf.trained_dataset.labels == 78 self.data_bin_1.labels).all())
79 80
81 - def testBoosted(self):
82 # XXXXXXX 83 # silly test if we get the same result with boosted as with a single one 84 bclf = CombinedClassifier(clfs=[self.clf_sign.clone(), 85 self.clf_sign.clone()]) 86 87 self.failUnlessEqual(list(bclf.predict(self.data_bin_1.samples)), 88 list(self.data_bin_1.labels), 89 msg="Boosted classifier should work") 90 self.failUnlessEqual(bclf.predict(self.data_bin_1.samples), 91 self.clf_sign.predict(self.data_bin_1.samples), 92 msg="Boosted classifier should have the same as regular")
93 94
96 bclf = CombinedClassifier(clfs=[self.clf_sign.clone(), 97 self.clf_sign.clone()], 98 enable_states=['feature_ids']) 99 100 # check states enabling propagation 101 self.failUnlessEqual(self.clf_sign.states.isEnabled('feature_ids'), 102 _all_states_enabled) 103 self.failUnlessEqual(bclf.clfs[0].states.isEnabled('feature_ids'), True) 104 105 bclf2 = CombinedClassifier(clfs=[self.clf_sign.clone(), 106 self.clf_sign.clone()], 107 propagate_states=False, 108 enable_states=['feature_ids']) 109 110 self.failUnlessEqual(self.clf_sign.states.isEnabled('feature_ids'), 111 _all_states_enabled) 112 self.failUnlessEqual(bclf2.clfs[0].states.isEnabled('feature_ids'), 113 _all_states_enabled)
114 115 116
117 - def testBinaryDecorator(self):
118 ds = Dataset(samples=[ [0,0], [0,1], [1,100], [-1,0], [-1,-3], [ 0,-10] ], 119 labels=[ 'sp', 'sp', 'sp', 'dn', 'sn', 'dp']) 120 testdata = [ [0,0], [10,10], [-10, -1], [0.1, -0.1], [-0.2, 0.2] ] 121 # labels [s]ame/[d]ifferent (sign), and [p]ositive/[n]egative first element 122 123 clf = SameSignClassifier() 124 # lets create classifier to descriminate only between same/different, 125 # which is a primary task of SameSignClassifier 126 bclf1 = BinaryClassifier(clf=clf, 127 poslabels=['sp', 'sn'], 128 neglabels=['dp', 'dn']) 129 130 orig_labels = ds.labels[:] 131 bclf1.train(ds) 132 133 self.failUnless(bclf1.predict(testdata) == 134 [['sp', 'sn'], ['sp', 'sn'], ['sp', 'sn'], 135 ['dn', 'dp'], ['dn', 'dp']]) 136 137 self.failUnless((ds.labels == orig_labels).all(), 138 msg="BinaryClassifier should not alter labels")
139 140 141 @sweepargs(clf=clfswh['binary'])
142 - def testClassifierGeneralization(self, clf):
143 """Simple test if classifiers can generalize ok on simple data 144 """ 145 te = CrossValidatedTransferError(TransferError(clf), NFoldSplitter()) 146 cve = te(datasets['uni2medium']) 147 if cfg.getboolean('tests', 'labile', default='yes'): 148 self.failUnless(cve < 0.25, 149 msg="Got transfer error %g" % (cve))
150 151 152 @sweepargs(clf=clfswh[:] + regrswh[:])
153 - def testSummary(self, clf):
154 """Basic testing of the clf summary 155 """ 156 summary1 = clf.summary() 157 self.failUnless('not yet trained' in summary1) 158 clf.train(datasets['uni2small']) 159 summary = clf.summary() 160 # It should get bigger ;) 161 self.failUnless(len(summary) > len(summary1)) 162 self.failUnless(not 'not yet trained' in summary)
163 164 165 @sweepargs(clf=clfswh[:] + regrswh[:])
166 - def testDegenerateUsage(self, clf):
167 """Test how clf handles degenerate cases 168 """ 169 # Whenever we have only 1 feature with only 0s in it 170 ds1 = datasets['uni2small'][:, [0]] 171 # XXX this very line breaks LARS in many other unittests -- 172 # very interesting effect. but screw it -- for now it will be 173 # this way 174 ds1.samples[:] = 0.0 # all 0s 175 176 #ds2 = datasets['uni2small'][[0], :] 177 #ds2.samples[:] = 0.0 # all 0s 178 179 clf.states._changeTemporarily( 180 enable_states=['values', 'training_confusion']) 181 182 # Good pukes are good ;-) 183 # TODO XXX add 184 # - ", ds2):" to test degenerate ds with 1 sample 185 # - ds1 but without 0s -- just 1 feature... feature selections 186 # might lead to 'surprises' due to magic in combiners etc 187 for ds in (ds1, ): 188 try: 189 clf.train(ds) # should not crash or stall 190 # could we still get those? 191 summary = clf.summary() 192 cm = clf.states.training_confusion 193 # If succeeded to train/predict (due to 194 # training_confusion) without error -- results better be 195 # at "chance" 196 continue 197 if 'ACC' in cm.stats: 198 self.failUnlessEqual(cm.stats['ACC'], 0.5) 199 else: 200 self.failUnless(N.isnan(cm.stats['CCe'])) 201 except tuple(_degenerate_allowed_exceptions): 202 pass 203 clf.states._resetEnabledTemporarily()
204 205 206 # TODO: sg - remove our limitations, meta -- also 207 @sweepargs(clf=clfswh['!sg', '!plr', '!meta'])
208 - def test_single_class(self, clf):
209 """Test if binary and multiclass can handle single class training/testing 210 """ 211 ds = datasets['uni2small']['labels', (0,)] 212 try: 213 err = TransferError(clf)( 214 datasets['uni2small_test']['labels', (0,)], 215 datasets['uni2small_train']['labels', (0,)]) 216 except Exception, e: 217 self.fail(str(e)) 218 self.failUnless(err == 0.)
219 220 # TODO: validate for regressions as well!!!
221 - def testSplitClassifier(self):
222 ds = self.data_bin_1 223 clf = SplitClassifier(clf=SameSignClassifier(), 224 splitter=NFoldSplitter(1), 225 enable_states=['confusion', 'training_confusion', 226 'feature_ids']) 227 clf.train(ds) # train the beast 228 error = clf.confusion.error 229 tr_error = clf.training_confusion.error 230 231 clf2 = clf.clone() 232 cv = CrossValidatedTransferError( 233 TransferError(clf2), 234 NFoldSplitter(), 235 enable_states=['confusion', 'training_confusion']) 236 cverror = cv(ds) 237 tr_cverror = cv.training_confusion.error 238 239 self.failUnlessEqual(error, cverror, 240 msg="We should get the same error using split classifier as" 241 " using CrossValidatedTransferError. Got %s and %s" 242 % (error, cverror)) 243 244 self.failUnlessEqual(tr_error, tr_cverror, 245 msg="We should get the same training error using split classifier as" 246 " using CrossValidatedTransferError. Got %s and %s" 247 % (tr_error, tr_cverror)) 248 249 self.failUnlessEqual(clf.confusion.percentCorrect, 250 100, 251 msg="Dummy clf should train perfectly") 252 self.failUnlessEqual(len(clf.confusion.sets), 253 len(ds.uniquechunks), 254 msg="Should have 1 confusion per each split") 255 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks), 256 msg="Should have number of classifiers equal # of epochs") 257 self.failUnlessEqual(clf.predict(ds.samples), list(ds.labels), 258 msg="Should classify correctly") 259 260 # feature_ids must be list of lists, and since it is not 261 # feature-selecting classifier used - we expect all features 262 # to be utilized 263 # NOT ANYMORE -- for BoostedClassifier we have now union of all 264 # used features across slave classifiers. That makes 265 # semantics clear. If you need to get deeper -- use upcoming 266 # harvesting facility ;-) 267 # self.failUnlessEqual(len(clf.feature_ids), len(ds.uniquechunks)) 268 # self.failUnless(N.array([len(ids)==ds.nfeatures 269 # for ids in clf.feature_ids]).all()) 270 271 # Just check if we get it at all ;-) 272 summary = clf.summary()
273 274 275 @sweepargs(clf_=clfswh['binary', '!meta'])
276 - def testSplitClassifierExtended(self, clf_):
277 clf2 = clf_.clone() 278 ds = datasets['uni2medium']#self.data_bin_1 279 clf = SplitClassifier(clf=clf_, #SameSignClassifier(), 280 splitter=NFoldSplitter(1), 281 enable_states=['confusion', 'feature_ids']) 282 clf.train(ds) # train the beast 283 error = clf.confusion.error 284 285 cv = CrossValidatedTransferError( 286 TransferError(clf2), 287 NFoldSplitter(), 288 enable_states=['confusion', 'training_confusion']) 289 cverror = cv(ds) 290 291 self.failUnless(abs(error-cverror)<0.01, 292 msg="We should get the same error using split classifier as" 293 " using CrossValidatedTransferError. Got %s and %s" 294 % (error, cverror)) 295 296 if cfg.getboolean('tests', 'labile', default='yes'): 297 self.failUnless(error < 0.25, 298 msg="clf should generalize more or less fine. " 299 "Got error %s" % error) 300 self.failUnlessEqual(len(clf.confusion.sets), len(ds.uniquechunks), 301 msg="Should have 1 confusion per each split") 302 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks), 303 msg="Should have number of classifiers equal # of epochs")
304 #self.failUnlessEqual(clf.predict(ds.samples), list(ds.labels), 305 # msg="Should classify correctly") 306 307 308
309 - def testHarvesting(self):
310 """Basic testing of harvesting based on SplitClassifier 311 """ 312 ds = self.data_bin_1 313 clf = SplitClassifier(clf=SameSignClassifier(), 314 splitter=NFoldSplitter(1), 315 enable_states=['confusion', 'training_confusion', 316 'feature_ids'], 317 harvest_attribs=['clf.feature_ids', 318 'clf.training_time'], 319 descr="DESCR") 320 clf.train(ds) # train the beast 321 # Number of harvested items should be equal to number of chunks 322 self.failUnlessEqual(len(clf.harvested['clf.feature_ids']), 323 len(ds.uniquechunks)) 324 # if we can blame multiple inheritance and ClassWithCollections.__init__ 325 self.failUnlessEqual(clf.descr, "DESCR")
326 327
328 - def testMappedClassifier(self):
329 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ]) 330 testdata3 = Dataset(samples=samples, labels=1) 331 res110 = [1, 1, 1, -1, -1] 332 res101 = [-1, 1, -1, -1, 1] 333 res011 = [-1, 1, -1, 1, -1] 334 335 clf110 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,1,0]))) 336 clf101 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,0,1]))) 337 clf011 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([0,1,1]))) 338 339 self.failUnlessEqual(clf110.predict(samples), res110) 340 self.failUnlessEqual(clf101.predict(samples), res101) 341 self.failUnlessEqual(clf011.predict(samples), res011)
342 343
345 from test_rfe import SillySensitivityAnalyzer 346 from mvpa.featsel.base import \ 347 SensitivityBasedFeatureSelection 348 from mvpa.featsel.helpers import \ 349 FixedNElementTailSelector 350 351 # should give lowest weight to the feature with lowest index 352 sens_ana = SillySensitivityAnalyzer() 353 # should give lowest weight to the feature with highest index 354 sens_ana_rev = SillySensitivityAnalyzer(mult=-1) 355 356 # corresponding feature selections 357 feat_sel = SensitivityBasedFeatureSelection(sens_ana, 358 FixedNElementTailSelector(1, mode='discard')) 359 360 feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev, 361 FixedNElementTailSelector(1)) 362 363 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ]) 364 365 testdata3 = Dataset(samples=samples, labels=1) 366 # dummy train data so proper mapper gets created 367 traindata = Dataset(samples=N.array([ [0, 0,-1], [1,0,1] ]), labels=[1,2]) 368 369 # targets 370 res110 = [1, 1, 1, -1, -1] 371 res011 = [-1, 1, -1, 1, -1] 372 373 # first classifier -- 0th feature should be discarded 374 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel, 375 enable_states=['feature_ids']) 376 377 self.clf_sign.states._changeTemporarily(enable_states=['values']) 378 clf011.train(traindata) 379 380 self.failUnlessEqual(clf011.predict(testdata3.samples), res011) 381 # just silly test if we get values assigned in the 'ProxyClassifier' 382 self.failUnless(len(clf011.values) == len(res110), 383 msg="We need to pass values into ProxyClassifier") 384 self.clf_sign.states._resetEnabledTemporarily() 385 386 self.failUnlessEqual(len(clf011.feature_ids), 2) 387 "Feature selection classifier had to be trained on 2 features" 388 389 # first classifier -- last feature should be discarded 390 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev) 391 clf011.train(traindata) 392 self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
393
395 from test_rfe import SillySensitivityAnalyzer 396 from mvpa.featsel.base import \ 397 SensitivityBasedFeatureSelection 398 from mvpa.featsel.helpers import \ 399 FixedNElementTailSelector 400 if sample_clf_reg is None: 401 # none regression was found, so nothing to test 402 return 403 # should give lowest weight to the feature with lowest index 404 sens_ana = SillySensitivityAnalyzer() 405 406 # corresponding feature selections 407 feat_sel = SensitivityBasedFeatureSelection(sens_ana, 408 FixedNElementTailSelector(1, mode='discard')) 409 410 # now test with regression-based classifier. The problem is 411 # that it is determining predictions twice from values and 412 # then setting the values from the results, which the second 413 # time is set to predictions. The final outcome is that the 414 # values are actually predictions... 415 dat = Dataset(samples=N.random.randn(4,10),labels=[-1,-1,1,1]) 416 clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel) 417 clf_reg.train(dat) 418 res = clf_reg.predict(dat.samples) 419 self.failIf((N.array(clf_reg.values)-clf_reg.predictions).sum()==0, 420 msg="Values were set to the predictions in %s." % 421 sample_clf_reg)
422 423
424 - def testTreeClassifier(self):
425 """Basic tests for TreeClassifier 426 """ 427 ds = datasets['uni4medium'] 428 # excluding PLR since that one can deal only with 0,1 labels ATM 429 clfs = clfswh['binary', '!plr'] # pool of classifiers 430 # Lets permute so each time we try some different combination 431 # of the classifiers 432 clfs = [clfs[i] for i in N.random.permutation(len(clfs))] 433 # Test conflicting definition 434 tclf = TreeClassifier(clfs[0], { 435 'L0+2' : (('L0', 'L2'), clfs[1]), 436 'L2+3' : ((2, 3), clfs[2])}) 437 self.failUnlessRaises(ValueError, tclf.train, ds) 438 """Should raise exception since label 2 is in both""" 439 440 # Test insufficient definition 441 tclf = TreeClassifier(clfs[0], { 442 'L0+5' : (('L0', 'L5'), clfs[1]), 443 'L2+3' : ((2, 3), clfs[2])}) 444 self.failUnlessRaises(ValueError, tclf.train, ds) 445 """Should raise exception since no group for L1""" 446 447 # proper definition now 448 tclf = TreeClassifier(clfs[0], { 449 'L0+1' : (('L0', 1), clfs[1]), 450 'L2+3' : ((2, 3), clfs[2])}) 451 452 # Lets test train/test cycle using CVTE 453 cv = CrossValidatedTransferError( 454 TransferError(tclf), 455 OddEvenSplitter(), 456 enable_states=['confusion', 'training_confusion']) 457 cverror = cv(ds) 458 try: 459 rtclf = repr(tclf) 460 except: 461 self.fail(msg="Could not obtain repr for TreeClassifier") 462 463 # Test accessibility of .clfs 464 self.failUnless(tclf.clfs['L0+1'] is clfs[1]) 465 self.failUnless(tclf.clfs['L2+3'] is clfs[2]) 466 467 cvtrc = cv.training_confusion 468 cvtc = cv.confusion 469 if cfg.getboolean('tests', 'labile', default='yes'): 470 # just a dummy check to make sure everything is working 471 self.failUnless(cvtrc != cvtc) 472 self.failUnless(cverror < 0.3, 473 msg="Got too high error = %s using %s" 474 % (cverror, tclf)) 475 476 # Test trailing nodes with no classifier 477 tclf = TreeClassifier(clfs[0], { 478 'L0' : ((0,), None), 479 'L1+2+3' : ((1, 2, 3), clfswh['multiclass'][0])}) 480 cv = CrossValidatedTransferError( 481 TransferError(tclf), 482 OddEvenSplitter(), 483 enable_states=['confusion', 'training_confusion']) 484 cverror = cv(ds) 485 if cfg.getboolean('tests', 'labile', default='yes'): 486 self.failUnless(cverror < 0.3, 487 msg="Got too high error = %s using %s" 488 % (cverror, tclf))
489 490 491 @sweepargs(clf=clfswh[:])
492 - def testValues(self, clf):
493 if isinstance(clf, MulticlassClassifier): 494 # TODO: handle those values correctly 495 return 496 ds = datasets['uni2small'] 497 clf.states._changeTemporarily(enable_states = ['values']) 498 cv = CrossValidatedTransferError( 499 TransferError(clf), 500 OddEvenSplitter(), 501 enable_states=['confusion', 'training_confusion']) 502 cverror = cv(ds) 503 #print clf.descr, clf.values[0] 504 # basic test either we get 1 set of values per each sample 505 self.failUnlessEqual(len(clf.values), ds.nsamples/2) 506 507 clf.states._resetEnabledTemporarily()
508 509 @sweepargs(clf=clfswh['linear', 'svm', 'libsvm', '!meta'])
510 - def testMulticlassClassifier(self, clf):
511 oldC = None 512 # XXX somewhat ugly way to force non-dataspecific C value. 513 # Otherwise multiclass libsvm builtin and our MultiClass would differ 514 # in results 515 if clf.params.isKnown('C') and clf.C<0: 516 oldC = clf.C 517 clf.C = 1.0 # reset C to be 1 518 519 svm, svm2 = clf, clf.clone() 520 svm2.states.enable(['training_confusion']) 521 522 mclf = MulticlassClassifier(clf=svm, 523 enable_states=['training_confusion']) 524 525 svm2.train(datasets['uni2small_train']) 526 mclf.train(datasets['uni2small_train']) 527 s1 = str(mclf.training_confusion) 528 s2 = str(svm2.training_confusion) 529 self.failUnlessEqual(s1, s2, 530 msg="Multiclass clf should provide same results as built-in " 531 "libsvm's %s. Got %s and %s" % (svm2, s1, s2)) 532 533 svm2.untrain() 534 535 self.failUnless(svm2.trained == False, 536 msg="Un-Trained SVM should be untrained") 537 538 self.failUnless(N.array([x.trained for x in mclf.clfs]).all(), 539 msg="Trained Boosted classifier should have all primary classifiers trained") 540 self.failUnless(mclf.trained, 541 msg="Trained Boosted classifier should be marked as trained") 542 543 mclf.untrain() 544 545 self.failUnless(not mclf.trained, 546 msg="UnTrained Boosted classifier should not be trained") 547 self.failUnless(not N.array([x.trained for x in mclf.clfs]).any(), 548 msg="UnTrained Boosted classifier should have no primary classifiers trained") 549 550 if oldC is not None: 551 clf.C = oldC
552 553 # XXX meta should also work but TODO 554 @sweepargs(clf=clfswh['svm', '!meta'])
555 - def testSVMs(self, clf):
556 knows_probabilities = 'probabilities' in clf.states.names and clf.params.probability 557 enable_states = ['values'] 558 if knows_probabilities: enable_states += ['probabilities'] 559 560 clf.states._changeTemporarily(enable_states = enable_states) 561 for traindata, testdata in [ 562 (datasets['uni2small_train'], datasets['uni2small_test']) ]: 563 clf.train(traindata) 564 predicts = clf.predict(testdata.samples) 565 # values should be different from predictions for SVMs we have 566 self.failUnless( (predicts != clf.values).any() ) 567 568 if knows_probabilities and clf.states.isSet('probabilities'): 569 # XXX test more thoroughly what we are getting here ;-) 570 self.failUnlessEqual( len(clf.probabilities), len(testdata.samples) ) 571 clf.states._resetEnabledTemporarily()
572 573 574 @sweepargs(clf=clfswh['retrainable'])
575 - def testRetrainables(self, clf):
576 # we need a copy since will tune its internals later on 577 clf = clf.clone() 578 clf.states._changeTemporarily(enable_states = ['values'], 579 # ensure that it does do predictions 580 # while training 581 disable_states=['training_confusion']) 582 clf_re = clf.clone() 583 # TODO: .retrainable must have a callback to call smth like 584 # _setRetrainable 585 clf_re._setRetrainable(True) 586 587 # need to have high snr so we don't 'cope' with problematic 588 # datasets since otherwise unittests would fail. 589 dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1, 590 'nonbogus_features':[2,4], 'snr': 5.0} 591 592 ## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 593 # NB datasets will be changed by the end of testing, so if 594 # are to change to use generic datasets - make sure to copy 595 # them here 596 dstrain = deepcopy(datasets['uni2large_train']) 597 dstest = deepcopy(datasets['uni2large_test']) 598 599 clf.untrain() 600 clf_re.untrain() 601 trerr, trerr_re = TransferError(clf), \ 602 TransferError(clf_re, disable_states=['training_confusion']) 603 604 # Just check for correctness of retraining 605 err_1 = trerr(dstest, dstrain) 606 self.failUnless(err_1<0.3, 607 msg="We should test here on easy dataset. Got error of %s" % err_1) 608 values_1 = clf.values[:] 609 # some times retraining gets into deeper optimization ;-) 610 eps = 0.05 611 corrcoef_eps = 0.85 # just to get no failures... usually > 0.95 612 613 614 def batch_test(retrain=True, retest=True, closer=True): 615 err = trerr(dstest, dstrain) 616 err_re = trerr_re(dstest, dstrain) 617 corr = N.corrcoef(clf.values, clf_re.values)[0,1] 618 corr_old = N.corrcoef(values_1, clf_re.values)[0,1] 619 if __debug__: 620 debug('TEST', "Retraining stats: errors %g %g corr %g " 621 "with old error %g corr %g" % 622 (err, err_re, corr, err_1, corr_old)) 623 self.failUnless(clf_re.states.retrained == retrain, 624 ("Must fully train", 625 "Must retrain instead of full training")[retrain]) 626 self.failUnless(clf_re.states.repredicted == retest, 627 ("Must fully test", 628 "Must retest instead of full testing")[retest]) 629 self.failUnless(corr > corrcoef_eps, 630 msg="Result must be close to the one without retraining." 631 " Got corrcoef=%s" % (corr)) 632 if closer: 633 self.failUnless(corr >= corr_old, 634 msg="Result must be closer to current without retraining" 635 " than to old one. Got corrcoef=%s" % (corr_old))
636 637 # Check sequential retraining/retesting 638 for i in xrange(3): 639 flag = bool(i!=0) 640 # ok - on 1st call we should train/test, then retrain/retest 641 # and we can't compare for closinest to old result since 642 # we are working on the same data/classifier 643 batch_test(retrain=flag, retest=flag, closer=False) 644 645 # should retrain nicely if we change a parameter 646 if 'C' in clf.params.names: 647 clf.params.C *= 0.1 648 clf_re.params.C *= 0.1 649 batch_test() 650 elif 'sigma_noise' in clf.params.names: 651 clf.params.sigma_noise *= 100 652 clf_re.params.sigma_noise *= 100 653 batch_test() 654 else: 655 raise RuntimeError, \ 656 'Please implement testing while changing some of the ' \ 657 'params for clf %s' % clf 658 659 # should retrain nicely if we change kernel parameter 660 if hasattr(clf, 'kernel_params') and len(clf.kernel_params.names): 661 clf.kernel_params.gamma = 0.1 662 clf_re.kernel_params.gamma = 0.1 663 # retest is false since kernel got recomputed thus 664 # can't expect to use the same kernel 665 batch_test(retest=not('gamma' in clf.kernel_params.names)) 666 667 # should retrain nicely if we change labels 668 oldlabels = dstrain.labels[:] 669 dstrain.permuteLabels(status=True, assure_permute=True) 670 self.failUnless((oldlabels != dstrain.labels).any(), 671 msg="We should succeed at permutting -- now got the same labels") 672 batch_test() 673 674 # Change labels in testing 675 oldlabels = dstest.labels[:] 676 dstest.permuteLabels(status=True, assure_permute=True) 677 self.failUnless((oldlabels != dstest.labels).any(), 678 msg="We should succeed at permutting -- now got the same labels") 679 batch_test() 680 681 # should re-train if we change data 682 # reuse trained SVM and its 'final' optimization point 683 if not clf.__class__.__name__ in ['GPR']: # on GPR everything depends on the data ;-) 684 oldsamples = dstrain.samples.copy() 685 dstrain.samples[:] += dstrain.samples*0.05 686 self.failUnless((oldsamples != dstrain.samples).any()) 687 batch_test(retest=False) 688 clf.states._resetEnabledTemporarily() 689 690 # test retrain() 691 # TODO XXX -- check validity 692 clf_re.retrain(dstrain); self.failUnless(clf_re.states.retrained) 693 clf_re.retrain(dstrain, labels=True); self.failUnless(clf_re.states.retrained) 694 clf_re.retrain(dstrain, traindataset=True); self.failUnless(clf_re.states.retrained) 695 696 # test repredict() 697 clf_re.repredict(dstest.samples); 698 self.failUnless(clf_re.states.repredicted) 699 self.failUnlessRaises(RuntimeError, clf_re.repredict, 700 dstest.samples, labels=True, 701 msg="for now retesting with anything changed makes no sense") 702 clf_re._setRetrainable(False)
703 704
705 - def testGenericTests(self):
706 """Test all classifiers for conformant behavior 707 """ 708 for clf_, traindata in \ 709 [(clfswh['binary'], datasets['dumb2']), 710 (clfswh['multiclass'], datasets['dumb'])]: 711 traindata_copy = deepcopy(traindata) # full copy of dataset 712 for clf in clf_: 713 clf.train(traindata) 714 self.failUnless( 715 (traindata.samples == traindata_copy.samples).all(), 716 "Training of a classifier shouldn't change original dataset") 717 718 # TODO: enforce uniform return from predict?? 719 #predicted = clf.predict(traindata.samples) 720 #self.failUnless(isinstance(predicted, N.ndarray)) 721 722 # Just simple test that all of them are syntaxed correctly 723 self.failUnless(str(clf) != "") 724 self.failUnless(repr(clf) != "")
725 726 # TODO: unify str and repr for all classifiers 727 728 # XXX TODO: should work on smlr, knn, ridgereg, lars as well! but now 729 # they fail to train 730 # GNB -- cannot train since 1 sample isn't sufficient to assess variance 731 @sweepargs(clf=clfswh['!smlr', '!knn', '!gnb', '!lars', '!meta', '!ridge'])
732 - def testCorrectDimensionsOrder(self, clf):
733 """To check if known/present Classifiers are working properly 734 with samples being first dimension. Started to worry about 735 possible problems while looking at sg where samples are 2nd 736 dimension 737 """ 738 # specially crafted dataset -- if dimensions are flipped over 739 # the same storage, problem becomes unseparable. Like in this case 740 # incorrect order of dimensions lead to equal samples [0, 1, 0] 741 traindatas = [ 742 Dataset(samples=N.array([ [0, 0, 1.0], 743 [1, 0, 0] ]), labels=[0, 1]), 744 Dataset(samples=N.array([ [0, 0.0], 745 [1, 1] ]), labels=[0, 1])] 746 747 clf.states._changeTemporarily(enable_states = ['training_confusion']) 748 for traindata in traindatas: 749 clf.train(traindata) 750 self.failUnlessEqual(clf.training_confusion.percentCorrect, 100.0, 751 "Classifier %s must have 100%% correct learning on %s. Has %f" % 752 (`clf`, traindata.samples, clf.training_confusion.percentCorrect)) 753 754 # and we must be able to predict every original sample thus 755 for i in xrange(traindata.nsamples): 756 sample = traindata.samples[i,:] 757 predicted = clf.predict([sample]) 758 self.failUnlessEqual([predicted], traindata.labels[i], 759 "We must be able to predict sample %s using " % sample + 760 "classifier %s" % `clf`) 761 clf.states._resetEnabledTemporarily()
762
763 -def suite():
764 return unittest.makeSuite(ClassifiersTests)
765 766 767 if __name__ == '__main__': 768 import runner 769