mvpa.clfs.warehouse

Source Code for Module mvpa.clfs.warehouse

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # See COPYING file distributed along with the PyMVPA package for the 6 # copyright and license terms. 7 # 8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 """Collection of classifiers to ease the exploration. 10 """ 11 12 __docformat__ = 'restructuredtext' 13 14 import operator 15 16 # Define sets of classifiers 17 from mvpa.clfs.meta import FeatureSelectionClassifier, SplitClassifier, \ 18 MulticlassClassifier 19 from mvpa.clfs.smlr import SMLR 20 from mvpa.clfs.knn import kNN 21 from mvpa.clfs.gnb import GNB 22 from mvpa.clfs.kernel import KernelLinear, KernelSquaredExponential 23 24 # Helpers 25 from mvpa.base import externals, cfg 26 from mvpa.measures.anova import OneWayAnova 27 from mvpa.misc.transformers import Absolute 28 from mvpa.clfs.smlr import SMLRWeights 29 from mvpa.featsel.helpers import FractionTailSelector, \ 30 FixedNElementTailSelector, RangeElementSelector 31 32 from mvpa.featsel.base import SensitivityBasedFeatureSelection 33 34 _KNOWN_INTERNALS = [ 'knn', 'binary', 'svm', 'linear', 35 'smlr', 'does_feature_selection', 'has_sensitivity', 36 'multiclass', 'non-linear', 'kernel-based', 'lars', 37 'regression', 'libsvm', 'sg', 'meta', 'retrainable', 'gpr', 38 'notrain2predict', 'ridge', 'blr', 'gnpp', 'enet', 'glmnet', 39 'gnb', 'plr']

40 41 -class Warehouse(object):

42 """Class to keep known instantiated classifiers 43 44 Should provide easy ways to select classifiers of needed kind: 45 clfswh['linear', 'svm'] should return all linear SVMs 46 clfswh['linear', 'multiclass'] should return all linear classifiers 47 capable of doing multiclass classification 48 """ 49

50 - def __init__(self, known_tags=None, matches=None):

51 """Initialize warehouse 52 53 :Parameters: 54 known_tags : list of basestring 55 List of known tags 56 matches : dict 57 Optional dictionary of additional matches. E.g. since any 58 regression can be used as a binary classifier, 59 matches={'binary':['regression']}, would allow to provide 60 regressions also if 'binary' was requested 61 """ 62 self._known_tags = set(known_tags) 63 self.__items = [] 64 self.__keys = set() 65 if matches is None: 66 matches = {} 67 self.__matches = matches

68

69 - def __getitem__(self, *args):

70 if isinstance(args[0], tuple): 71 args = args[0] 72 73 # so we explicitely handle [:] 74 if args == (slice(None),): 75 args = [] 76 77 # lets remove optional modifier '!' 78 dargs = set([str(x).lstrip('!') for x in args]).difference( 79 self._known_tags) 80 81 if len(dargs)>0: 82 raise ValueError, "Unknown internals %s requested. Known are %s" % \ 83 (list(dargs), list(self._known_tags)) 84 85 # dummy implementation for now 86 result = [] 87 # check every known item 88 for item in self.__items: 89 good = True 90 # by default each one counts 91 for arg in args: 92 # check for rejection first 93 if arg.startswith('!'): 94 if (arg[1:] in item._clf_internals): 95 good = False 96 break 97 else: 98 continue 99 # check for inclusion 100 found = False 101 for arg in [arg] + self.__matches.get(arg, []): 102 if (arg in item._clf_internals): 103 found = True 104 break 105 good = found 106 if not good: 107 break 108 if good: 109 result.append(item) 110 return result

111

112 - def __iadd__(self, item):

113 if operator.isSequenceType(item): 114 for item_ in item: 115 self.__iadd__(item_) 116 else: 117 if not hasattr(item, '_clf_internals'): 118 raise ValueError, "Cannot register %s " % item + \ 119 "which has no _clf_internals defined" 120 if len(item._clf_internals) == 0: 121 raise ValueError, "Cannot register %s " % item + \ 122 "which has empty _clf_internals" 123 clf_internals = set(item._clf_internals) 124 if clf_internals.issubset(self._known_tags): 125 self.__items.append(item) 126 self.__keys |= clf_internals 127 else: 128 raise ValueError, 'Unknown clf internal(s) %s' % \ 129 clf_internals.difference(self._known_tags) 130 return self

131 132 @property

133 - def internals(self):

134 """Known internal tags of the classifiers 135 """ 136 return self.__keys

137

138 - def listing(self):

139 """Listing (description + internals) of registered items 140 """ 141 return [(x.descr, x._clf_internals) for x in self.__items]

142 143 @property

144 - def items(self):

145 """Registered items 146 """ 147 return self.__items

148 149 clfswh = Warehouse(known_tags=_KNOWN_INTERNALS) # classifiers 150 regrswh = Warehouse(known_tags=_KNOWN_INTERNALS) # regressions 151 152 # NB: 153 # - Nu-classifiers are turned off since for haxby DS default nu 154 # is an 'infisible' one 155 # - Python's SMLR is turned off for the duration of development 156 # since it is slow and results should be the same as of C version 157 # 158 clfswh += [ SMLR(lm=0.1, implementation="C", descr="SMLR(lm=0.1)"), 159 SMLR(lm=1.0, implementation="C", descr="SMLR(lm=1.0)"), 160 #SMLR(lm=10.0, implementation="C", descr="SMLR(lm=10.0)"), 161 #SMLR(lm=100.0, implementation="C", descr="SMLR(lm=100.0)"), 162 #SMLR(implementation="Python", descr="SMLR(Python)") 163 ] 164 165 clfswh += \ 166 [ MulticlassClassifier(clfswh['smlr'][0], 167 descr='Pairs+maxvote multiclass on ' + \ 168 clfswh['smlr'][0].descr) ] 169 170 if externals.exists('libsvm'): 171 from mvpa.clfs import libsvmc as libsvm 172 clfswh._known_tags.update(libsvm.SVM._KNOWN_IMPLEMENTATIONS.keys()) 173 clfswh += [libsvm.SVM(descr="libsvm.LinSVM(C=def)", probability=1), 174 libsvm.SVM( 175 C=-10.0, descr="libsvm.LinSVM(C=10*def)", probability=1), 176 libsvm.SVM( 177 C=1.0, descr="libsvm.LinSVM(C=1)", probability=1), 178 libsvm.SVM(svm_impl='NU_SVC', 179 descr="libsvm.LinNuSVM(nu=def)", probability=1) 180 ] 181 clfswh += [libsvm.SVM(kernel_type='RBF', descr="libsvm.RbfSVM()"), 182 libsvm.SVM(kernel_type='RBF', svm_impl='NU_SVC', 183 descr="libsvm.RbfNuSVM(nu=def)"), 184 libsvm.SVM(kernel_type='poly', 185 descr='libsvm.PolySVM()', probability=1), 186 #libsvm.svm.SVM(kernel_type='sigmoid', 187 # svm_impl='C_SVC', 188 # descr='libsvm.SigmoidSVM()'), 189 ] 190 191 # regressions 192 regrswh._known_tags.update(['EPSILON_SVR', 'NU_SVR']) 193 regrswh += [libsvm.SVM(svm_impl='EPSILON_SVR', descr='libsvm epsilon-SVR', 194 regression=True), 195 libsvm.SVM(svm_impl='NU_SVR', descr='libsvm nu-SVR', 196 regression=True)] 197 198 if externals.exists('shogun'): 199 from mvpa.clfs import sg 200 clfswh._known_tags.update(sg.SVM._KNOWN_IMPLEMENTATIONS) 201 202 # some classifiers are not yet ready to be used out-of-the-box in 203 # PyMVPA, thus we don't populate warehouse with their instances 204 bad_classifiers = [ 205 'mpd', # was segfault, now non-training on testcases, and XOR. 206 # and was described as "for educational purposes", thus 207 # shouldn't be used for real data ;-) 208 # Should be a drop-in replacement for lightsvm 209 'gpbt', # fails to train for testAnalyzerWithSplitClassifier 210 # also 'retraining' doesn't work -- fails to generalize 211 'gmnp', # would fail with 'assertion Cache_Size > 2' 212 # if shogun < 0.6.3, also refuses to train 213 'svrlight', # fails to 'generalize' as a binary classifier 214 # after 'binning' 215 'krr', # fails to generalize 216 ] 217 if not externals.exists('sg_fixedcachesize'): 218 # would fail with 'assertion Cache_Size > 2' if shogun < 0.6.3 219 bad_classifiers.append('gnpp') 220 221 for impl in sg.SVM._KNOWN_IMPLEMENTATIONS: 222 # Uncomment the ones to disable 223 if impl in bad_classifiers: 224 continue 225 clfswh += [ 226 sg.SVM( 227 descr="sg.LinSVM(C=def)/%s" % impl, svm_impl=impl), 228 sg.SVM( 229 C=-10.0, descr="sg.LinSVM(C=10*def)/%s" % impl, svm_impl=impl), 230 sg.SVM( 231 C=1.0, descr="sg.LinSVM(C=1)/%s" % impl, svm_impl=impl), 232 ] 233 clfswh += [ 234 sg.SVM(kernel_type='RBF', 235 descr="sg.RbfSVM()/%s" % impl, svm_impl=impl), 236 # sg.SVM(kernel_type='RBF', 237 # descr="sg.RbfSVM(gamma=0.1)/%s" 238 # % impl, svm_impl=impl, gamma=0.1), 239 # sg.SVM(descr="sg.SigmoidSVM()/%s" 240 # % impl, svm_impl=impl, kernel_type="sigmoid"), 241 ] 242 243 _optional_regressions = [] 244 if externals.exists('shogun.krr'): 245 _optional_regressions += ['krr'] 246 for impl in ['libsvr'] + _optional_regressions:# \ 247 # XXX svrlight sucks in SG -- dont' have time to figure it out 248 #+ ([], ['svrlight'])['svrlight' in sg.SVM._KNOWN_IMPLEMENTATIONS]: 249 regrswh._known_tags.update([impl]) 250 regrswh += [ sg.SVM(svm_impl=impl, descr='sg.LinSVMR()/%s' % impl, 251 regression=True), 252 #sg.SVM(svm_impl=impl, kernel_type='RBF', 253 # descr='sg.RBFSVMR()/%s' % impl, 254 # regression=True), 255 ] 256 257 if len(clfswh['svm', 'linear']) > 0: 258 # if any SVM implementation is known, import default ones 259 from mvpa.clfs.svm import * 260 261 # lars from R via RPy 262 if externals.exists('lars'): 263 import mvpa.clfs.lars as lars 264 from mvpa.clfs.lars import LARS 265 for model in lars.known_models: 266 # XXX create proper repository of classifiers! 267 lars_clf = LARS(descr="LARS(%s)" % model, model_type=model) 268 clfswh += lars_clf 269 270 # is a regression, too 271 lars_regr = LARS(descr="_LARS(%s, regression=True)" % model, 272 regression=True, model_type=model) 273 regrswh += lars_regr 274 # clfswh += MulticlassClassifier(lars, 275 # descr='Multiclass %s' % lars.descr) 276 277 ## PBS: enet has some weird issue that causes it to fail. GLMNET is 278 ## better anyway, so just use that instead 279 ## # enet from R via RPy 280 ## if externals.exists('elasticnet'): 281 ## from mvpa.clfs.enet import ENET 282 ## clfswh += ENET(descr="ENET()") 283 ## regrswh += ENET(descr="ENET(regression=True)", regression=True) 284 285 # glmnet from R via RPy 286 if externals.exists('glmnet'): 287 from mvpa.clfs.glmnet import GLMNET_C, GLMNET_R 288 clfswh += GLMNET_C(descr="GLMNET_C()") 289 regrswh += GLMNET_R(descr="GLMNET_R()") 290 291 # kNN 292 clfswh += kNN(k=5, descr="kNN(k=5)") 293 clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") 294 295 clfswh += \ 296 FeatureSelectionClassifier( 297 kNN(), 298 SensitivityBasedFeatureSelection( 299 SMLRWeights(SMLR(lm=1.0, implementation="C")), 300 RangeElementSelector(mode='select')), 301 descr="kNN on SMLR(lm=1) non-0") 302 303 clfswh += \ 304 FeatureSelectionClassifier( 305 kNN(), 306 SensitivityBasedFeatureSelection( 307 OneWayAnova(), 308 FractionTailSelector(0.05, mode='select', tail='upper')), 309 descr="kNN on 5%(ANOVA)") 310 311 clfswh += \ 312 FeatureSelectionClassifier( 313 kNN(), 314 SensitivityBasedFeatureSelection( 315 OneWayAnova(), 316 FixedNElementTailSelector(50, mode='select', tail='upper')), 317 descr="kNN on 50(ANOVA)") 318 319 320 # GNB 321 clfswh += GNB(descr="GNB()") 322 clfswh += GNB(common_variance=True, descr="GNB(common_variance=True)") 323 clfswh += GNB(prior='uniform', descr="GNB(prior='uniform')") 324 clfswh += \ 325 FeatureSelectionClassifier( 326 GNB(), 327 SensitivityBasedFeatureSelection( 328 OneWayAnova(), 329 FractionTailSelector(0.05, mode='select', tail='upper')), 330 descr="GNB on 5%(ANOVA)") 331 332 333 # GPR 334 if externals.exists('scipy'): 335 from mvpa.clfs.gpr import GPR 336 337 clfswh += GPR(kernel=KernelLinear(), descr="GPR(kernel='linear')") 338 clfswh += GPR(kernel=KernelSquaredExponential(), 339 descr="GPR(kernel='sqexp')") 340 341 # BLR 342 from mvpa.clfs.blr import BLR 343 clfswh += BLR(descr="BLR()") 344 345 #PLR 346 from mvpa.clfs.plr import PLR 347 clfswh += PLR(descr="PLR()") 348 if externals.exists('scipy'): 349 clfswh += PLR(reduced=0.05, descr="PLR(reduced=0.01)") 350 351 # SVM stuff 352 353 if len(clfswh['linear', 'svm']) > 0: 354 355 linearSVMC = clfswh['linear', 'svm', 356 cfg.get('svm', 'backend', default='libsvm').lower() 357 ][0] 358 359 # "Interesting" classifiers 360 clfswh += \ 361 FeatureSelectionClassifier( 362 linearSVMC.clone(), 363 SensitivityBasedFeatureSelection( 364 SMLRWeights(SMLR(lm=0.1, implementation="C")), 365 RangeElementSelector(mode='select')), 366 descr="LinSVM on SMLR(lm=0.1) non-0") 367 368 369 clfswh += \ 370 FeatureSelectionClassifier( 371 linearSVMC.clone(), 372 SensitivityBasedFeatureSelection( 373 SMLRWeights(SMLR(lm=1.0, implementation="C")), 374 RangeElementSelector(mode='select')), 375 descr="LinSVM on SMLR(lm=1) non-0") 376 377 378 # "Interesting" classifiers 379 clfswh += \ 380 FeatureSelectionClassifier( 381 RbfCSVMC(), 382 SensitivityBasedFeatureSelection( 383 SMLRWeights(SMLR(lm=1.0, implementation="C")), 384 RangeElementSelector(mode='select')), 385 descr="RbfSVM on SMLR(lm=1) non-0") 386 387 clfswh += \ 388 FeatureSelectionClassifier( 389 linearSVMC.clone(), 390 SensitivityBasedFeatureSelection( 391 OneWayAnova(), 392 FractionTailSelector(0.05, mode='select', tail='upper')), 393 descr="LinSVM on 5%(ANOVA)") 394 395 clfswh += \ 396 FeatureSelectionClassifier( 397 linearSVMC.clone(), 398 SensitivityBasedFeatureSelection( 399 OneWayAnova(), 400 FixedNElementTailSelector(50, mode='select', tail='upper')), 401 descr="LinSVM on 50(ANOVA)") 402 403 clfswh += \ 404 FeatureSelectionClassifier( 405 linearSVMC.clone(), 406 SensitivityBasedFeatureSelection( 407 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 408 FractionTailSelector(0.05, mode='select', tail='upper')), 409 descr="LinSVM on 5%(SVM)") 410 411 clfswh += \ 412 FeatureSelectionClassifier( 413 linearSVMC.clone(), 414 SensitivityBasedFeatureSelection( 415 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 416 FixedNElementTailSelector(50, mode='select', tail='upper')), 417 descr="LinSVM on 50(SVM)") 418 419 420 ### Imports which are specific to RFEs 421 # from mvpa.datasets.splitters import OddEvenSplitter 422 # from mvpa.clfs.transerror import TransferError 423 # from mvpa.featsel.rfe import RFE 424 # from mvpa.featsel.helpers import FixedErrorThresholdStopCrit 425 # from mvpa.clfs.transerror import ConfusionBasedError 426 427 # SVM with unbiased RFE -- transfer-error to another splits, or in 428 # other terms leave-1-out error on the same dataset 429 # Has to be bound outside of the RFE definition since both analyzer and 430 # error should use the same instance. 431 rfesvm_split = SplitClassifier(linearSVMC)#clfswh['LinearSVMC'][0]) 432 433 # "Almost" classical RFE. If this works it would differ only that 434 # our transfer_error is based on internal splitting and classifier used 435 # within RFE is a split classifier and its sensitivities per split will get 436 # averaged 437 # 438 439 #clfswh += \ 440 # FeatureSelectionClassifier( 441 # clf = LinearCSVMC(), #clfswh['LinearSVMC'][0], # we train LinearSVM 442 # feature_selection = RFE( # on features selected via RFE 443 # # based on sensitivity of a clf which does splitting internally 444 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 445 # transfer_error=ConfusionBasedError( 446 # rfesvm_split, 447 # confusion_state="confusion"), 448 # # and whose internal error we use 449 # feature_selector=FractionTailSelector( 450 # 0.2, mode='discard', tail='lower'), 451 # # remove 20% of features at each step 452 # update_sensitivity=True), 453 # # update sensitivity at each step 454 # descr='LinSVM+RFE(splits_avg)' ) 455 # 456 #clfswh += \ 457 # FeatureSelectionClassifier( 458 # clf = LinearCSVMC(), # we train LinearSVM 459 # feature_selection = RFE( # on features selected via RFE 460 # # based on sensitivity of a clf which does splitting internally 461 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 462 # transfer_error=ConfusionBasedError( 463 # rfesvm_split, 464 # confusion_state="confusion"), 465 # # and whose internal error we use 466 # feature_selector=FractionTailSelector( 467 # 0.2, mode='discard', tail='lower'), 468 # # remove 20% of features at each step 469 # update_sensitivity=False), 470 # # update sensitivity at each step 471 # descr='LinSVM+RFE(splits_avg,static)' ) 472 473 rfesvm = LinearCSVMC() 474 475 # This classifier will do RFE while taking transfer error to testing 476 # set of that split. Resultant classifier is voted classifier on top 477 # of all splits, let see what that would do ;-) 478 #clfswh += \ 479 # SplitClassifier( # which does splitting internally 480 # FeatureSelectionClassifier( 481 # clf = LinearCSVMC(), 482 # feature_selection = RFE( # on features selected via RFE 483 # sensitivity_analyzer=\ 484 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 485 # transfer_error=TransferError(rfesvm), 486 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 487 # feature_selector=FractionTailSelector( 488 # 0.2, mode='discard', tail='lower'), 489 # # remove 20% of features at each step 490 # update_sensitivity=True)), 491 # # update sensitivity at each step 492 # descr='LinSVM+RFE(N-Fold)') 493 # 494 # 495 #clfswh += \ 496 # SplitClassifier( # which does splitting internally 497 # FeatureSelectionClassifier( 498 # clf = LinearCSVMC(), 499 # feature_selection = RFE( # on features selected via RFE 500 # sensitivity_analyzer=\ 501 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 502 # transfer_error=TransferError(rfesvm), 503 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 504 # feature_selector=FractionTailSelector( 505 # 0.2, mode='discard', tail='lower'), 506 # # remove 20% of features at each step 507 # update_sensitivity=True)), 508 # # update sensitivity at each step 509 # splitter = OddEvenSplitter(), 510 # descr='LinSVM+RFE(OddEven)') 511