| Home | Trees | Indices | Help |
|
|---|
|
|
1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
2 # vi: set ft=python sts=4 ts=4 sw=4 et:
3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
4 #
5 # See COPYING file distributed along with the PyMVPA package for the
6 # copyright and license terms.
7 #
8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
9 """Unit tests for PyMVPA SplittingSensitivityAnalyzer"""
10
11 from mvpa.base import externals
12 from mvpa.featsel.base import FeatureSelectionPipeline, \
13 SensitivityBasedFeatureSelection, CombinedFeatureSelection
14 from mvpa.clfs.transerror import TransferError
15 from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
16 from mvpa.featsel.helpers import FixedNElementTailSelector, \
17 FractionTailSelector, RangeElementSelector
18
19 from mvpa.featsel.rfe import RFE
20
21 from mvpa.clfs.meta import SplitClassifier, MulticlassClassifier, \
22 FeatureSelectionClassifier
23 from mvpa.clfs.smlr import SMLR, SMLRWeights
24 from mvpa.misc.transformers import Absolute
25 from mvpa.datasets.splitters import NFoldSplitter, NoneSplitter
26
27 from mvpa.misc.transformers import Absolute, FirstAxisMean, \
28 SecondAxisSumOfAbs, DistPValue
29
30 from mvpa.measures.base import SplitFeaturewiseDatasetMeasure
31 from mvpa.measures.anova import OneWayAnova, CompoundOneWayAnova
32 from mvpa.measures.irelief import IterativeRelief, IterativeReliefOnline, \
33 IterativeRelief_Devel, IterativeReliefOnline_Devel
34
35 from tests_warehouse import *
36 from tests_warehouse_clfs import *
37
38 _MEASURES_2_SWEEP = [ OneWayAnova(),
39 CompoundOneWayAnova(combiner=SecondAxisSumOfAbs),
40 IterativeRelief(), IterativeReliefOnline(),
41 IterativeRelief_Devel(), IterativeReliefOnline_Devel()
42 ]
43 if externals.exists('scipy'):
44 from mvpa.measures.corrcoef import CorrCoef
45 _MEASURES_2_SWEEP += [ CorrCoef(),
46 # that one is good when small... handle later
47 #CorrCoef(pvalue=True)
48 ]
51
53 self.dataset = datasets['uni2large']
54
55
56 @sweepargs(dsm=_MEASURES_2_SWEEP)
58 data = datasets['dumbinv']
59
60 datass = data.samples.copy()
61
62 # compute scores
63 f = dsm(data)
64
65 # check if nothing evil is done to dataset
66 self.failUnless(N.all(data.samples == datass))
67 self.failUnless(f.shape == (4,))
68 self.failUnless(abs(f[1]) <= 1e-12, # some small value
69 msg="Failed test with value %g instead of != 0.0" % f[1])
70 self.failUnless(f[0] > 0.1) # some reasonably large value
71
72 # we should not have NaNs
73 self.failUnless(not N.any(N.isnan(f)))
74
75
76 # XXX meta should work too but doesn't
77 # XXX also look below -- lars with stepwise segfaults if all states are enabled,
78 # disabled for now -- do not have enough juice to debug lars code
79 @sweepargs(clf=clfswh['has_sensitivity'])
81 """Test analyzers in split classifier
82 """
83 # We need to skip some LARSes here
84 _sclf = str(clf)
85 if 'LARS(' in _sclf and "type='stepwise'" in _sclf:
86 return
87
88 # assumming many defaults it is as simple as
89 mclf = SplitClassifier(clf=clf,
90 enable_states=['training_confusion',
91 'confusion'])
92 sana = mclf.getSensitivityAnalyzer(transformer=Absolute,
93 enable_states=["sensitivities"])
94
95 # Test access to transformers and combiners
96 self.failUnless(sana.transformer is Absolute)
97 self.failUnless(sana.combiner is FirstAxisMean)
98 # and lets look at all sensitivities
99
100 # and we get sensitivity analyzer which works on splits
101 map_ = sana(self.dataset)
102 self.failUnlessEqual(len(map_), self.dataset.nfeatures)
103
104 if cfg.getboolean('tests', 'labile', default='yes'):
105 for conf_matrix in [sana.clf.training_confusion] \
106 + sana.clf.confusion.matrices:
107 self.failUnless(
108 conf_matrix.percentCorrect>75,
109 msg="We must have trained on each one more or " \
110 "less correctly. Got %f%% correct on %d labels" %
111 (conf_matrix.percentCorrect,
112 len(self.dataset.uniquelabels)))
113
114 errors = [x.percentCorrect
115 for x in sana.clf.confusion.matrices]
116
117 # XXX
118 # That is too much to ask if the dataset is easy - thus
119 # disabled for now
120 #self.failUnless(N.min(errors) != N.max(errors),
121 # msg="Splits should have slightly but different " \
122 # "generalization")
123
124 # lets go through all sensitivities and see if we selected the right
125 # features
126 # XXX yoh: disabled checking of each map separately since in
127 # BoostedClassifierSensitivityAnalyzer and
128 # ProxyClassifierSensitivityAnalyzer
129 # we don't have yet way to provide transformers thus internal call
130 # to getSensitivityAnalyzer in _call of them is not parametrized
131 if 'meta' in clf._clf_internals and len(map_.nonzero()[0])<2:
132 # Some meta classifiers (5% of ANOVA) are too harsh ;-)
133 return
134 for map__ in [map_]: # + sana.combined_analyzer.sensitivities:
135 selected = FixedNElementTailSelector(
136 self.dataset.nfeatures -
137 len(self.dataset.nonbogus_features))(map__)
138 if cfg.getboolean('tests', 'labile', default='yes'):
139 self.failUnlessEqual(
140 list(selected),
141 list(self.dataset.nonbogus_features),
142 msg="At the end we should have selected the right features")
143
144
145 @sweepargs(clf=clfswh['has_sensitivity'])
147 """Test sensitivity of the mapped classifier
148 """
149 # Assuming many defaults it is as simple as
150 mclf = FeatureSelectionClassifier(
151 clf,
152 SensitivityBasedFeatureSelection(
153 OneWayAnova(),
154 FractionTailSelector(0.5, mode='select', tail='upper')),
155 enable_states=['training_confusion'])
156
157 sana = mclf.getSensitivityAnalyzer(transformer=Absolute,
158 enable_states=["sensitivities"])
159 # and lets look at all sensitivities
160
161 dataset = datasets['uni2medium']
162 # and we get sensitivity analyzer which works on splits
163 map_ = sana(dataset)
164 self.failUnlessEqual(len(map_), dataset.nfeatures)
165
166
167
168 @sweepargs(svm=clfswh['linear', 'svm'])
170 # assumming many defaults it is as simple as
171 sana = svm.getSensitivityAnalyzer(enable_states=["sensitivities"] )
172
173 # and lets look at all sensitivities
174 map_ = sana(self.dataset)
175 # for now we can do only linear SVM, so lets check if we raise
176 # a concern
177 svmnl = clfswh['non-linear', 'svm'][0]
178 self.failUnlessRaises(NotImplementedError,
179 svmnl.getSensitivityAnalyzer)
180
181
182 @sweepargs(svm=clfswh['linear', 'svm'])
184 # assumming many defaults it is as simple as
185 sana = svm.getSensitivityAnalyzer(enable_states=["sensitivities"] )
186
187 # and lets look at all sensitivities
188 map_ = sana(self.dataset)
189 # for now we can do only linear SVM, so lets check if we raise
190 # a concern
191 svmnl = clfswh['non-linear', 'svm'][0]
192 self.failUnlessRaises(NotImplementedError,
193 svmnl.getSensitivityAnalyzer)
194
195 # XXX doesn't work easily with meta since it would need
196 # to be explicitely passed to the slave classifier's
197 # getSengetSensitivityAnalyzer
198 @sweepargs(svm=clfswh['linear', 'svm', 'libsvm', '!sg', '!meta'])
200 # assumming many defaults it is as simple as
201 kwargs = dict(combiner=None, transformer=None,
202 enable_states=["sensitivities"])
203 sana_split = svm.getSensitivityAnalyzer(
204 split_weights=True, **kwargs)
205 sana_full = svm.getSensitivityAnalyzer(
206 force_training=False, **kwargs)
207
208 # and lets look at all sensitivities
209 ds2 = datasets['uni4large'].copy()
210 ds2.zscore(baselinelabels = [2, 3])
211 ds2 = ds2['labels', [0,1]]
212
213 map_split = sana_split(ds2)
214 map_full = sana_full(ds2)
215
216 self.failUnlessEqual(map_split.shape, (ds2.nfeatures, 2))
217 self.failUnlessEqual(map_full.shape, (ds2.nfeatures, ))
218
219 # just to verify that we split properly and if we reconstruct
220 # manually we obtain the same
221 dmap = (-1*map_split[:, 1] + map_split[:, 0]) - map_full
222 self.failUnless((N.abs(dmap) <= 1e-10).all())
223 #print "____"
224 #print map_split
225 #print SMLR().getSensitivityAnalyzer(combiner=None)(ds2)
226
227 # for now we can do split weights for binary tasks only, so
228 # lets check if we raise a concern
229 self.failUnlessRaises(NotImplementedError,
230 sana_split, datasets['uni3medium'])
231
232
234 ds = datasets['uni3small']
235 sana = SplitFeaturewiseDatasetMeasure(
236 analyzer=SMLR(
237 fit_all_weights=True).getSensitivityAnalyzer(combiner=None),
238 splitter=NFoldSplitter(),
239 combiner=None)
240
241 sens = sana(ds)
242
243 self.failUnless(sens.shape == (
244 len(ds.uniquechunks), ds.nfeatures, len(ds.uniquelabels)))
245
246
247 # Lets try more complex example with 'boosting'
248 ds = datasets['uni3medium']
249 sana = SplitFeaturewiseDatasetMeasure(
250 analyzer=SMLR(
251 fit_all_weights=True).getSensitivityAnalyzer(combiner=None),
252 splitter=NoneSplitter(nperlabel=0.25, mode='first',
253 nrunspersplit=2),
254 combiner=None,
255 enable_states=['splits', 'sensitivities'])
256 sens = sana(ds)
257
258 self.failUnless(sens.shape == (2, ds.nfeatures, 3))
259 splits = sana.splits
260 self.failUnlessEqual(len(splits), 2)
261 self.failUnless(N.all([s[0].nsamples == ds.nsamples/4 for s in splits]))
262 # should have used different samples
263 self.failUnless(N.any([splits[0][0].origids != splits[1][0].origids]))
264 # and should have got different sensitivities
265 self.failUnless(N.any(sens[0] != sens[1]))
266
267
268 if not externals.exists('scipy'):
269 return
270 # Most evil example
271 ds = datasets['uni2medium']
272 plain_sana = SVM().getSensitivityAnalyzer(
273 combiner=None, transformer=DistPValue())
274 boosted_sana = SplitFeaturewiseDatasetMeasure(
275 analyzer=SVM().getSensitivityAnalyzer(
276 combiner=None, transformer=DistPValue(fpp=0.05)),
277 splitter=NoneSplitter(nperlabel=0.8, mode='first', nrunspersplit=2),
278 combiner=FirstAxisMean,
279 enable_states=['splits', 'sensitivities'])
280 # lets create feature selector
281 fsel = RangeElementSelector(upper=0.1, lower=0.9, inclusive=True)
282
283 sanas = dict(plain=plain_sana, boosted=boosted_sana)
284 for k,sana in sanas.iteritems():
285 clf = FeatureSelectionClassifier(SVM(),
286 SensitivityBasedFeatureSelection(sana, fsel),
287 descr='SVM on p=0.2(both tails) using %s' % k)
288 ce = CrossValidatedTransferError(TransferError(clf),
289 NFoldSplitter())
290 error = ce(ds)
291
292 sens = boosted_sana(ds)
293 sens_plain = plain_sana(ds)
294
295 # TODO: make a really unittest out of it -- not just runtime
296 # bugs catcher
297
298 # TODO -- unittests for sensitivity analyzers which use combiners
299 # (linsvmweights for multi-class SVMs and smlrweights for SMLR)
300
301
302 @sweepargs(basic_clf=clfswh['has_sensitivity'])
304 #basic_clf = LinearNuSVMC()
305 multi_clf = MulticlassClassifier(clf=basic_clf)
306 #svm_weigths = LinearSVMWeights(svm)
307
308 # Proper RFE: aggregate sensitivities across multiple splits,
309 # but also due to multi class those need to be aggregated
310 # somehow. Transfer error here should be 'leave-1-out' error
311 # of split classifier itself
312 sclf = SplitClassifier(clf=basic_clf)
313 rfe = RFE(sensitivity_analyzer=
314 sclf.getSensitivityAnalyzer(
315 enable_states=["sensitivities"]),
316 transfer_error=trans_error,
317 feature_selector=FeatureSelectionPipeline(
318 [FractionTailSelector(0.5),
319 FixedNElementTailSelector(1)]),
320 train_clf=True)
321
322 # and we get sensitivity analyzer which works on splits and uses
323 # sensitivity
324 selected_features = rfe(self.dataset)
325
327 # two methods: 5% highes F-scores, non-zero SMLR weights
328 fss = [SensitivityBasedFeatureSelection(
329 OneWayAnova(),
330 FractionTailSelector(0.05, mode='select', tail='upper')),
331 SensitivityBasedFeatureSelection(
332 SMLRWeights(SMLR(lm=1, implementation="C")),
333 RangeElementSelector(mode='select'))]
334
335 fs = CombinedFeatureSelection(fss, combiner='union',
336 enable_states=['selected_ids',
337 'selections_ids'])
338
339 od, otd = fs(self.dataset)
340
341 self.failUnless(fs.combiner == 'union')
342 self.failUnless(len(fs.selections_ids))
343 self.failUnless(len(fs.selections_ids) <= self.dataset.nfeatures)
344 # should store one set per methods
345 self.failUnless(len(fs.selections_ids) == len(fss))
346 # no individual can be larger than union
347 for s in fs.selections_ids:
348 self.failUnless(len(s) <= len(fs.selected_ids))
349 # check output dataset
350 self.failUnless(od.nfeatures == len(fs.selected_ids))
351 for i, id in enumerate(fs.selected_ids):
352 self.failUnless((od.samples[:,i]
353 == self.dataset.samples[:,id]).all())
354
355 # again for intersection
356 fs = CombinedFeatureSelection(fss, combiner='intersection',
357 enable_states=['selected_ids',
358 'selections_ids'])
359 # simply run it for now -- can't think of additional tests
360 od, otd = fs(self.dataset)
361
365 return unittest.makeSuite(SensitivityAnalysersTests)
366
367
368 if __name__ == '__main__':
369 import runner
370
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Apr 23 23:09:47 2012 | http://epydoc.sourceforge.net |