1
2
3
4
5
6
7
8
9 """Recursive feature elimination."""
10
11 __docformat__ = 'restructuredtext'
12
13 from mvpa.clfs.transerror import ClassifierError
14 from mvpa.measures.base import Sensitivity
15 from mvpa.featsel.base import FeatureSelection
16 from mvpa.featsel.helpers import BestDetector, \
17 NBackHistoryStopCrit, \
18 FractionTailSelector
19 from numpy import arange
20 from mvpa.misc.state import StateVariable
21
22 if __debug__:
23 from mvpa.base import debug
24
25
26
27
28
29
30
31
32 -class RFE(FeatureSelection):
33 """Recursive feature elimination.
34
35 A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a
36 certain dataset. These sensitivity maps are in turn used to discard
37 unimportant features. For each feature selection the transfer error on some
38 testdatset is computed. This procedure is repeated until a given
39 `StoppingCriterion` is reached.
40
41 Such strategy after
42 Guyon, I., Weston, J., Barnhill, S., & Vapnik, V. (2002). Gene
43 selection for cancer classification using support vector
44 machines. Mach. Learn., 46(1-3), 389--422.
45 was applied to SVM-based analysis of fMRI data in
46 Hanson, S. J. & Halchenko, Y. O. (2008). Brain reading using
47 full brain support vector machines for object recognition:
48 there is no "face identification area". Neural Computation, 20,
49 486--503.
50 """
51
52
53
54
55
56
57
58
59 errors = StateVariable()
60 nfeatures = StateVariable()
61 history = StateVariable()
62 sensitivities = StateVariable(enabled=False)
63
74
75
76 """Initialize recursive feature elimination
77
78 :Parameters:
79 sensitivity_analyzer : FeaturewiseDatasetMeasure object
80 transfer_error : TransferError object
81 used to compute the transfer error of a classifier based on a
82 certain feature set on the test dataset.
83 NOTE: If sensitivity analyzer is based on the same
84 classifier as transfer_error is using, make sure you
85 initialize transfer_error with train=False, otherwise
86 it would train classifier twice without any necessity.
87 feature_selector : Functor
88 Given a sensitivity map it has to return the ids of those
89 features that should be kept.
90 bestdetector : Functor
91 Given a list of error values it has to return a boolean that
92 signals whether the latest error value is the total minimum.
93 stopping_criterion : Functor
94 Given a list of error values it has to return whether the
95 criterion is fulfilled.
96 train_clf : bool
97 Flag whether the classifier in `transfer_error` should be
98 trained before computing the error. In general this is
99 required, but if the `sensitivity_analyzer` and
100 `transfer_error` share and make use of the same classifier it
101 can be switched off to save CPU cycles. Default `None` checks
102 if sensitivity_analyzer is based on a classifier and doesn't train
103 if so.
104 update_sensitivity : bool
105 If False the sensitivity map is only computed once and reused
106 for each iteration. Otherwise the senstitivities are
107 recomputed at each selection step.
108 """
109
110
111 FeatureSelection.__init__(self, **kargs)
112
113 self.__sensitivity_analyzer = sensitivity_analyzer
114 """Sensitivity analyzer used to call at each step."""
115
116 self.__transfer_error = transfer_error
117 """Compute transfer error for each feature set."""
118
119 self.__feature_selector = feature_selector
120 """Functor which takes care about removing some features."""
121
122 self.__stopping_criterion = stopping_criterion
123
124 self.__bestdetector = bestdetector
125
126 if train_clf is None:
127 self.__train_clf = isinstance(sensitivity_analyzer,
128 Sensitivity)
129 else:
130 self.__train_clf = train_clf
131 """Flag whether training classifier is required."""
132
133 self.__update_sensitivity = update_sensitivity
134 """Flag whether sensitivity map is recomputed for each step."""
135
136
137
138 if not self.__update_sensitivity \
139 and isinstance(self.__transfer_error, ClassifierError) \
140 and not self.__train_clf:
141 if __debug__:
142 debug("RFEC", "Forcing training of classifier since " +
143 "sensitivities aren't updated at each step")
144 self.__train_clf = True
145
146
147 - def __call__(self, dataset, testdataset):
148 """Proceed and select the features recursively eliminating less
149 important ones.
150
151 :Parameters:
152 dataset : Dataset
153 used to compute sensitivity maps and train a classifier
154 to determine the transfer error
155 testdataset : Dataset
156 used to test the trained classifer to determine the
157 transfer error
158
159 Returns a tuple of two new datasets with the feature subset of
160 `dataset` that had the lowest transfer error of all tested
161 sets until the stopping criterion was reached. The first
162 dataset is the feature subset of the training data and the
163 second the selection of the test dataset.
164 """
165 errors = []
166 """Computed error for each tested features set."""
167
168 self.nfeatures = []
169 """Number of features at each step. Since it is not used by the
170 algorithm it is stored directly in the state variable"""
171
172 self.history = arange(dataset.nfeatures)
173 """Store the last step # when the feature was still present
174 """
175
176 self.sensitivities = []
177
178 stop = False
179 """Flag when RFE should be stopped."""
180
181 results = None
182 """Will hold the best feature set ever."""
183
184 wdataset = dataset
185 """Operate on working dataset initially identical."""
186
187 wtestdataset = testdataset
188 """Same feature selection has to be performs on test dataset as well.
189 This will hold the current testdataset."""
190
191 step = 0
192 """Counter how many selection step where done."""
193
194 orig_feature_ids = arange(dataset.nfeatures)
195 """List of feature Ids as per original dataset remaining at any given
196 step"""
197
198 sensitivity = None
199 """Contains the latest sensitivity map."""
200
201 result_selected_ids = orig_feature_ids
202 """Resultant ids of selected features. Since the best is not
203 necessarily is the last - we better keep this one around. By
204 default -- all features are there"""
205 selected_ids = result_selected_ids
206
207 while wdataset.nfeatures > 0:
208
209 if __debug__:
210 debug('RFEC',
211 "Step %d: nfeatures=%d" % (step, wdataset.nfeatures))
212
213
214
215
216 self.history[orig_feature_ids] = step
217
218
219 if self.__update_sensitivity or sensitivity == None:
220 sensitivity = self.__sensitivity_analyzer(wdataset)
221
222 if self.states.isEnabled("sensitivities"):
223 self.sensitivities.append(sensitivity)
224
225
226 if self.__train_clf:
227 error = self.__transfer_error(wtestdataset, wdataset)
228 else:
229 error = self.__transfer_error(wtestdataset, None)
230
231
232 errors.append(error)
233
234
235
236 stop = self.__stopping_criterion(errors)
237 isthebest = self.__bestdetector(errors)
238
239 nfeatures = wdataset.nfeatures
240
241 if self.states.isEnabled("nfeatures"):
242 self.nfeatures.append(wdataset.nfeatures)
243
244
245 if isthebest:
246 results = (wdataset, wtestdataset)
247 result_selected_ids = orig_feature_ids
248
249 if __debug__:
250 debug('RFEC',
251 "Step %d: nfeatures=%d error=%.4f best/stop=%d/%d " %
252 (step, nfeatures, error, isthebest, stop))
253
254
255 if nfeatures == 1 or stop:
256 break
257
258
259 selected_ids = self.__feature_selector(sensitivity)
260
261 if __debug__:
262 debug('RFEC_',
263 "Sensitivity: %s, nfeatures_selected=%d, selected_ids: %s" %
264 (sensitivity, len(selected_ids), selected_ids))
265
266
267
268 wdataset = wdataset.selectFeatures(selected_ids)
269
270
271
272 if not self.__update_sensitivity:
273 sensitivity = sensitivity[selected_ids]
274
275
276
277
278
279
280
281
282 if not testdataset is None:
283 wtestdataset = wtestdataset.selectFeatures(selected_ids)
284
285 step += 1
286
287
288 selected_ids.sort()
289 if self.states.isEnabled("history") or self.states.isEnabled('selected_ids'):
290 orig_feature_ids = orig_feature_ids[selected_ids]
291
292
293 if hasattr(self.__transfer_error, "clf"):
294 self.__transfer_error.clf.untrain()
295
296 self.errors = errors
297 self.selected_ids = result_selected_ids
298
299
300 return results
301