mvpa.featsel.rfe

33 """Recursive feature elimination. 34 35 A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a 36 certain dataset. These sensitivity maps are in turn used to discard 37 unimportant features. For each feature selection the transfer error on some 38 testdatset is computed. This procedure is repeated until a given 39 `StoppingCriterion` is reached. 40 41 Such strategy after 42 Guyon, I., Weston, J., Barnhill, S., & Vapnik, V. (2002). Gene 43 selection for cancer classification using support vector 44 machines. Mach. Learn., 46(1-3), 389--422. 45 was applied to SVM-based analysis of fMRI data in 46 Hanson, S. J. & Halchenko, Y. O. (2008). Brain reading using 47 full brain support vector machines for object recognition: 48 there is no "face identification area". Neural Computation, 20, 49 486--503. 50 """ 51 52 # TODO: remove 53 # doesn't work nicely -- if FeatureSelection defines its states via 54 # _register_states, they would simply be ignored 55 #_register_states = {'errors':True, 56 # 'nfeatures':True, 57 # 'history':True} 58 59 errors = StateVariable() 60 nfeatures = StateVariable() 61 history = StateVariable() 62 sensitivities = StateVariable(enabled=False) 63

64 - def __init__(self, 65 sensitivity_analyzer, 66 transfer_error, 67 feature_selector=FractionTailSelector(0.05), 68 bestdetector=BestDetector(), 69 stopping_criterion=NBackHistoryStopCrit(BestDetector()), 70 train_clf=None, 71 update_sensitivity=True, 72 **kargs 73 ):

74 # XXX Allow for multiple stopping criterions, e.g. error not decreasing 75 # anymore OR number of features less than threshold 76 """Initialize recursive feature elimination 77 78 :Parameters: 79 sensitivity_analyzer : FeaturewiseDatasetMeasure object 80 transfer_error : TransferError object 81 used to compute the transfer error of a classifier based on a 82 certain feature set on the test dataset. 83 NOTE: If sensitivity analyzer is based on the same 84 classifier as transfer_error is using, make sure you 85 initialize transfer_error with train=False, otherwise 86 it would train classifier twice without any necessity. 87 feature_selector : Functor 88 Given a sensitivity map it has to return the ids of those 89 features that should be kept. 90 bestdetector : Functor 91 Given a list of error values it has to return a boolean that 92 signals whether the latest error value is the total minimum. 93 stopping_criterion : Functor 94 Given a list of error values it has to return whether the 95 criterion is fulfilled. 96 train_clf : bool 97 Flag whether the classifier in `transfer_error` should be 98 trained before computing the error. In general this is 99 required, but if the `sensitivity_analyzer` and 100 `transfer_error` share and make use of the same classifier it 101 can be switched off to save CPU cycles. Default `None` checks 102 if sensitivity_analyzer is based on a classifier and doesn't train 103 if so. 104 update_sensitivity : bool 105 If False the sensitivity map is only computed once and reused 106 for each iteration. Otherwise the senstitivities are 107 recomputed at each selection step. 108 """ 109 110 # base init first 111 FeatureSelection.__init__(self, **kargs) 112 113 self.__sensitivity_analyzer = sensitivity_analyzer 114 """Sensitivity analyzer used to call at each step.""" 115 116 self.__transfer_error = transfer_error 117 """Compute transfer error for each feature set.""" 118 119 self.__feature_selector = feature_selector 120 """Functor which takes care about removing some features.""" 121 122 self.__stopping_criterion = stopping_criterion 123 124 self.__bestdetector = bestdetector 125 126 if train_clf is None: 127 self.__train_clf = isinstance(sensitivity_analyzer, 128 Sensitivity) 129 else: 130 self.__train_clf = train_clf 131 """Flag whether training classifier is required.""" 132 133 self.__update_sensitivity = update_sensitivity 134 """Flag whether sensitivity map is recomputed for each step.""" 135 136 # force clf training when sensitivities are not updated as otherwise 137 # shared classifiers are not retrained 138 if not self.__update_sensitivity \ 139 and isinstance(self.__transfer_error, ClassifierError) \ 140 and not self.__train_clf: 141 if __debug__: 142 debug("RFEC", "Forcing training of classifier since " + 143 "sensitivities aren't updated at each step") 144 self.__train_clf = True

145 146

147 - def __call__(self, dataset, testdataset):

148 """Proceed and select the features recursively eliminating less 149 important ones. 150 151 :Parameters: 152 dataset : Dataset 153 used to compute sensitivity maps and train a classifier 154 to determine the transfer error 155 testdataset : Dataset 156 used to test the trained classifer to determine the 157 transfer error 158 159 Returns a tuple of two new datasets with the feature subset of 160 `dataset` that had the lowest transfer error of all tested 161 sets until the stopping criterion was reached. The first 162 dataset is the feature subset of the training data and the 163 second the selection of the test dataset. 164 """ 165 errors = [] 166 """Computed error for each tested features set.""" 167 168 self.nfeatures = [] 169 """Number of features at each step. Since it is not used by the 170 algorithm it is stored directly in the state variable""" 171 172 self.history = arange(dataset.nfeatures) 173 """Store the last step # when the feature was still present 174 """ 175 176 self.sensitivities = [] 177 178 stop = False 179 """Flag when RFE should be stopped.""" 180 181 results = None 182 """Will hold the best feature set ever.""" 183 184 wdataset = dataset 185 """Operate on working dataset initially identical.""" 186 187 wtestdataset = testdataset 188 """Same feature selection has to be performs on test dataset as well. 189 This will hold the current testdataset.""" 190 191 step = 0 192 """Counter how many selection step where done.""" 193 194 orig_feature_ids = arange(dataset.nfeatures) 195 """List of feature Ids as per original dataset remaining at any given 196 step""" 197 198 sensitivity = None 199 """Contains the latest sensitivity map.""" 200 201 result_selected_ids = orig_feature_ids 202 """Resultant ids of selected features. Since the best is not 203 necessarily is the last - we better keep this one around. By 204 default -- all features are there""" 205 selected_ids = result_selected_ids 206 207 while wdataset.nfeatures > 0: 208 209 if __debug__: 210 debug('RFEC', 211 "Step %d: nfeatures=%d" % (step, wdataset.nfeatures)) 212 213 # mark the features which are present at this step 214 # if it brings anyb mentionable computational burden in the future, 215 # only mark on removed features at each step 216 self.history[orig_feature_ids] = step 217 218 # Compute sensitivity map 219 if self.__update_sensitivity or sensitivity == None: 220 sensitivity = self.__sensitivity_analyzer(wdataset) 221 222 if self.states.isEnabled("sensitivities"): 223 self.sensitivities.append(sensitivity) 224 225 # do not retrain clf if not necessary 226 if self.__train_clf: 227 error = self.__transfer_error(wtestdataset, wdataset) 228 else: 229 error = self.__transfer_error(wtestdataset, None) 230 231 # Record the error 232 errors.append(error) 233 234 # Check if it is time to stop and if we got 235 # the best result 236 stop = self.__stopping_criterion(errors) 237 isthebest = self.__bestdetector(errors) 238 239 nfeatures = wdataset.nfeatures 240 241 if self.states.isEnabled("nfeatures"): 242 self.nfeatures.append(wdataset.nfeatures) 243 244 # store result 245 if isthebest: 246 results = (wdataset, wtestdataset) 247 result_selected_ids = orig_feature_ids 248 249 if __debug__: 250 debug('RFEC', 251 "Step %d: nfeatures=%d error=%.4f best/stop=%d/%d " % 252 (step, nfeatures, error, isthebest, stop)) 253 254 # stop if it is time to finish 255 if nfeatures == 1 or stop: 256 break 257 258 # Select features to preserve 259 selected_ids = self.__feature_selector(sensitivity) 260 261 if __debug__: 262 debug('RFEC_', 263 "Sensitivity: %s, nfeatures_selected=%d, selected_ids: %s" % 264 (sensitivity, len(selected_ids), selected_ids)) 265 266 267 # Create a dataset only with selected features 268 wdataset = wdataset.selectFeatures(selected_ids) 269 270 # select corresponding sensitivity values if they are not 271 # recomputed 272 if not self.__update_sensitivity: 273 sensitivity = sensitivity[selected_ids] 274 275 # need to update the test dataset as well 276 # XXX why should it ever become None? 277 # yoh: because we can have __transfer_error computed 278 # using wdataset. See xia-generalization estimate 279 # in lightsvm. Or for god's sake leave-one-out 280 # on a wdataset 281 # TODO: document these cases in this class 282 if not testdataset is None: 283 wtestdataset = wtestdataset.selectFeatures(selected_ids) 284 285 step += 1 286 287 # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids 288 selected_ids.sort() 289 if self.states.isEnabled("history") or self.states.isEnabled('selected_ids'): 290 orig_feature_ids = orig_feature_ids[selected_ids] 291 292 293 if hasattr(self.__transfer_error, "clf"): 294 self.__transfer_error.clf.untrain() 295 # charge state variables 296 self.errors = errors 297 self.selected_ids = result_selected_ids 298 299 # best dataset ever is returned 300 return results

Source Code for Module mvpa.featsel.rfe