mvpa.featsel.base

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # See COPYING file distributed along with the PyMVPA package for the 6 # copyright and license terms. 7 # 8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 """Feature selection base class and related stuff base classes and helpers.""" 10 11 __docformat__ = 'restructuredtext' 12 13 from mvpa.featsel.helpers import FractionTailSelector 14 from mvpa.misc.state import StateVariable, ClassWithCollections 15 16 if __debug__: 17 from mvpa.base import debug 18

19 -class FeatureSelection(ClassWithCollections):

20 """Base class for any feature selection 21 22 Base class for Functors which implement feature selection on the 23 datasets. 24 """ 25 26 selected_ids = StateVariable(enabled=False) 27

28 - def __init__(self, **kwargs):

29 # base init first 30 ClassWithCollections.__init__(self, **kwargs)

31 32

33 - def __call__(self, dataset, testdataset=None):

34 """Invocation of the feature selection 35 36 :Parameters: 37 dataset : Dataset 38 dataset used to select features 39 testdataset : Dataset 40 dataset the might be used to compute a stopping criterion 41 42 Returns a tuple with the dataset containing the selected features. 43 If present the tuple also contains the selected features of the 44 test dataset. Derived classes must provide interface to access other 45 relevant to the feature selection process information (e.g. mask, 46 elimination step (in RFE), etc) 47 """ 48 raise NotImplementedError

49 50

51 - def untrain(self):

52 """ 'Untrain' feature selection 53 54 Necessary for full 'untraining' of the classifiers. By default 55 does nothing, needs to be overridden in corresponding feature 56 selections to pass to the sensitivities 57 """ 58 pass

59 60

61 -class SensitivityBasedFeatureSelection(FeatureSelection):

62 """Feature elimination. 63 64 A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a certain 65 dataset. These sensitivity maps are in turn used to discard unimportant 66 features. 67 """ 68 69 sensitivity = StateVariable(enabled=False) 70

71 - def __init__(self, 72 sensitivity_analyzer, 73 feature_selector=FractionTailSelector(0.05), 74 **kwargs 75 ):

76 """Initialize feature selection 77 78 :Parameters: 79 sensitivity_analyzer : FeaturewiseDatasetMeasure 80 sensitivity analyzer to come up with sensitivity 81 feature_selector : Functor 82 Given a sensitivity map it has to return the ids of those 83 features that should be kept. 84 85 """ 86 87 # base init first 88 FeatureSelection.__init__(self, **kwargs) 89 90 self.__sensitivity_analyzer = sensitivity_analyzer 91 """Sensitivity analyzer to use once""" 92 93 self.__feature_selector = feature_selector 94 """Functor which takes care about removing some features."""

95 96

97 - def untrain(self):

98 if __debug__: 99 debug("FS_", "Untraining sensitivity-based FS: %s" % self) 100 self.__sensitivity_analyzer.untrain()

101 102

103 - def __call__(self, dataset, testdataset=None):

104 """Select the most important features 105 106 :Parameters: 107 dataset : Dataset 108 used to compute sensitivity maps 109 testdataset: Dataset 110 optional dataset to select features on 111 112 Returns a tuple of two new datasets with selected feature 113 subset of `dataset`. 114 """ 115 116 sensitivity = self.__sensitivity_analyzer(dataset) 117 """Compute the sensitivity map.""" 118 119 self.sensitivity = sensitivity 120 121 # Select features to preserve 122 selected_ids = self.__feature_selector(sensitivity) 123 124 if __debug__: 125 debug("FS_", "Sensitivity: %s Selected ids: %s" % 126 (sensitivity, selected_ids)) 127 128 # Create a dataset only with selected features 129 wdataset = dataset.selectFeatures(selected_ids) 130 131 if not testdataset is None: 132 wtestdataset = testdataset.selectFeatures(selected_ids) 133 else: 134 wtestdataset = None 135 136 # Differ from the order in RFE when actually error reported is for 137 results = (wdataset, wtestdataset) 138 139 # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids 140 selected_ids.sort() 141 self.selected_ids = selected_ids 142 143 # dataset with selected features is returned 144 return results

145 146 # make it accessible from outside 147 sensitivity_analyzer = property(fget=lambda self:self.__sensitivity_analyzer, 148 doc="Measure which was used to do selection")

149 150

151 -class FeatureSelectionPipeline(FeatureSelection):

152 """Feature elimination through the list of FeatureSelection's. 153 154 Given as list of FeatureSelections it applies them in turn. 155 """ 156 157 nfeatures = StateVariable( 158 doc="Number of features before each step in pipeline") 159 # TODO: may be we should also append resultant number of features? 160

161 - def __init__(self, 162 feature_selections, 163 **kwargs 164 ):

165 """Initialize feature selection pipeline 166 167 :Parameters: 168 feature_selections : lisf of FeatureSelection 169 selections which to use. Order matters 170 """ 171 # base init first 172 FeatureSelection.__init__(self, **kwargs) 173 174 self.__feature_selections = feature_selections 175 """Selectors to use in turn"""

176 177

178 - def untrain(self):

179 if __debug__: 180 debug("FS_", "Untraining FS pipeline: %s" % self) 181 for fs in self.__feature_selections: 182 fs.untrain()

183 184

185 - def __call__(self, dataset, testdataset=None, **kwargs):

186 """Invocation of the feature selection 187 """ 188 wdataset = dataset 189 wtestdataset = testdataset 190 191 self.selected_ids = None 192 193 self.nfeatures = [] 194 """Number of features at each step (before running selection)""" 195 196 for fs in self.__feature_selections: 197 198 # enable selected_ids state if it was requested from this class 199 fs.states._changeTemporarily( 200 enable_states=["selected_ids"], other=self) 201 if self.states.isEnabled("nfeatures"): 202 self.nfeatures.append(wdataset.nfeatures) 203 204 if __debug__: 205 debug('FSPL', 'Invoking %s on (%s, %s)' % 206 (fs, wdataset, wtestdataset)) 207 wdataset, wtestdataset = fs(wdataset, wtestdataset, **kwargs) 208 209 if self.states.isEnabled("selected_ids"): 210 if self.selected_ids == None: 211 self.selected_ids = fs.selected_ids 212 else: 213 self.selected_ids = self.selected_ids[fs.selected_ids] 214 215 fs.states._resetEnabledTemporarily() 216 217 return (wdataset, wtestdataset)

218 219 feature_selections = property(fget=lambda self:self.__feature_selections, 220 doc="List of `FeatureSelections`")

221 222 223

224 -class CombinedFeatureSelection(FeatureSelection):

225 """Meta feature selection utilizing several embedded selection methods. 226 227 Each embedded feature selection method is computed individually. Afterwards 228 all feature sets are combined by either taking the union or intersection of 229 all sets. 230 231 The individual feature sets of all embedded methods are optionally avialable 232 from the `selections_ids` state variable. 233 """ 234 selections_ids = StateVariable( 235 doc="List of feature id sets for each performed method.") 236

237 - def __init__(self, feature_selections, combiner, **kwargs):

238 """ 239 :Parameters: 240 feature_selections: list 241 FeatureSelection instances to run. Order is not important. 242 combiner: 'union', 'intersection' 243 which method to be used to combine the feature selection set of 244 all computed methods. 245 """ 246 FeatureSelection.__init__(self, **kwargs) 247 248 self.__feature_selections = feature_selections 249 self.__combiner = combiner

250 251

252 - def untrain(self):

253 if __debug__: 254 debug("FS_", "Untraining combined FS: %s" % self) 255 for fs in self.__feature_selections: 256 fs.untrain()

257 258

259 - def __call__(self, dataset, testdataset=None):

260 """Really run it. 261 """ 262 # to hold the union 263 selected_ids = None 264 # to hold the individuals 265 self.selections_ids = [] 266 267 for fs in self.__feature_selections: 268 # we need the feature ids that were selection by each method, 269 # so enable them temporarily 270 fs.states._changeTemporarily( 271 enable_states=["selected_ids"], other=self) 272 273 # compute feature selection, but ignore return datasets 274 fs(dataset, testdataset) 275 276 # retrieve feature ids and determined union of all selections 277 if selected_ids == None: 278 selected_ids = set(fs.selected_ids) 279 else: 280 if self.__combiner == 'union': 281 selected_ids.update(fs.selected_ids) 282 elif self.__combiner == 'intersection': 283 selected_ids.intersection_update(fs.selected_ids) 284 else: 285 raise ValueError, "Unknown combiner '%s'" % self.__combiner 286 287 # store individual set in state 288 self.selections_ids.append(fs.selected_ids) 289 290 # restore states to previous settings 291 fs.states._resetEnabledTemporarily() 292 293 # finally apply feature set union selection to original datasets 294 selected_ids = sorted(list(selected_ids)) 295 296 # take care of optional second dataset 297 td_sel = None 298 if not testdataset is None: 299 td_sel = testdataset.selectFeatures(self.selected_ids) 300 301 # and main dataset 302 d_sel = dataset.selectFeatures(selected_ids) 303 304 # finally store ids in state 305 self.selected_ids = selected_ids 306 307 return (d_sel, td_sel)

308 309 310 feature_selections = property(fget=lambda self:self.__feature_selections, 311 doc="List of `FeatureSelections`") 312 combiner = property(fget=lambda self:self.__combiner, 313 doc="Selection set combination method.")

314

Source Code for Module mvpa.featsel.base