Package mvpa :: Package clfs :: Module glmnet
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.glmnet

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """GLM-Net (GLMNET) regression classifier.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  # system imports 
 14  import numpy as N 
 15   
 16  import mvpa.base.externals as externals 
 17   
 18  # do conditional to be able to build module reference 
 19  if externals.exists('rpy', raiseException=True) and \ 
 20     externals.exists('glmnet', raiseException=True): 
 21      import rpy 
 22      rpy.r.library('glmnet') 
 23   
 24  # local imports 
 25  from mvpa.clfs.base import Classifier 
 26  from mvpa.measures.base import Sensitivity 
 27  from mvpa.misc.param import Parameter 
 28   
 29  if __debug__: 
 30      from mvpa.base import debug 
 31   
32 -def _label2indlist(labels, ulabels):
33 """Convert labels to list of unique label indicies starting at 1. 34 """ 35 36 # allocate for the new one-of-M labels 37 new_labels = N.zeros(len(labels), dtype=N.int) 38 39 # loop and convert to one-of-M 40 for i, c in enumerate(ulabels): 41 new_labels[labels == c] = i+1 42 43 return [str(l) for l in new_labels.tolist()]
44 45
46 -class _GLMNET(Classifier):
47 """GLM-Net regression (GLMNET) `Classifier`. 48 49 GLM-Net is the model selection algorithm from: 50 51 Friedman, J., Hastie, T. and Tibshirani, R. (2008) Regularization 52 Paths for Generalized Linear Models via Coordinate 53 Descent. http://www-stat.stanford.edu/~hastie/Papers/glmnet.pdf 54 55 To make use of GLMNET, you must have R and RPy installed as well 56 as both the glmnet contributed package. You can install the R and 57 RPy with the following command on Debian-based machines: 58 59 sudo aptitude install python-rpy python-rpy-doc r-base-dev 60 61 You can then install the glmnet package by running R 62 as root and calling: 63 64 install.packages() 65 66 """ 67 68 _clf_internals = [ 'glmnet', 'linear', 'has_sensitivity', 69 'does_feature_selection' 70 ] 71 72 family = Parameter('gaussian', 73 allowedtype='basestring', 74 choices=["gaussian", "multinomial"], 75 doc="""Response type of your labels (either 'gaussian' 76 for regression or 'multinomial' for classification).""") 77 78 alpha = Parameter(1.0, min=0.01, max=1.0, allowedtype='float', 79 doc="""The elastic net mixing parameter. 80 Larger values will give rise to 81 less L2 regularization, with alpha=1.0 82 as a true LASSO penalty.""") 83 84 nlambda = Parameter(100, allowedtype='int', min=1, 85 doc="""Maximum number of lambdas to calculate 86 before stopping if not converged.""") 87 88 standardize = Parameter(True, allowedtype='bool', 89 doc="""Whether to standardize the variables 90 prior to fitting.""") 91 92 thresh = Parameter(1e-4, min=1e-10, max=1.0, allowedtype='float', 93 doc="""Convergence threshold for coordinate descent.""") 94 95 pmax = Parameter(None, min=1, allowedtype='None or int', 96 doc="""Limit the maximum number of variables ever to be 97 nonzero.""") 98 99 maxit = Parameter(100, min=10, allowedtype='int', 100 doc="""Maximum number of outer-loop iterations for 101 'multinomial' families.""") 102 103 model_type = Parameter('covariance', 104 allowedtype='basestring', 105 choices=["covariance", "naive"], 106 doc="""'covariance' saves all inner-products ever 107 computed and can be much faster than 'naive'. The 108 latter can be more efficient for 109 nfeatures>>nsamples situations.""") 110
111 - def __init__(self, **kwargs):
112 """ 113 Initialize GLM-Net. 114 115 See the help in R for further details on the parameters 116 """ 117 # init base class first 118 Classifier.__init__(self, **kwargs) 119 120 # pylint friendly initializations 121 self.__weights = None 122 """The beta weights for each feature.""" 123 self.__trained_model = None 124 """The model object after training that will be used for 125 predictions.""" 126 self.__trained_model_dict = None 127 """The model object in dict form after training that will be 128 used for predictions."""
129 130 # It does not make sense to calculate a confusion matrix for a 131 # regression 132 # YOH: sorry for not clear semantics... pyvmpa is evolving, 133 # regressions will store RegressionStatistics within the 134 # confusion, so it is ok to have training_confusion 135 # enabled, but .regression parameter needs to be set to true, 136 # therefor above conditioning and tuneup of kwargs in _R 137 #if self.params.family == 'gaussian': 138 # self.states.enable('training_confusion', False) 139 140 # def __repr__(self): 141 # """String summary of the object 142 # """ 143 # return """ENET(lm=%s, normalize=%s, intercept=%s, trace=%s, max_steps=%s, enable_states=%s)""" % \ 144 # (self.__lm, 145 # self.__normalize, 146 # self.__intercept, 147 # self.__trace, 148 # self.__max_steps, 149 # str(self.states.enabled)) 150 151
152 - def _train(self, dataset):
153 """Train the classifier using `data` (`Dataset`). 154 """ 155 # process the labels based on the model family 156 if self.params.family == 'gaussian': 157 # do nothing, just save the labels as a list 158 labels = dataset.labels.tolist() 159 pass 160 elif self.params.family == 'multinomial': 161 # turn lables into list of range values starting at 1 162 labels = _label2indlist(dataset.labels, 163 dataset.uniquelabels) 164 self.__ulabels = dataset.uniquelabels.copy() 165 166 # process the pmax 167 if self.params.pmax is None: 168 # set it to the num features 169 pmax = dataset.nfeatures 170 else: 171 # use the value 172 pmax = self.params.pmax 173 174 # train with specifying max_steps 175 # must not convert trained model to dict or we'll get segfault 176 rpy.set_default_mode(rpy.NO_CONVERSION) 177 self.__trained_model = rpy.r.glmnet(dataset.samples, 178 labels, 179 family=self.params.family, 180 alpha=self.params.alpha, 181 nlambda=self.params.nlambda, 182 standardize=self.params.standardize, 183 thresh=self.params.thresh, 184 pmax=pmax, 185 maxit=self.params.maxit, 186 type=self.params.model_type) 187 rpy.set_default_mode(rpy.NO_DEFAULT) 188 189 # get a dict version of the model 190 self.__trained_model_dict = rpy.r.as_list(self.__trained_model) 191 192 # save the lambda of last step 193 self.__last_lambda = self.__trained_model_dict['lambda'][-1] 194 195 # set the weights to the last step 196 weights = rpy.r.coef(self.__trained_model, s=self.__last_lambda) 197 if self.params.family == 'multinomial': 198 self.__weights = N.hstack([rpy.r.as_matrix(weights[str(i)])[1:] 199 for i in range(1,len(self.__ulabels)+1)]) 200 elif self.params.family == 'gaussian': 201 self.__weights = rpy.r.as_matrix(weights)[1:]
202 203
204 - def _predict(self, data):
205 """ 206 Predict the output for the provided data. 207 """ 208 # predict with standard method 209 values = rpy.r.predict(self.__trained_model, 210 newx=data, 211 type='link', 212 s=self.__last_lambda) 213 214 # predict with the final state (i.e., the last step) 215 classes = None 216 if self.params.family == 'multinomial': 217 # remove last dimension of values 218 values = values[:,:,0] 219 220 # get the classes too (they are 1-indexed) 221 rpy.set_default_mode(rpy.NO_CONVERSION) 222 class_ind = rpy.r.predict(self.__trained_model, 223 newx=data, 224 type='class', 225 s=self.__last_lambda) 226 rpy.set_default_mode(rpy.NO_DEFAULT) 227 class_ind = rpy.r.as_vector(class_ind) 228 229 # convert the strings to ints and subtract 1 230 class_ind = N.array([int(float(c))-1 for c in class_ind]) 231 232 # convert to actual labels 233 classes = self.__ulabels[class_ind] 234 else: 235 # is gaussian, so just remove last dim of values 236 values = values[:,0] 237 238 # values need to be set anyways if values state is enabled 239 self.values = values 240 if classes is not None: 241 # set the values and return none 242 return classes 243 else: 244 # return the values as predictions 245 return values
246 247
248 - def _getFeatureIds(self):
249 """Return ids of the used features 250 """ 251 return N.where(N.abs(self.__weights)>0)[0]
252 253 254
255 - def getSensitivityAnalyzer(self, **kwargs):
256 """Returns a sensitivity analyzer for GLMNET.""" 257 return GLMNETWeights(self, **kwargs)
258 259 weights = property(lambda self: self.__weights)
260 261 262
263 -class GLMNETWeights(Sensitivity):
264 """`SensitivityAnalyzer` that reports the weights GLMNET trained 265 on a given `Dataset`. 266 """ 267 268 _LEGAL_CLFS = [ _GLMNET ] 269
270 - def _call(self, dataset=None):
271 """Extract weights from GLMNET classifier. 272 273 GLMNET always has weights available, so nothing has to be computed here. 274 """ 275 clf = self.clf 276 weights = clf.weights 277 278 if __debug__: 279 debug('GLMNET', 280 "Extracting weights for GLMNET - "+ 281 "Result: min=%f max=%f" %\ 282 (N.min(weights), N.max(weights))) 283 284 return weights
285
286 -class GLMNET_R(_GLMNET):
287 """ 288 GLM-NET Gaussian Regression Classifier. 289 290 This is the GLM-NET algorithm from 291 292 Friedman, J., Hastie, T. and Tibshirani, R. (2008) Regularization 293 Paths for Generalized Linear Models via Coordinate 294 Descent. http://www-stat.stanford.edu/~hastie/Papers/glmnet.pdf 295 296 parameterized to be a regression. 297 298 See GLMNET_C for the multinomial classifier version. 299 300 """ 301 302 _clf_internals = _GLMNET._clf_internals + ['regression'] 303
304 - def __init__(self, **kwargs):
305 """ 306 Initialize GLM-Net. 307 308 See the help in R for further details on the parameters 309 """ 310 # make sure they didn't specify incompatible model 311 regr_family = 'gaussian' 312 family = kwargs.pop('family', regr_family).lower() 313 if family != regr_family: 314 warning('You specified the parameter family=%s, but we ' 315 'force this to be "%s" for regression.' 316 % (family, regr_family)) 317 family = regr_family 318 319 regression = kwargs.pop('regression', None) 320 if regression is None: 321 # enforce regression by default, but regression might be used as 322 # a binary classifier as well, so leave it as is if it was 323 # explicitly specified 324 regression = True 325 326 # init base class first, forcing regression 327 _GLMNET.__init__(self, family=family, regression=regression, **kwargs)
328 329
330 -class GLMNET_C(_GLMNET):
331 """ 332 GLM-NET Multinomial Classifier. 333 334 This is the GLM-NET algorithm from 335 336 Friedman, J., Hastie, T. and Tibshirani, R. (2008) Regularization 337 Paths for Generalized Linear Models via Coordinate 338 Descent. http://www-stat.stanford.edu/~hastie/Papers/glmnet.pdf 339 340 parameterized to be a multinomial classifier. 341 342 See GLMNET_Class for the gaussian regression version. 343 344 """ 345 346 _clf_internals = _GLMNET._clf_internals + ['multiclass', 'binary'] 347
348 - def __init__(self, **kwargs):
349 """ 350 Initialize GLM-Net multinomial classifier. 351 352 See the help in R for further details on the parameters 353 """ 354 # make sure they didn't specify regression 355 if not kwargs.pop('family', None) is None: 356 warning('You specified the "family" parameter, but we ' 357 'force this to be "multinomial".') 358 359 # init base class first, forcing regression 360 _GLMNET.__init__(self, family='multinomial', **kwargs)
361