mvpa.clfs.glmnet

47 """GLM-Net regression (GLMNET) `Classifier`. 48 49 GLM-Net is the model selection algorithm from: 50 51 Friedman, J., Hastie, T. and Tibshirani, R. (2008) Regularization 52 Paths for Generalized Linear Models via Coordinate 53 Descent. http://www-stat.stanford.edu/~hastie/Papers/glmnet.pdf 54 55 To make use of GLMNET, you must have R and RPy installed as well 56 as both the glmnet contributed package. You can install the R and 57 RPy with the following command on Debian-based machines: 58 59 sudo aptitude install python-rpy python-rpy-doc r-base-dev 60 61 You can then install the glmnet package by running R 62 as root and calling: 63 64 install.packages() 65 66 """ 67 68 _clf_internals = [ 'glmnet', 'linear', 'has_sensitivity', 69 'does_feature_selection' 70 ] 71 72 family = Parameter('gaussian', 73 allowedtype='basestring', 74 choices=["gaussian", "multinomial"], 75 doc="""Response type of your labels (either 'gaussian' 76 for regression or 'multinomial' for classification).""") 77 78 alpha = Parameter(1.0, min=0.01, max=1.0, allowedtype='float', 79 doc="""The elastic net mixing parameter. 80 Larger values will give rise to 81 less L2 regularization, with alpha=1.0 82 as a true LASSO penalty.""") 83 84 nlambda = Parameter(100, allowedtype='int', min=1, 85 doc="""Maximum number of lambdas to calculate 86 before stopping if not converged.""") 87 88 standardize = Parameter(True, allowedtype='bool', 89 doc="""Whether to standardize the variables 90 prior to fitting.""") 91 92 thresh = Parameter(1e-4, min=1e-10, max=1.0, allowedtype='float', 93 doc="""Convergence threshold for coordinate descent.""") 94 95 pmax = Parameter(None, min=1, allowedtype='None or int', 96 doc="""Limit the maximum number of variables ever to be 97 nonzero.""") 98 99 maxit = Parameter(100, min=10, allowedtype='int', 100 doc="""Maximum number of outer-loop iterations for 101 'multinomial' families.""") 102 103 model_type = Parameter('covariance', 104 allowedtype='basestring', 105 choices=["covariance", "naive"], 106 doc="""'covariance' saves all inner-products ever 107 computed and can be much faster than 'naive'. The 108 latter can be more efficient for 109 nfeatures>>nsamples situations.""") 110

111 - def __init__(self, **kwargs):

112 """ 113 Initialize GLM-Net. 114 115 See the help in R for further details on the parameters 116 """ 117 # init base class first 118 Classifier.__init__(self, **kwargs) 119 120 # pylint friendly initializations 121 self.__weights = None 122 """The beta weights for each feature.""" 123 self.__trained_model = None 124 """The model object after training that will be used for 125 predictions.""" 126 self.__trained_model_dict = None 127 """The model object in dict form after training that will be 128 used for predictions."""

129 130 # It does not make sense to calculate a confusion matrix for a 131 # regression 132 # YOH: sorry for not clear semantics... pyvmpa is evolving, 133 # regressions will store RegressionStatistics within the 134 # confusion, so it is ok to have training_confusion 135 # enabled, but .regression parameter needs to be set to true, 136 # therefor above conditioning and tuneup of kwargs in _R 137 #if self.params.family == 'gaussian': 138 # self.states.enable('training_confusion', False) 139 140 # def __repr__(self): 141 # """String summary of the object 142 # """ 143 # return """ENET(lm=%s, normalize=%s, intercept=%s, trace=%s, max_steps=%s, enable_states=%s)""" % \ 144 # (self.__lm, 145 # self.__normalize, 146 # self.__intercept, 147 # self.__trace, 148 # self.__max_steps, 149 # str(self.states.enabled)) 150 151

152 - def _train(self, dataset):

153 """Train the classifier using `data` (`Dataset`). 154 """ 155 # process the labels based on the model family 156 if self.params.family == 'gaussian': 157 # do nothing, just save the labels as a list 158 labels = dataset.labels.tolist() 159 pass 160 elif self.params.family == 'multinomial': 161 # turn lables into list of range values starting at 1 162 labels = _label2indlist(dataset.labels, 163 dataset.uniquelabels) 164 self.__ulabels = dataset.uniquelabels.copy() 165 166 # process the pmax 167 if self.params.pmax is None: 168 # set it to the num features 169 pmax = dataset.nfeatures 170 else: 171 # use the value 172 pmax = self.params.pmax 173 174 # train with specifying max_steps 175 # must not convert trained model to dict or we'll get segfault 176 rpy.set_default_mode(rpy.NO_CONVERSION) 177 self.__trained_model = rpy.r.glmnet(dataset.samples, 178 labels, 179 family=self.params.family, 180 alpha=self.params.alpha, 181 nlambda=self.params.nlambda, 182 standardize=self.params.standardize, 183 thresh=self.params.thresh, 184 pmax=pmax, 185 maxit=self.params.maxit, 186 type=self.params.model_type) 187 rpy.set_default_mode(rpy.NO_DEFAULT) 188 189 # get a dict version of the model 190 self.__trained_model_dict = rpy.r.as_list(self.__trained_model) 191 192 # save the lambda of last step 193 self.__last_lambda = self.__trained_model_dict['lambda'][-1] 194 195 # set the weights to the last step 196 weights = rpy.r.coef(self.__trained_model, s=self.__last_lambda) 197 if self.params.family == 'multinomial': 198 self.__weights = N.hstack([rpy.r.as_matrix(weights[str(i)])[1:] 199 for i in range(1,len(self.__ulabels)+1)]) 200 elif self.params.family == 'gaussian': 201 self.__weights = rpy.r.as_matrix(weights)[1:]

202 203

204 - def _predict(self, data):

205 """ 206 Predict the output for the provided data. 207 """ 208 # predict with standard method 209 values = rpy.r.predict(self.__trained_model, 210 newx=data, 211 type='link', 212 s=self.__last_lambda) 213 214 # predict with the final state (i.e., the last step) 215 classes = None 216 if self.params.family == 'multinomial': 217 # remove last dimension of values 218 values = values[:,:,0] 219 220 # get the classes too (they are 1-indexed) 221 rpy.set_default_mode(rpy.NO_CONVERSION) 222 class_ind = rpy.r.predict(self.__trained_model, 223 newx=data, 224 type='class', 225 s=self.__last_lambda) 226 rpy.set_default_mode(rpy.NO_DEFAULT) 227 class_ind = rpy.r.as_vector(class_ind) 228 229 # convert the strings to ints and subtract 1 230 class_ind = N.array([int(float(c))-1 for c in class_ind]) 231 232 # convert to actual labels 233 classes = self.__ulabels[class_ind] 234 else: 235 # is gaussian, so just remove last dim of values 236 values = values[:,0] 237 238 # values need to be set anyways if values state is enabled 239 self.values = values 240 if classes is not None: 241 # set the values and return none 242 return classes 243 else: 244 # return the values as predictions 245 return values

246 247

248 - def _getFeatureIds(self):

249 """Return ids of the used features 250 """ 251 return N.where(N.abs(self.__weights)>0)[0]

252 253 254

255 - def getSensitivityAnalyzer(self, **kwargs):

256 """Returns a sensitivity analyzer for GLMNET.""" 257 return GLMNETWeights(self, **kwargs)

258 259 weights = property(lambda self: self.__weights)

Source Code for Module mvpa.clfs.glmnet