1
2
3
4
5
6
7
8
9 """GLM-Net (GLMNET) regression classifier."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 import numpy as N
15
16 import mvpa.base.externals as externals
17
18
19 if externals.exists('rpy', raiseException=True) and \
20 externals.exists('glmnet', raiseException=True):
21 import rpy
22 rpy.r.library('glmnet')
23
24
25 from mvpa.clfs.base import Classifier
26 from mvpa.measures.base import Sensitivity
27 from mvpa.misc.param import Parameter
28
29 if __debug__:
30 from mvpa.base import debug
31
33 """Convert labels to list of unique label indicies starting at 1.
34 """
35
36
37 new_labels = N.zeros(len(labels), dtype=N.int)
38
39
40 for i, c in enumerate(ulabels):
41 new_labels[labels == c] = i+1
42
43 return [str(l) for l in new_labels.tolist()]
44
45
47 """GLM-Net regression (GLMNET) `Classifier`.
48
49 GLM-Net is the model selection algorithm from:
50
51 Friedman, J., Hastie, T. and Tibshirani, R. (2008) Regularization
52 Paths for Generalized Linear Models via Coordinate
53 Descent. http://www-stat.stanford.edu/~hastie/Papers/glmnet.pdf
54
55 To make use of GLMNET, you must have R and RPy installed as well
56 as both the glmnet contributed package. You can install the R and
57 RPy with the following command on Debian-based machines:
58
59 sudo aptitude install python-rpy python-rpy-doc r-base-dev
60
61 You can then install the glmnet package by running R
62 as root and calling:
63
64 install.packages()
65
66 """
67
68 _clf_internals = [ 'glmnet', 'linear', 'has_sensitivity',
69 'does_feature_selection'
70 ]
71
72 family = Parameter('gaussian',
73 allowedtype='basestring',
74 choices=["gaussian", "multinomial"],
75 doc="""Response type of your labels (either 'gaussian'
76 for regression or 'multinomial' for classification).""")
77
78 alpha = Parameter(1.0, min=0.01, max=1.0, allowedtype='float',
79 doc="""The elastic net mixing parameter.
80 Larger values will give rise to
81 less L2 regularization, with alpha=1.0
82 as a true LASSO penalty.""")
83
84 nlambda = Parameter(100, allowedtype='int', min=1,
85 doc="""Maximum number of lambdas to calculate
86 before stopping if not converged.""")
87
88 standardize = Parameter(True, allowedtype='bool',
89 doc="""Whether to standardize the variables
90 prior to fitting.""")
91
92 thresh = Parameter(1e-4, min=1e-10, max=1.0, allowedtype='float',
93 doc="""Convergence threshold for coordinate descent.""")
94
95 pmax = Parameter(None, min=1, allowedtype='None or int',
96 doc="""Limit the maximum number of variables ever to be
97 nonzero.""")
98
99 maxit = Parameter(100, min=10, allowedtype='int',
100 doc="""Maximum number of outer-loop iterations for
101 'multinomial' families.""")
102
103 model_type = Parameter('covariance',
104 allowedtype='basestring',
105 choices=["covariance", "naive"],
106 doc="""'covariance' saves all inner-products ever
107 computed and can be much faster than 'naive'. The
108 latter can be more efficient for
109 nfeatures>>nsamples situations.""")
110
112 """
113 Initialize GLM-Net.
114
115 See the help in R for further details on the parameters
116 """
117
118 Classifier.__init__(self, **kwargs)
119
120
121 self.__weights = None
122 """The beta weights for each feature."""
123 self.__trained_model = None
124 """The model object after training that will be used for
125 predictions."""
126 self.__trained_model_dict = None
127 """The model object in dict form after training that will be
128 used for predictions."""
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
153 """Train the classifier using `data` (`Dataset`).
154 """
155
156 if self.params.family == 'gaussian':
157
158 labels = dataset.labels.tolist()
159 pass
160 elif self.params.family == 'multinomial':
161
162 labels = _label2indlist(dataset.labels,
163 dataset.uniquelabels)
164 self.__ulabels = dataset.uniquelabels.copy()
165
166
167 if self.params.pmax is None:
168
169 pmax = dataset.nfeatures
170 else:
171
172 pmax = self.params.pmax
173
174
175
176 rpy.set_default_mode(rpy.NO_CONVERSION)
177 self.__trained_model = rpy.r.glmnet(dataset.samples,
178 labels,
179 family=self.params.family,
180 alpha=self.params.alpha,
181 nlambda=self.params.nlambda,
182 standardize=self.params.standardize,
183 thresh=self.params.thresh,
184 pmax=pmax,
185 maxit=self.params.maxit,
186 type=self.params.model_type)
187 rpy.set_default_mode(rpy.NO_DEFAULT)
188
189
190 self.__trained_model_dict = rpy.r.as_list(self.__trained_model)
191
192
193 self.__last_lambda = self.__trained_model_dict['lambda'][-1]
194
195
196 weights = rpy.r.coef(self.__trained_model, s=self.__last_lambda)
197 if self.params.family == 'multinomial':
198 self.__weights = N.hstack([rpy.r.as_matrix(weights[str(i)])[1:]
199 for i in range(1,len(self.__ulabels)+1)])
200 elif self.params.family == 'gaussian':
201 self.__weights = rpy.r.as_matrix(weights)[1:]
202
203
205 """
206 Predict the output for the provided data.
207 """
208
209 values = rpy.r.predict(self.__trained_model,
210 newx=data,
211 type='link',
212 s=self.__last_lambda)
213
214
215 classes = None
216 if self.params.family == 'multinomial':
217
218 values = values[:,:,0]
219
220
221 rpy.set_default_mode(rpy.NO_CONVERSION)
222 class_ind = rpy.r.predict(self.__trained_model,
223 newx=data,
224 type='class',
225 s=self.__last_lambda)
226 rpy.set_default_mode(rpy.NO_DEFAULT)
227 class_ind = rpy.r.as_vector(class_ind)
228
229
230 class_ind = N.array([int(float(c))-1 for c in class_ind])
231
232
233 classes = self.__ulabels[class_ind]
234 else:
235
236 values = values[:,0]
237
238
239 self.values = values
240 if classes is not None:
241
242 return classes
243 else:
244
245 return values
246
247
249 """Return ids of the used features
250 """
251 return N.where(N.abs(self.__weights)>0)[0]
252
253
254
256 """Returns a sensitivity analyzer for GLMNET."""
257 return GLMNETWeights(self, **kwargs)
258
259 weights = property(lambda self: self.__weights)
260
261
262
264 """`SensitivityAnalyzer` that reports the weights GLMNET trained
265 on a given `Dataset`.
266 """
267
268 _LEGAL_CLFS = [ _GLMNET ]
269
270 - def _call(self, dataset=None):
271 """Extract weights from GLMNET classifier.
272
273 GLMNET always has weights available, so nothing has to be computed here.
274 """
275 clf = self.clf
276 weights = clf.weights
277
278 if __debug__:
279 debug('GLMNET',
280 "Extracting weights for GLMNET - "+
281 "Result: min=%f max=%f" %\
282 (N.min(weights), N.max(weights)))
283
284 return weights
285
287 """
288 GLM-NET Gaussian Regression Classifier.
289
290 This is the GLM-NET algorithm from
291
292 Friedman, J., Hastie, T. and Tibshirani, R. (2008) Regularization
293 Paths for Generalized Linear Models via Coordinate
294 Descent. http://www-stat.stanford.edu/~hastie/Papers/glmnet.pdf
295
296 parameterized to be a regression.
297
298 See GLMNET_C for the multinomial classifier version.
299
300 """
301
302 _clf_internals = _GLMNET._clf_internals + ['regression']
303
305 """
306 Initialize GLM-Net.
307
308 See the help in R for further details on the parameters
309 """
310
311 regr_family = 'gaussian'
312 family = kwargs.pop('family', regr_family).lower()
313 if family != regr_family:
314 warning('You specified the parameter family=%s, but we '
315 'force this to be "%s" for regression.'
316 % (family, regr_family))
317 family = regr_family
318
319 regression = kwargs.pop('regression', None)
320 if regression is None:
321
322
323
324 regression = True
325
326
327 _GLMNET.__init__(self, family=family, regression=regression, **kwargs)
328
329
331 """
332 GLM-NET Multinomial Classifier.
333
334 This is the GLM-NET algorithm from
335
336 Friedman, J., Hastie, T. and Tibshirani, R. (2008) Regularization
337 Paths for Generalized Linear Models via Coordinate
338 Descent. http://www-stat.stanford.edu/~hastie/Papers/glmnet.pdf
339
340 parameterized to be a multinomial classifier.
341
342 See GLMNET_Class for the gaussian regression version.
343
344 """
345
346 _clf_internals = _GLMNET._clf_internals + ['multiclass', 'binary']
347
349 """
350 Initialize GLM-Net multinomial classifier.
351
352 See the help in R for further details on the parameters
353 """
354
355 if not kwargs.pop('family', None) is None:
356 warning('You specified the "family" parameter, but we '
357 'force this to be "multinomial".')
358
359
360 _GLMNET.__init__(self, family='multinomial', **kwargs)
361