1
2
3
4
5
6
7
8
9 """Wrap the libsvm package into a very simple class interface."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 _DEV__doc__ = """
15
16 TODOs:
17 * dual-license under GPL for use of SG?
18 * for recent versions add ability to specify/parametrize normalization
19 scheme for the kernel, and reuse 'scale' now for the normalizer
20 * Add support for simplified linear classifiers (which do not require
21 storing all training SVs/samples to make classification in predict())
22 """
23
24 import numpy as N
25
26 from mvpa import _random_seed
27
28
29 from mvpa.base import externals, warning
30 if externals.exists('shogun', raiseException=True):
31 import shogun.Features
32 import shogun.Classifier
33 import shogun.Regression
34 import shogun.Kernel
35 import shogun.Library
36
37
38 if hasattr(shogun.Kernel, 'M_DEBUG'):
39 _M_DEBUG = shogun.Kernel.M_DEBUG
40 _M_ERROR = shogun.Kernel.M_ERROR
41 elif hasattr(shogun.Kernel, 'MSG_DEBUG'):
42 _M_DEBUG = shogun.Kernel.MSG_DEBUG
43 _M_ERROR = shogun.Kernel.MSG_ERROR
44 else:
45 _M_DEBUG, _M_ERROR = None, None
46 warning("Could not figure out debug IDs within shogun. "
47 "No control over shogun verbosity would be provided")
48
49 try:
50
51 shogun.Library.Math_init_random(_random_seed)
52
53
54 shogun.Library.Math_init_random(_random_seed)
55 except Exception, e:
56 warning('Shogun cannot be seeded due to %s' % (e,))
57
58 import operator
59
60 from mvpa.misc.param import Parameter
61 from mvpa.base import warning
62
63 from mvpa.clfs.base import FailedToTrainError
64 from mvpa.clfs.meta import MulticlassClassifier
65 from mvpa.clfs._svmbase import _SVM
66 from mvpa.misc.state import StateVariable
67 from mvpa.measures.base import Sensitivity
68
69 from sens import *
70
71 if __debug__:
72 from mvpa.base import debug
73
74
76 """Helper to set level of debugging output for SG
77 :Parameters:
78 obj
79 In SG debug output seems to be set per every object
80 partname : basestring
81 For what kind of object we are talking about... could be automated
82 later on (TODO)
83 """
84 if _M_DEBUG is None:
85 return
86 debugname = "SG_%s" % partname.upper()
87
88 switch = {True: (_M_DEBUG, 'M_DEBUG', "enable"),
89 False: (_M_ERROR, 'M_ERROR', "disable")}
90
91 key = __debug__ and debugname in debug.active
92
93 sglevel, slevel, progressfunc = switch[key]
94
95 if __debug__:
96 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" %
97 (partname, `obj`, slevel))
98 obj.io.set_loglevel(sglevel)
99 try:
100 exec "obj.io.%s_progress()" % progressfunc
101 except:
102 warning("Shogun version installed has no way to enable progress" +
103 " reports")
104
105
107 """Draft helper function to convert data we have into SG suitable format
108
109 TODO: Support different datatypes
110 """
111
112 if __debug__:
113 debug("SG_", "Converting data for shogun into RealFeatures")
114
115 features = shogun.Features.RealFeatures(data.astype('double').T)
116
117 if __debug__:
118 debug("SG__", "Done converting data for shogun into RealFeatures")
119 _setdebug(features, 'Features')
120 return features
121
122
124 """Support Vector Machine Classifier(s) based on Shogun
125
126 This is a simple base interface
127 """
128
129 num_threads = Parameter(1,
130 min=1,
131 doc='Number of threads to utilize')
132
133
134 _KERNELS = {}
135 if externals.exists('shogun', raiseException=True):
136 _KERNELS = { "linear": (shogun.Kernel.LinearKernel,
137 ('scale',), LinearSVMWeights),
138 "rbf" : (shogun.Kernel.GaussianKernel,
139 ('gamma',), None),
140 "rbfshift": (shogun.Kernel.GaussianShiftKernel,
141 ('gamma', 'max_shift', 'shift_step'), None),
142 "sigmoid": (shogun.Kernel.SigmoidKernel,
143 ('cache_size', 'gamma', 'coef0'), None),
144 }
145
146 _KNOWN_PARAMS = [ 'epsilon' ]
147 _KNOWN_KERNEL_PARAMS = [ ]
148
149 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ]
150
151 if externals.exists('sg ge 0.6.4'):
152 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights)
153
154
155
156 """
157 If you'd like to train linear SVMs use SGD or OCAS. These are (I am
158 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs
159 with standard additive bias, but will L2 reqularize it - though it
160 should not matter much in practice (although it will give slightly
161 different solutions)). Note that SGD has no stopping criterion (you
162 simply have to specify the number of iterations) and that OCAS has a
163 different stopping condition than svmlight for example which may be more
164 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3
165 for epsilon.
166
167 If you would like to train kernel SVMs use libsvm/gpdt/svmlight -
168 depending on the problem one is faster than the other (hard to say when,
169 I *think* when your dataset is very unbalanced chunking methods like
170 svmlight/gpdt are better), for smaller problems definitely libsvm.
171
172 If you use string kernels then gpdt/svmlight have a special 'linadd'
173 speedup for this (requires sg 0.6.2 - there was some inefficiency in the
174 code for python-modular before that). This is effective for big datasets
175 and (I trained on 10 million strings based on this).
176
177 And yes currently we only implemented parallel training for svmlight,
178 however all SVMs can be evaluated in parallel.
179 """
180 _KNOWN_IMPLEMENTATIONS = {}
181 if externals.exists('shogun', raiseException=True):
182 _KNOWN_IMPLEMENTATIONS = {
183 "libsvm" : (shogun.Classifier.LibSVM, ('C',),
184 ('multiclass', 'binary'),
185 "LIBSVM's C-SVM (L2 soft-margin SVM)"),
186 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',),
187 ('multiclass', 'binary'),
188 "Generalized Nearest Point Problem SVM"),
189
190 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',),
191 "Gradient Projection Decomposition Technique for " \
192 "large-scale SVM problems"),
193 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',),
194 "Generalized Nearest Point Problem SVM"),
195
196
197
198
199
200
201
202
203
204
205 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',),
206 ('regression',),
207 "LIBSVM's epsilon-SVR"),
208 }
209
210
211 - def __init__(self,
212 kernel_type='linear',
213 **kwargs):
214 """Interface class to Shogun's classifiers and regressions.
215
216 Default implementation is 'libsvm'.
217 """
218
219 svm_impl = kwargs.get('svm_impl', 'libsvm').lower()
220 kwargs['svm_impl'] = svm_impl
221
222
223 _SVM.__init__(self, kernel_type=kernel_type, **kwargs)
224
225 self.__svm = None
226 """Holds the trained svm."""
227 self.__svm_apply = None
228 """Compatibility convenience to bind to the classify/apply method
229 of __svm"""
230
231
232
233 self.__traindataset = None
234
235
236 self.__traindata = None
237 self.__kernel = None
238 self.__kernel_test = None
239 self.__testdata = None
240
241
243
244
245
246 if self._svm_impl in ['svrlight', 'lightsvm']:
247 try:
248 kernel.set_precompute_matrix(True, True)
249 except Exception, e:
250
251 if __debug__:
252 debug('SG_', "Failed call to set_precompute_matrix for %s: %s"
253 % (self, e))
254
255
257 """Train SVM
258 """
259
260
261 newkernel, newsvm = False, False
262
263 retrainable = self.params.retrainable
264
265 if retrainable:
266 _changedData = self._changedData
267
268
269 ul = None
270 self.__traindataset = dataset
271
272
273
274
275
276
277 if __debug__:
278 debug("SG_", "Creating labels instance")
279
280 if 'regression' in self._clf_internals:
281 labels_ = N.asarray(dataset.labels, dtype='double')
282 else:
283 ul = dataset.uniquelabels
284 ul.sort()
285
286 if len(ul) == 2:
287
288 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0}
289 elif len(ul) < 2:
290 raise FailedToTrainError, \
291 "We do not have 1-class SVM brought into SG yet"
292 else:
293
294 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))])
295
296
297 _labels_dict_rev = dict([(x[1], x[0])
298 for x in _labels_dict.items()])
299
300
301 self._labels_dict = _labels_dict
302 self._labels_dict_rev = _labels_dict_rev
303
304
305
306
307
308 if __debug__:
309 debug("SG__", "Mapping labels using dict %s" % _labels_dict)
310 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double')
311
312 labels = shogun.Features.Labels(labels_)
313 _setdebug(labels, 'Labels')
314
315
316
317 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']:
318
319
320 kargs = []
321 for arg in self._KERNELS[self._kernel_type_literal][1]:
322 value = self.kernel_params[arg].value
323
324 if arg == 'gamma' and value == 0.0:
325 value = self._getDefaultGamma(dataset)
326 kargs += [value]
327
328 if retrainable and __debug__:
329 if _changedData['traindata']:
330 debug("SG",
331 "Re-Creating kernel since training data has changed")
332
333 if _changedData['kernel_params']:
334 debug("SG",
335 "Re-Creating kernel since params %s has changed" %
336 _changedData['kernel_params'])
337
338
339 if __debug__: debug("SG_", "Converting input data for shogun")
340 self.__traindata = _tosg(dataset.samples)
341
342 if __debug__:
343 debug("SG", "Creating kernel instance of %s giving arguments %s" %
344 (`self._kernel_type`, kargs))
345
346 self.__kernel = kernel = \
347 self._kernel_type(self.__traindata, self.__traindata,
348 *kargs)
349
350 if externals.exists('sg ge 0.6.4'):
351 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer())
352
353 newkernel = True
354 self.kernel_params.reset()
355 _setdebug(kernel, 'Kernels')
356
357 self.__condition_kernel(kernel)
358 if retrainable:
359 if __debug__:
360 debug("SG_", "Resetting test kernel for retrainable SVM")
361 self.__kernel_test = None
362 self.__kernel_args = kargs
363
364
365
366 Cs = None
367 if not retrainable or self.__svm is None or _changedData['params']:
368
369 if self.params.isKnown('C'):
370 C = self.params.C
371 if not operator.isSequenceType(C):
372
373 C = [C]
374
375 Cs = list(C[:])
376 for i in xrange(len(Cs)):
377 if Cs[i]<0:
378 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i])
379 if __debug__:
380 debug("SG_", "Default C for %s was computed to be %s" %
381 (C[i], Cs[i]))
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403 svm_impl_class = self.__get_implementation(ul)
404
405 if __debug__:
406 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`)
407
408 if self._svm_impl in ['libsvr', 'svrlight']:
409
410 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels)
411 elif self._svm_impl in ['krr']:
412 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels)
413 else:
414 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels)
415 self.__svm.set_epsilon(self.params.epsilon)
416
417
418 self.__svm_apply = hasattr(self.__svm, 'apply') \
419 and self.__svm.apply \
420 or self.__svm.classify
421
422
423 if self.params.isKnown('shrinking'):
424 shrinking = self.params.shrinking
425 if __debug__:
426 debug("SG_", "Setting shrinking to %s" % shrinking)
427 self.__svm.set_shrinking_enabled(shrinking)
428
429 if Cs is not None and len(Cs) == 2:
430 if __debug__:
431 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs)
432 self.__svm.set_C(Cs[0], Cs[1])
433
434 self.params.reset()
435 newsvm = True
436 _setdebug(self.__svm, 'SVM')
437
438 if self.params.isKnown('tube_epsilon') and \
439 hasattr(self.__svm, 'set_tube_epsilon'):
440 self.__svm.set_tube_epsilon(self.params.tube_epsilon)
441 self.__svm.parallel.set_num_threads(self.params.num_threads)
442 else:
443 if __debug__:
444 debug("SG_", "SVM instance is not re-created")
445 if _changedData['labels']:
446 if __debug__: debug("SG__", "Assigning new labels")
447 self.__svm.set_labels(labels)
448 if newkernel:
449 if __debug__: debug("SG__", "Assigning new kernel")
450 self.__svm.set_kernel(self.__kernel)
451 assert(_changedData['params'] is False)
452
453 if retrainable:
454
455 self.states.retrained = not newsvm or not newkernel
456
457
458 if __debug__ and 'SG' in debug.active:
459 if not self.regression:
460 lstr = " with labels %s" % dataset.uniquelabels
461 else:
462 lstr = ""
463 debug("SG", "%sTraining %s on data%s" %
464 (("","Re-")[retrainable and self.states.retrained],
465 self, lstr))
466
467 self.__svm.train()
468
469 if __debug__:
470 debug("SG_", "Done training SG_SVM %s" % self._kernel_type)
471
472
473 if (__debug__ and 'SG__' in debug.active) or \
474 self.states.isEnabled('training_confusion'):
475 trained_labels = self.__svm_apply().get_labels()
476 else:
477 trained_labels = None
478
479 if __debug__ and "SG__" in debug.active:
480 debug("SG__", "Original labels: %s, Trained labels: %s" %
481 (dataset.labels, trained_labels))
482
483
484
485
486
487
488
489
490
491 if self.regression and self.states.isEnabled('training_confusion'):
492 self.states.training_confusion = self._summaryClass(
493 targets=dataset.labels,
494 predictions=trained_labels)
495
497 """Predict values for the data
498 """
499
500 retrainable = self.params.retrainable
501
502 if retrainable:
503 changed_testdata = self._changedData['testdata'] or \
504 self.__kernel_test is None
505
506 if not retrainable or changed_testdata:
507 testdata = _tosg(data)
508
509 if not retrainable:
510 if __debug__:
511 debug("SG__",
512 "Initializing SVMs kernel of %s with training/testing samples"
513 % self)
514
515 self.__kernel.init(self.__traindata, testdata)
516 self.__condition_kernel(self.__kernel)
517 else:
518 if changed_testdata:
519 if __debug__:
520 debug("SG__",
521 "Re-creating testing kernel of %s giving "
522 "arguments %s" %
523 (`self._kernel_type`, self.__kernel_args))
524 kernel_test = self._kernel_type(self.__traindata, testdata,
525 *self.__kernel_args)
526 _setdebug(kernel_test, 'Kernels')
527
528 custk_args = ([self.__traindata, testdata], [])[
529 int(externals.exists('sg ge 0.6.4'))]
530 if __debug__:
531 debug("SG__",
532 "Re-creating custom testing kernel giving "
533 "arguments %s" % (str(custk_args)))
534 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args)
535
536 _setdebug(kernel_test_custom, 'Kernels')
537 self.__kernel_test = kernel_test_custom
538 self.__kernel_test.set_full_kernel_matrix_from_full(
539 kernel_test.get_kernel_matrix())
540 elif __debug__:
541 debug("SG__", "Re-using testing kernel")
542
543 assert(self.__kernel_test is not None)
544 self.__svm.set_kernel(self.__kernel_test)
545
546 if __debug__:
547 debug("SG_", "Classifying testing data")
548
549
550
551 values_ = self.__svm_apply()
552 if values_ is None:
553 raise RuntimeError, "We got empty list of values from %s" % self
554
555 values = values_.get_labels()
556
557 if retrainable:
558
559 self.states.repredicted = repredicted = not changed_testdata
560 if __debug__:
561 debug("SG__", "Re-assigning learing kernel. Repredicted is %s"
562 % repredicted)
563
564 self.__svm.set_kernel(self.__kernel)
565
566 if __debug__:
567 debug("SG__", "Got values %s" % values)
568
569 if ('regression' in self._clf_internals):
570 predictions = values
571 else:
572
573 _labels_dict = self._labels_dict
574 _labels_dict_rev = self._labels_dict_rev
575
576 if len(_labels_dict) == 2:
577 predictions = 1.0 - 2*N.signbit(values)
578 else:
579 predictions = values
580
581
582 label_type = type(_labels_dict.values()[0])
583
584
585 predictions = [_labels_dict_rev[label_type(x)]
586 for x in predictions]
587
588 if __debug__:
589 debug("SG__", "Tuned predictions %s" % predictions)
590
591
592
593
594 self.values = values
595
596
597 if not retrainable:
598 try:
599 testdata.free_features()
600 except:
601 pass
602
603 return predictions
604
605
607 super(SVM, self).untrain()
608 if not self.params.retrainable:
609 if __debug__:
610 debug("SG__", "Untraining %(clf)s and destroying sg's SVM",
611 msgargs={'clf':self})
612
613
614
615 if True:
616 if True:
617
618 if self.__kernel is not None:
619 del self.__kernel
620 self.__kernel = None
621
622 if self.__kernel_test is not None:
623 del self.__kernel_test
624 self.__kernel_test = None
625
626 if self.__svm is not None:
627 del self.__svm
628 self.__svm = None
629 self.__svm_apply = None
630
631 if self.__traindata is not None:
632
633
634
635
636 self.__traindata.free_features()
637 del self.__traindata
638 self.__traindata = None
639
640 self.__traindataset = None
641
642
643
644
645
646 if __debug__:
647 debug("SG__",
648 "Done untraining %(self)s and destroying sg's SVM",
649 msgargs=locals())
650 elif __debug__:
651 debug("SG__", "Not untraining %(self)s since it is retrainable",
652 msgargs=locals())
653
654
656 if 'regression' in self._clf_internals or len(ul) == 2:
657 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0]
658 else:
659 if self._svm_impl == 'libsvm':
660 svm_impl_class = shogun.Classifier.LibSVMMultiClass
661 elif self._svm_impl == 'gmnp':
662 svm_impl_class = shogun.Classifier.GMNPSVM
663 else:
664 raise RuntimeError, \
665 "Shogun: Implementation %s doesn't handle multiclass " \
666 "data. Got labels %s. Use some other classifier" % \
667 (self._svm_impl, self.__traindataset.uniquelabels)
668 if __debug__:
669 debug("SG_", "Using %s for multiclass data of %s" %
670 (svm_impl_class, self._svm_impl))
671
672 return svm_impl_class
673
674
675 svm = property(fget=lambda self: self.__svm)
676 """Access to the SVM model."""
677
678 traindataset = property(fget=lambda self: self.__traindataset)
679 """Dataset which was used for training
680
681 TODO -- might better become state variable I guess"""
682
683
684
685
686
687 for name, item, params, descr in \
688 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)",
689 "MPD classifier from shogun"),
690 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)",
691 "SVMLight classification http://svmlight.joachims.org/"),
692 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)",
693 "SVMLight regression http://svmlight.joachims.org/"),
694 ('krr', "shogun.Regression.KRR", "('tau',), ('regression',)",
695 "Kernel Ridge Regression"),
696 ]:
697 if externals.exists('shogun.%s' % name):
698 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr)
699
700
701 LinearSVMWeights._LEGAL_CLFS = [SVM]
702