Package mvpa :: Package clfs :: Module gnb
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.gnb

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Gaussian Naive Bayes Classifier 
 10   
 11     EXPERIMENTAL ;) 
 12     Basic implementation of Gaussian Naive Bayes classifier. 
 13  """ 
 14   
 15  __docformat__ = 'restructuredtext' 
 16   
 17  import numpy as N 
 18   
 19  from numpy import ones, zeros, sum, abs, isfinite, dot 
 20  from mvpa.base import warning, externals 
 21  from mvpa.clfs.base import Classifier 
 22  from mvpa.misc.param import Parameter 
 23  from mvpa.misc.state import StateVariable 
 24  #from mvpa.measures.base import Sensitivity 
 25  #from mvpa.misc.transformers import SecondAxisMaxOfAbs # XXX ? 
 26   
 27   
 28  if __debug__: 
 29      from mvpa.base import debug 
 30   
 31   
32 -class GNB(Classifier):
33 """Gaussian Naive Bayes `Classifier`. 34 35 GNB is a probabilistic classifier relying on Bayes rule to 36 estimate posterior probabilities of labels given the data. Naive 37 assumption in it is an independence of the features, which allows 38 to combine per-feature likelihoods by a simple product across 39 likelihoods of"independent" features. 40 See http://en.wikipedia.org/wiki/Naive_bayes for more information. 41 42 Provided here implementation is "naive" on its own -- various 43 aspects could be improved, but has its own advantages: 44 45 - implementation is simple and straightforward 46 - no data copying while considering samples of specific class 47 - provides alternative ways to assess prior distribution of the 48 classes in the case of unbalanced sets of samples (see parameter 49 `prior`) 50 - makes use of NumPy broadcasting mechanism, so should be 51 relatively efficient 52 - should work for any dimensionality of samples 53 54 GNB is listed both as linear and non-linear classifier, since 55 specifics of separating boundary depends on the data and/or 56 parameters: linear separation is achieved whenever samples are 57 balanced (or prior='uniform') and features have the same variance 58 across different classes (i.e. if common_variance=True to enforce 59 this). 60 61 Whenever decisions are made based on log-probabilities (parameter 62 logprob=True, which is the default), then state variable `values` 63 if enabled would also contain log-probabilities. Also mention 64 that normalization by the evidence (P(data)) is disabled by 65 default since it has no impact per se on classification decision. 66 You might like set parameter normalize to True if you want to 67 access properly scaled probabilities in `values` state variable. 68 """ 69 # XXX decide when should we set corresponding internal, 70 # since it depends actually on the data -- no clear way, 71 # so set both linear and non-linear 72 _clf_internals = [ 'gnb', 'linear', 'non-linear', 73 'binary', 'multiclass' ] 74 75 common_variance = Parameter(False, allowedtype='bool', 76 doc="""Use the same variance across all classes.""") 77 prior = Parameter('laplacian_smoothing', 78 allowedtype='basestring', 79 choices=["laplacian_smoothing", "uniform", "ratio"], 80 doc="""How to compute prior distribution.""") 81 82 logprob = Parameter(True, allowedtype='bool', 83 doc="""Operate on log probabilities. Preferable to avoid unneeded 84 exponentiation and loose precision. 85 If set, logprobs are stored in `values`""") 86 normalize = Parameter(False, allowedtype='bool', 87 doc="""Normalize (log)prob by P(data). Requires probabilities thus 88 for `logprob` case would require exponentiation of 'logprob's, thus 89 disabled by default since does not impact classification output. 90 """) 91
92 - def __init__(self, **kwargs):
93 """Initialize an GNB classifier. 94 """ 95 96 # init base class first 97 Classifier.__init__(self, **kwargs) 98 99 # pylint friendly initializations 100 self.means = None 101 """Means of features per class""" 102 self.variances = None 103 """Variances per class, but "vars" is taken ;)""" 104 self.ulabels = None 105 """Labels classifier was trained on""" 106 self.priors = None 107 """Class probabilities""" 108 109 # Define internal state of classifier 110 self._norm_weight = None
111
112 - def _train(self, dataset):
113 """Train the classifier using `dataset` (`Dataset`). 114 """ 115 params = self.params 116 117 # get the dataset information into easy vars 118 X = dataset.samples 119 labels = dataset.labels 120 self.ulabels = ulabels = dataset.uniquelabels 121 nlabels = len(ulabels) 122 #params = self.params # for quicker access 123 label2index = dict((l, il) for il, l in enumerate(ulabels)) 124 125 # set the feature dimensions 126 nsamples = len(X) 127 s_shape = X.shape[1:] # shape of a single sample 128 129 self.means = means = \ 130 N.zeros((nlabels, ) + s_shape) 131 self.variances = variances = \ 132 N.zeros((nlabels, ) + s_shape) 133 # degenerate dimension are added for easy broadcasting later on 134 nsamples_per_class = N.zeros((nlabels,) + (1,)*len(s_shape)) 135 136 # Estimate means and number of samples per each label 137 for s, l in zip(X, labels): 138 il = label2index[l] # index of the label 139 nsamples_per_class[il] += 1 140 means[il] += s 141 142 # helped function - squash all dimensions but 1 143 squash = lambda x: N.atleast_1d(x.squeeze()) 144 ## Actually compute the means 145 non0labels = (squash(nsamples_per_class) != 0) 146 means[non0labels] /= nsamples_per_class[non0labels] 147 148 # Estimate variances 149 # better loop than repmat! ;) 150 for s, l in zip(X, labels): 151 il = label2index[l] # index of the label 152 variances[il] += (s - means[il])**2 153 154 ## Actually compute the variances 155 if params.common_variance: 156 # we need to get global std 157 cvar = N.sum(variances, axis=0)/nsamples # sum across labels 158 # broadcast the same variance across labels 159 variances[:] = cvar 160 else: 161 variances[non0labels] /= nsamples_per_class[non0labels] 162 163 # Store prior probabilities 164 prior = params.prior 165 if prior == 'uniform': 166 self.priors = N.ones((nlabels,))/nlabels 167 elif prior == 'laplacian_smoothing': 168 self.priors = (1+squash(nsamples_per_class)) \ 169 / (float(nsamples) + nlabels) 170 elif prior == 'ratio': 171 self.priors = squash(nsamples_per_class) / float(nsamples) 172 else: 173 raise ValueError( 174 "No idea on how to handle '%s' way to compute priors" 175 % params.prior) 176 177 # Precompute and store weighting coefficient for Gaussian 178 if params.logprob: 179 # it would be added to exponent 180 self._norm_weight = -0.5 * N.log(2*N.pi*variances) 181 else: 182 self._norm_weight = 1.0/N.sqrt(2*N.pi*variances) 183 184 if __debug__ and 'GNB' in debug.active: 185 debug('GNB', "training finished on data.shape=%s " % (X.shape, ) 186 + "min:max(data)=%f:%f" % (N.min(X), N.max(X)))
187 188
189 - def untrain(self):
190 """Untrain classifier and reset all learnt params 191 """ 192 self.means = None 193 self.variances = None 194 self.ulabels = None 195 self.priors = None 196 super(GNB, self).untrain()
197 198
199 - def _predict(self, data):
200 """Predict the output for the provided data. 201 """ 202 params = self.params 203 # argument of exponentiation 204 scaled_distances = \ 205 -0.5 * (((data - self.means[:, N.newaxis, ...])**2) \ 206 / self.variances[:, N.newaxis, ...]) 207 if params.logprob: 208 # if self.params.common_variance: 209 # XXX YOH: 210 # For decision there is no need to actually compute 211 # properly scaled p, ie 1/sqrt(2pi * sigma_i) could be 212 # simply discarded since it is common across features AND 213 # classes 214 # For completeness -- computing everything now even in logprob 215 lprob_csfs = self._norm_weight[:, N.newaxis, ...] + scaled_distances 216 217 # XXX for now just cut/paste with different operators, but 218 # could just bind them and reuse in the same equations 219 # Naive part -- just a product of probabilities across features 220 ## First we need to reshape to get class x samples x features 221 lprob_csf = lprob_csfs.reshape( 222 lprob_csfs.shape[:2] + (-1,)) 223 ## Now -- sum across features 224 lprob_cs = lprob_csf.sum(axis=2) 225 226 # Incorporate class probabilities: 227 prob_cs_cp = lprob_cs + N.log(self.priors[:, N.newaxis]) 228 229 else: 230 # Just a regular Normal distribution with per 231 # feature/class mean and variances 232 prob_csfs = \ 233 self._norm_weight[:, N.newaxis, ...] * N.exp(scaled_distances) 234 235 # Naive part -- just a product of probabilities across features 236 ## First we need to reshape to get class x samples x features 237 prob_csf = prob_csfs.reshape( 238 prob_csfs.shape[:2] + (-1,)) 239 ## Now -- product across features 240 prob_cs = prob_csf.prod(axis=2) 241 242 # Incorporate class probabilities: 243 prob_cs_cp = prob_cs * self.priors[:, N.newaxis] 244 245 # Normalize by evidence P(data) 246 if params.normalize: 247 if params.logprob: 248 prob_cs_cp_real = N.exp(prob_cs_cp) 249 else: 250 prob_cs_cp_real = prob_cs_cp 251 prob_s_cp_marginals = N.sum(prob_cs_cp_real, axis=0) 252 if params.logprob: 253 prob_cs_cp -= N.log(prob_s_cp_marginals) 254 else: 255 prob_cs_cp /= prob_s_cp_marginals 256 257 # Take the class with maximal (log)probability 258 winners = prob_cs_cp.argmax(axis=0) 259 predictions = [self.ulabels[c] for c in winners] 260 261 262 self.values = prob_cs_cp.T # set to the probabilities per class 263 264 if __debug__ and 'GNB' in debug.active: 265 debug('GNB', "predict on data.shape=%s min:max(data)=%f:%f " % 266 (data.shape, N.min(data), N.max(data))) 267 268 return predictions
269 270 271 # XXX Later come up with some 272 # could be a simple t-test maps using distributions 273 # per each class 274 #def getSensitivityAnalyzer(self, **kwargs): 275 # """Returns a sensitivity analyzer for GNB.""" 276 # return GNBWeights(self, **kwargs) 277 278 279 # XXX Is there any reason to use properties? 280 #means = property(lambda self: self.__biases) 281 #variances = property(lambda self: self.__weights) 282 283 284 285 ## class GNBWeights(Sensitivity): 286 ## """`SensitivityAnalyzer` that reports the weights GNB trained 287 ## on a given `Dataset`. 288 289 ## """ 290 291 ## _LEGAL_CLFS = [ GNB ] 292 293 ## def _call(self, dataset=None): 294 ## """Extract weights from GNB classifier. 295 296 ## GNB always has weights available, so nothing has to be computed here. 297 ## """ 298 ## clf = self.clf 299 ## means = clf.means 300 ## XXX we can do something better ;) 301 ## return means 302