1
2
3
4
5
6
7
8
9 """Gaussian Naive Bayes Classifier
10
11 EXPERIMENTAL ;)
12 Basic implementation of Gaussian Naive Bayes classifier.
13 """
14
15 __docformat__ = 'restructuredtext'
16
17 import numpy as N
18
19 from numpy import ones, zeros, sum, abs, isfinite, dot
20 from mvpa.base import warning, externals
21 from mvpa.clfs.base import Classifier
22 from mvpa.misc.param import Parameter
23 from mvpa.misc.state import StateVariable
24
25
26
27
28 if __debug__:
29 from mvpa.base import debug
30
31
32 -class GNB(Classifier):
33 """Gaussian Naive Bayes `Classifier`.
34
35 GNB is a probabilistic classifier relying on Bayes rule to
36 estimate posterior probabilities of labels given the data. Naive
37 assumption in it is an independence of the features, which allows
38 to combine per-feature likelihoods by a simple product across
39 likelihoods of"independent" features.
40 See http://en.wikipedia.org/wiki/Naive_bayes for more information.
41
42 Provided here implementation is "naive" on its own -- various
43 aspects could be improved, but has its own advantages:
44
45 - implementation is simple and straightforward
46 - no data copying while considering samples of specific class
47 - provides alternative ways to assess prior distribution of the
48 classes in the case of unbalanced sets of samples (see parameter
49 `prior`)
50 - makes use of NumPy broadcasting mechanism, so should be
51 relatively efficient
52 - should work for any dimensionality of samples
53
54 GNB is listed both as linear and non-linear classifier, since
55 specifics of separating boundary depends on the data and/or
56 parameters: linear separation is achieved whenever samples are
57 balanced (or prior='uniform') and features have the same variance
58 across different classes (i.e. if common_variance=True to enforce
59 this).
60
61 Whenever decisions are made based on log-probabilities (parameter
62 logprob=True, which is the default), then state variable `values`
63 if enabled would also contain log-probabilities. Also mention
64 that normalization by the evidence (P(data)) is disabled by
65 default since it has no impact per se on classification decision.
66 You might like set parameter normalize to True if you want to
67 access properly scaled probabilities in `values` state variable.
68 """
69
70
71
72 _clf_internals = [ 'gnb', 'linear', 'non-linear',
73 'binary', 'multiclass' ]
74
75 common_variance = Parameter(False, allowedtype='bool',
76 doc="""Use the same variance across all classes.""")
77 prior = Parameter('laplacian_smoothing',
78 allowedtype='basestring',
79 choices=["laplacian_smoothing", "uniform", "ratio"],
80 doc="""How to compute prior distribution.""")
81
82 logprob = Parameter(True, allowedtype='bool',
83 doc="""Operate on log probabilities. Preferable to avoid unneeded
84 exponentiation and loose precision.
85 If set, logprobs are stored in `values`""")
86 normalize = Parameter(False, allowedtype='bool',
87 doc="""Normalize (log)prob by P(data). Requires probabilities thus
88 for `logprob` case would require exponentiation of 'logprob's, thus
89 disabled by default since does not impact classification output.
90 """)
91
93 """Initialize an GNB classifier.
94 """
95
96
97 Classifier.__init__(self, **kwargs)
98
99
100 self.means = None
101 """Means of features per class"""
102 self.variances = None
103 """Variances per class, but "vars" is taken ;)"""
104 self.ulabels = None
105 """Labels classifier was trained on"""
106 self.priors = None
107 """Class probabilities"""
108
109
110 self._norm_weight = None
111
113 """Train the classifier using `dataset` (`Dataset`).
114 """
115 params = self.params
116
117
118 X = dataset.samples
119 labels = dataset.labels
120 self.ulabels = ulabels = dataset.uniquelabels
121 nlabels = len(ulabels)
122
123 label2index = dict((l, il) for il, l in enumerate(ulabels))
124
125
126 nsamples = len(X)
127 s_shape = X.shape[1:]
128
129 self.means = means = \
130 N.zeros((nlabels, ) + s_shape)
131 self.variances = variances = \
132 N.zeros((nlabels, ) + s_shape)
133
134 nsamples_per_class = N.zeros((nlabels,) + (1,)*len(s_shape))
135
136
137 for s, l in zip(X, labels):
138 il = label2index[l]
139 nsamples_per_class[il] += 1
140 means[il] += s
141
142
143 squash = lambda x: N.atleast_1d(x.squeeze())
144
145 non0labels = (squash(nsamples_per_class) != 0)
146 means[non0labels] /= nsamples_per_class[non0labels]
147
148
149
150 for s, l in zip(X, labels):
151 il = label2index[l]
152 variances[il] += (s - means[il])**2
153
154
155 if params.common_variance:
156
157 cvar = N.sum(variances, axis=0)/nsamples
158
159 variances[:] = cvar
160 else:
161 variances[non0labels] /= nsamples_per_class[non0labels]
162
163
164 prior = params.prior
165 if prior == 'uniform':
166 self.priors = N.ones((nlabels,))/nlabels
167 elif prior == 'laplacian_smoothing':
168 self.priors = (1+squash(nsamples_per_class)) \
169 / (float(nsamples) + nlabels)
170 elif prior == 'ratio':
171 self.priors = squash(nsamples_per_class) / float(nsamples)
172 else:
173 raise ValueError(
174 "No idea on how to handle '%s' way to compute priors"
175 % params.prior)
176
177
178 if params.logprob:
179
180 self._norm_weight = -0.5 * N.log(2*N.pi*variances)
181 else:
182 self._norm_weight = 1.0/N.sqrt(2*N.pi*variances)
183
184 if __debug__ and 'GNB' in debug.active:
185 debug('GNB', "training finished on data.shape=%s " % (X.shape, )
186 + "min:max(data)=%f:%f" % (N.min(X), N.max(X)))
187
188
190 """Untrain classifier and reset all learnt params
191 """
192 self.means = None
193 self.variances = None
194 self.ulabels = None
195 self.priors = None
196 super(GNB, self).untrain()
197
198
200 """Predict the output for the provided data.
201 """
202 params = self.params
203
204 scaled_distances = \
205 -0.5 * (((data - self.means[:, N.newaxis, ...])**2) \
206 / self.variances[:, N.newaxis, ...])
207 if params.logprob:
208
209
210
211
212
213
214
215 lprob_csfs = self._norm_weight[:, N.newaxis, ...] + scaled_distances
216
217
218
219
220
221 lprob_csf = lprob_csfs.reshape(
222 lprob_csfs.shape[:2] + (-1,))
223
224 lprob_cs = lprob_csf.sum(axis=2)
225
226
227 prob_cs_cp = lprob_cs + N.log(self.priors[:, N.newaxis])
228
229 else:
230
231
232 prob_csfs = \
233 self._norm_weight[:, N.newaxis, ...] * N.exp(scaled_distances)
234
235
236
237 prob_csf = prob_csfs.reshape(
238 prob_csfs.shape[:2] + (-1,))
239
240 prob_cs = prob_csf.prod(axis=2)
241
242
243 prob_cs_cp = prob_cs * self.priors[:, N.newaxis]
244
245
246 if params.normalize:
247 if params.logprob:
248 prob_cs_cp_real = N.exp(prob_cs_cp)
249 else:
250 prob_cs_cp_real = prob_cs_cp
251 prob_s_cp_marginals = N.sum(prob_cs_cp_real, axis=0)
252 if params.logprob:
253 prob_cs_cp -= N.log(prob_s_cp_marginals)
254 else:
255 prob_cs_cp /= prob_s_cp_marginals
256
257
258 winners = prob_cs_cp.argmax(axis=0)
259 predictions = [self.ulabels[c] for c in winners]
260
261
262 self.values = prob_cs_cp.T
263
264 if __debug__ and 'GNB' in debug.active:
265 debug('GNB', "predict on data.shape=%s min:max(data)=%f:%f " %
266 (data.shape, N.min(data), N.max(data)))
267
268 return predictions
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302