1
2
3
4
5
6
7
8
9 """Misc function performing operations on datasets.
10
11 All the functions defined in this module must accept dataset as the
12 first argument since they are bound to Dataset class in the trailer.
13 """
14
15 __docformat__ = 'restructuredtext'
16
17 from operator import isSequenceType
18
19 import numpy as N
20
21 from mvpa.datasets.base import Dataset, datasetmethod
22 from mvpa.base.dochelpers import table2string
23 from mvpa.misc.support import getBreakPoints
24
25 from mvpa.base import externals, warning
26
27 if __debug__:
28 from mvpa.base import debug
29
30 if externals.exists('scipy'):
31 from mvpa.datasets.miscfx_sp import detrend
32
33
34 @datasetmethod
35 -def zscore(dataset, mean=None, std=None,
36 perchunk=True, baselinelabels=None,
37 pervoxel=True, targetdtype='float64'):
38 """Z-Score the samples of a `Dataset` (in-place).
39
40 `mean` and `std` can be used to pass custom values to the z-scoring.
41 Both may be scalars or arrays.
42
43 All computations are done *in place*. Data upcasting is done
44 automatically if necessary into `targetdtype`
45
46 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would
47 compute the corresponding measure based only on labels in `baselinelabels`
48
49 If `perchunk` is True samples within the same chunk are z-scored independent
50 of samples from other chunks, e.i. mean and standard deviation are
51 calculated individually.
52 """
53
54 if __debug__ and perchunk \
55 and N.array(dataset.samplesperchunk.values()).min() <= 2:
56 warning("Z-scoring chunk-wise and one chunk with less than three "
57 "samples will set features in these samples to either zero "
58 "(with 1 sample in a chunk) "
59 "or -1/+1 (with 2 samples in a chunk).")
60
61
62 if str(dataset.samples.dtype).startswith('uint') \
63 or str(dataset.samples.dtype).startswith('int'):
64 dataset.setSamplesDType(targetdtype)
65
66 def doit(samples, mean, std, statsamples=None):
67 """Internal method."""
68
69 if statsamples is None:
70
71 statsamples = samples
72
73 if pervoxel:
74 axisarg = {'axis':0}
75 else:
76 axisarg = {}
77
78
79 if mean is None:
80 mean = statsamples.mean(**axisarg)
81
82
83 samples -= mean
84
85
86
87
88
89
90 if std is None:
91 std = statsamples.std(**axisarg)
92
93
94 if pervoxel:
95
96 if N.isscalar(std):
97 std = N.ones(samples.shape[1])
98 else:
99
100 std = N.asanyarray(std)
101 samples[:, std != 0] /= std[std != 0]
102 else:
103 samples /= std
104
105 return samples
106
107 if baselinelabels is None:
108 statids = None
109 else:
110 statids = set(dataset.idsbylabels(baselinelabels))
111
112
113
114 if perchunk:
115 for c in dataset.uniquechunks:
116 slicer = N.where(dataset.chunks == c)[0]
117 if not statids is None:
118 statslicer = list(statids.intersection(set(slicer)))
119 dataset.samples[slicer] = doit(dataset.samples[slicer],
120 mean, std,
121 dataset.samples[statslicer])
122 else:
123 slicedsamples = dataset.samples[slicer]
124 dataset.samples[slicer] = doit(slicedsamples,
125 mean, std,
126 slicedsamples)
127 elif statids is None:
128 doit(dataset.samples, mean, std, dataset.samples)
129 else:
130 doit(dataset.samples, mean, std, dataset.samples[list(statids)])
131
135 """Apply a function to each row of the samples matrix of a dataset.
136
137 The functor given as `fx` has to honour an `axis` keyword argument in the
138 way that NumPy used it (e.g. NumPy.mean, var).
139
140 :Returns:
141 a new `Dataset` object with the aggregated feature(s).
142 """
143 agg = fx(dataset.samples, axis=1)
144
145 return Dataset(samples=N.array(agg, ndmin=2).T,
146 labels=dataset.labels,
147 chunks=dataset.chunks)
148
152 """Returns a new dataset with all invariant features removed.
153 """
154 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
155
159 """Change chunking of the dataset
160
161 Group chunks into groups to match desired number of chunks. Makes
162 sense if originally there were no strong groupping into chunks or
163 each sample was independent, thus belonged to its own chunk
164
165 :Parameters:
166 source : Dataset or list of chunk ids
167 dataset or list of chunk ids to operate on. If Dataset, then its chunks
168 get modified
169 nchunks : int
170 desired number of chunks
171 """
172
173 if isinstance(source, Dataset):
174 chunks = source.chunks
175 else:
176 chunks = source
177 chunks_unique = N.unique(chunks)
178 nchunks_orig = len(chunks_unique)
179
180 if nchunks_orig < nchunks:
181 raise ValueError, \
182 "Original number of chunks is %d. Cannot coarse them " \
183 "to get %d chunks" % (nchunks_orig, nchunks)
184
185
186 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique)))
187 for c in chunks:
188 counts[c] += 1
189
190
191
192
193
194 avg_chunk_size = N.sum(counts.values())*1.0/nchunks
195 chunks_groups = []
196 cur_chunk = []
197 nchunks = 0
198 cur_chunk_nsamples = 0
199 samples_counted = 0
200 for i, c in enumerate(chunks_unique):
201 cc = counts[c]
202
203 cur_chunk += [c]
204 cur_chunk_nsamples += cc
205
206
207 if (samples_counted + cur_chunk_nsamples
208 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1:
209 chunks_groups.append(cur_chunk)
210 samples_counted += cur_chunk_nsamples
211 cur_chunk_nsamples = 0
212 cur_chunk = []
213 nchunks += 1
214
215 if len(chunks_groups) != nchunks:
216 warning("Apparently logic in coarseChunks is wrong. "
217 "It was desired to get %d chunks, got %d"
218 % (nchunks, len(chunks_groups)))
219
220
221
222 chunks_map = {}
223 for i, group in enumerate(chunks_groups):
224 for c in group:
225 chunks_map[c] = i
226
227 chunks_new = [chunks_map[x] for x in chunks]
228
229 if __debug__:
230 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s"
231 % (chunks_map, chunks, chunks_new))
232
233 if isinstance(source, Dataset):
234 if __debug__:
235 debug("DS", "Coarsing %d chunks into %d chunks for %s"
236 %(nchunks_orig, len(chunks_new), source))
237 source.chunks = chunks_new
238 return
239 else:
240 return chunks_new
241
245 """Returns an array with the number of samples per label in each chunk.
246
247 Array shape is (chunks x labels).
248
249 :Parameters:
250 dataset: Dataset
251 Source dataset.
252 """
253 ul = dataset.uniquelabels
254 uc = dataset.uniquechunks
255
256 count = N.zeros((len(uc), len(ul)), dtype='uint')
257
258 for cc, c in enumerate(uc):
259 for lc, l in enumerate(ul):
260 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l,
261 dataset.chunks == c))
262
263 return count
264
267 """Simple helper to provide representation of sequence statistics
268
269 Matlab analog:
270 http://cfn.upenn.edu/aguirre/code/matlablib/mseq/mtest.m
271
272 WARNING: Experimental -- API might change without warning!
273 Current implementation is ugly!
274 """
275
277 """Initialize SequenceStats
278
279 :Parameters:
280 seq : list or ndarray
281 Actual sequence of labels
282
283 :Keywords:
284 order : int
285 Maximal order of counter-balancing check. For perfect
286 counterbalancing all matrices should be identical
287 """
288 """
289 chunks : None or list or ndarray
290 Chunks to use if `perchunk`=True
291 perchunk .... TODO
292 """
293 dict.__init__(self)
294 self.order = order
295 self._seq = seq
296 self.stats = None
297 self._str_stats = None
298 self.__compute()
299
300
302 """Representation of SequenceStats
303 """
304 return "SequenceStats(%s, order=%d)" % (repr(self._seq), self.order)
305
307 return self._str_stats
308
310 """Compute stats and string representation
311 """
312
313 order = self.order
314 seq = list(self._seq)
315 nsamples = len(seq)
316 ulabels = sorted(list(set(seq)))
317 nlabels = len(ulabels)
318
319
320 labels_map = dict([(l, i) for i,l in enumerate(ulabels)])
321
322
323 seqm = [labels_map[i] for i in seq]
324 nperlabel = N.bincount(seqm)
325
326 res = dict(ulabels=ulabels)
327
328 cbcounts = N.zeros((order, nlabels, nlabels), dtype=int)
329 for cb in xrange(order):
330 for i,j in zip(seqm[:-(cb+1)], seqm[cb+1:]):
331 cbcounts[cb, i, j] += 1
332 res['cbcounts'] = cbcounts
333
334 """
335 Lets compute relative counter-balancing
336 Ideally, nperlabel[i]/nlabels should precede each label
337 """
338
339 corr = []
340
341 for shift in xrange(1, nsamples):
342 shifted = seqm[shift:] + seqm[:shift]
343
344 corr += [N.corrcoef(seqm, shifted)[0, 1]]
345
346 res['corrcoef'] = corr = N.array(corr)
347 res['sumabscorr'] = sumabscorr = N.sum(N.abs(corr))
348 self.update(res)
349
350
351
352 t = [ [""] * (1 + self.order*(nlabels+1)) for i in xrange(nlabels+1) ]
353 t[0][0] = "Labels/Order"
354 for i, l in enumerate(ulabels):
355 t[i+1][0] = '%s:' % l
356 for cb in xrange(order):
357 t[0][1+cb*(nlabels+1)] = "O%d" % (cb+1)
358 for i in xrange(nlabels+1):
359 t[i][(cb+1)*(nlabels+1)] = " | "
360 m = cbcounts[cb]
361
362 ind = N.where(~N.isnan(m))
363 for i, j in zip(*ind):
364 t[1+i][1+cb*(nlabels+1)+j] = '%d' % m[i, j]
365
366 sout = "Original sequence had %d entries from set %s\n" \
367 % (len(seq), ulabels) + \
368 "Counter-balance table for orders up to %d:\n" % order \
369 + table2string(t)
370 sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \
371 % (min(corr), max(corr), N.mean(corr), sumabscorr)
372 self._str_stats = sout
373
374
376 """Plot correlation coefficients
377 """
378 externals.exists('pylab', raiseException=True)
379 import pylab as P
380 P.plot(self['corrcoef'])
381 P.title('Auto-correlation of the sequence')
382 P.xlabel('Offset')
383 P.ylabel('Correlation Coefficient')
384 P.show()
385