mvpa.datasets.miscfx

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # See COPYING file distributed along with the PyMVPA package for the 6 # copyright and license terms. 7 # 8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 """Misc function performing operations on datasets. 10 11 All the functions defined in this module must accept dataset as the 12 first argument since they are bound to Dataset class in the trailer. 13 """ 14 15 __docformat__ = 'restructuredtext' 16 17 from operator import isSequenceType 18 19 import numpy as N 20 21 from mvpa.datasets.base import Dataset, datasetmethod 22 from mvpa.base.dochelpers import table2string 23 from mvpa.misc.support import getBreakPoints 24 25 from mvpa.base import externals, warning 26 27 if __debug__: 28 from mvpa.base import debug 29 30 if externals.exists('scipy'): 31 from mvpa.datasets.miscfx_sp import detrend

32 33 34 @datasetmethod 35 -def zscore(dataset, mean=None, std=None, 36 perchunk=True, baselinelabels=None, 37 pervoxel=True, targetdtype='float64'):

38 """Z-Score the samples of a `Dataset` (in-place). 39 40 `mean` and `std` can be used to pass custom values to the z-scoring. 41 Both may be scalars or arrays. 42 43 All computations are done *in place*. Data upcasting is done 44 automatically if necessary into `targetdtype` 45 46 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would 47 compute the corresponding measure based only on labels in `baselinelabels` 48 49 If `perchunk` is True samples within the same chunk are z-scored independent 50 of samples from other chunks, e.i. mean and standard deviation are 51 calculated individually. 52 """ 53 54 if __debug__ and perchunk \ 55 and N.array(dataset.samplesperchunk.values()).min() <= 2: 56 warning("Z-scoring chunk-wise and one chunk with less than three " 57 "samples will set features in these samples to either zero " 58 "(with 1 sample in a chunk) " 59 "or -1/+1 (with 2 samples in a chunk).") 60 61 # cast to floating point datatype if necessary 62 if str(dataset.samples.dtype).startswith('uint') \ 63 or str(dataset.samples.dtype).startswith('int'): 64 dataset.setSamplesDType(targetdtype) 65 66 def doit(samples, mean, std, statsamples=None): 67 """Internal method.""" 68 69 if statsamples is None: 70 # if nothing provided -- mean/std on all samples 71 statsamples = samples 72 73 if pervoxel: 74 axisarg = {'axis':0} 75 else: 76 axisarg = {} 77 78 # calculate mean if necessary 79 if mean is None: 80 mean = statsamples.mean(**axisarg) 81 82 # de-mean 83 samples -= mean 84 85 # calculate std-deviation if necessary 86 # XXX YOH: would that be actually what we want? 87 # may be we want actually estimate of deviation from the mean, 88 # which per se might be not statsamples.mean (see above)? 89 # if logic to be changed -- adjust ZScoreMapper as well 90 if std is None: 91 std = statsamples.std(**axisarg) 92 93 # do the z-scoring 94 if pervoxel: 95 # Assure std being an array 96 if N.isscalar(std): 97 std = N.ones(samples.shape[1]) 98 else: 99 # and so we don't perform list comparison to 0 100 std = N.asanyarray(std) 101 samples[:, std != 0] /= std[std != 0] 102 else: 103 samples /= std 104 105 return samples

106 107 if baselinelabels is None: 108 statids = None 109 else: 110 statids = set(dataset.idsbylabels(baselinelabels)) 111 112 # for the sake of speed yoh didn't simply create a list 113 # [True]*dataset.nsamples to provide easy selection of everything 114 if perchunk: 115 for c in dataset.uniquechunks: 116 slicer = N.where(dataset.chunks == c)[0] 117 if not statids is None: 118 statslicer = list(statids.intersection(set(slicer))) 119 dataset.samples[slicer] = doit(dataset.samples[slicer], 120 mean, std, 121 dataset.samples[statslicer]) 122 else: 123 slicedsamples = dataset.samples[slicer] 124 dataset.samples[slicer] = doit(slicedsamples, 125 mean, std, 126 slicedsamples) 127 elif statids is None: 128 doit(dataset.samples, mean, std, dataset.samples) 129 else: 130 doit(dataset.samples, mean, std, dataset.samples[list(statids)]) 131

132 133 @datasetmethod 134 -def aggregateFeatures(dataset, fx=N.mean):

135 """Apply a function to each row of the samples matrix of a dataset. 136 137 The functor given as `fx` has to honour an `axis` keyword argument in the 138 way that NumPy used it (e.g. NumPy.mean, var). 139 140 :Returns: 141 a new `Dataset` object with the aggregated feature(s). 142 """ 143 agg = fx(dataset.samples, axis=1) 144 145 return Dataset(samples=N.array(agg, ndmin=2).T, 146 labels=dataset.labels, 147 chunks=dataset.chunks)

148

149 150 @datasetmethod 151 -def removeInvariantFeatures(dataset):

152 """Returns a new dataset with all invariant features removed. 153 """ 154 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])

155

156 157 @datasetmethod 158 -def coarsenChunks(source, nchunks=4):

159 """Change chunking of the dataset 160 161 Group chunks into groups to match desired number of chunks. Makes 162 sense if originally there were no strong groupping into chunks or 163 each sample was independent, thus belonged to its own chunk 164 165 :Parameters: 166 source : Dataset or list of chunk ids 167 dataset or list of chunk ids to operate on. If Dataset, then its chunks 168 get modified 169 nchunks : int 170 desired number of chunks 171 """ 172 173 if isinstance(source, Dataset): 174 chunks = source.chunks 175 else: 176 chunks = source 177 chunks_unique = N.unique(chunks) 178 nchunks_orig = len(chunks_unique) 179 180 if nchunks_orig < nchunks: 181 raise ValueError, \ 182 "Original number of chunks is %d. Cannot coarse them " \ 183 "to get %d chunks" % (nchunks_orig, nchunks) 184 185 # figure out number of samples per each chunk 186 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique))) 187 for c in chunks: 188 counts[c] += 1 189 190 # now we need to group chunks to get more or less equalized number 191 # of samples per chunk. No sophistication is done -- just 192 # consecutively group to get close to desired number of samples 193 # per chunk 194 avg_chunk_size = N.sum(counts.values())*1.0/nchunks 195 chunks_groups = [] 196 cur_chunk = [] 197 nchunks = 0 198 cur_chunk_nsamples = 0 199 samples_counted = 0 200 for i, c in enumerate(chunks_unique): 201 cc = counts[c] 202 203 cur_chunk += [c] 204 cur_chunk_nsamples += cc 205 206 # time to get a new chunk? 207 if (samples_counted + cur_chunk_nsamples 208 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1: 209 chunks_groups.append(cur_chunk) 210 samples_counted += cur_chunk_nsamples 211 cur_chunk_nsamples = 0 212 cur_chunk = [] 213 nchunks += 1 214 215 if len(chunks_groups) != nchunks: 216 warning("Apparently logic in coarseChunks is wrong. " 217 "It was desired to get %d chunks, got %d" 218 % (nchunks, len(chunks_groups))) 219 220 # remap using groups 221 # create dictionary 222 chunks_map = {} 223 for i, group in enumerate(chunks_groups): 224 for c in group: 225 chunks_map[c] = i 226 227 chunks_new = [chunks_map[x] for x in chunks] 228 229 if __debug__: 230 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s" 231 % (chunks_map, chunks, chunks_new)) 232 233 if isinstance(source, Dataset): 234 if __debug__: 235 debug("DS", "Coarsing %d chunks into %d chunks for %s" 236 %(nchunks_orig, len(chunks_new), source)) 237 source.chunks = chunks_new 238 return 239 else: 240 return chunks_new

241

242 243 @datasetmethod 244 -def getSamplesPerChunkLabel(dataset):

245 """Returns an array with the number of samples per label in each chunk. 246 247 Array shape is (chunks x labels). 248 249 :Parameters: 250 dataset: Dataset 251 Source dataset. 252 """ 253 ul = dataset.uniquelabels 254 uc = dataset.uniquechunks 255 256 count = N.zeros((len(uc), len(ul)), dtype='uint') 257 258 for cc, c in enumerate(uc): 259 for lc, l in enumerate(ul): 260 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l, 261 dataset.chunks == c)) 262 263 return count

264

265 266 -class SequenceStats(dict):

267 """Simple helper to provide representation of sequence statistics 268 269 Matlab analog: 270 http://cfn.upenn.edu/aguirre/code/matlablib/mseq/mtest.m 271 272 WARNING: Experimental -- API might change without warning! 273 Current implementation is ugly! 274 """ 275

276 - def __init__(self, seq, order=2):#, chunks=None, perchunk=False):

277 """Initialize SequenceStats 278 279 :Parameters: 280 seq : list or ndarray 281 Actual sequence of labels 282 283 :Keywords: 284 order : int 285 Maximal order of counter-balancing check. For perfect 286 counterbalancing all matrices should be identical 287 """ 288 """ 289 chunks : None or list or ndarray 290 Chunks to use if `perchunk`=True 291 perchunk .... TODO 292 """ 293 dict.__init__(self) 294 self.order = order 295 self._seq = seq 296 self.stats = None 297 self._str_stats = None 298 self.__compute()

299 300

301 - def __repr__(self):

302 """Representation of SequenceStats 303 """ 304 return "SequenceStats(%s, order=%d)" % (repr(self._seq), self.order)

305

306 - def __str__(self):

307 return self._str_stats

308

309 - def __compute(self):

310 """Compute stats and string representation 311 """ 312 # Do actual computation 313 order = self.order 314 seq = list(self._seq) # assure list 315 nsamples = len(seq) # # of samples/labels 316 ulabels = sorted(list(set(seq))) # unique labels 317 nlabels = len(ulabels) # # of labels 318 319 # mapping for labels 320 labels_map = dict([(l, i) for i,l in enumerate(ulabels)]) 321 322 # map sequence first 323 seqm = [labels_map[i] for i in seq] 324 nperlabel = N.bincount(seqm) 325 326 res = dict(ulabels=ulabels) 327 # Estimate counter-balance 328 cbcounts = N.zeros((order, nlabels, nlabels), dtype=int) 329 for cb in xrange(order): 330 for i,j in zip(seqm[:-(cb+1)], seqm[cb+1:]): 331 cbcounts[cb, i, j] += 1 332 res['cbcounts'] = cbcounts 333 334 """ 335 Lets compute relative counter-balancing 336 Ideally, nperlabel[i]/nlabels should precede each label 337 """ 338 # Autocorrelation 339 corr = [] 340 # for all possible shifts: 341 for shift in xrange(1, nsamples): 342 shifted = seqm[shift:] + seqm[:shift] 343 # ??? User pearsonsr with p may be? 344 corr += [N.corrcoef(seqm, shifted)[0, 1]] 345 # ??? report high (anti)correlations? 346 res['corrcoef'] = corr = N.array(corr) 347 res['sumabscorr'] = sumabscorr = N.sum(N.abs(corr)) 348 self.update(res) 349 350 # Assign textual summary 351 # XXX move into a helper function and do on demand 352 t = [ [""] * (1 + self.order*(nlabels+1)) for i in xrange(nlabels+1) ] 353 t[0][0] = "Labels/Order" 354 for i, l in enumerate(ulabels): 355 t[i+1][0] = '%s:' % l 356 for cb in xrange(order): 357 t[0][1+cb*(nlabels+1)] = "O%d" % (cb+1) 358 for i in xrange(nlabels+1): 359 t[i][(cb+1)*(nlabels+1)] = " | " 360 m = cbcounts[cb] 361 # ??? there should be better way to get indexes 362 ind = N.where(~N.isnan(m)) 363 for i, j in zip(*ind): 364 t[1+i][1+cb*(nlabels+1)+j] = '%d' % m[i, j] 365 366 sout = "Original sequence had %d entries from set %s\n" \ 367 % (len(seq), ulabels) + \ 368 "Counter-balance table for orders up to %d:\n" % order \ 369 + table2string(t) 370 sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \ 371 % (min(corr), max(corr), N.mean(corr), sumabscorr) 372 self._str_stats = sout

373 374

375 - def plot(self):

376 """Plot correlation coefficients 377 """ 378 externals.exists('pylab', raiseException=True) 379 import pylab as P 380 P.plot(self['corrcoef']) 381 P.title('Auto-correlation of the sequence') 382 P.xlabel('Offset') 383 P.ylabel('Correlation Coefficient') 384 P.show()

385

Source Code for Module mvpa.datasets.miscfx