Package mvpa :: Package misc :: Module data_generators
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.data_generators

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Miscelaneous data generators for unittests and demos""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15  from mvpa.datasets import Dataset 
 16   
 17  if __debug__: 
 18      from mvpa.base import debug 
 19   
20 -def multipleChunks(func, n_chunks, *args, **kwargs):
21 """Replicate datasets multiple times raising different chunks 22 23 Given some randomized (noisy) generator of a dataset with a single 24 chunk call generator multiple times and place results into a 25 distinct chunks 26 """ 27 for chunk in xrange(n_chunks): 28 dataset_ = func(*args, **kwargs) 29 dataset_.chunks[:] = chunk + 1 30 if chunk == 0: 31 dataset = dataset_ 32 else: 33 dataset += dataset_ 34 35 return dataset
36 37
38 -def dumbFeatureDataset():
39 """Create a very simple dataset with 2 features and 3 labels 40 """ 41 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], 42 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], 43 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], 44 [12, 1]] 45 regs = ([1] * 8) + ([2] * 8) + ([3] * 8) 46 47 return Dataset(samples=data, labels=regs)
48 49
50 -def dumbFeatureBinaryDataset():
51 """Very simple binary (2 labels) dataset 52 """ 53 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], 54 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], 55 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], 56 [12, 1]] 57 regs = ([0] * 12) + ([1] * 12) 58 59 return Dataset(samples=data, labels=regs)
60 61 62
63 -def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5, 64 means=None, nonbogus_features=None, snr=3.0):
65 """Generate a univariate dataset with normal noise and specified means. 66 67 :Keywords: 68 perlabel : int 69 Number of samples per each label 70 nlabels : int 71 Number of labels in the dataset 72 nfeatures : int 73 Total number of features (including bogus features which carry 74 no label-related signal) 75 nchunks : int 76 Number of chunks (perlabel should be multiple of nchunks) 77 means : None or list of float or ndarray 78 Specified means for each of features among nfeatures. 79 nonbogus_features : None or list of int 80 Indexes of non-bogus features (1 per label) 81 snr : float 82 Signal-to-noise ration assuming that signal has std 1.0 so we 83 just divide random normal noise by snr 84 85 Probably it is a generalization of pureMultivariateSignal where 86 means=[ [0,1], [1,0] ] 87 88 Specify either means or nonbogus_features so means get assigned 89 accordingly 90 """ 91 92 data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr) 93 if (means is None) and (not nonbogus_features is None): 94 if len(nonbogus_features) > nlabels: 95 raise ValueError, "Can't assign simply a feature to a " + \ 96 "class: more nonbogus_features than labels" 97 means = N.zeros((len(nonbogus_features), nfeatures)) 98 # pure multivariate -- single bit per feature 99 for i in xrange(len(nonbogus_features)): 100 means[i, nonbogus_features[i]] = 1.0 101 if not means is None: 102 # add mean 103 data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0) 104 # bring it 'under 1', since otherwise some classifiers have difficulties 105 # during optimization 106 data = 1.0/(N.max(N.abs(data))) * data 107 labels = N.concatenate([N.repeat('L%d' % i, perlabel) 108 for i in range(nlabels)]) 109 chunks = N.concatenate([N.repeat(range(nchunks), 110 perlabel/nchunks) for i in range(nlabels)]) 111 ds = Dataset(samples=data, labels=labels, chunks=chunks, labels_map=True) 112 ds.nonbogus_features = nonbogus_features 113 return ds
114
115 -def pureMultivariateSignal(patterns, signal2noise = 1.5, chunks=None):
116 """ Create a 2d dataset with a clear multivariate signal, but no 117 univariate information. 118 119 :: 120 121 %%%%%%%%% 122 % O % X % 123 %%%%%%%%% 124 % X % O % 125 %%%%%%%%% 126 """ 127 128 # start with noise 129 data = N.random.normal(size=(4*patterns, 2)) 130 131 # add signal 132 data[:2*patterns, 1] += signal2noise 133 134 data[2*patterns:4*patterns, 1] -= signal2noise 135 data[:patterns, 0] -= signal2noise 136 data[2*patterns:3*patterns, 0] -= signal2noise 137 data[patterns:2*patterns, 0] += signal2noise 138 data[3*patterns:4*patterns, 0] += signal2noise 139 140 # two conditions 141 regs = N.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns)) 142 143 return Dataset(samples=data, labels=regs, chunks=chunks)
144 145
146 -def normalFeatureDataset__(dataset=None, labels=None, nchunks=None, 147 perlabel=50, activation_probability_steps=1, 148 randomseed=None, randomvoxels=False):
149 """ NOT FINISHED """ 150 raise NotImplementedError 151 152 if dataset is None and labels is None: 153 raise ValueError, \ 154 "Provide at least labels or a background dataset" 155 156 if dataset is None: 157 nlabels = len(labels) 158 else: 159 nchunks = len(dataset.uniquechunks) 160 161 N.random.seed(randomseed) 162 163 # Create a sequence of indexes from which to select voxels to be used 164 # for features 165 if randomvoxels: 166 indexes = N.random.permutation(dataset.nfeatures) 167 else: 168 indexes = N.arange(dataset.nfeatures) 169 170 allind, maps = [], [] 171 if __debug__: 172 debug('DG', "Ugly creation of the copy of background") 173 174 dtype = dataset.samples.dtype 175 if not N.issubdtype(dtype, N.float): 176 dtype = N.float 177 totalsignal = N.zeros(dataset.samples.shape, dtype=dtype) 178 179 for l in xrange(len(labels)): 180 label = labels[l] 181 if __debug__: 182 debug('DG', "Simulating independent labels for %s" % label) 183 184 # What sample ids belong to this label 185 labelids = dataset.idsbylabels(label) 186 187 188 # What features belong here and what is left over 189 nfeatures = perlabel * activation_probability_steps 190 ind, indexes = indexes[0:nfeatures], \ 191 indexes[nfeatures+1:] 192 allind += list(ind) # store what indexes we used 193 194 # Create a dataset only for 'selected' features NB there is 195 # sideeffect that selectFeatures will sort those ind provided 196 ds = dataset.selectFeatures(ind) 197 ds.samples[:] = 0.0 # zero them out 198 199 # assign data 200 prob = [1.0 - x*1.0/activation_probability_steps 201 for x in xrange(activation_probability_steps)] 202 203 # repeat so each feature gets itw own 204 probabilities = N.repeat(prob, perlabel) 205 if __debug__: 206 debug('DG', 'For prob=%s probabilities=%s' % (prob, probabilities)) 207 208 for chunk in ds.uniquechunks: 209 chunkids = ds.idsbychunks(chunk) # samples in this chunk 210 ids = list(set(chunkids).intersection(set(labelids))) 211 chunkvalue = N.random.uniform() # random number to decide either 212 # to 'activate' the voxel 213 for id_ in ids: 214 ds.samples[id_, :] = \ 215 (chunkvalue <= probabilities).astype('float') 216 217 maps.append(N.array(probabilities, copy=True)) 218 219 signal = ds.map2Nifti(ds.samples) 220 totalsignal[:, ind] += ds.samples 221 222 # figure out average variance across all 'working' features 223 wfeatures = dataset.samples[:, allind] 224 meanstd = N.mean(N.std(wfeatures, 1)) 225 if __debug__: 226 debug('DG', "Mean deviation is %f" % meanstd) 227 228 totalsignal *= meanstd * options.snr 229 # add signal on top of background 230 dataset.samples += totalsignal 231 232 return dataset
233
234 -def getMVPattern(s2n):
235 """Simple multivariate dataset""" 236 return multipleChunks(pureMultivariateSignal, 6, 237 5, s2n, 1)
238 239
240 -def wr1996(size=200):
241 """Generate '6d robot arm' dataset (Williams and Rasmussen 1996) 242 243 Was originally created in order to test the correctness of the 244 implementation of kernel ARD. For full details see: 245 http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard 246 247 x_1 picked randomly in [-1.932, -0.453] 248 x_2 picked randomly in [0.534, 3.142] 249 r_1 = 2.0 250 r_2 = 1.3 251 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025) 252 etc. 253 254 Expected relevances: 255 ell_1 1.804377 256 ell_2 1.963956 257 ell_3 8.884361 258 ell_4 34.417657 259 ell_5 1081.610451 260 ell_6 375.445823 261 sigma_f 2.379139 262 sigma_n 0.050835 263 """ 264 intervals = N.array([[-1.932, -0.453], [0.534, 3.142]]) 265 r = N.array([2.0, 1.3]) 266 x = N.random.rand(size, 2) 267 x *= N.array(intervals[:, 1]-intervals[:, 0]) 268 x += N.array(intervals[:, 0]) 269 if __debug__: 270 for i in xrange(2): 271 debug('DG', '%d columnt Min: %g Max: %g' % 272 (i, x[:, i].min(), x[:, i].max())) 273 y = r[0]*N.cos(x[:, 0] + r[1]*N.cos(x.sum(1))) + \ 274 N.random.randn(size)*N.sqrt(0.0025) 275 y -= y.mean() 276 x34 = x + N.random.randn(size, 2)*0.02 277 x56 = N.random.randn(size, 2) 278 x = N.hstack([x, x34, x56]) 279 return Dataset(samples=x, labels=y)
280 281
282 -def sinModulated(n_instances, n_features, 283 flat=False, noise=0.4):
284 """ Generate a (quite) complex multidimensional non-linear dataset 285 286 Used for regression testing. In the data label is a sin of a x^2 + 287 uniform noise 288 """ 289 if flat: 290 data = (N.arange(0.0, 1.0, 1.0/n_instances)*N.pi) 291 data.resize(n_instances, n_features) 292 else: 293 data = N.random.rand(n_instances, n_features)*N.pi 294 label = N.sin((data**2).sum(1)).round() 295 label += N.random.rand(label.size)*noise 296 return Dataset(samples=data, labels=label)
297
298 -def chirpLinear(n_instances, n_features=4, n_nonbogus_features=2, 299 data_noise=0.4, noise=0.1):
300 """ Generates simple dataset for linear regressions 301 302 Generates chirp signal, populates n_nonbogus_features out of 303 n_features with it with different noise level and then provides 304 signal itself with additional noise as labels 305 """ 306 x = N.linspace(0, 1, n_instances) 307 y = N.sin((10 * N.pi * x **2)) 308 309 data = N.random.normal(size=(n_instances, n_features ))*data_noise 310 for i in xrange(n_nonbogus_features): 311 data[:, i] += y[:] 312 313 labels = y + N.random.normal(size=(n_instances,))*noise 314 315 return Dataset(samples=data, labels=labels)
316 317
318 -def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False):
319 """Generate a dataset from a linear function with AWGN 320 (Added White Gaussian Noise). 321 322 It can be multidimensional if 'slope' is a vector. If flat is True 323 (in 1 dimesion) generate equally spaces samples instead of random 324 ones. This is useful for the test phase. 325 """ 326 dimensions = 1 327 if isinstance(slope, N.ndarray): 328 dimensions = slope.size 329 330 if flat and dimensions == 1: 331 x = N.linspace(0, 1, size)[:, N.newaxis] 332 else: 333 x = N.random.rand(size, dimensions) 334 335 y = N.dot(x, slope)[:, N.newaxis] \ 336 + (N.random.randn(*(x.shape[0], 1)) * noise_std) + intercept 337 338 return Dataset(samples=x, labels=y)
339 340
341 -def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1):
342 """ 343 """ 344 x = [] 345 y = [] 346 labels = [] 347 for fx in sfx: 348 nx = N.random.normal(size=size_per_fx) 349 ny = fx(nx) + N.random.normal(size=nx.shape, scale=noise_std) 350 x.append(nx) 351 y.append(ny) 352 353 # whenever larger than first function value 354 labels.append(N.array(ny < dfx(nx), dtype='int')) 355 356 samples = N.array((N.hstack(x), N.hstack(y))).squeeze().T 357 labels = N.hstack(labels).squeeze().T 358 359 samples += N.array(center) 360 361 return Dataset(samples=samples, labels=labels)
362 363
364 -def linear1d_gaussian_noise(size=100, slope=0.5, intercept=1.0, 365 x_min=-2.0, x_max=3.0, sigma=0.2):
366 """A straight line with some Gaussian noise. 367 """ 368 x = N.linspace(start=x_min, stop=x_max, num=size) 369 noise = N.random.randn(size)*sigma 370 y = x * slope + intercept + noise 371 return Dataset(samples=x.reshape((-1, 1)), labels=y)
372