1
2
3
4
5
6
7
8
9 """Miscelaneous data generators for unittests and demos"""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from mvpa.datasets import Dataset
16
17 if __debug__:
18 from mvpa.base import debug
19
21 """Replicate datasets multiple times raising different chunks
22
23 Given some randomized (noisy) generator of a dataset with a single
24 chunk call generator multiple times and place results into a
25 distinct chunks
26 """
27 for chunk in xrange(n_chunks):
28 dataset_ = func(*args, **kwargs)
29 dataset_.chunks[:] = chunk + 1
30 if chunk == 0:
31 dataset = dataset_
32 else:
33 dataset += dataset_
34
35 return dataset
36
37
39 """Create a very simple dataset with 2 features and 3 labels
40 """
41 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
42 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
43 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
44 [12, 1]]
45 regs = ([1] * 8) + ([2] * 8) + ([3] * 8)
46
47 return Dataset(samples=data, labels=regs)
48
49
51 """Very simple binary (2 labels) dataset
52 """
53 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
54 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
55 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
56 [12, 1]]
57 regs = ([0] * 12) + ([1] * 12)
58
59 return Dataset(samples=data, labels=regs)
60
61
62
63 -def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5,
64 means=None, nonbogus_features=None, snr=3.0):
65 """Generate a univariate dataset with normal noise and specified means.
66
67 :Keywords:
68 perlabel : int
69 Number of samples per each label
70 nlabels : int
71 Number of labels in the dataset
72 nfeatures : int
73 Total number of features (including bogus features which carry
74 no label-related signal)
75 nchunks : int
76 Number of chunks (perlabel should be multiple of nchunks)
77 means : None or list of float or ndarray
78 Specified means for each of features among nfeatures.
79 nonbogus_features : None or list of int
80 Indexes of non-bogus features (1 per label)
81 snr : float
82 Signal-to-noise ration assuming that signal has std 1.0 so we
83 just divide random normal noise by snr
84
85 Probably it is a generalization of pureMultivariateSignal where
86 means=[ [0,1], [1,0] ]
87
88 Specify either means or nonbogus_features so means get assigned
89 accordingly
90 """
91
92 data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr)
93 if (means is None) and (not nonbogus_features is None):
94 if len(nonbogus_features) > nlabels:
95 raise ValueError, "Can't assign simply a feature to a " + \
96 "class: more nonbogus_features than labels"
97 means = N.zeros((len(nonbogus_features), nfeatures))
98
99 for i in xrange(len(nonbogus_features)):
100 means[i, nonbogus_features[i]] = 1.0
101 if not means is None:
102
103 data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0)
104
105
106 data = 1.0/(N.max(N.abs(data))) * data
107 labels = N.concatenate([N.repeat('L%d' % i, perlabel)
108 for i in range(nlabels)])
109 chunks = N.concatenate([N.repeat(range(nchunks),
110 perlabel/nchunks) for i in range(nlabels)])
111 ds = Dataset(samples=data, labels=labels, chunks=chunks, labels_map=True)
112 ds.nonbogus_features = nonbogus_features
113 return ds
114
116 """ Create a 2d dataset with a clear multivariate signal, but no
117 univariate information.
118
119 ::
120
121 %%%%%%%%%
122 % O % X %
123 %%%%%%%%%
124 % X % O %
125 %%%%%%%%%
126 """
127
128
129 data = N.random.normal(size=(4*patterns, 2))
130
131
132 data[:2*patterns, 1] += signal2noise
133
134 data[2*patterns:4*patterns, 1] -= signal2noise
135 data[:patterns, 0] -= signal2noise
136 data[2*patterns:3*patterns, 0] -= signal2noise
137 data[patterns:2*patterns, 0] += signal2noise
138 data[3*patterns:4*patterns, 0] += signal2noise
139
140
141 regs = N.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns))
142
143 return Dataset(samples=data, labels=regs, chunks=chunks)
144
145
146 -def normalFeatureDataset__(dataset=None, labels=None, nchunks=None,
147 perlabel=50, activation_probability_steps=1,
148 randomseed=None, randomvoxels=False):
149 """ NOT FINISHED """
150 raise NotImplementedError
151
152 if dataset is None and labels is None:
153 raise ValueError, \
154 "Provide at least labels or a background dataset"
155
156 if dataset is None:
157 nlabels = len(labels)
158 else:
159 nchunks = len(dataset.uniquechunks)
160
161 N.random.seed(randomseed)
162
163
164
165 if randomvoxels:
166 indexes = N.random.permutation(dataset.nfeatures)
167 else:
168 indexes = N.arange(dataset.nfeatures)
169
170 allind, maps = [], []
171 if __debug__:
172 debug('DG', "Ugly creation of the copy of background")
173
174 dtype = dataset.samples.dtype
175 if not N.issubdtype(dtype, N.float):
176 dtype = N.float
177 totalsignal = N.zeros(dataset.samples.shape, dtype=dtype)
178
179 for l in xrange(len(labels)):
180 label = labels[l]
181 if __debug__:
182 debug('DG', "Simulating independent labels for %s" % label)
183
184
185 labelids = dataset.idsbylabels(label)
186
187
188
189 nfeatures = perlabel * activation_probability_steps
190 ind, indexes = indexes[0:nfeatures], \
191 indexes[nfeatures+1:]
192 allind += list(ind)
193
194
195
196 ds = dataset.selectFeatures(ind)
197 ds.samples[:] = 0.0
198
199
200 prob = [1.0 - x*1.0/activation_probability_steps
201 for x in xrange(activation_probability_steps)]
202
203
204 probabilities = N.repeat(prob, perlabel)
205 if __debug__:
206 debug('DG', 'For prob=%s probabilities=%s' % (prob, probabilities))
207
208 for chunk in ds.uniquechunks:
209 chunkids = ds.idsbychunks(chunk)
210 ids = list(set(chunkids).intersection(set(labelids)))
211 chunkvalue = N.random.uniform()
212
213 for id_ in ids:
214 ds.samples[id_, :] = \
215 (chunkvalue <= probabilities).astype('float')
216
217 maps.append(N.array(probabilities, copy=True))
218
219 signal = ds.map2Nifti(ds.samples)
220 totalsignal[:, ind] += ds.samples
221
222
223 wfeatures = dataset.samples[:, allind]
224 meanstd = N.mean(N.std(wfeatures, 1))
225 if __debug__:
226 debug('DG', "Mean deviation is %f" % meanstd)
227
228 totalsignal *= meanstd * options.snr
229
230 dataset.samples += totalsignal
231
232 return dataset
233
238
239
241 """Generate '6d robot arm' dataset (Williams and Rasmussen 1996)
242
243 Was originally created in order to test the correctness of the
244 implementation of kernel ARD. For full details see:
245 http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard
246
247 x_1 picked randomly in [-1.932, -0.453]
248 x_2 picked randomly in [0.534, 3.142]
249 r_1 = 2.0
250 r_2 = 1.3
251 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025)
252 etc.
253
254 Expected relevances:
255 ell_1 1.804377
256 ell_2 1.963956
257 ell_3 8.884361
258 ell_4 34.417657
259 ell_5 1081.610451
260 ell_6 375.445823
261 sigma_f 2.379139
262 sigma_n 0.050835
263 """
264 intervals = N.array([[-1.932, -0.453], [0.534, 3.142]])
265 r = N.array([2.0, 1.3])
266 x = N.random.rand(size, 2)
267 x *= N.array(intervals[:, 1]-intervals[:, 0])
268 x += N.array(intervals[:, 0])
269 if __debug__:
270 for i in xrange(2):
271 debug('DG', '%d columnt Min: %g Max: %g' %
272 (i, x[:, i].min(), x[:, i].max()))
273 y = r[0]*N.cos(x[:, 0] + r[1]*N.cos(x.sum(1))) + \
274 N.random.randn(size)*N.sqrt(0.0025)
275 y -= y.mean()
276 x34 = x + N.random.randn(size, 2)*0.02
277 x56 = N.random.randn(size, 2)
278 x = N.hstack([x, x34, x56])
279 return Dataset(samples=x, labels=y)
280
281
282 -def sinModulated(n_instances, n_features,
283 flat=False, noise=0.4):
284 """ Generate a (quite) complex multidimensional non-linear dataset
285
286 Used for regression testing. In the data label is a sin of a x^2 +
287 uniform noise
288 """
289 if flat:
290 data = (N.arange(0.0, 1.0, 1.0/n_instances)*N.pi)
291 data.resize(n_instances, n_features)
292 else:
293 data = N.random.rand(n_instances, n_features)*N.pi
294 label = N.sin((data**2).sum(1)).round()
295 label += N.random.rand(label.size)*noise
296 return Dataset(samples=data, labels=label)
297
298 -def chirpLinear(n_instances, n_features=4, n_nonbogus_features=2,
299 data_noise=0.4, noise=0.1):
300 """ Generates simple dataset for linear regressions
301
302 Generates chirp signal, populates n_nonbogus_features out of
303 n_features with it with different noise level and then provides
304 signal itself with additional noise as labels
305 """
306 x = N.linspace(0, 1, n_instances)
307 y = N.sin((10 * N.pi * x **2))
308
309 data = N.random.normal(size=(n_instances, n_features ))*data_noise
310 for i in xrange(n_nonbogus_features):
311 data[:, i] += y[:]
312
313 labels = y + N.random.normal(size=(n_instances,))*noise
314
315 return Dataset(samples=data, labels=labels)
316
317
318 -def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False):
319 """Generate a dataset from a linear function with AWGN
320 (Added White Gaussian Noise).
321
322 It can be multidimensional if 'slope' is a vector. If flat is True
323 (in 1 dimesion) generate equally spaces samples instead of random
324 ones. This is useful for the test phase.
325 """
326 dimensions = 1
327 if isinstance(slope, N.ndarray):
328 dimensions = slope.size
329
330 if flat and dimensions == 1:
331 x = N.linspace(0, 1, size)[:, N.newaxis]
332 else:
333 x = N.random.rand(size, dimensions)
334
335 y = N.dot(x, slope)[:, N.newaxis] \
336 + (N.random.randn(*(x.shape[0], 1)) * noise_std) + intercept
337
338 return Dataset(samples=x, labels=y)
339
340
341 -def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1):
342 """
343 """
344 x = []
345 y = []
346 labels = []
347 for fx in sfx:
348 nx = N.random.normal(size=size_per_fx)
349 ny = fx(nx) + N.random.normal(size=nx.shape, scale=noise_std)
350 x.append(nx)
351 y.append(ny)
352
353
354 labels.append(N.array(ny < dfx(nx), dtype='int'))
355
356 samples = N.array((N.hstack(x), N.hstack(y))).squeeze().T
357 labels = N.hstack(labels).squeeze().T
358
359 samples += N.array(center)
360
361 return Dataset(samples=samples, labels=labels)
362
363
366 """A straight line with some Gaussian noise.
367 """
368 x = N.linspace(start=x_min, stop=x_max, num=size)
369 noise = N.random.randn(size)*sigma
370 y = x * slope + intercept + noise
371 return Dataset(samples=x.reshape((-1, 1)), labels=y)
372