1
2
3
4
5
6
7
8
9 """Unit tests for PyMVPA dataset handling"""
10
11 import unittest
12 import random
13 import numpy as N
14 from mvpa.datasets import Dataset
15 from mvpa.datasets.miscfx import zscore, aggregateFeatures
16 from mvpa.mappers.mask import MaskMapper
17 from mvpa.misc.exceptions import DatasetError
18 from mvpa.support import copy
19
20 from tests_warehouse import datasets
21
23
25 """Test composition of new datasets by addition of existing ones
26 """
27 data = Dataset(samples=range(5), labels=1, chunks=1)
28
29 self.failUnlessEqual(
30 data.uniquelabels, [1],
31 msg="uniquelabels must be correctly recomputed")
32
33
34 self.failUnlessEqual( data.nsamples, 1)
35
36 self.failUnless(
37 (data.samples == N.array([[0, 1, 2, 3, 4]])).all() )
38
39
40 self.failUnless( (data.labels == N.array([1])).all() )
41 self.failUnless( (data.chunks == N.array([1])).all() )
42
43
44 self.failUnlessRaises( DatasetError,
45 data.__iadd__, Dataset(samples=N.ones((2,3)),
46 labels=1,
47 chunks=1))
48
49
50 dss = datasets['uni2large'].samples
51 data += Dataset(samples=dss[:2, :5], labels=2, chunks=2 )
52 self.failUnlessEqual( data.nfeatures, 5 )
53 self.failUnless((data.labels == N.array([1, 2, 2])).all() )
54 self.failUnless((data.chunks == N.array([1, 2, 2])).all() )
55
56
57 data += Dataset(samples=dss[3:5, :5], labels=3)
58 self.failUnless((data.chunks == N.array([1, 2, 2, 0, 1]) ).all())
59
60
61 self.failUnless((data.uniquelabels == N.array([1, 2, 3]) ).all())
62
63
64 self.failUnlessRaises(DatasetError,
65 Dataset,
66 samples=dss[:4, :5],
67 labels=[ 1, 2, 3 ],
68 chunks=2)
69
70
71 self.failUnlessRaises(DatasetError,
72 Dataset,
73 samples=dss[:4, :5],
74 labels=[ 1, 2, 3, 4 ],
75 chunks=[ 2, 2, 2 ])
76
77
79 """Testing feature selection: sorted/not sorted, feature groups
80 """
81 origdata = datasets['uni2large'].samples[:10, :20]
82 data = Dataset(samples=origdata, labels=2, chunks=2 )
83
84
85 data.defineFeatureGroups(N.repeat(range(4), 5))
86
87 unmasked = data.samples.copy()
88
89
90 self.failUnless( data.nfeatures == 20 )
91
92 features_to_select = [3, 0, 17]
93 features_to_select_copy = copy.deepcopy(features_to_select)
94 features_to_select_sorted = copy.deepcopy(features_to_select)
95 features_to_select_sorted.sort()
96
97 bsel = N.array([False]*20)
98 bsel[ features_to_select ] = True
99
100 for sel, issorted in \
101 [(data.selectFeatures( features_to_select, sort=False), False),
102 (data.selectFeatures( features_to_select, sort=True), True),
103 (data.select(slice(None), features_to_select), True),
104 (data.select(slice(None), N.array(features_to_select)), True),
105 (data.select(slice(None), bsel), True)
106 ]:
107 self.failUnless(sel.nfeatures == 3)
108
109
110 self.failUnless(sel.samples.shape == (10, 3))
111
112
113 fts = (features_to_select, features_to_select_sorted)[int(issorted)]
114 self.failUnless((unmasked[:, fts] == sel.samples).all())
115
116
117 self.failUnless((sel._dsattr['featuregroups'] == [0, 0, 3]).all())
118
119
120 self.failUnless(features_to_select==features_to_select_copy)
121
122
123 gsel = data.selectFeatures(groups=[2, 3])
124 self.failUnless(gsel.nfeatures == 10)
125 self.failUnless(set(gsel._dsattr['featuregroups']) == set([2, 3]))
126
127
129 origdata = datasets['uni2large'].samples[:100, :10].T
130 data = Dataset(samples=origdata, labels=2, chunks=2 )
131
132 self.failUnless( data.nsamples == 10 )
133
134
135 for sel in [ data.selectSamples(5),
136 data.select(5),
137 data.select(slice(5, 6)),
138 ]:
139 self.failUnless( sel.nsamples == 1 )
140 self.failUnless( data.nfeatures == 100 )
141 self.failUnless( sel.origids == [5] )
142
143
144 for sel in [ data.selectSamples([5, 5]),
145
146
147
148
149
150 ]:
151 self.failUnless( sel.nsamples == 2 )
152 self.failUnless( (sel.samples[0] == data.samples[5]).all() )
153 self.failUnless( (sel.samples[0] == sel.samples[1]).all() )
154 self.failUnless( len(sel.labels) == 2 )
155 self.failUnless( len(sel.chunks) == 2 )
156 self.failUnless((sel.origids == [5, 5]).all())
157
158 self.failUnless( sel.samples.shape == (2, 100) )
159
160
161 for sel in [ data.selectSamples(data.idsbylabels(2)),
162 data.select(labels=2),
163 data.select('labels', 2),
164 data.select('labels', [2]),
165 data['labels', [2]],
166 data['labels': [2], 'labels':2],
167 data['labels': [2]],
168 ]:
169 self.failUnless( sel.nsamples == data.nsamples )
170 self.failUnless( N.all(sel.samples == data.samples) )
171
172 for sel in [ data.selectSamples(data.idsbylabels(3)),
173 data.select(labels=3),
174 data.select('labels', 3),
175 data.select('labels', [3]),
176 ]:
177 self.failUnless( sel.nsamples == 0 )
178
179 data = Dataset(samples=origdata,
180 labels=[8, 9, 4, 3, 3, 3, 4, 2, 8, 9],
181 chunks=2)
182 for sel in [ data.selectSamples(data.idsbylabels([2, 3])),
183 data.select('labels', [2, 3]),
184 data.select('labels', [2, 3], labels=[1, 2, 3, 4]),
185 data.select('labels', [2, 3], chunks=[1, 2, 3, 4]),
186 data['labels':[2, 3], 'chunks':[1, 2, 3, 4]],
187 data['chunks':[1, 2, 3, 4], 'labels':[2, 3]],
188 ]:
189 self.failUnless(N.all(sel.origids == [ 3., 4., 5., 7.]))
190
191
192 self.failUnless( (data.uniquelabels == [2, 3, 4, 8, 9]).all() );
193
194
195
196 sel = data.selectSamples(data.idsbylabels([3, 4, 8, 9]))
197 self.failUnlessEqual(set(sel.uniquelabels), set([3, 4, 8, 9]))
198 self.failUnless((sel.origids == [0, 1, 2, 3, 4, 5, 6, 8, 9]).all())
199
200
202 """Test some obscure selections of samples via select() or __getitem__
203 """
204 origdata = datasets['uni2large'].samples[:100, :10].T
205 data = Dataset(samples=origdata,
206
207 labels=[8, 9, 4, 3, 3, 3, 3, 2, 8, 9],
208 chunks=[1, 2, 3, 2, 3, 1, 5, 6, 3, 6])
209
210
211 if __debug__:
212
213 self.failUnlessRaises(ValueError, data.__getitem__,
214 'labels', 'featu')
215
216
217 self.failUnlessRaises(ValueError, data.__getitem__, 1, 1, 1)
218
219
220 for sel in [ data.select('chunks', [2, 6], labels=[3, 2],
221 features=slice(None)),
222 data.select('all', 'all', labels=[2,3], chunks=[2, 6]),
223 data['chunks', [2, 6], 'labels', [3, 2]],
224 data[:, :, 'chunks', [2, 6], 'labels', [3, 2]],
225
226 data[3:8, 'chunks', [2, 6, 2, 6], 'labels', [3, 2]],
227 ]:
228 self.failUnless(N.all(sel.origids == [3, 7]))
229 self.failUnless(sel.nfeatures == 100)
230 self.failUnless(N.all(sel.samples == origdata[ [3, 7] ]))
231
232 target = origdata[ [3, 7] ]
233 target = target[:, [1, 3] ]
234
235 for sel in [ data.select('all', [1, 3],
236 'chunks', [2, 6], labels=[3, 2]),
237 data[:, [1,3], 'chunks', [2, 6], 'labels', [3, 2]],
238 data[:, [1,3], 'chunks', [2, 6], 'labels', [3, 2]],
239
240 data[3:8, [1, 1, 3, 1],
241 'chunks', [2, 6, 2, 6], 'labels', [3, 2]],
242 ]:
243 self.failUnless(N.all(sel.origids == [3, 7]))
244 self.failUnless(sel.nfeatures == 2)
245 self.failUnless(N.all(sel.samples == target))
246
247
248 self.failUnless(data.select(chunks=[23]).nsamples == 0)
249
250
251 self.failUnless(N.all(data.where(chunks=[2, 6])==[1, 3, 7, 9]))
252 self.failUnless(N.all(data.where(chunks=[2, 6], labels=[22, 3])==[3]))
253
254 idx = data.where('all', [1, 3, 10], labels=[2, 3, 4])
255 self.failUnless(N.all(idx[1] == [1, 3, 10]))
256 self.failUnless(N.all(idx[0] == range(2, 8)))
257
258 self.failUnless(data.where() is None)
259
260 self.failUnless(data.where(labels=[123]) == [])
261
262
276
277
279 data1 = Dataset(samples=N.ones((5, 5)), labels=1, chunks=1 )
280 data2 = Dataset(samples=N.ones((3, 5)), labels=2, chunks=1 )
281
282 merged = data1 + data2
283
284 self.failUnless( merged.nfeatures == 5 )
285 l12 = [1]*5 + [2]*3
286 l1 = [1]*8
287 self.failUnless( (merged.labels == l12).all() )
288 self.failUnless( (merged.chunks == l1).all() )
289
290 data1 += data2
291
292 self.failUnless( data1.nfeatures == 5 )
293 self.failUnless( (data1.labels == l12).all() )
294 self.failUnless( (data1.chunks == l1).all() )
295
296
298 """
299 """
300 data = Dataset(samples=N.ones((5, 1)), labels=range(5), chunks=1 )
301 data += Dataset(samples=N.ones((5, 1))+1, labels=range(5), chunks=2 )
302 data += Dataset(samples=N.ones((5, 1))+2, labels=range(5), chunks=3 )
303 data += Dataset(samples=N.ones((5, 1))+3, labels=range(5), chunks=4 )
304 data += Dataset(samples=N.ones((5, 1))+4, labels=range(5), chunks=5 )
305 self.failUnless( data.samplesperlabel == {0:5, 1:5, 2:5, 3:5, 4:5} )
306
307
308 sample = data.getRandomSamples( 2 )
309 self.failUnless( sample.samplesperlabel.values() == [ 2, 2, 2, 2, 2 ] )
310
311 self.failUnless( (data.uniquechunks == range(1, 6)).all() )
312
313
314 origlabels = data.labels.copy()
315
316 data.permuteLabels(True)
317
318 self.failIf( (data.labels == origlabels).all() )
319
320 data.permuteLabels(False)
321
322 self.failUnless( (data.labels == origlabels).all() )
323
324
325 data2 = Dataset(samples=data.samples,
326 labels=data.labels,
327 chunks=data.chunks )
328
329
330 self.failUnless( (data2.labels == origlabels).all() )
331
332
333 data2.permuteLabels( True )
334
335
336 self.failUnless( (data.labels == origlabels).all() )
337
338 self.failIf( (data2.labels == origlabels).all() )
339
340
342 """Test adding custom attributes to a dataset
343 """
344
345
346
347 ds = Dataset(samples=range(5), labels=1, chunks=1)
348 self.failUnlessRaises(AttributeError, lambda x:x.blobs, ds)
349 """Dataset.blobs should fail since .blobs wasn't yet registered"""
350
351
352 Dataset._registerAttribute("blobs", "_data", hasunique=True)
353 ds = Dataset(samples=range(5), labels=1, chunks=1)
354 self.failUnless(not ds.blobs != [ 0 ],
355 msg="By default new attributes supposed to get 0 as the value")
356
357 try:
358 ds.blobs = [1, 2]
359 self.fail(msg="Dataset.blobs=[1,2] should fail since "
360 "there is 5 samples")
361 except ValueError, e:
362 pass
363
364 try:
365 ds.blobs = [1]
366 except e:
367 self.fail(msg="We must be able to assign the attribute")
368
369
370
371
372
373
385
386
388 """Test z-scoring transformation
389 """
390
391 samples = N.array( (0,1,3,4,2,2,3,1,1,3,3,1,2,2,2,2) ).\
392 reshape((16, 1))
393 data = Dataset(samples=samples,
394 labels=range(16), chunks=[0]*16)
395 self.failUnlessEqual( data.samples.mean(), 2.0 )
396 self.failUnlessEqual( data.samples.std(), 1.0 )
397 zscore(data, perchunk=True)
398
399
400 check = N.array([-2,-1,1,2,0,0,1,-1,-1,1,1,-1,0,0,0,0],
401 dtype='float64').reshape(16,1)
402 self.failUnless( (data.samples == check).all() )
403
404 data = Dataset(samples=samples,
405 labels=range(16), chunks=[0]*16)
406 zscore(data, perchunk=False)
407 self.failUnless( (data.samples == check).all() )
408
409
410 data = Dataset(samples=samples,
411 labels=[0, 2, 2, 2, 1] + [2]*11,
412 chunks=[0]*16)
413 zscore(data, baselinelabels=[0, 1])
414 self.failUnless((samples == data.samples+1.0).all())
415
417 dataset = Dataset(samples=N.arange( 20 ).reshape( (4, 5) ),
418 labels=1,
419 chunks=1)
420 zscore(dataset, mean=0, std=1,
421 perchunk=True, pervoxel=True,
422 targetdtype="float32")
423
434
435
437 """Test creation of new dataset by applying a mapper"""
438 mapper = MaskMapper(N.array([1, 0, 1]))
439 dataset = Dataset(samples=N.arange(12).reshape( (4, 3) ),
440 labels=1,
441 chunks=1)
442 seldataset = dataset.applyMapper(featuresmapper=mapper)
443 self.failUnless( (dataset.selectFeatures([0, 2]).samples
444 == seldataset.samples).all() )
445
446
447
448 if __debug__:
449
450 self.failUnlessRaises(ValueError, mapper.reverse, [10, 20, 30])
451 self.failUnlessRaises(ValueError, mapper.forward, [10, 20])
452
453
454
455
456
457
458
459
461 """Test Dataset.idhash() if it gets changed if any of the
462 labels/chunks changes
463 """
464
465 dataset = Dataset(samples=N.arange(12).reshape( (4, 3) ),
466 labels=1,
467 chunks=1)
468 origid = dataset.idhash
469 dataset.labels = [3, 1, 2, 3]
470 self.failUnless(origid != dataset.idhash,
471 msg="Changing all labels should alter dataset's idhash")
472
473 origid = dataset.idhash
474
475 z = dataset.labels[1]
476 self.failUnlessEqual(origid, dataset.idhash,
477 msg="Accessing shouldn't change idhash")
478 z = dataset.chunks
479 self.failUnlessEqual(origid, dataset.idhash,
480 msg="Accessing shouldn't change idhash")
481 z[2] = 333
482 self.failUnless(origid != dataset.idhash,
483 msg="Changing value in attribute should change idhash")
484
485 origid = dataset.idhash
486 dataset.samples[1, 1] = 1000
487 self.failUnless(origid != dataset.idhash,
488 msg="Changing value in data should change idhash")
489
490
491 origid = dataset.idhash
492 dataset.permuteLabels(True)
493 self.failUnless(origid != dataset.idhash,
494 msg="Permutation also changes idhash")
495
496 dataset.permuteLabels(False)
497 self.failUnless(origid == dataset.idhash,
498 msg="idhash should be restored after "
499 "permuteLabels(False)")
500
501
518
519
526
527
529 """Test mapping of the labels from strings to numericals
530 """
531 od = {'apple':0, 'orange':1}
532 samples = [[3], [2], [3]]
533 labels_l = ['apple', 'orange', 'apple']
534
535
536 ds = Dataset(samples=samples, labels='orange')
537 self.failUnless(N.all(ds.labels == ['orange']*3))
538
539
540 for ds in [Dataset(samples=samples, labels=labels_l, labels_map=od),
541
542 Dataset(samples=samples, labels=labels_l, labels_map=True)]:
543 self.failUnless(N.all(ds.labels == [0, 1, 0]))
544 self.failUnless(ds.labels_map == od)
545 ds_ = ds[1]
546 self.failUnless(ds_.labels_map == od,
547 msg='selectSamples should provide full mapping preserved')
548
549
550 self.failUnlessRaises(ValueError, Dataset, samples=samples,
551 labels=labels_l, labels_map = {'apple':0})
552
553
554
555 ds2 = Dataset(samples=samples, labels=labels_l)
556 self.failUnlessEqual(ds2.labels_map, None)
557
558
559 od3 = {1:100, 2:101, 3:100}
560 ds3 = Dataset(samples=samples, labels=[1, 2, 3],
561 labels_map=od3)
562 self.failUnlessEqual(ds3.labels_map, od3)
563 self.failUnless(N.all(ds3.labels == [100, 101, 100]))
564
565 ds3_ = ds3[1]
566 self.failUnlessEqual(ds3.labels_map, od3)
567
568 ds4 = Dataset(samples=samples, labels=labels_l)
569
570
571 ds = Dataset(samples=samples, labels=labels_l, labels_map=od)
572
573 self.failUnlessRaises(ValueError, ds.setLabelsMap,
574 {'orange': 1, 'nonorange': 3})
575 new_map = {'tasty':0, 'crappy':1}
576 ds.labels_map = new_map.copy()
577 self.failUnlessEqual(ds.labels_map, new_map)
578
579
581 """Adding datasets needs special care whenever labels mapping
582 is used."""
583 samples = [[3], [2], [3]]
584 l1 = ['a', 'b', 'a']
585 l2 = ['b', 'a', 'c']
586 ds1 = Dataset(samples=samples, labels=l1,
587 labels_map={'a':1, 'b':2})
588 ds2 = Dataset(samples=samples, labels=l2,
589 labels_map={'c':1, 'a':4, 'b':2})
590
591
592 ds0 = Dataset(samples=samples, labels=l2)
593
594
595 lm1 = ds1.labels_map.copy()
596 lm2 = ds2.labels_map.copy()
597
598 ds3 = ds1 + ds2
599 self.failUnless(N.all(ds3.labels ==
600 N.hstack((ds1.labels, [2, 1, 5]))))
601 self.failUnless(ds1.labels_map == lm1)
602 self.failUnless(ds2.labels_map == lm2)
603
604
605 ds1 += ds2
606 self.failUnless(N.all(ds1.labels == ds3.labels))
607
608
609 self.failUnless(N.all(ds1.labels_map == ds3.labels_map))
610
611
612
613 self.failUnlessRaises(ValueError, ds1.__add__, ds0)
614 self.failUnlessRaises(ValueError, ds1.__iadd__, ds0)
615
616
618
619 ds = datasets['uni2small']
620
621 ds_ = ds.copy()
622
623 self.failUnless(N.all(ds.samples == ds_.samples))
624 self.failUnless(N.all(ds.labels == ds_.labels))
625 self.failUnless(N.all(ds.chunks == ds_.chunks))
626
627
628 ds_.samples[0, 0] = 1234
629 self.failUnless(N.any(ds.samples != ds_.samples))
630 self.failUnless(N.all(ds.labels == ds_.labels))
631 self.failUnless(N.all(ds.chunks == ds_.chunks))
632
633 ds_.labels = N.hstack(([123], ds_.labels[1:]))
634 self.failUnless(N.any(ds.samples != ds_.samples))
635 self.failUnless(N.any(ds.labels != ds_.labels))
636 self.failUnless(N.all(ds.chunks == ds_.chunks))
637
638 ds_.chunks = N.hstack(([1234], ds_.chunks[1:]))
639 self.failUnless(N.any(ds.samples != ds_.samples))
640 self.failUnless(N.any(ds.labels != ds_.labels))
641 self.failUnless(N.any(ds.chunks != ds_.chunks))
642
643 self.failUnless(N.any(ds.uniquelabels != ds_.uniquelabels))
644 self.failUnless(N.any(ds.uniquechunks != ds_.uniquechunks))
645
646
648 """Test detection of transition points
649
650 Shame on Yarik -- he didn't create unittests right away... damn me
651 """
652 ds = Dataset(samples=N.array(range(10), ndmin=2).T,
653 labels=[0,0,1,1,0,0,1,1,0,0],
654 chunks=[0,0,0,0,0,1,1,1,1,1])
655 self.failUnless(ds.idsonboundaries() == [0,2,4,5,6,8],
656 "We should have got ids whenever either chunk or "
657 "label changes")
658 self.failUnless(ds.idsonboundaries(attributes_to_track=['chunks'])
659 == [0, 5])
660
661 self.failUnless(ds.idsonboundaries(prior=1, post=-1,
662 attributes_to_track=['chunks'])
663 == [4, 9])
664 self.failUnless(ds.idsonboundaries(prior=2, post=-1,
665 attributes_to_track=['chunks'])
666 == [3, 4, 8, 9])
667 self.failUnless(ds.idsonboundaries(prior=2, post=-1,
668 attributes_to_track=['chunks'],
669 revert=True)
670 == [0, 1, 2, 5, 6, 7])
671 self.failUnless(ds.idsonboundaries(prior=1, post=1,
672 attributes_to_track=['chunks'])
673 == [0, 1, 4, 5, 6, 9])
674
675 self.failUnless(ds.idsonboundaries(prior=2) == range(10))
676
677
680
681
682 if __name__ == '__main__':
683 import runner
684