1
2
3
4
5
6
7
8
9 """Collection of dataset splitters.
10
11 Module Description
12 ==================
13
14 Splitters are destined to split the provided dataset varous ways to
15 simplify cross-validation analysis, implement boosting of the
16 estimates, or sample null-space via permutation testing.
17
18 Most of the splitters at the moment split 2-ways -- conventionally
19 first part is used for training, and 2nd part for testing by
20 `CrossValidatedTransferError` and `SplitClassifier`.
21
22 Brief Description of Available Splitters
23 ========================================
24
25 * `NoneSplitter` - just return full dataset as the desired part (training/testing)
26 * `OddEvenSplitter` - 2 splits: (odd samples,even samples) and (even, odd)
27 * `HalfSplitter` - 2 splits: (first half, second half) and (second, first)
28 * `NFoldSplitter` - splits for N-Fold cross validation.
29
30 Module Organization
31 ===================
32
33 .. packagetree::
34 :style: UML
35
36 """
37
38 __docformat__ = 'restructuredtext'
39
40 import operator
41
42 import numpy as N
43
44 import mvpa.misc.support as support
45 from mvpa.base.dochelpers import enhancedDocString
46 from mvpa.datasets.miscfx import coarsenChunks
47
48 if __debug__:
49 from mvpa.base import debug
50
52 """Base class of dataset splitters.
53
54 Each splitter should be initialized with all its necessary parameters. The
55 final splitting is done running the splitter object on a certain Dataset
56 via __call__(). This method has to be implemented like a generator, i.e. it
57 has to return every possible split with a yield() call.
58
59 Each split has to be returned as a sequence of Datasets. The properties
60 of the splitted dataset may vary between implementations. It is possible
61 to declare a sequence element as 'None'.
62
63 Please note, that even if there is only one Dataset returned it has to be
64 an element in a sequence and not just the Dataset object!
65 """
66
67 _STRATEGIES = ('first', 'random', 'equidistant')
68 _NPERLABEL_STR = ['equal', 'all']
69
70 - def __init__(self,
71 nperlabel='all',
72 nrunspersplit=1,
73 permute=False,
74 count=None,
75 strategy='equidistant',
76 discard_boundary=None,
77 attr='chunks',
78 reverse=False):
79 """Initialize splitter base.
80
81 :Parameters:
82 nperlabel : int or str (or list of them) or float
83 Number of dataset samples per label to be included in each
84 split. If given as a float, it must be in [0,1] range and would
85 mean the ratio of selected samples per each label.
86 Two special strings are recognized: 'all' uses all available
87 samples (default) and 'equal' uses the maximum number of samples
88 the can be provided by all of the classes. This value might be
89 provided as a sequence whos length matches the number of datasets
90 per split and indicates the configuration for the respective dataset
91 in each split.
92 nrunspersplit: int
93 Number of times samples for each split are chosen. This
94 is mostly useful if a subset of the available samples
95 is used in each split and the subset is randomly
96 selected for each run (see the `nperlabel` argument).
97 permute : bool
98 If set to `True`, the labels of each generated dataset
99 will be permuted on a per-chunk basis.
100 count : None or int
101 Desired number of splits to be output. It is limited by the
102 number of splits possible for a given splitter
103 (e.g. `OddEvenSplitter` can have only up to 2 splits). If None,
104 all splits are output (default).
105 strategy : str
106 If `count` is not None, possible strategies are possible:
107 first
108 First `count` splits are chosen
109 random
110 Random (without replacement) `count` splits are chosen
111 equidistant
112 Splits which are equidistant from each other
113 discard_boundary : None or int or sequence of int
114 If not `None`, how many samples on the boundaries between
115 parts of the split to discard in the training part.
116 If int, then discarded in all parts. If a sequence, numbers
117 to discard are given per part of the split.
118 E.g. if splitter splits only into (training, testing)
119 parts, then `discard_boundary`=(2,0) would instruct to discard
120 2 samples from training which are on the boundary with testing.
121 attr : str
122 Sample attribute used to determine splits.
123 reverse : bool
124 If True, the order of datasets in the split is reversed, e.g.
125 instead of (training, testing), (training, testing) will be spit
126 out
127 """
128
129 self.__nperlabel = None
130 self.__runspersplit = nrunspersplit
131 self.__permute = permute
132 self.__splitattr = attr
133 self._reverse = reverse
134 self.discard_boundary = discard_boundary
135
136
137
138
139 self.count = count
140 """Number (max) of splits to output on call"""
141
142 self._setStrategy(strategy)
143
144
145 self.setNPerLabel(nperlabel)
146
147
148 __doc__ = enhancedDocString('Splitter', locals())
149
158
160 """Set the number of samples per label in the split datasets.
161
162 'equal' sets sample size to highest possible number of samples that
163 can be provided by each class. 'all' uses all available samples
164 (default).
165 """
166 if isinstance(value, basestring):
167 if not value in self._NPERLABEL_STR:
168 raise ValueError, "Unsupported value '%s' for nperlabel." \
169 " Supported ones are %s or float or int" % (value, self._NPERLABEL_STR)
170 self.__nperlabel = value
171
172
174 """Each subclass has to implement this method. It gets a sequence with
175 the unique attribte ids of a dataset and has to return a list of lists
176 containing attribute ids to split into the second dataset.
177 """
178 raise NotImplementedError
179
180
182 """Splits the dataset.
183
184 This method behaves like a generator.
185 """
186
187
188 ds_class = dataset.__class__
189 DS_permuteLabels = ds_class.permuteLabels
190 try:
191 DS_getNSamplesPerLabel = ds_class._getNSamplesPerAttr
192 except AttributeError:
193
194
195 pass
196 DS_getRandomSamples = ds_class.getRandomSamples
197
198
199 cfgs = self.splitcfg(dataset)
200
201
202 count, Ncfgs = self.count, len(cfgs)
203
204
205
206 if count is not None and count < Ncfgs:
207 if count < 1:
208
209 return
210 strategy = self.strategy
211 if strategy == 'first':
212 cfgs = cfgs[:count]
213 elif strategy in ['equidistant', 'random']:
214 if strategy == 'equidistant':
215
216
217 step = float(Ncfgs) / count
218 assert(step >= 1.0)
219 indexes = [int(round(step * i)) for i in xrange(count)]
220 elif strategy == 'random':
221 indexes = N.random.permutation(range(Ncfgs))[:count]
222
223
224 indexes.sort()
225 else:
226
227 raise RuntimeError, "Really should not happen"
228 if __debug__:
229 debug("SPL", "For %s strategy selected %s splits "
230 "from %d total" % (strategy, indexes, Ncfgs))
231 cfgs = [cfgs[i] for i in indexes]
232
233 Ncfgs = len(cfgs)
234
235
236 for isplit, split in enumerate(cfgs):
237
238
239 if not operator.isSequenceType(self.__nperlabel) \
240 or isinstance(self.__nperlabel, str):
241 nperlabelsplit = [self.__nperlabel] * len(split)
242 else:
243 nperlabelsplit = self.__nperlabel
244
245
246 split_ds = self.splitDataset(dataset, split)
247
248
249 for run in xrange(self.__runspersplit):
250
251
252 finalized_datasets = []
253
254 for ds, nperlabel in zip(split_ds, nperlabelsplit):
255
256
257
258
259 if ds is not None:
260 ds._dsattr['lastsplit'] = (isplit == Ncfgs-1)
261
262 if self.__permute:
263 DS_permuteLabels(ds, True, perchunk=True)
264
265
266 if nperlabel == 'all' or ds is None:
267 finalized_datasets.append(ds)
268 else:
269
270
271
272
273
274 if nperlabel == 'equal':
275
276 npl = N.array(DS_getNSamplesPerLabel(
277 ds, attrib='labels').values()).min()
278 elif isinstance(nperlabel, float) or (
279 operator.isSequenceType(nperlabel) and
280 len(nperlabel) > 0 and
281 isinstance(nperlabel[0], float)):
282
283
284 counts = N.array(DS_getNSamplesPerLabel(
285 ds, attrib='labels').values())
286 npl = (counts * nperlabel).round().astype(int)
287 else:
288 npl = nperlabel
289
290
291 finalized_datasets.append(
292 DS_getRandomSamples(ds, npl))
293
294 if self._reverse:
295 yield finalized_datasets[::-1]
296 else:
297 yield finalized_datasets
298
299
301 """Split a dataset by separating the samples where the configured
302 sample attribute matches an element of `specs`.
303
304 :Parameters:
305 dataset : Dataset
306 This is this source dataset.
307 specs : sequence of sequences
308 Contains ids of a sample attribute that shall be split into the
309 another dataset.
310 :Returns: Tuple of splitted datasets.
311 """
312
313 filters = []
314 none_specs = 0
315 cum_filter = None
316
317
318 discard_boundary = self.discard_boundary
319 if isinstance(discard_boundary, int):
320 if discard_boundary != 0:
321 discard_boundary = (discard_boundary,) * len(specs)
322 else:
323 discard_boundary = None
324
325 splitattr_data = eval('dataset.' + self.__splitattr)
326 for spec in specs:
327 if spec is None:
328 filters.append(None)
329 none_specs += 1
330 else:
331 filter_ = N.array([ i in spec \
332 for i in splitattr_data])
333 filters.append(filter_)
334 if cum_filter is None:
335 cum_filter = filter_
336 else:
337 cum_filter = N.logical_and(cum_filter, filter_)
338
339
340 if none_specs > 1:
341 raise ValueError, "Splitter cannot handle more than one `None` " \
342 "split definition."
343
344 for i, filter_ in enumerate(filters):
345 if filter_ is None:
346 filters[i] = N.logical_not(cum_filter)
347
348
349
350 if discard_boundary is not None:
351 ndiscard = discard_boundary[i]
352 if ndiscard != 0:
353
354
355
356 f, lenf = filters[i], len(filters[i])
357 f_pad = N.concatenate(([True]*ndiscard, f, [True]*ndiscard))
358 for d in xrange(2*ndiscard+1):
359 f = N.logical_and(f, f_pad[d:d+lenf])
360 filters[i] = f[:]
361
362
363
364
365 split_datasets = []
366
367
368 dataset_selectSamples = dataset.selectSamples
369 for filter_ in filters:
370 if (filter_ == False).all():
371 split_datasets.append(None)
372 else:
373 split_datasets.append(dataset_selectSamples(filter_))
374
375 return split_datasets
376
377
379 """String summary over the object
380 """
381 return \
382 "SplitterConfig: nperlabel:%s runs-per-split:%d permute:%s" \
383 % (self.__nperlabel, self.__runspersplit, self.__permute)
384
385
387 """Return splitcfg for a given dataset"""
388 return self._getSplitConfig(eval('dataset.unique' + self.__splitattr))
389
390
391 strategy = property(fget=lambda self:self.__strategy,
392 fset=_setStrategy)
393
394
396 """This is a dataset splitter that does **not** split. It simply returns
397 the full dataset that it is called with.
398
399 The passed dataset is returned as the second element of the 2-tuple.
400 The first element of that tuple will always be 'None'.
401 """
402
403 _known_modes = ['first', 'second']
404
405 - def __init__(self, mode='second', **kwargs):
406 """Cheap init -- nothing special
407
408 :Parameters:
409 mode
410 Either 'first' or 'second' (default) -- which output dataset
411 would actually contain the samples
412 """
413 Splitter.__init__(self, **(kwargs))
414
415 if not mode in NoneSplitter._known_modes:
416 raise ValueError, "Unknown mode %s for NoneSplitter" % mode
417 self.__mode = mode
418
419
420 __doc__ = enhancedDocString('NoneSplitter', locals(), Splitter)
421
422
424 """Return just one full split: no first or second dataset.
425 """
426 if self.__mode == 'second':
427 return [([], None)]
428 else:
429 return [(None, [])]
430
431
433 """String summary over the object
434 """
435 return \
436 "NoneSplitter / " + Splitter.__str__(self)
437
438
439
441 """Split a dataset into odd and even values of the sample attribute.
442
443 The splitter yields to splits: first (odd, even) and second (even, odd).
444 """
445 - def __init__(self, usevalues=False, **kwargs):
446 """Cheap init.
447
448 :Parameters:
449 usevalues: bool
450 If True the values of the attribute used for splitting will be
451 used to determine odd and even samples. If False odd and even
452 chunks are defined by the order of attribute values, i.e. first
453 unique attribute is odd, second is even, despite the
454 corresponding values might indicate the opposite (e.g. in case
455 of [2,3].
456 """
457 Splitter.__init__(self, **(kwargs))
458
459 self.__usevalues = usevalues
460
461
462 __doc__ = enhancedDocString('OddEvenSplitter', locals(), Splitter)
463
464
466 """Huka chaka!
467 YOH: LOL XXX
468 """
469 if self.__usevalues:
470 return [(None, uniqueattrs[(uniqueattrs % 2) == True]),
471 (None, uniqueattrs[(uniqueattrs % 2) == False])]
472 else:
473 return [(None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == True]),
474 (None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == False])]
475
476
478 """String summary over the object
479 """
480 return \
481 "OddEvenSplitter / " + Splitter.__str__(self)
482
483
484
486 """Split a dataset into two halves of the sample attribute.
487
488 The splitter yields to splits: first (1st half, 2nd half) and second
489 (2nd half, 1st half).
490 """
495
496
497 __doc__ = enhancedDocString('HalfSplitter', locals(), Splitter)
498
499
501 """Huka chaka!
502 """
503 return [(None, uniqueattrs[:len(uniqueattrs)/2]),
504 (None, uniqueattrs[len(uniqueattrs)/2:])]
505
506
508 """String summary over the object
509 """
510 return \
511 "HalfSplitter / " + Splitter.__str__(self)
512
513
514
516 """Split a dataset into N-groups of the sample attribute.
517
518 For example, NGroupSplitter(2) is the same as the HalfSplitter and
519 yields to splits: first (1st half, 2nd half) and second (2nd half,
520 1st half).
521 """
522 - def __init__(self, ngroups=4, **kwargs):
523 """Initialize the N-group splitter.
524
525 :Parameters:
526 ngroups: int
527 Number of groups to split the attribute into.
528 kwargs
529 Additional parameters are passed to the `Splitter` base class.
530 """
531 Splitter.__init__(self, **(kwargs))
532
533 self.__ngroups = ngroups
534
535 __doc__ = enhancedDocString('NGroupSplitter', locals(), Splitter)
536
537
539 """Huka chaka, wuka waka!
540 """
541
542
543 if len(uniqueattrs) < self.__ngroups:
544 raise ValueError, "Number of groups (%d) " % (self.__ngroups) + \
545 "must be less than " + \
546 "or equal to the number of unique attributes (%d)" % \
547 (len(uniqueattrs))
548
549
550 split_ind = coarsenChunks(uniqueattrs, nchunks=self.__ngroups)
551 split_ind = N.asarray(split_ind)
552
553
554 split_list = [(None, uniqueattrs[split_ind==i])
555 for i in range(self.__ngroups)]
556 return split_list
557
558
560 """String summary over the object
561 """
562 return \
563 "N-%d-GroupSplitter / " % self.__ngroup + Splitter.__str__(self)
564
565
566
568 """Generic N-fold data splitter.
569
570 Provide folding splitting. Given a dataset with N chunks, with
571 cvtype=1 (which is default), it would generate N splits, where
572 each chunk sequentially is taken out (with replacement) for
573 cross-validation. Example, if there is 4 chunks, splits for
574 cvtype=1 are:
575
576 [[1, 2, 3], [0]]
577 [[0, 2, 3], [1]]
578 [[0, 1, 3], [2]]
579 [[0, 1, 2], [3]]
580
581 If cvtype>1, then all possible combinations of cvtype number of
582 chunks are taken out for testing, so for cvtype=2 in previous
583 example:
584
585 [[2, 3], [0, 1]]
586 [[1, 3], [0, 2]]
587 [[1, 2], [0, 3]]
588 [[0, 3], [1, 2]]
589 [[0, 2], [1, 3]]
590 [[0, 1], [2, 3]]
591
592 """
593
594 - def __init__(self,
595 cvtype = 1,
596 **kwargs):
597 """Initialize the N-fold splitter.
598
599 :Parameters:
600 cvtype: int
601 Type of cross-validation: N-(cvtype)
602 kwargs
603 Additional parameters are passed to the `Splitter` base class.
604 """
605 Splitter.__init__(self, **(kwargs))
606
607
608 self.__cvtype = cvtype
609
610
611 __doc__ = enhancedDocString('NFoldSplitter', locals(), Splitter)
612
613
615 """String summary over the object
616 """
617 return \
618 "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)
619
620
622 """Returns proper split configuration for N-M fold split.
623 """
624 return [(None, i) for i in \
625 support.getUniqueLengthNCombinations(uniqueattrs,
626 self.__cvtype)]
627
628
629
631 """Split a dataset using an arbitrary custom rule.
632
633 The splitter is configured by passing a custom spitting rule (`splitrule`)
634 to its constructor. Such a rule is basically a sequence of split
635 definitions. Every single element in this sequence results in excatly one
636 split generated by the Splitter. Each element is another sequence for
637 sequences of sample ids for each dataset that shall be generated in the
638 split.
639
640 Example:
641
642 * Generate two splits. In the first split the *second* dataset
643 contains all samples with sample attributes corresponding to
644 either 0, 1 or 2. The *first* dataset of the first split contains
645 all samples which are not split into the second dataset.
646
647 The second split yields three datasets. The first with all samples
648 corresponding to sample attributes 1 and 2, the second dataset
649 contains only samples with attrbiute 3 and the last dataset
650 contains the samples with attribute 5 and 6.
651
652 CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])])
653 """
654 - def __init__(self, splitrule, **kwargs):
655 """Cheap init.
656 """
657 Splitter.__init__(self, **(kwargs))
658
659 self.__splitrule = splitrule
660
661
662 __doc__ = enhancedDocString('CustomSplitter', locals(), Splitter)
663
664
666 """Huka chaka!
667 """
668 return self.__splitrule
669
670
672 """String summary over the object
673 """
674 return "CustomSplitter / " + Splitter.__str__(self)
675