mvpa.datasets.splitters

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # See COPYING file distributed along with the PyMVPA package for the 6 # copyright and license terms. 7 # 8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 """Collection of dataset splitters. 10 11 Module Description 12 ================== 13 14 Splitters are destined to split the provided dataset varous ways to 15 simplify cross-validation analysis, implement boosting of the 16 estimates, or sample null-space via permutation testing. 17 18 Most of the splitters at the moment split 2-ways -- conventionally 19 first part is used for training, and 2nd part for testing by 20 `CrossValidatedTransferError` and `SplitClassifier`. 21 22 Brief Description of Available Splitters 23 ======================================== 24 25 * `NoneSplitter` - just return full dataset as the desired part (training/testing) 26 * `OddEvenSplitter` - 2 splits: (odd samples,even samples) and (even, odd) 27 * `HalfSplitter` - 2 splits: (first half, second half) and (second, first) 28 * `NFoldSplitter` - splits for N-Fold cross validation. 29 30 Module Organization 31 =================== 32 33 .. packagetree:: 34 :style: UML 35 36 """ 37 38 __docformat__ = 'restructuredtext' 39 40 import operator 41 42 import numpy as N 43 44 import mvpa.misc.support as support 45 from mvpa.base.dochelpers import enhancedDocString 46 from mvpa.datasets.miscfx import coarsenChunks 47 48 if __debug__: 49 from mvpa.base import debug 50

51 -class Splitter(object):

52 """Base class of dataset splitters. 53 54 Each splitter should be initialized with all its necessary parameters. The 55 final splitting is done running the splitter object on a certain Dataset 56 via __call__(). This method has to be implemented like a generator, i.e. it 57 has to return every possible split with a yield() call. 58 59 Each split has to be returned as a sequence of Datasets. The properties 60 of the splitted dataset may vary between implementations. It is possible 61 to declare a sequence element as 'None'. 62 63 Please note, that even if there is only one Dataset returned it has to be 64 an element in a sequence and not just the Dataset object! 65 """ 66 67 _STRATEGIES = ('first', 'random', 'equidistant') 68 _NPERLABEL_STR = ['equal', 'all'] 69

70 - def __init__(self, 71 nperlabel='all', 72 nrunspersplit=1, 73 permute=False, 74 count=None, 75 strategy='equidistant', 76 discard_boundary=None, 77 attr='chunks', 78 reverse=False):

79 """Initialize splitter base. 80 81 :Parameters: 82 nperlabel : int or str (or list of them) or float 83 Number of dataset samples per label to be included in each 84 split. If given as a float, it must be in [0,1] range and would 85 mean the ratio of selected samples per each label. 86 Two special strings are recognized: 'all' uses all available 87 samples (default) and 'equal' uses the maximum number of samples 88 the can be provided by all of the classes. This value might be 89 provided as a sequence whos length matches the number of datasets 90 per split and indicates the configuration for the respective dataset 91 in each split. 92 nrunspersplit: int 93 Number of times samples for each split are chosen. This 94 is mostly useful if a subset of the available samples 95 is used in each split and the subset is randomly 96 selected for each run (see the `nperlabel` argument). 97 permute : bool 98 If set to `True`, the labels of each generated dataset 99 will be permuted on a per-chunk basis. 100 count : None or int 101 Desired number of splits to be output. It is limited by the 102 number of splits possible for a given splitter 103 (e.g. `OddEvenSplitter` can have only up to 2 splits). If None, 104 all splits are output (default). 105 strategy : str 106 If `count` is not None, possible strategies are possible: 107 first 108 First `count` splits are chosen 109 random 110 Random (without replacement) `count` splits are chosen 111 equidistant 112 Splits which are equidistant from each other 113 discard_boundary : None or int or sequence of int 114 If not `None`, how many samples on the boundaries between 115 parts of the split to discard in the training part. 116 If int, then discarded in all parts. If a sequence, numbers 117 to discard are given per part of the split. 118 E.g. if splitter splits only into (training, testing) 119 parts, then `discard_boundary`=(2,0) would instruct to discard 120 2 samples from training which are on the boundary with testing. 121 attr : str 122 Sample attribute used to determine splits. 123 reverse : bool 124 If True, the order of datasets in the split is reversed, e.g. 125 instead of (training, testing), (training, testing) will be spit 126 out 127 """ 128 # pylint happyness block 129 self.__nperlabel = None 130 self.__runspersplit = nrunspersplit 131 self.__permute = permute 132 self.__splitattr = attr 133 self._reverse = reverse 134 self.discard_boundary = discard_boundary 135 136 # we don't check it, thus no reason to make it private. 137 # someone might find it useful to change post creation 138 # TODO utilize such (or similar) policy through out the code 139 self.count = count 140 """Number (max) of splits to output on call""" 141 142 self._setStrategy(strategy) 143 144 # pattern sampling status vars 145 self.setNPerLabel(nperlabel)

146 147 148 __doc__ = enhancedDocString('Splitter', locals()) 149

150 - def _setStrategy(self, strategy):

151 """Set strategy to select splits out from available 152 """ 153 strategy = strategy.lower() 154 if not strategy in self._STRATEGIES: 155 raise ValueError, "strategy is not known. Known are %s" \ 156 % str(self._STRATEGIES) 157 self.__strategy = strategy

158

159 - def setNPerLabel(self, value):

160 """Set the number of samples per label in the split datasets. 161 162 'equal' sets sample size to highest possible number of samples that 163 can be provided by each class. 'all' uses all available samples 164 (default). 165 """ 166 if isinstance(value, basestring): 167 if not value in self._NPERLABEL_STR: 168 raise ValueError, "Unsupported value '%s' for nperlabel." \ 169 " Supported ones are %s or float or int" % (value, self._NPERLABEL_STR) 170 self.__nperlabel = value

171 172

173 - def _getSplitConfig(self, uniqueattr):

174 """Each subclass has to implement this method. It gets a sequence with 175 the unique attribte ids of a dataset and has to return a list of lists 176 containing attribute ids to split into the second dataset. 177 """ 178 raise NotImplementedError

179 180

181 - def __call__(self, dataset):

182 """Splits the dataset. 183 184 This method behaves like a generator. 185 """ 186 187 # local bindings to methods to gain some speedup 188 ds_class = dataset.__class__ 189 DS_permuteLabels = ds_class.permuteLabels 190 try: 191 DS_getNSamplesPerLabel = ds_class._getNSamplesPerAttr 192 except AttributeError: 193 # Some "not-real" datasets e.g. MetaDataset, might not 194 # have it 195 pass 196 DS_getRandomSamples = ds_class.getRandomSamples 197 198 # for each split 199 cfgs = self.splitcfg(dataset) 200 201 # Select just some splits if desired 202 count, Ncfgs = self.count, len(cfgs) 203 204 # further makes sense only iff count < Ncfgs, 205 # otherwise all strategies are equivalent 206 if count is not None and count < Ncfgs: 207 if count < 1: 208 # we can only wish a good luck 209 return 210 strategy = self.strategy 211 if strategy == 'first': 212 cfgs = cfgs[:count] 213 elif strategy in ['equidistant', 'random']: 214 if strategy == 'equidistant': 215 # figure out what step is needed to 216 # acommodate the `count` number 217 step = float(Ncfgs) / count 218 assert(step >= 1.0) 219 indexes = [int(round(step * i)) for i in xrange(count)] 220 elif strategy == 'random': 221 indexes = N.random.permutation(range(Ncfgs))[:count] 222 # doesn't matter much but lets keep them in the original 223 # order at least 224 indexes.sort() 225 else: 226 # who said that I am paranoid? 227 raise RuntimeError, "Really should not happen" 228 if __debug__: 229 debug("SPL", "For %s strategy selected %s splits " 230 "from %d total" % (strategy, indexes, Ncfgs)) 231 cfgs = [cfgs[i] for i in indexes] 232 # update Ncfgs 233 Ncfgs = len(cfgs) 234 235 # Finally split the data 236 for isplit, split in enumerate(cfgs): 237 238 # determine sample sizes 239 if not operator.isSequenceType(self.__nperlabel) \ 240 or isinstance(self.__nperlabel, str): 241 nperlabelsplit = [self.__nperlabel] * len(split) 242 else: 243 nperlabelsplit = self.__nperlabel 244 245 # get splitted datasets 246 split_ds = self.splitDataset(dataset, split) 247 248 # do multiple post-processing runs for this split 249 for run in xrange(self.__runspersplit): 250 251 # post-process all datasets 252 finalized_datasets = [] 253 254 for ds, nperlabel in zip(split_ds, nperlabelsplit): 255 # Set flag of dataset either this was the last split 256 # ??? per our discussion this might be the best 257 # solution which would scale if we care about 258 # thread-safety etc 259 if ds is not None: 260 ds._dsattr['lastsplit'] = (isplit == Ncfgs-1) 261 # permute the labels 262 if self.__permute: 263 DS_permuteLabels(ds, True, perchunk=True) 264 265 # select subset of samples if requested 266 if nperlabel == 'all' or ds is None: 267 finalized_datasets.append(ds) 268 else: 269 # We need to select a subset of samples 270 # TODO: move all this logic within getRandomSamples 271 272 # go for maximum possible number of samples provided 273 # by each label in this dataset 274 if nperlabel == 'equal': 275 # determine the min number of samples per class 276 npl = N.array(DS_getNSamplesPerLabel( 277 ds, attrib='labels').values()).min() 278 elif isinstance(nperlabel, float) or ( 279 operator.isSequenceType(nperlabel) and 280 len(nperlabel) > 0 and 281 isinstance(nperlabel[0], float)): 282 # determine number of samples per class and take 283 # a ratio 284 counts = N.array(DS_getNSamplesPerLabel( 285 ds, attrib='labels').values()) 286 npl = (counts * nperlabel).round().astype(int) 287 else: 288 npl = nperlabel 289 290 # finally select the patterns 291 finalized_datasets.append( 292 DS_getRandomSamples(ds, npl)) 293 294 if self._reverse: 295 yield finalized_datasets[::-1] 296 else: 297 yield finalized_datasets

298 299

300 - def splitDataset(self, dataset, specs):

301 """Split a dataset by separating the samples where the configured 302 sample attribute matches an element of `specs`. 303 304 :Parameters: 305 dataset : Dataset 306 This is this source dataset. 307 specs : sequence of sequences 308 Contains ids of a sample attribute that shall be split into the 309 another dataset. 310 :Returns: Tuple of splitted datasets. 311 """ 312 # collect the sample ids for each resulting dataset 313 filters = [] 314 none_specs = 0 315 cum_filter = None 316 317 # Prepare discard_boundary 318 discard_boundary = self.discard_boundary 319 if isinstance(discard_boundary, int): 320 if discard_boundary != 0: 321 discard_boundary = (discard_boundary,) * len(specs) 322 else: 323 discard_boundary = None 324 325 splitattr_data = eval('dataset.' + self.__splitattr) 326 for spec in specs: 327 if spec is None: 328 filters.append(None) 329 none_specs += 1 330 else: 331 filter_ = N.array([ i in spec \ 332 for i in splitattr_data]) 333 filters.append(filter_) 334 if cum_filter is None: 335 cum_filter = filter_ 336 else: 337 cum_filter = N.logical_and(cum_filter, filter_) 338 339 # need to turn possible Nones into proper ids sequences 340 if none_specs > 1: 341 raise ValueError, "Splitter cannot handle more than one `None` " \ 342 "split definition." 343 344 for i, filter_ in enumerate(filters): 345 if filter_ is None: 346 filters[i] = N.logical_not(cum_filter) 347 348 # If it was told to discard samples on the boundary to the 349 # other parts of the split 350 if discard_boundary is not None: 351 ndiscard = discard_boundary[i] 352 if ndiscard != 0: 353 # XXX sloppy implementation for now. It still 354 # should not be the main reason for a slow-down of 355 # the whole analysis ;) 356 f, lenf = filters[i], len(filters[i]) 357 f_pad = N.concatenate(([True]*ndiscard, f, [True]*ndiscard)) 358 for d in xrange(2*ndiscard+1): 359 f = N.logical_and(f, f_pad[d:d+lenf]) 360 filters[i] = f[:] 361 362 # split data: return None if no samples are left 363 # XXX: Maybe it should simply return an empty dataset instead, but 364 # keeping it this way for now, to maintain current behavior 365 split_datasets = [] 366 367 # local bindings 368 dataset_selectSamples = dataset.selectSamples 369 for filter_ in filters: 370 if (filter_ == False).all(): 371 split_datasets.append(None) 372 else: 373 split_datasets.append(dataset_selectSamples(filter_)) 374 375 return split_datasets

376 377

378 - def __str__(self):

379 """String summary over the object 380 """ 381 return \ 382 "SplitterConfig: nperlabel:%s runs-per-split:%d permute:%s" \ 383 % (self.__nperlabel, self.__runspersplit, self.__permute)

384 385

386 - def splitcfg(self, dataset):

387 """Return splitcfg for a given dataset""" 388 return self._getSplitConfig(eval('dataset.unique' + self.__splitattr))

389 390 391 strategy = property(fget=lambda self:self.__strategy, 392 fset=_setStrategy)

393 394

395 -class NoneSplitter(Splitter):

396 """This is a dataset splitter that does **not** split. It simply returns 397 the full dataset that it is called with. 398 399 The passed dataset is returned as the second element of the 2-tuple. 400 The first element of that tuple will always be 'None'. 401 """ 402 403 _known_modes = ['first', 'second'] 404

405 - def __init__(self, mode='second', **kwargs):

406 """Cheap init -- nothing special 407 408 :Parameters: 409 mode 410 Either 'first' or 'second' (default) -- which output dataset 411 would actually contain the samples 412 """ 413 Splitter.__init__(self, **(kwargs)) 414 415 if not mode in NoneSplitter._known_modes: 416 raise ValueError, "Unknown mode %s for NoneSplitter" % mode 417 self.__mode = mode

418 419 420 __doc__ = enhancedDocString('NoneSplitter', locals(), Splitter) 421 422

423 - def _getSplitConfig(self, uniqueattrs):

424 """Return just one full split: no first or second dataset. 425 """ 426 if self.__mode == 'second': 427 return [([], None)] 428 else: 429 return [(None, [])]

430 431

432 - def __str__(self):

433 """String summary over the object 434 """ 435 return \ 436 "NoneSplitter / " + Splitter.__str__(self)

437 438 439

440 -class OddEvenSplitter(Splitter):

441 """Split a dataset into odd and even values of the sample attribute. 442 443 The splitter yields to splits: first (odd, even) and second (even, odd). 444 """

445 - def __init__(self, usevalues=False, **kwargs):

446 """Cheap init. 447 448 :Parameters: 449 usevalues: bool 450 If True the values of the attribute used for splitting will be 451 used to determine odd and even samples. If False odd and even 452 chunks are defined by the order of attribute values, i.e. first 453 unique attribute is odd, second is even, despite the 454 corresponding values might indicate the opposite (e.g. in case 455 of [2,3]. 456 """ 457 Splitter.__init__(self, **(kwargs)) 458 459 self.__usevalues = usevalues

460 461 462 __doc__ = enhancedDocString('OddEvenSplitter', locals(), Splitter) 463 464

465 - def _getSplitConfig(self, uniqueattrs):

466 """Huka chaka! 467 YOH: LOL XXX 468 """ 469 if self.__usevalues: 470 return [(None, uniqueattrs[(uniqueattrs % 2) == True]), 471 (None, uniqueattrs[(uniqueattrs % 2) == False])] 472 else: 473 return [(None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == True]), 474 (None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == False])]

475 476

477 - def __str__(self):

478 """String summary over the object 479 """ 480 return \ 481 "OddEvenSplitter / " + Splitter.__str__(self)

482 483 484

485 -class HalfSplitter(Splitter):

486 """Split a dataset into two halves of the sample attribute. 487 488 The splitter yields to splits: first (1st half, 2nd half) and second 489 (2nd half, 1st half). 490 """

491 - def __init__(self, **kwargs):

492 """Cheap init. 493 """ 494 Splitter.__init__(self, **(kwargs))

495 496 497 __doc__ = enhancedDocString('HalfSplitter', locals(), Splitter) 498 499

500 - def _getSplitConfig(self, uniqueattrs):

501 """Huka chaka! 502 """ 503 return [(None, uniqueattrs[:len(uniqueattrs)/2]), 504 (None, uniqueattrs[len(uniqueattrs)/2:])]

505 506

507 - def __str__(self):

508 """String summary over the object 509 """ 510 return \ 511 "HalfSplitter / " + Splitter.__str__(self)

512 513 514

515 -class NGroupSplitter(Splitter):

516 """Split a dataset into N-groups of the sample attribute. 517 518 For example, NGroupSplitter(2) is the same as the HalfSplitter and 519 yields to splits: first (1st half, 2nd half) and second (2nd half, 520 1st half). 521 """

522 - def __init__(self, ngroups=4, **kwargs):

523 """Initialize the N-group splitter. 524 525 :Parameters: 526 ngroups: int 527 Number of groups to split the attribute into. 528 kwargs 529 Additional parameters are passed to the `Splitter` base class. 530 """ 531 Splitter.__init__(self, **(kwargs)) 532 533 self.__ngroups = ngroups

534 535 __doc__ = enhancedDocString('NGroupSplitter', locals(), Splitter) 536 537

538 - def _getSplitConfig(self, uniqueattrs):

539 """Huka chaka, wuka waka! 540 """ 541 542 # make sure there are more of attributes than desired groups 543 if len(uniqueattrs) < self.__ngroups: 544 raise ValueError, "Number of groups (%d) " % (self.__ngroups) + \ 545 "must be less than " + \ 546 "or equal to the number of unique attributes (%d)" % \ 547 (len(uniqueattrs)) 548 549 # use coarsenChunks to get the split indices 550 split_ind = coarsenChunks(uniqueattrs, nchunks=self.__ngroups) 551 split_ind = N.asarray(split_ind) 552 553 # loop and create splits 554 split_list = [(None, uniqueattrs[split_ind==i]) 555 for i in range(self.__ngroups)] 556 return split_list

557 558

559 - def __str__(self):

560 """String summary over the object 561 """ 562 return \ 563 "N-%d-GroupSplitter / " % self.__ngroup + Splitter.__str__(self)

564 565 566

567 -class NFoldSplitter(Splitter):

568 """Generic N-fold data splitter. 569 570 Provide folding splitting. Given a dataset with N chunks, with 571 cvtype=1 (which is default), it would generate N splits, where 572 each chunk sequentially is taken out (with replacement) for 573 cross-validation. Example, if there is 4 chunks, splits for 574 cvtype=1 are: 575 576 [[1, 2, 3], [0]] 577 [[0, 2, 3], [1]] 578 [[0, 1, 3], [2]] 579 [[0, 1, 2], [3]] 580 581 If cvtype>1, then all possible combinations of cvtype number of 582 chunks are taken out for testing, so for cvtype=2 in previous 583 example: 584 585 [[2, 3], [0, 1]] 586 [[1, 3], [0, 2]] 587 [[1, 2], [0, 3]] 588 [[0, 3], [1, 2]] 589 [[0, 2], [1, 3]] 590 [[0, 1], [2, 3]] 591 592 """ 593

594 - def __init__(self, 595 cvtype = 1, 596 **kwargs):

597 """Initialize the N-fold splitter. 598 599 :Parameters: 600 cvtype: int 601 Type of cross-validation: N-(cvtype) 602 kwargs 603 Additional parameters are passed to the `Splitter` base class. 604 """ 605 Splitter.__init__(self, **(kwargs)) 606 607 # pylint happiness block 608 self.__cvtype = cvtype

609 610 611 __doc__ = enhancedDocString('NFoldSplitter', locals(), Splitter) 612 613

614 - def __str__(self):

615 """String summary over the object 616 """ 617 return \ 618 "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)

619 620

621 - def _getSplitConfig(self, uniqueattrs):

622 """Returns proper split configuration for N-M fold split. 623 """ 624 return [(None, i) for i in \ 625 support.getUniqueLengthNCombinations(uniqueattrs, 626 self.__cvtype)]

627 628 629

630 -class CustomSplitter(Splitter):

631 """Split a dataset using an arbitrary custom rule. 632 633 The splitter is configured by passing a custom spitting rule (`splitrule`) 634 to its constructor. Such a rule is basically a sequence of split 635 definitions. Every single element in this sequence results in excatly one 636 split generated by the Splitter. Each element is another sequence for 637 sequences of sample ids for each dataset that shall be generated in the 638 split. 639 640 Example: 641 642 * Generate two splits. In the first split the *second* dataset 643 contains all samples with sample attributes corresponding to 644 either 0, 1 or 2. The *first* dataset of the first split contains 645 all samples which are not split into the second dataset. 646 647 The second split yields three datasets. The first with all samples 648 corresponding to sample attributes 1 and 2, the second dataset 649 contains only samples with attrbiute 3 and the last dataset 650 contains the samples with attribute 5 and 6. 651 652 CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])]) 653 """

654 - def __init__(self, splitrule, **kwargs):

655 """Cheap init. 656 """ 657 Splitter.__init__(self, **(kwargs)) 658 659 self.__splitrule = splitrule

660 661 662 __doc__ = enhancedDocString('CustomSplitter', locals(), Splitter) 663 664

665 - def _getSplitConfig(self, uniqueattrs):

666 """Huka chaka! 667 """ 668 return self.__splitrule

669 670

671 - def __str__(self):

672 """String summary over the object 673 """ 674 return "CustomSplitter / " + Splitter.__str__(self)

675

Source Code for Module mvpa.datasets.splitters