mvpa.datasets.base

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # See COPYING file distributed along with the PyMVPA package for the 6 # copyright and license terms. 7 # 8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 """Dataset container""" 10 11 __docformat__ = 'restructuredtext' 12 13 import operator 14 import random 15 import mvpa.support.copy as copy 16 import numpy as N 17 18 # Sooner or later Dataset would become ClassWithCollections as well, but for 19 # now just an object -- thus commenting out tentative changes 20 # 21 #XXX from mvpa.misc.state import ClassWithCollections, SampleAttribute 22 23 from mvpa.misc.exceptions import DatasetError 24 from mvpa.misc.support import idhash as idhash_ 25 from mvpa.base.dochelpers import enhancedDocString, table2string 26 27 from mvpa.base import warning 28 29 if __debug__: 30 from mvpa.base import debug

31 32 - def _validate_indexes_uniq_sorted(seq, fname, item):

33 """Helper function to validate that seq contains unique sorted values 34 """ 35 if operator.isSequenceType(seq): 36 seq_unique = N.unique(seq) 37 if len(seq) != len(seq_unique): 38 warning("%s() operates only with indexes for %s without" 39 " repetitions. Repetitions were removed." 40 % (fname, item)) 41 if N.any(N.sort(seq) != seq_unique): 42 warning("%s() does not guarantee the original order" 43 " of selected %ss. Use selectSamples() and " 44 " selectFeatures(sort=False) instead" % (fname, item))

45

46 47 #XXX class Dataset(ClassWithCollections): 48 -class Dataset(object):

49 """*The* Dataset. 50 51 This class provides a container to store all necessary data to 52 perform MVPA analyses. These are the data samples, as well as the 53 labels associated with the samples. Additionally, samples can be 54 grouped into chunks. 55 56 :Groups: 57 - `Creators`: `__init__`, `selectFeatures`, `selectSamples`, 58 `applyMapper` 59 - `Mutators`: `permuteLabels` 60 61 Important: labels assumed to be immutable, i.e. no one should modify 62 them externally by accessing indexed items, ie something like 63 ``dataset.labels[1] += 100`` should not be used. If a label has 64 to be modified, full copy of labels should be obtained, operated on, 65 and assigned back to the dataset, otherwise dataset.uniquelabels 66 would not work. The same applies to any other attribute which has 67 corresponding unique* access property. 68 69 """ 70 # XXX Notes about migration to use Collections to store data and 71 # attributes for samples, features, and dataset itself: 72 73 # changes: 74 # _data -> s_attr collection (samples attributes) 75 # _dsattr -> ds_attr collection 76 # f_attr collection (features attributes) 77 78 # static definition to track which unique attributes 79 # have to be reset/recomputed whenever anything relevant 80 # changes 81 82 # unique{labels,chunks} become a part of dsattr 83 _uniqueattributes = [] 84 """Unique attributes associated with the data""" 85 86 _registeredattributes = [] 87 """Registered attributes (stored in _data)""" 88 89 _requiredattributes = ['samples', 'labels'] 90 """Attributes which have to be provided to __init__, or otherwise 91 no default values would be assumed and construction of the 92 instance would fail""" 93 94 #XXX _ATTRIBUTE_COLLECTIONS = [ 's_attr', 'f_attr', 'ds_attr' ] 95 #XXX """Assure those 3 collections to be present in all datasets""" 96 #XXX 97 #XXX samples__ = SampleAttribute(doc="Samples data. 0th index is time", hasunique=False) # XXX 98 #XXX labels__ = SampleAttribute(doc="Labels for the samples", hasunique=True) 99 #XXX chunks__ = SampleAttribute(doc="Chunk identities for the samples", hasunique=True) 100 #XXX # samples ids (already unique by definition) 101 #XXX origids__ = SampleAttribute(doc="Chunk identities for the samples", hasunique=False) 102

103 - def __init__(self, 104 # for copy constructor 105 data=None, 106 dsattr=None, 107 # automatic dtype conversion 108 dtype=None, 109 # new instances 110 samples=None, 111 labels=None, 112 labels_map=None, 113 chunks=None, 114 origids=None, 115 # flags 116 check_data=True, 117 copy_samples=False, 118 copy_data=True, 119 copy_dsattr=True):

120 """Initialize dataset instance 121 122 There are basically two different way to create a dataset: 123 124 1. Create a new dataset from samples and sample attributes. In 125 this mode a two-dimensional `ndarray` has to be passed to the 126 `samples` keyword argument and the corresponding samples 127 attributes are provided via the `labels` and `chunks` 128 arguments. 129 130 2. Copy contructor mode 131 The second way is used internally to perform quick coyping 132 of datasets, e.g. when performing feature selection. In this 133 mode and the two dictionaries (`data` and `dsattr`) are 134 required. For performance reasons this mode bypasses most of 135 the sanity check performed by the previous mode, as for 136 internal operations data integrity is assumed. 137 138 139 :Parameters: 140 data : dict 141 Dictionary with an arbitrary number of entries. The value for 142 each key in the dict has to be an ndarray with the 143 same length as the number of rows in the samples array. 144 A special entry in this dictionary is 'samples', a 2d array 145 (samples x features). A shallow copy is stored in the object. 146 dsattr : dict 147 Dictionary of dataset attributes. An arbitrary number of 148 arbitrarily named and typed objects can be stored here. A 149 shallow copy of the dictionary is stored in the object. 150 dtype: type | None 151 If None -- do not change data type if samples 152 is an ndarray. Otherwise convert samples to dtype. 153 154 155 :Keywords: 156 samples : ndarray 157 2d array (samples x features) 158 labels 159 An array or scalar value defining labels for each samples. 160 Generally `labels` should be numeric, unless `labels_map` 161 is used 162 labels_map : None or bool or dict 163 Map original labels into numeric labels. If True, the 164 mapping is computed if labels are literal. If is False, 165 no mapping is computed. If dict instance -- provided 166 mapping is verified and applied. If you want to have 167 labels_map just be present given already numeric labels, 168 just assign labels_map dictionary to existing dataset 169 instance 170 chunks 171 An array or scalar value defining chunks for each sample 172 173 Each of the Keywords arguments overwrites what is/might be 174 already in the `data` container. 175 176 """ 177 178 #XXX ClassWithCollections.__init__(self) 179 180 # see if data and dsattr are none, if so, make them empty dicts 181 if data is None: 182 data = {} 183 if dsattr is None: 184 dsattr = {} 185 186 # initialize containers; default values are empty dicts 187 # always make a shallow copy of what comes in, otherwise total chaos 188 # is likely to happen soon 189 if copy_data: 190 # deep copy (cannot use copy.deepcopy, because samples is an 191 # exception 192 # but shallow copy first to get a shared version of the data in 193 # any case 194 lcl_data = data.copy() 195 for k, v in data.iteritems(): 196 # skip copying samples if requested 197 if k == 'samples' and not copy_samples: 198 continue 199 lcl_data[k] = v.copy() 200 else: 201 # shallow copy 202 # XXX? yoh: it might be better speed wise just assign dictionary 203 # without any shallow .copy 204 lcl_data = data.copy() 205 206 if copy_dsattr and len(dsattr)>0: 207 # deep copy 208 if __debug__: 209 debug('DS', "Deep copying dsattr %s" % `dsattr`) 210 lcl_dsattr = copy.deepcopy(dsattr) 211 212 else: 213 # shallow copy 214 lcl_dsattr = copy.copy(dsattr) 215 216 # has to be not private since otherwise derived methods 217 # would have problem accessing it and _registerAttribute 218 # would fail on lambda getters 219 self._data = lcl_data 220 """What makes a dataset.""" 221 222 self._dsattr = lcl_dsattr 223 """Dataset attriibutes.""" 224 225 # store samples (and possibly transform/reshape/retype them) 226 if not samples is None: 227 if __debug__: 228 if lcl_data.has_key('samples'): 229 debug('DS', 230 "`Data` dict has `samples` (%s) but there is also" \ 231 " __init__ parameter `samples` which overrides " \ 232 " stored in `data`" % (`lcl_data['samples'].shape`)) 233 lcl_data['samples'] = self._shapeSamples(samples, dtype, 234 copy_samples) 235 236 # TODO? we might want to have the same logic for chunks and labels 237 # ie if no labels present -- assign arange 238 # MH: don't think this is necessary -- or is there a use case? 239 # labels 240 if not labels is None: 241 if __debug__: 242 if lcl_data.has_key('labels'): 243 debug('DS', 244 "`Data` dict has `labels` (%s) but there is also" + 245 " __init__ parameter `labels` which overrides " + 246 " stored in `data`" % (`lcl_data['labels']`)) 247 if lcl_data.has_key('samples'): 248 lcl_data['labels'] = \ 249 self._expandSampleAttribute(labels, 'labels') 250 251 # check if we got all required attributes 252 for attr in self._requiredattributes: 253 if not lcl_data.has_key(attr): 254 raise DatasetError, \ 255 "Attribute %s is required to initialize dataset" % \ 256 attr 257 258 nsamples = self.nsamples 259 260 # chunks 261 if not chunks == None: 262 lcl_data['chunks'] = \ 263 self._expandSampleAttribute(chunks, 'chunks') 264 elif not lcl_data.has_key('chunks'): 265 # if no chunk information is given assume that every pattern 266 # is its own chunk 267 lcl_data['chunks'] = N.arange(nsamples) 268 269 # samples origids 270 if not origids is None: 271 # simply assign if provided 272 lcl_data['origids'] = origids 273 elif not lcl_data.has_key('origids'): 274 # otherwise contruct unqiue ones 275 lcl_data['origids'] = N.arange(len(lcl_data['labels'])) 276 else: 277 # assume origids have been specified already (copy constructor 278 # mode) leave them as they are, e.g. to make origids survive 279 # selectSamples() 280 pass 281 282 # Initialize attributes which are registered but were not setup 283 for attr in self._registeredattributes: 284 if not lcl_data.has_key(attr): 285 if __debug__: 286 debug("DS", "Initializing attribute %s" % attr) 287 lcl_data[attr] = N.zeros(nsamples) 288 289 # labels_map 290 labels_ = N.asarray(lcl_data['labels']) 291 labels_map_known = lcl_dsattr.has_key('labels_map') 292 if labels_map is True: 293 # need to compose labels_map 294 if labels_.dtype.char == 'S': # or not labels_map_known: 295 # Create mapping 296 ulabels = list(set(labels_)) 297 ulabels.sort() 298 labels_map = dict([ (x[1], x[0]) for x in enumerate(ulabels) ]) 299 if __debug__: 300 debug('DS', 'Mapping for the labels computed to be %s' 301 % labels_map) 302 else: 303 if __debug__: 304 debug('DS', 'Mapping of labels was requested but labels ' 305 'are not strings. Skipped') 306 labels_map = None 307 pass 308 elif labels_map is False: 309 labels_map = None 310 311 if isinstance(labels_map, dict): 312 if labels_map_known: 313 if __debug__: 314 debug('DS', 315 "`dsattr` dict has `labels_map` (%s) but there is also" \ 316 " __init__ parameter `labels_map` (%s) which overrides " \ 317 " stored in `dsattr`" % (lcl_dsattr['labels_map'], labels_map)) 318 319 lcl_dsattr['labels_map'] = labels_map 320 # map labels if needed (if strings or was explicitely requested) 321 if labels_.dtype.char == 'S' or not labels_map_known: 322 if __debug__: 323 debug('DS_', "Remapping labels using mapping %s" % labels_map) 324 # need to remap 325 # !!! N.array is important here 326 try: 327 lcl_data['labels'] = N.array( 328 [labels_map[x] for x in lcl_data['labels']]) 329 except KeyError, e: 330 raise ValueError, "Provided labels_map %s is insufficient " \ 331 "to map all the labels. Mapping for label %s is " \ 332 "missing" % (labels_map, e) 333 334 elif not lcl_dsattr.has_key('labels_map'): 335 lcl_dsattr['labels_map'] = labels_map 336 elif __debug__: 337 debug('DS_', 'Not overriding labels_map in dsattr since it has one') 338 339 if check_data: 340 self._checkData() 341 342 # lazy computation of unique members 343 #self._resetallunique('_dsattr', self._dsattr) 344 345 # Michael: we cannot do this conditional here. When selectSamples() 346 # removes a whole data chunk the uniquechunks values will be invalid. 347 # Same applies to labels of course. 348 if not labels is None or not chunks is None: 349 # for a speed up to don't go through all uniqueattributes 350 # when no need 351 lcl_dsattr['__uniquereseted'] = False 352 self._resetallunique(force=True)

353 354 355 __doc__ = enhancedDocString('Dataset', locals()) 356 357 358 @property

359 - def idhash(self):

360 """To verify if dataset is in the same state as when smth else was done 361 362 Like if classifier was trained on the same dataset as in question""" 363 364 _data = self._data 365 res = idhash_(_data) 366 367 # we cannot count on the order the values in the dict will show up 368 # with `self._data.value()` and since idhash will be order-dependent 369 # we have to make it deterministic 370 keys = _data.keys() 371 keys.sort() 372 for k in keys: 373 res += idhash_(_data[k]) 374 return res

375 376

377 - def _resetallunique(self, force=False):

378 """Set to None all unique* attributes of corresponding dictionary 379 """ 380 _dsattr = self._dsattr 381 382 if not force and _dsattr['__uniquereseted']: 383 return 384 385 _uniqueattributes = self._uniqueattributes 386 387 if __debug__ and "DS_" in debug.active: 388 debug("DS_", "Reseting all attributes %s for dataset %s" 389 % (_uniqueattributes, 390 self.summary(uniq=False, idhash=False, 391 stats=False, lstats=False))) 392 393 # I guess we better checked if dictname is known but... 394 for k in _uniqueattributes: 395 _dsattr[k] = None 396 _dsattr['__uniquereseted'] = True

397 398

399 - def _getuniqueattr(self, attrib, dict_):

400 """Provide common facility to return unique attributes 401 402 XXX `dict_` can be simply replaced now with self._dsattr 403 """ 404 405 # local bindings 406 _dsattr = self._dsattr 407 408 if not _dsattr.has_key(attrib) or _dsattr[attrib] is None: 409 if __debug__ and 'DS_' in debug.active: 410 debug("DS_", "Recomputing unique set for attrib %s within %s" % 411 (attrib, self.summary(uniq=False, 412 stats=False, lstats=False))) 413 # uff... might come up with better strategy to keep relevant 414 # attribute name 415 _dsattr[attrib] = N.unique( N.asanyarray(dict_[attrib[6:]]) ) 416 assert(not _dsattr[attrib] is None) 417 _dsattr['__uniquereseted'] = False 418 419 return _dsattr[attrib]

420 421

422 - def _setdataattr(self, attrib, value):

423 """Provide common facility to set attributes 424 425 """ 426 if len(value) != self.nsamples: 427 raise ValueError, \ 428 "Provided %s have %d entries while there is %d samples" % \ 429 (attrib, len(value), self.nsamples) 430 self._data[attrib] = N.asarray(value) 431 uniqueattr = "unique" + attrib 432 433 _dsattr = self._dsattr 434 if _dsattr.has_key(uniqueattr): 435 _dsattr[uniqueattr] = None

436 437

438 - def _getNSamplesPerAttr( self, attrib='labels' ):

439 """Returns the number of samples per unique label. 440 """ 441 # local bindings 442 _data = self._data 443 444 # XXX hardcoded dict_=self._data.... might be in self._dsattr 445 uniqueattr = self._getuniqueattr(attrib="unique" + attrib, 446 dict_=_data) 447 448 # use dictionary to cope with arbitrary labels 449 result = dict(zip(uniqueattr, [ 0 ] * len(uniqueattr))) 450 for l in _data[attrib]: 451 result[l] += 1 452 453 # XXX only return values to mimic the old interface but we might want 454 # to return the full dict instead 455 # return result 456 return result

457 458

459 - def _getSampleIdsByAttr(self, values, attrib="labels", 460 sort=True):

461 """Return indecies of samples given a list of attributes 462 """ 463 464 if not operator.isSequenceType(values) \ 465 or isinstance(values, basestring): 466 values = [ values ] 467 468 # TODO: compare to plain for loop through the labels 469 # on a real data example 470 sel = N.array([], dtype=N.int16) 471 _data = self._data 472 for value in values: 473 sel = N.concatenate(( 474 sel, N.where(_data[attrib]==value)[0])) 475 476 if sort: 477 # place samples in the right order 478 sel.sort() 479 480 return sel

481 482

483 - def idsonboundaries(self, prior=0, post=0, 484 attributes_to_track=['labels', 'chunks'], 485 affected_labels=None, 486 revert=False):

487 """Find samples which are on the boundaries of the blocks 488 489 Such samples might need to be removed. By default (with 490 prior=0, post=0) ids of the first samples in a 'block' are 491 reported 492 493 :Parameters: 494 prior : int 495 how many samples prior to transition sample to include 496 post : int 497 how many samples post the transition sample to include 498 attributes_to_track : list of basestring 499 which attributes to track to decide on the boundary condition 500 affected_labels : list of basestring 501 for which labels to perform selection. If None - for all 502 revert : bool 503 either to revert the meaning and provide ids of samples which are found 504 to not to be boundary samples 505 """ 506 # local bindings 507 _data = self._data 508 labels = self.labels 509 nsamples = self.nsamples 510 511 lastseen = none = [None for attr in attributes_to_track] 512 transitions = [] 513 514 for i in xrange(nsamples+1): 515 if i < nsamples: 516 current = [_data[attr][i] for attr in attributes_to_track] 517 else: 518 current = none 519 if lastseen != current: 520 # transition point 521 new_transitions = range(max(0, i-prior), 522 min(nsamples-1, i+post)+1) 523 if affected_labels is not None: 524 new_transitions = [labels[i] for i in new_transitions 525 if i in affected_labels] 526 transitions += new_transitions 527 lastseen = current 528 529 transitions = set(transitions) 530 if revert: 531 transitions = set(range(nsamples)).difference(transitions) 532 533 # postprocess 534 transitions = N.array(list(transitions)) 535 transitions.sort() 536 return list(transitions)

537 538

539 - def _shapeSamples(self, samples, dtype, copy):

540 """Adapt different kinds of samples 541 542 Handle all possible input value for 'samples' and tranform 543 them into a 2d (samples x feature) representation. 544 """ 545 # put samples array into correct shape 546 # 1d arrays or simple sequences are assumed to be a single pattern 547 if (not isinstance(samples, N.ndarray)): 548 # it is safe to provide dtype which defaults to None, 549 # when N would choose appropriate dtype automagically 550 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy) 551 else: 552 if samples.ndim < 2 \ 553 or (not dtype is None and dtype != samples.dtype): 554 if dtype is None: 555 dtype = samples.dtype 556 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy) 557 elif copy: 558 samples = samples.copy() 559 560 # only samples x features matrices are supported 561 if len(samples.shape) > 2: 562 raise DatasetError, "Only (samples x features) -> 2d sample " \ 563 + "are supported (got %s shape of samples)." \ 564 % (`samples.shape`) \ 565 +" Consider MappedDataset if applicable." 566 567 return samples

568 569

570 - def _checkData(self):

571 """Checks `_data` members to have the same # of samples. 572 """ 573 # 574 # XXX: Maybe just run this under __debug__ and remove the `check_data` 575 # from the constructor, which is too complicated anyway? 576 # 577 578 # local bindings 579 nsamples = self.nsamples 580 _data = self._data 581 582 for k, v in _data.iteritems(): 583 if not len(v) == nsamples: 584 raise DatasetError, \ 585 "Length of sample attribute '%s' [%i] does not " \ 586 "match the number of samples in the dataset [%i]." \ 587 % (k, len(v), nsamples) 588 589 # check for unique origids 590 uniques = N.unique(_data['origids']) 591 uniques.sort() 592 # need to copy to prevent sorting the original array 593 sorted_ids = _data['origids'].copy() 594 sorted_ids.sort() 595 596 if not (uniques == sorted_ids).all(): 597 raise DatasetError, "Samples IDs are not unique." 598 599 # Check if labels as not literal 600 if N.asanyarray(_data['labels'].dtype.char == 'S'): 601 warning('Labels for dataset %s are literal, should be numeric. ' 602 'You might like to use labels_map argument.' % self)

603

604 - def _expandSampleAttribute(self, attr, attr_name):

605 """If a sample attribute is given as a scalar expand/repeat it to a 606 length matching the number of samples in the dataset. 607 """ 608 try: 609 # if we are initializing with a single string -- we should 610 # treat it as a single label 611 if isinstance(attr, basestring): 612 raise TypeError 613 if len(attr) != self.nsamples: 614 raise DatasetError, \ 615 "Length of sample attribute '%s' [%d]" \ 616 % (attr_name, len(attr)) \ 617 + " has to match the number of samples" \ 618 + " [%d]." % self.nsamples 619 # store the sequence as array 620 return N.array(attr) 621 622 except TypeError: 623 # make sequence of identical value matching the number of 624 # samples 625 return N.repeat(attr, self.nsamples)

626 627 628 @classmethod

629 - def _registerAttribute(cls, key, dictname="_data", abbr=None, hasunique=False):

630 """Register an attribute for any Dataset class. 631 632 Creates property assigning getters/setters depending on the 633 availability of corresponding _get, _set functions. 634 """ 635 classdict = cls.__dict__ 636 if not classdict.has_key(key): 637 if __debug__: 638 debug("DS", "Registering new attribute %s" % key) 639 # define get function and use corresponding 640 # _getATTR if such defined 641 getter = '_get%s' % key 642 if classdict.has_key(getter): 643 getter = '%s.%s' % (cls.__name__, getter) 644 else: 645 getter = "lambda x: x.%s['%s']" % (dictname, key) 646 647 # define set function and use corresponding 648 # _setATTR if such defined 649 setter = '_set%s' % key 650 if classdict.has_key(setter): 651 setter = '%s.%s' % (cls.__name__, setter) 652 elif dictname=="_data": 653 setter = "lambda self,x: self._setdataattr" + \ 654 "(attrib='%s', value=x)" % (key) 655 else: 656 setter = None 657 658 if __debug__: 659 debug("DS", "Registering new property %s.%s" % 660 (cls.__name__, key)) 661 exec "%s.%s = property(fget=%s, fset=%s)" % \ 662 (cls.__name__, key, getter, setter) 663 664 if abbr is not None: 665 exec "%s.%s = property(fget=%s, fset=%s)" % \ 666 (cls.__name__, abbr, getter, setter) 667 668 if hasunique: 669 uniquekey = "unique%s" % key 670 getter = '_get%s' % uniquekey 671 if classdict.has_key(getter): 672 getter = '%s.%s' % (cls.__name__, getter) 673 else: 674 getter = "lambda x: x._getuniqueattr" + \ 675 "(attrib='%s', dict_=x.%s)" % (uniquekey, dictname) 676 677 if __debug__: 678 debug("DS", "Registering new property %s.%s" % 679 (cls.__name__, uniquekey)) 680 681 exec "%s.%s = property(fget=%s)" % \ 682 (cls.__name__, uniquekey, getter) 683 if abbr is not None: 684 exec "%s.U%s = property(fget=%s)" % \ 685 (cls.__name__, abbr, getter) 686 687 # create samplesper<ATTR> properties 688 sampleskey = "samplesper%s" % key[:-1] # remove ending 's' XXX 689 if __debug__: 690 debug("DS", "Registering new property %s.%s" % 691 (cls.__name__, sampleskey)) 692 693 exec "%s.%s = property(fget=%s)" % \ 694 (cls.__name__, sampleskey, 695 "lambda x: x._getNSamplesPerAttr(attrib='%s')" % key) 696 697 cls._uniqueattributes.append(uniquekey) 698 699 # create idsby<ATTR> properties 700 sampleskey = "idsby%s" % key # remove ending 's' XXX 701 if __debug__: 702 debug("DS", "Registering new property %s.%s" % 703 (cls.__name__, sampleskey)) 704 705 exec "%s.%s = %s" % (cls.__name__, sampleskey, 706 "lambda self, x: " + 707 "self._getSampleIdsByAttr(x,attrib='%s')" % key) 708 709 cls._uniqueattributes.append(uniquekey) 710 711 cls._registeredattributes.append(key) 712 elif __debug__: 713 warning('Trying to reregister attribute `%s`. For now ' % key + 714 'such capability is not present')

715 716

717 - def __str__(self):

718 """String summary over the object 719 """ 720 try: 721 ssummary = self.summary(uniq=True, 722 idhash=__debug__ and ('DS_ID' in debug.active), 723 stats=__debug__ and ('DS_STATS' in debug.active), 724 lstats=__debug__ and ('DS_STATS' in debug.active), 725 ) 726 except (AttributeError, KeyError), e: 727 # __str__ or __repr__ could have been requested before actual 728 # instance is populated, e.g. by tracebacks of pdb/pydb. 729 # ??? this case might be generic enough to allow for common 730 # decorator around plentiful of __str__ and __repr__s 731 ssummary = str(e) 732 return ssummary

733 734

735 - def __repr__(self):

736 return "<%s>" % str(self)

737 738

739 - def summary(self, uniq=True, stats=True, idhash=False, lstats=True, 740 maxc=30, maxl=20):

741 """String summary over the object 742 743 :Parameters: 744 uniq : bool 745 Include summary over data attributes which have unique 746 idhash : bool 747 Include idhash value for dataset and samples 748 stats : bool 749 Include some basic statistics (mean, std, var) over dataset samples 750 lstats : bool 751 Include statistics on chunks/labels 752 maxc : int 753 Maximal number of chunks when provide details on labels/chunks 754 maxl : int 755 Maximal number of labels when provide details on labels/chunks 756 """ 757 # local bindings 758 samples = self.samples 759 _data = self._data 760 _dsattr = self._dsattr 761 762 if idhash: 763 idhash_ds = "{%s}" % self.idhash 764 idhash_samples = "{%s}" % idhash_(samples) 765 else: 766 idhash_ds = "" 767 idhash_samples = "" 768 769 s = """Dataset %s/ %s %d%s x %d""" % \ 770 (idhash_ds, samples.dtype, 771 self.nsamples, idhash_samples, self.nfeatures) 772 773 ssep = (' ', '\n')[lstats] 774 if uniq: 775 s += "%suniq:" % ssep 776 for uattr in _dsattr.keys(): 777 if not uattr.startswith("unique"): 778 continue 779 attr = uattr[6:] 780 try: 781 value = self._getuniqueattr(attrib=uattr, 782 dict_=_data) 783 s += " %d %s" % (len(value), attr) 784 except: 785 pass 786 787 if isinstance(self.labels_map, dict): 788 s += ' labels_mapped' 789 790 if stats: 791 # TODO -- avg per chunk? 792 # XXX We might like to use scipy.stats.describe to get 793 # quick summary statistics (mean/range/skewness/kurtosis) 794 if self.nfeatures: 795 s += "%sstats: mean=%g std=%g var=%g min=%g max=%g\n" % \ 796 (ssep, N.mean(samples), N.std(samples), 797 N.var(samples), N.min(samples), N.max(samples)) 798 else: 799 s += "%sstats: dataset has no features\n" % ssep 800 801 if lstats: 802 s += self.summary_labels(maxc=maxc, maxl=maxl) 803 804 return s

805 806

807 - def summary_labels(self, maxc=30, maxl=20):

808 """Provide summary statistics over the labels and chunks 809 810 :Parameters: 811 maxc : int 812 Maximal number of chunks when provide details 813 maxl : int 814 Maximal number of labels when provide details 815 """ 816 # We better avoid bound function since if people only 817 # imported Dataset without miscfx it would fail 818 from mvpa.datasets.miscfx import getSamplesPerChunkLabel 819 spcl = getSamplesPerChunkLabel(self) 820 # XXX couldn't they be unordered? 821 ul = self.uniquelabels.tolist() 822 uc = self.uniquechunks.tolist() 823 s = "" 824 if len(ul) < maxl and len(uc) < maxc: 825 s += "\nCounts of labels in each chunk:" 826 # only in a resonable case do printing 827 table = [[' chunks\labels'] + ul] 828 table += [[''] + ['---'] * len(ul)] 829 for c, counts in zip(uc, spcl): 830 table.append([ str(c) ] + counts.tolist()) 831 s += '\n' + table2string(table) 832 else: 833 s += "No details due to large number of labels or chunks. " \ 834 "Increase maxc and maxl if desired" 835 836 labels_map = self.labels_map 837 if isinstance(labels_map, dict): 838 s += "\nOriginal labels were mapped using following mapping:" 839 s += '\n\t'+'\n\t'.join([':\t'.join(map(str, x)) 840 for x in labels_map.items()]) + '\n' 841 842 def cl_stats(axis, u, name1, name2): 843 """ Compute statistics per label 844 """ 845 stats = {'min': N.min(spcl, axis=axis), 846 'max': N.max(spcl, axis=axis), 847 'mean': N.mean(spcl, axis=axis), 848 'std': N.std(spcl, axis=axis), 849 '#%ss' % name2: N.sum(spcl>0, axis=axis)} 850 entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%ss' % name2] 851 table = [ entries ] 852 for i, l in enumerate(u): 853 d = {' ' + name1 : l} 854 d.update(dict([ (k, stats[k][i]) for k in stats.keys()])) 855 table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)] 856 % d[e] for e in entries] ) 857 return '\nSummary per %s across %ss\n' % (name1, name2) \ 858 + table2string(table)

859 860 if len(ul) < maxl: 861 s += cl_stats(0, ul, 'label', 'chunk') 862 if len(uc) < maxc: 863 s += cl_stats(1, uc, 'chunk', 'label') 864 return s

865 866

867 - def __iadd__(self, other):

868 """Merge the samples of one Dataset object to another (in-place). 869 870 No dataset attributes, besides labels_map, will be merged! 871 Additionally, a new set of unique `origids` will be generated. 872 """ 873 # local bindings 874 _data = self._data 875 other_data = other._data 876 877 if not self.nfeatures == other.nfeatures: 878 raise DatasetError, "Cannot add Dataset, because the number of " \ 879 "feature do not match." 880 881 # take care about labels_map and labels 882 slm = self.labels_map 883 olm = other.labels_map 884 if N.logical_xor(slm is None, olm is None): 885 raise ValueError, "Cannot add datasets where only one of them " \ 886 "has labels map assigned. If needed -- implement it" 887 888 # concatenate all sample attributes 889 for k,v in _data.iteritems(): 890 if k == 'origids': 891 # special case samples origids: for now just regenerate unique 892 # ones could also check if concatenation is unique, but it 893 # would be costly performance-wise 894 _data[k] = N.arange(len(v) + len(other_data[k])) 895 896 elif k == 'labels' and slm is not None: 897 # special care about labels if mapping was in effect, 898 # we need to append 2nd map to the first one and 899 # relabel 2nd dataset 900 nlm = slm.copy() 901 # figure out maximal numerical label used now 902 nextid = N.sort(nlm.values())[-1] + 1 903 olabels = other.labels 904 olabels_remap = {} 905 for ol, olnum in olm.iteritems(): 906 if not nlm.has_key(ol): 907 # check if we can preserve old numberic label 908 # if not -- assign some new one not yet present 909 # in any dataset 910 if olnum in nlm.values(): 911 nextid = N.sort(nlm.values() + olm.values())[-1] + 1 912 else: 913 nextid = olnum 914 olabels_remap[olnum] = nextid 915 nlm[ol] = nextid 916 nextid += 1 917 else: 918 olabels_remap[olnum] = nlm[ol] 919 olabels = [olabels_remap[x] for x in olabels] 920 # finally compose new labels 921 _data['labels'] = N.concatenate((v, olabels), axis=0) 922 # and reassign new mapping 923 self._dsattr['labels_map'] = nlm 924 925 if __debug__: 926 # check if we are not dealing with colliding 927 # mapping, since it is problematic and might lead 928 # to various complications 929 if (len(set(slm.keys())) != len(set(slm.values()))) or \ 930 (len(set(olm.keys())) != len(set(olm.values()))): 931 warning("Adding datasets where multiple labels " 932 "mapped to the same ID is not recommended. " 933 "Please check the outcome. Original mappings " 934 "were %s and %s. Resultant is %s" 935 % (slm, olm, nlm)) 936 937 else: 938 _data[k] = N.concatenate((v, other_data[k]), axis=0) 939 940 # might be more sophisticated but for now just reset -- it is safer ;) 941 self._resetallunique() 942 943 return self

944 945

946 - def __add__( self, other ):

947 """Merge the samples two Dataset objects. 948 949 All data of both datasets is copied, concatenated and a new Dataset is 950 returned. 951 952 NOTE: This can be a costly operation (both memory and time). If 953 performance is important consider the '+=' operator. 954 """ 955 # create a new object of the same type it is now and NOT only Dataset 956 out = super(Dataset, self).__new__(self.__class__) 957 958 # now init it: to make it work all Dataset contructors have to accept 959 # Class(data=Dict, dsattr=Dict) 960 out.__init__(data=self._data, 961 dsattr=self._dsattr, 962 copy_samples=True, 963 copy_data=True, 964 copy_dsattr=True) 965 966 out += other 967 968 return out

969 970

971 - def copy(self, deep=True):

972 """Create a copy (clone) of the dataset, by fully copying current one 973 974 :Keywords: 975 deep : bool 976 deep flag is provided to __init__ for 977 copy_{samples,data,dsattr}. By default full copy is done. 978 """ 979 # create a new object of the same type it is now and NOT only Dataset 980 out = super(Dataset, self).__new__(self.__class__) 981 982 # now init it: to make it work all Dataset contructors have to accept 983 # Class(data=Dict, dsattr=Dict) 984 out.__init__(data=self._data, 985 dsattr=self._dsattr, 986 copy_samples=True, 987 copy_data=True, 988 copy_dsattr=True) 989 990 return out

991 992

993 - def selectFeatures(self, ids=None, sort=True, groups=None):

994 """Select a number of features from the current set. 995 996 :Parameters: 997 ids 998 iterable container to select ids 999 sort : bool 1000 if to sort Ids. Order matters and `selectFeatures` assumes 1001 incremental order. If not such, in non-optimized code 1002 selectFeatures would verify the order and sort 1003 1004 Returns a new Dataset object with a copy of corresponding features 1005 from the original samples array. 1006 1007 WARNING: The order of ids determines the order of features in 1008 the returned dataset. This might be useful sometimes, but can 1009 also cause major headaches! Order would is verified when 1010 running in non-optimized code (if __debug__) 1011 """ 1012 if ids is None and groups is None: 1013 raise ValueError, "No feature selection specified." 1014 1015 # start with empty list if no ids where specified (so just groups) 1016 if ids is None: 1017 ids = [] 1018 1019 if not groups is None: 1020 if not self._dsattr.has_key('featuregroups'): 1021 raise RuntimeError, \ 1022 "Dataset has no feature grouping information." 1023 1024 for g in groups: 1025 ids += (self._dsattr['featuregroups'] == g).nonzero()[0].tolist() 1026 1027 # XXX set sort default to True, now sorting has to be explicitely 1028 # disabled and warning is not necessary anymore 1029 if sort: 1030 ids = copy.deepcopy(ids) 1031 ids.sort() 1032 elif __debug__ and 'CHECK_DS_SORTED' in debug.active: 1033 from mvpa.misc.support import isSorted 1034 if not isSorted(ids): 1035 warning("IDs for selectFeatures must be provided " + 1036 "in sorted order, otherwise major headache might occur") 1037 1038 # shallow-copy all stuff from current data dict 1039 new_data = self._data.copy() 1040 1041 # assign the selected features -- data is still shared with 1042 # current dataset 1043 new_data['samples'] = self._data['samples'][:, ids] 1044 1045 # apply selection to feature groups as well 1046 if self._dsattr.has_key('featuregroups'): 1047 new_dsattr = self._dsattr.copy() 1048 new_dsattr['featuregroups'] = self._dsattr['featuregroups'][ids] 1049 else: 1050 new_dsattr = self._dsattr 1051 1052 # create a new object of the same type it is now and NOT only Dataset 1053 dataset = super(Dataset, self).__new__(self.__class__) 1054 1055 # now init it: to make it work all Dataset contructors have to accept 1056 # Class(data=Dict, dsattr=Dict) 1057 dataset.__init__(data=new_data, 1058 dsattr=new_dsattr, 1059 check_data=False, 1060 copy_samples=False, 1061 copy_data=False, 1062 copy_dsattr=False 1063 ) 1064 1065 return dataset

1066 1067

1068 - def applyMapper(self, featuresmapper=None, samplesmapper=None, 1069 train=True):

1070 """Obtain new dataset by applying mappers over features and/or samples. 1071 1072 While featuresmappers leave the sample attributes information 1073 unchanged, as the number of samples in the dataset is invariant, 1074 samplesmappers are also applied to the samples attributes themselves! 1075 1076 Applying a featuresmapper will destroy any feature grouping information. 1077 1078 :Parameters: 1079 featuresmapper : Mapper 1080 `Mapper` to somehow transform each sample's features 1081 samplesmapper : Mapper 1082 `Mapper` to transform each feature across samples 1083 train : bool 1084 Flag whether to train the mapper with this dataset before applying 1085 it. 1086 1087 TODO: selectFeatures is pretty much 1088 applyMapper(featuresmapper=MaskMapper(...)) 1089 """ 1090 1091 # shallow-copy all stuff from current data dict 1092 new_data = self._data.copy() 1093 1094 # apply mappers 1095 1096 if samplesmapper: 1097 if __debug__: 1098 debug("DS", "Training samplesmapper %s" % `samplesmapper`) 1099 samplesmapper.train(self) 1100 1101 if __debug__: 1102 debug("DS", "Applying samplesmapper %s" % `samplesmapper` + 1103 " to samples of dataset `%s`" % `self`) 1104 1105 # get rid of existing 'origids' as they are not valid anymore and 1106 # applying a mapper to them is not really meaningful 1107 if new_data.has_key('origids'): 1108 del(new_data['origids']) 1109 1110 # apply mapper to all sample-wise data in dataset 1111 for k in new_data.keys(): 1112 new_data[k] = samplesmapper.forward(self._data[k]) 1113 1114 # feature mapping might affect dataset attributes 1115 # XXX: might be obsolete when proper feature attributes are implemented 1116 new_dsattr = self._dsattr 1117 1118 if featuresmapper: 1119 if __debug__: 1120 debug("DS", "Training featuresmapper %s" % `featuresmapper`) 1121 featuresmapper.train(self) 1122 1123 if __debug__: 1124 debug("DS", "Applying featuresmapper %s" % `featuresmapper` + 1125 " to samples of dataset `%s`" % `self`) 1126 new_data['samples'] = featuresmapper.forward(self._data['samples']) 1127 1128 # remove feature grouping, who knows what the mapper did to the 1129 # features 1130 if self._dsattr.has_key('featuregroups'): 1131 new_dsattr = self._dsattr.copy() 1132 del(new_dsattr['featuregroups']) 1133 else: 1134 new_dsattr = self._dsattr 1135 1136 # create a new object of the same type it is now and NOT only Dataset 1137 dataset = super(Dataset, self).__new__(self.__class__) 1138 1139 # now init it: to make it work all Dataset contructors have to accept 1140 # Class(data=Dict, dsattr=Dict) 1141 dataset.__init__(data=new_data, 1142 dsattr=new_dsattr, 1143 check_data=False, 1144 copy_samples=False, 1145 copy_data=False, 1146 copy_dsattr=False 1147 ) 1148 1149 # samples attributes might have changed after applying samplesmapper 1150 if samplesmapper: 1151 dataset._resetallunique(force=True) 1152 1153 return dataset

1154 1155

1156 - def selectSamples(self, ids):

1157 """Choose a subset of samples defined by samples IDs. 1158 1159 Returns a new dataset object containing the selected sample 1160 subset. 1161 1162 TODO: yoh, we might need to sort the mask if the mask is a 1163 list of ids and is not ordered. Clarify with Michael what is 1164 our intent here! 1165 """ 1166 # without having a sequence a index the masked sample array would 1167 # loose its 2d layout 1168 if not operator.isSequenceType( ids ): 1169 ids = [ids] 1170 # TODO: Reconsider crafting a slice if it can be done to don't copy 1171 # the data 1172 #try: 1173 # minmask = min(mask) 1174 # maxmask = max(mask) 1175 #except: 1176 # minmask = min(map(int,mask)) 1177 # maxmask = max(map(int,mask)) 1178 # lets see if we could get it done with cheap view/slice 1179 #(minmask, maxmask) != (0, 1) and \ 1180 #if len(mask) > 2 and \ 1181 # N.array([N.arange(minmask, maxmask+1) == N.array(mask)]).all(): 1182 # slice_ = slice(minmask, maxmask+1) 1183 # if __debug__: 1184 # debug("DS", "We can and do convert mask %s into splice %s" % 1185 # (mask, slice_)) 1186 # mask = slice_ 1187 # mask all sample attributes 1188 data = {} 1189 for k, v in self._data.iteritems(): 1190 data[k] = v[ids, ] 1191 1192 # create a new object of the same type it is now and NOT onyl Dataset 1193 dataset = super(Dataset, self).__new__(self.__class__) 1194 1195 # now init it: to make it work all Dataset contructors have to accept 1196 # Class(data=Dict, dsattr=Dict) 1197 dataset.__init__(data=data, 1198 dsattr=self._dsattr, 1199 check_data=False, 1200 copy_samples=False, 1201 copy_data=False, 1202 copy_dsattr=False) 1203 1204 dataset._resetallunique(force=True) 1205 return dataset

1206 1207 1208

1209 - def index(self, *args, **kwargs):

1210 """Universal indexer to obtain indexes of interesting samples/features. 1211 See .select() for more information 1212 1213 :Return: tuple of (samples indexes, features indexes). Each 1214 item could be also None, if no selection on samples or 1215 features was requested (to discriminate between no selected 1216 items, and no selections) 1217 """ 1218 s_indx = [] # selections for samples 1219 f_indx = [] # selections for features 1220 return_dataset = kwargs.pop('return_dataset', False) 1221 largs = len(args) 1222 1223 args = list(args) # so we could override 1224 # Figure out number of positional 1225 largs_nonstring = 0 1226 # need to go with index since we might need to override internally 1227 for i in xrange(largs): 1228 l = args[i] 1229 if isinstance(l, basestring): 1230 if l.lower() == 'all': 1231 # override with a slice 1232 args[i] = slice(None) 1233 else: 1234 break 1235 largs_nonstring += 1 1236 1237 if largs_nonstring >= 1: 1238 s_indx.append(args[0]) 1239 if __debug__ and 'CHECK_DS_SELECT' in debug.active: 1240 _validate_indexes_uniq_sorted(args[0], 'select', 'samples') 1241 if largs_nonstring == 2: 1242 f_indx.append(args[1]) 1243 if __debug__ and 'CHECK_DS_SELECT' in debug.active: 1244 _validate_indexes_uniq_sorted(args[1], 'select', 'features') 1245 elif largs_nonstring > 2: 1246 raise ValueError, "Only two positional arguments are allowed" \ 1247 ". 1st for samples, 2nd for features" 1248 1249 # process left positional arguments which must encode selections like 1250 # ('labels', [1,2,3]) 1251 1252 if (largs - largs_nonstring) % 2 != 0: 1253 raise ValueError, "Positional selections must come in pairs:" \ 1254 " e.g. ('labels', [1,2,3])" 1255 1256 for i in xrange(largs_nonstring, largs, 2): 1257 k, v = args[i:i+2] 1258 kwargs[k] = v 1259 1260 # process keyword parameters 1261 data_ = self._data 1262 for k, v in kwargs.iteritems(): 1263 if k == 'samples': 1264 s_indx.append(v) 1265 elif k == 'features': 1266 f_indx.append(v) 1267 elif data_.has_key(k): 1268 # so it is an attribute for samples 1269 # XXX may be do it not only if __debug__ 1270 if __debug__: # and 'CHECK_DS_SELECT' in debug.active: 1271 if not N.any([isinstance(v, cls) for cls in 1272 [list, tuple, slice, int]]): 1273 raise ValueError, "Trying to specify selection for %s " \ 1274 "based on unsupported '%s'" % (k, v) 1275 s_indx.append(self._getSampleIdsByAttr(v, attrib=k, sort=False)) 1276 else: 1277 raise ValueError, 'Keyword "%s" is not known, thus' \ 1278 'select() failed' % k 1279 1280 def combine_indexes(indx, nelements): 1281 """Helper function: intersect selections given in indx 1282 1283 :Parameters: 1284 indxs : list of lists or slices 1285 selections of elements 1286 nelements : int 1287 number of elements total for deriving indexes from slices 1288 """ 1289 indx_sel = None # pure list of ids for selection 1290 for s in indx: 1291 if isinstance(s, slice) or \ 1292 isinstance(s, N.ndarray) and s.dtype==bool: 1293 # XXX there might be a better way than reconstructing the full 1294 # index list. Also we are loosing ability to do simlpe slicing, 1295 # ie w.o making a copy of the selected data 1296 all_indexes = N.arange(nelements) 1297 s = all_indexes[s] 1298 elif not operator.isSequenceType(s): 1299 s = [ s ] 1300 1301 if indx_sel is None: 1302 indx_sel = set(s) 1303 else: 1304 # To be consistent 1305 #if not isinstance(indx_sel, set): 1306 # indx_sel = set(indx_sel) 1307 indx_sel = indx_sel.intersection(s) 1308 1309 # if we got set -- convert 1310 if isinstance(indx_sel, set): 1311 indx_sel = list(indx_sel) 1312 1313 # sort for the sake of sanity 1314 indx_sel.sort() 1315 1316 return indx_sel

1317 1318 # Select samples 1319 if len(s_indx) == 1 and isinstance(s_indx[0], slice) \ 1320 and s_indx[0] == slice(None): 1321 # so no actual selection -- full slice 1322 s_indx = s_indx[0] 1323 else: 1324 # else - get indexes 1325 if len(s_indx) == 0: 1326 s_indx = None 1327 else: 1328 s_indx = combine_indexes(s_indx, self.nsamples) 1329 1330 # Select features 1331 if len(f_indx): 1332 f_indx = combine_indexes(f_indx, self.nfeatures) 1333 else: 1334 f_indx = None 1335 1336 return s_indx, f_indx 1337 1338

1339 - def select(self, *args, **kwargs):

1340 """Universal selector 1341 1342 WARNING: if you need to select duplicate samples 1343 (e.g. samples=[5,5]) or order of selected samples of features 1344 is important and has to be not ordered (e.g. samples=[3,2,1]), 1345 please use selectFeatures or selectSamples functions directly 1346 1347 Examples: 1348 Mimique plain selectSamples:: 1349 1350 dataset.select([1,2,3]) 1351 dataset[[1,2,3]] 1352 1353 Mimique plain selectFeatures:: 1354 1355 dataset.select(slice(None), [1,2,3]) 1356 dataset.select('all', [1,2,3]) 1357 dataset[:, [1,2,3]] 1358 1359 Mixed (select features and samples):: 1360 1361 dataset.select([1,2,3], [1, 2]) 1362 dataset[[1,2,3], [1, 2]] 1363 1364 Select samples matching some attributes:: 1365 1366 dataset.select(labels=[1,2], chunks=[2,4]) 1367 dataset.select('labels', [1,2], 'chunks', [2,4]) 1368 dataset['labels', [1,2], 'chunks', [2,4]] 1369 1370 Mixed -- out of first 100 samples, select only those with 1371 labels 1 or 2 and belonging to chunks 2 or 4, and select 1372 features 2 and 3:: 1373 1374 dataset.select(slice(0,100), [2,3], labels=[1,2], chunks=[2,4]) 1375 dataset[:100, [2,3], 'labels', [1,2], 'chunks', [2,4]] 1376 1377 """ 1378 s_indx, f_indx = self.index(*args, **kwargs) 1379 1380 # Select samples 1381 if s_indx == slice(None): 1382 # so no actual selection was requested among samples. 1383 # thus proceed with original dataset 1384 if __debug__: 1385 debug('DS', 'in select() not selecting samples') 1386 ds = self 1387 else: 1388 # else do selection 1389 if __debug__: 1390 debug('DS', 'in select() selecting samples given selections' 1391 + str(s_indx)) 1392 ds = self.selectSamples(s_indx) 1393 1394 # Select features 1395 if f_indx is not None: 1396 if __debug__: 1397 debug('DS', 'in select() selecting features given selections' 1398 + str(f_indx)) 1399 ds = ds.selectFeatures(f_indx) 1400 1401 return ds

1402 1403 1404

1405 - def where(self, *args, **kwargs):

1406 """Obtain indexes of interesting samples/features. See select() for more information 1407 1408 XXX somewhat obsoletes idsby... 1409 """ 1410 s_indx, f_indx = self.index(*args, **kwargs) 1411 if s_indx is not None and f_indx is not None: 1412 return s_indx, f_indx 1413 elif s_indx is not None: 1414 return s_indx 1415 else: 1416 return f_indx

1417 1418

1419 - def __getitem__(self, *args):

1420 """Convinience dataset parts selection 1421 1422 See select for more information 1423 """ 1424 # for cases like ['labels', 1] 1425 if len(args) == 1 and isinstance(args[0], tuple): 1426 args = args[0] 1427 1428 args_, args = args, () 1429 for a in args_: 1430 if isinstance(a, slice) and \ 1431 isinstance(a.start, basestring): 1432 # for the constructs like ['labels':[1,2]] 1433 if a.stop is None or a.step is not None: 1434 raise ValueError, \ 1435 "Selection must look like ['chunks':[2,3]]" 1436 args += (a.start, a.stop) 1437 else: 1438 args += (a,) 1439 return self.select(*args)

1440 1441

1442 - def permuteLabels(self, status, perchunk=True, assure_permute=False):

1443 """Permute the labels. 1444 1445 TODO: rename status into something closer in semantics. 1446 1447 :Parameters: 1448 status : bool 1449 Calling this method with set to True, the labels are 1450 permuted among all samples. If 'status' is False the 1451 original labels are restored. 1452 perchunk : bool 1453 If True permutation is limited to samples sharing the same 1454 chunk value. Therefore only the association of a certain 1455 sample with a label is permuted while keeping the absolute 1456 number of occurences of each label value within a certain 1457 chunk constant. 1458 assure_permute : bool 1459 If True, assures that labels are permutted, ie any one is 1460 different from the original one 1461 """ 1462 # local bindings 1463 _data = self._data 1464 1465 if len(self.uniquelabels)<2: 1466 raise RuntimeError, \ 1467 "Call to permuteLabels is bogus since there is insuficient" \ 1468 " number of labels: %s" % self.uniquelabels 1469 1470 if not status: 1471 # restore originals 1472 if _data.get('origlabels', None) is None: 1473 raise RuntimeError, 'Cannot restore labels. ' \ 1474 'permuteLabels() has never been ' \ 1475 'called with status == True.' 1476 self.labels = _data['origlabels'] 1477 _data.pop('origlabels') 1478 else: 1479 # store orig labels, but only if not yet done, otherwise multiple 1480 # calls with status == True will destroy the original labels 1481 if not _data.has_key('origlabels') \ 1482 or _data['origlabels'] == None: 1483 # bind old labels to origlabels 1484 _data['origlabels'] = _data['labels'] 1485 # copy labels 1486 _data['labels'] = copy.copy(_data['labels']) 1487 1488 labels = _data['labels'] 1489 # now scramble 1490 if perchunk: 1491 for o in self.uniquechunks: 1492 labels[self.chunks == o] = \ 1493 N.random.permutation(labels[self.chunks == o]) 1494 else: 1495 labels = N.random.permutation(labels) 1496 1497 self.labels = labels 1498 1499 if assure_permute: 1500 if not (_data['labels'] != _data['origlabels']).any(): 1501 if not (assure_permute is True): 1502 if assure_permute == 1: 1503 raise RuntimeError, \ 1504 "Cannot assure permutation of labels %s for " \ 1505 "some reason with chunks %s and while " \ 1506 "perchunk=%s . Should not happen" % \ 1507 (self.labels, self.chunks, perchunk) 1508 else: 1509 assure_permute = 11 # make 10 attempts 1510 if __debug__: 1511 debug("DS", "Recalling permute to assure different labels") 1512 self.permuteLabels(status, perchunk=perchunk, 1513 assure_permute=assure_permute-1)

1514 1515

1516 - def getRandomSamples(self, nperlabel):

1517 """Select a random set of samples. 1518 1519 If 'nperlabel' is an integer value, the specified number of samples is 1520 randomly choosen from the group of samples sharing a unique label 1521 value ( total number of selected samples: nperlabel x len(uniquelabels). 1522 1523 If 'nperlabel' is a list which's length has to match the number of 1524 unique label values. In this case 'nperlabel' specifies the number of 1525 samples that shall be selected from the samples with the corresponding 1526 label. 1527 1528 The method returns a Dataset object containing the selected 1529 samples. 1530 """ 1531 # if interger is given take this value for all classes 1532 if isinstance(nperlabel, int): 1533 nperlabel = [ nperlabel for i in self.uniquelabels ] 1534 1535 sample = [] 1536 # for each available class 1537 labels = self.labels 1538 for i, r in enumerate(self.uniquelabels): 1539 # get the list of pattern ids for this class 1540 sample += random.sample( (labels == r).nonzero()[0], 1541 nperlabel[i] ) 1542 1543 return self.selectSamples( sample )

1544 1545 1546 # def _setchunks(self, chunks): 1547 # """Sets chunks and recomputes uniquechunks 1548 # """ 1549 # self._data['chunks'] = N.array(chunks) 1550 # self._dsattr['uniquechunks'] = None # None!since we might not need them 1551 1552

1553 - def getNSamples( self ):

1554 """Currently available number of patterns. 1555 """ 1556 return self._data['samples'].shape[0]

1557 1558

1559 - def getNFeatures( self ):

1560 """Number of features per pattern. 1561 """ 1562 return self._data['samples'].shape[1]

1563 1564

1565 - def getLabelsMap(self):

1566 """Stored labels map (if any) 1567 """ 1568 return self._dsattr.get('labels_map', None)

1569 1570

1571 - def setLabelsMap(self, lm):

1572 """Set labels map. 1573 1574 Checks for the validity of the mapping -- values should cover 1575 all existing labels in the dataset 1576 """ 1577 values = set(lm.values()) 1578 labels = set(self.uniquelabels) 1579 if not values.issuperset(labels): 1580 raise ValueError, \ 1581 "Provided mapping %s has some existing labels (out of %s) " \ 1582 "missing from mapping" % (list(values), list(labels)) 1583 self._dsattr['labels_map'] = lm

1584 1585

1586 - def setSamplesDType(self, dtype):

1587 """Set the data type of the samples array. 1588 """ 1589 # local bindings 1590 _data = self._data 1591 1592 if _data['samples'].dtype != dtype: 1593 _data['samples'] = _data['samples'].astype(dtype)

1594 1595

1596 - def defineFeatureGroups(self, definition):

1597 """Assign `definition` to featuregroups 1598 1599 XXX Feature-groups was not finished to be useful 1600 """ 1601 if not len(definition) == self.nfeatures: 1602 raise ValueError, \ 1603 "Length of feature group definition %i " \ 1604 "does not match the number of features %i " \ 1605 % (len(definition), self.nfeatures) 1606 1607 self._dsattr['featuregroups'] = N.array(definition)

1608 1609

1610 - def convertFeatureIds2FeatureMask(self, ids):

1611 """Returns a boolean mask with all features in `ids` selected. 1612 1613 :Parameters: 1614 ids: list or 1d array 1615 To be selected features ids. 1616 1617 :Returns: 1618 ndarray: dtype='bool' 1619 All selected features are set to True; False otherwise. 1620 """ 1621 fmask = N.repeat(False, self.nfeatures) 1622 fmask[ids] = True 1623 1624 return fmask

1625 1626

1627 - def convertFeatureMask2FeatureIds(self, mask):

1628 """Returns feature ids corresponding to non-zero elements in the mask. 1629 1630 :Parameters: 1631 mask: 1d ndarray 1632 Feature mask. 1633 1634 :Returns: 1635 ndarray: integer 1636 Ids of non-zero (non-False) mask elements. 1637 """ 1638 return mask.nonzero()[0]

1639 1640 1641 @staticmethod

1642 - def _checkCopyConstructorArgs(**kwargs):

1643 """Common sanity check for Dataset copy constructor calls.""" 1644 # check if we have samples (somwhere) 1645 samples = None 1646 if kwargs.has_key('samples'): 1647 samples = kwargs['samples'] 1648 if samples is None and kwargs.has_key('data') \ 1649 and kwargs['data'].has_key('samples'): 1650 samples = kwargs['data']['samples'] 1651 if samples is None: 1652 raise DatasetError, \ 1653 "`samples` must be provided to copy constructor call." 1654 1655 if not len(samples.shape) == 2: 1656 raise DatasetError, \ 1657 "samples must be in 2D shape in copy constructor call."

1658 1659 1660 # read-only class properties 1661 nsamples = property( fget=getNSamples ) 1662 nfeatures = property( fget=getNFeatures ) 1663 labels_map = property( fget=getLabelsMap, fset=setLabelsMap ) 1664

1665 -def datasetmethod(func):

1666 """Decorator to easily bind functions to a Dataset class 1667 """ 1668 if __debug__: 1669 debug("DS_", "Binding function %s to Dataset class" % func.func_name) 1670 1671 # Bind the function 1672 setattr(Dataset, func.func_name, func) 1673 1674 # return the original one 1675 return func

1676 1677 1678 # Following attributes adherent to the basic dataset 1679 Dataset._registerAttribute("samples", "_data", abbr='S', hasunique=False) 1680 Dataset._registerAttribute("labels", "_data", abbr='L', hasunique=True) 1681 Dataset._registerAttribute("chunks", "_data", abbr='C', hasunique=True) 1682 # samples ids (already unique by definition) 1683 Dataset._registerAttribute("origids", "_data", abbr='I', hasunique=False) 1684 1685 1686 1687 # XXX This is the place to redo the Dataset base class in a more powerful, yet 1688 # simpler way. The basic goal is to allow for all kinds of attributes: 1689 # 1690 # 1) Samples attributes (per-sample full) 1691 # 2) Features attributes (per-feature stuff) 1692 # 1693 # 3) Dataset attributes (per-dataset stuff) 1694 # 1695 # Use cases: 1696 # 1697 # 1) labels and chunks -- goal: it should be possible to have multivariate 1698 # labels, e.g. to specify targets for a neural network output layer 1699 # 1700 # 2) feature binding/grouping -- goal: easily define ROIs in datasets, or 1701 # group/mark various types of feature so they could be selected or 1702 # discarded all together 1703 # 1704 # 3) Mappers, or chains of them (this should be possible already, but could 1705 # be better integrated to make applyMapper() obsolete). 1706 # 1707 # 1708 # Perform distortion correction on __init__(). The copy contructor 1709 # implementation should move into a separate classmethod. 1710 # 1711 # Think about implementing the current 'clever' attributes in terms of one-time 1712 # properties as suggested by Fernando on nipy-devel. 1713 1714 # ... 1715 1716 from mvpa.misc.state import ClassWithCollections, Collection 1717 from mvpa.misc.attributes import SampleAttribute, FeatureAttribute, \ 1718 DatasetAttribute

1719 1720 # Remaining public interface of Dataset 1721 -class _Dataset(ClassWithCollections):

1722 """The successor of Dataset. 1723 """ 1724 # placeholder for all three basic collections of a Dataset 1725 # put here to be able to check whether the AttributesCollector already 1726 # instanciated a particular collection 1727 # XXX maybe it should not do this at all for Dataset 1728 sa = None 1729 fa = None 1730 dsa = None 1731 1732 # storage of samples in a plain NumPy array for fast access 1733 samples = None 1734

1735 - def __init__(self, samples, sa=None, fa=None, dsa=None):

1736 """ 1737 This is the generic internal constructor. Its main task is to allow 1738 for a maximum level of customization during dataset construction, 1739 including fast copy construction. 1740 1741 Parameters 1742 ---------- 1743 samples : ndarray 1744 Data samples. 1745 sa : Collection 1746 Samples attributes collection. 1747 fa : Collection 1748 Features attributes collection. 1749 dsa : Collection 1750 Dataset attributes collection. 1751 """ 1752 # init base class 1753 ClassWithCollections.__init__(self) 1754 1755 # Internal constructor -- users focus on init* Methods 1756 1757 # Every dataset needs data (aka samples), completely data-driven 1758 # analyses might not even need labels, so this is the only mandatory 1759 # argument 1760 # XXX add checks 1761 self.samples = samples 1762 1763 # Everything else in a dataset (except for samples) is organized in 1764 # collections 1765 # copy attributes from source collections (scol) into target 1766 # collections (tcol) 1767 for scol, tcol in ((sa, self.sa), 1768 (fa, self.fa), 1769 (dsa, self.dsa)): 1770 # make sure we have the target collection 1771 if tcol is None: 1772 # XXX maybe use different classes for the collections 1773 # but currently no reason to do so 1774 tcol = Collection(owner=self) 1775 1776 # transfer the attributes 1777 if not scol is None: 1778 for name, attr in scol.items.iteritems(): 1779 # this will also update the owner of the attribute 1780 # XXX discuss the implications of always copying 1781 tcol.add(copy.copy(attr))

1782 1783 1784 @classmethod

1785 - def initSimple(klass, samples, labels, chunks):

1786 # use Numpy convention 1787 """ 1788 One line summary. 1789 1790 Long description. 1791 1792 Parameters 1793 ---------- 1794 samples : ndarray 1795 The two-dimensional samples matrix. 1796 labels : ndarray 1797 chunks : ndarray 1798 1799 Returns 1800 ------- 1801 blah blah 1802 1803 Notes 1804 ----- 1805 blah blah 1806 1807 See Also 1808 -------- 1809 blah blah 1810 1811 Examples 1812 -------- 1813 blah blah 1814 """ 1815 # Demo user contructor 1816 1817 # compile the necessary samples attributes collection 1818 labels_ = SampleAttribute(name='labels') 1819 labels_.value = labels 1820 chunks_ = SampleAttribute(name='chunks') 1821 chunks_.value = chunks 1822 1823 # feels strange that one has to give the name again 1824 # XXX why does items have to be a dict when each samples 1825 # attr already knows its name 1826 sa = Collection(items={'labels': labels_, 'chunks': chunks_}) 1827 1828 # common checks should go into __init__ 1829 return klass(samples, sa=sa)

1830 1831

1832 - def getNSamples( self ):

1833 """Currently available number of patterns. 1834 """ 1835 return self.samples.shape[0]

1836 1837

1838 - def getNFeatures( self ):

1839 """Number of features per pattern. 1840 """ 1841 return self.samples.shape[1]

1842 1843 1844 # 1845 # @property 1846 # def idhash(self): 1847 # pass 1848 # 1849 # 1850 # def idsonboundaries(self, prior=0, post=0, 1851 # attributes_to_track=['labels', 'chunks'], 1852 # affected_labels=None, 1853 # revert=False): 1854 # pass 1855 # 1856 # 1857 # def summary(self, uniq=True, stats=True, idhash=False, lstats=True, 1858 # maxc=30, maxl=20): 1859 # pass 1860 # 1861 # 1862 # def summary_labels(self, maxc=30, maxl=20): 1863 # pass 1864 # 1865 # 1866 # def __iadd__(self, other): 1867 # pass 1868 # 1869 # 1870 # def __add__( self, other ): 1871 # pass 1872 # 1873 # 1874 # def copy(self): 1875 # pass 1876 # 1877 # 1878 # def selectFeatures(self, ids=None, sort=True, groups=None): 1879 # pass 1880 # 1881 # 1882 # def applyMapper(self, featuresmapper=None, samplesmapper=None, 1883 # train=True): 1884 # pass 1885 # 1886 # 1887 # def selectSamples(self, ids): 1888 # pass 1889 # 1890 # 1891 # def index(self, *args, **kwargs): 1892 # pass 1893 # 1894 # 1895 # def select(self, *args, **kwargs): 1896 # pass 1897 # 1898 # 1899 # def where(self, *args, **kwargs): 1900 # pass 1901 # 1902 # 1903 # def __getitem__(self, *args): 1904 # pass 1905 # 1906 # 1907 # def permuteLabels(self, status, perchunk=True, assure_permute=False): 1908 # pass 1909 # 1910 # 1911 # def getRandomSamples(self, nperlabel): 1912 # pass 1913 # 1914 # 1915 # def getLabelsMap(self): 1916 # pass 1917 # 1918 # 1919 # def setLabelsMap(self, lm): 1920 # pass 1921 # 1922 # 1923 # def setSamplesDType(self, dtype): 1924 # pass 1925 # 1926 # 1927 # def defineFeatureGroups(self, definition): 1928 # pass 1929 # 1930 # 1931 # def convertFeatureIds2FeatureMask(self, ids): 1932 # pass 1933 # 1934 # 1935 # def convertFeatureMask2FeatureIds(self, mask): 1936 # pass 1937

Source Code for Module mvpa.datasets.base