mvpa.misc.io.base

1 # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 # vi: set ft=python sts=4 ts=4 sw=4 et: 3 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 # 5 # See COPYING file distributed along with the PyMVPA package for the 6 # copyright and license terms. 7 # 8 ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 """Some little helper for reading (and writing) common formats from and to 10 disk.""" 11 12 __docformat__ = 'restructuredtext' 13 14 import numpy as N 15 import mvpa.support.copy as copy 16 from mvpa.base.dochelpers import enhancedDocString 17 from re import sub as re_sub 18 from mvpa.base import warning 19 20 from mvpa.misc.support import Event 21 22 if __debug__: 23 from mvpa.base import debug 24 25

26 -class DataReader(object):

27 """Base class for data readers. 28 29 Every subclass has to put all information into to variable: 30 31 `self._data`: ndarray 32 The data array has to have the samples separating dimension along the 33 first axis. 34 `self._props`: dict 35 All other meaningful information has to be stored in a dictionary. 36 37 This class provides two methods (and associated properties) to retrieve 38 this information. 39 """

40 - def __init__(self):

41 """Cheap init. 42 """ 43 self._props = {} 44 self._data = None

45 46

47 - def getPropsAsDict(self):

48 """Return the dictionary with the data properties. 49 """ 50 return self._props

51 52

53 - def getData(self):

54 """Return the data array. 55 """ 56 return self._data

57 58 59 data = property(fget=getData, doc="Data array") 60 props = property(fget=getPropsAsDict, doc="Property dict")

61 62 63

64 -class ColumnData(dict):

65 """Read data that is stored in columns of text files. 66 67 All read data is available via a dictionary-like interface. If 68 column headers are available, the column names serve as dictionary keys. 69 If no header exists an articfical key is generated: str(number_of_column). 70 71 Splitting of text file lines is performed by the standard split() function 72 (which gets passed the `sep` argument as separator string) and each 73 element is converted into the desired datatype. 74 75 Because data is read into a dictionary no two columns can have the same 76 name in the header! Each column is stored as a list in the dictionary. 77 """

78 - def __init__(self, source, header=True, sep=None, headersep=None, 79 dtype=float, skiplines=0):

80 """Read data from file into a dictionary. 81 82 :Parameters: 83 source : basestring or dict 84 If values is given as a string all data is read from the 85 file and additonal keyword arguments can be sued to 86 customize the read procedure. If a dictionary is passed 87 a deepcopy is performed. 88 header : bool or list of basestring 89 Indicates whether the column names should be read from the 90 first line (`header=True`). If `header=False` unique 91 column names will be generated (see class docs). If 92 `header` is a python list, it's content is used as column 93 header names and its length has to match the number of 94 columns in the file. 95 sep : basestring or None 96 Separator string. The actual meaning depends on the output 97 format (see class docs). 98 headersep : basestring or None 99 Separator string used in the header. The actual meaning 100 depends on the output format (see class docs). 101 dtype : type or list(types) 102 Desired datatype(s). Datatype per column get be specified by 103 passing a list of types. 104 skiplines : int 105 Number of lines to skip at the beginning of the file. 106 """ 107 # init base class 108 dict.__init__(self) 109 110 # intialize with default 111 self._header_order = None 112 113 if isinstance(source, str): 114 self._fromFile(source, header=header, sep=sep, headersep=headersep, 115 dtype=dtype, skiplines=skiplines) 116 117 elif isinstance(source, dict): 118 for k, v in source.iteritems(): 119 self[k] = v 120 # check data integrity 121 self._check() 122 123 else: 124 raise ValueError, 'Unkown source for ColumnData [%s]' \ 125 % `type(source)` 126 127 # generate missing properties for each item in the header 128 classdict = self.__class__.__dict__ 129 for k in self.keys(): 130 if not classdict.has_key(k): 131 getter = "lambda self: self._getAttrib('%s')" % (k) 132 # Sanitarize the key, substitute ' []' with '_' 133 k_ = re_sub('[[\] ]', '_', k) 134 # replace multipe _s 135 k_ = re_sub('__+', '_', k_) 136 # remove quotes 137 k_ = re_sub('["\']', '', k_) 138 if __debug__: 139 debug("IOH", "Registering property %s for ColumnData key %s" 140 % (k_, k)) 141 # make sure to import class directly into local namespace 142 # otherwise following does not work for classes defined 143 # elsewhere 144 exec 'from %s import %s' % (self.__module__, 145 self.__class__.__name__) 146 exec "%s.%s = property(fget=%s)" % \ 147 (self.__class__.__name__, k_, getter)

148 # TODO!!! Check if it is safe actually here to rely on value of 149 # k in lambda. May be it is treated as continuation and 150 # some local space would override it???? 151 #setattr(self.__class__, 152 # k, 153 # property(fget=lambda x: x._getAttrib("%s" % k))) 154 # it seems to be error-prone due to continuation... 155 156 157 __doc__ = enhancedDocString('ColumnData', locals()) 158 159

160 - def _getAttrib(self, key):

161 """Return corresponding value if given key is known to current instance 162 163 Is used for automatically added properties to the class. 164 165 :Raises: 166 ValueError: 167 If `key` is not known to given instance 168 169 :Returns: 170 Value if `key` is known 171 """ 172 if self.has_key(key): 173 return self[key] 174 else: 175 raise ValueError, "Instance %s has no data about %s" \ 176 % (`self`, `key`)

177 178

179 - def __str__(self):

180 s = self.__class__.__name__ 181 if len(self.keys())>0: 182 s += " %d rows, %d columns [" % \ 183 (self.getNRows(), self.getNColumns()) 184 s += reduce(lambda x, y: x+" %s" % y, self.keys()) 185 s += "]" 186 return s

187

188 - def _check(self):

189 """Performs some checks for data integrity. 190 """ 191 length = None 192 for k in self.keys(): 193 if length == None: 194 length = len(self[k]) 195 else: 196 if not len(self[k]) == length: 197 raise ValueError, "Data integrity lost. Columns do not " \ 198 "have equal length."

199 200

201 - def _fromFile(self, filename, header, sep, headersep, 202 dtype, skiplines):

203 """Loads column data from file -- clears object first. 204 """ 205 # make a clean table 206 self.clear() 207 208 file_ = open(filename, 'r') 209 210 self._header_order = None 211 212 [ file_.readline() for x in range(skiplines) ] 213 """Simply skip some lines""" 214 # make column names, either take header or generate 215 if header == True: 216 # read first line and split by 'sep' 217 hdr = file_.readline().split(headersep) 218 # remove bogus empty header titles 219 hdr = filter(lambda x:len(x.strip()), hdr) 220 self._header_order = hdr 221 elif isinstance(header, list): 222 hdr = header 223 else: 224 hdr = [ str(i) for i in xrange(len(file_.readline().split(sep))) ] 225 # reset file to not miss the first line 226 file_.seek(0) 227 [ file_.readline() for x in range(skiplines) ] 228 229 230 # string in lists: one per column 231 tbl = [ [] for i in xrange(len(hdr)) ] 232 233 # do per column dtypes 234 if not isinstance(dtype, list): 235 dtype = [dtype] * len(hdr) 236 237 # parse line by line and feed into the lists 238 for line in file_: 239 # get rid of leading and trailing whitespace 240 line = line.strip() 241 # ignore empty lines and comment lines 242 if not line or line.startswith('#'): 243 continue 244 l = line.split(sep) 245 246 if not len(l) == len(hdr): 247 raise RuntimeError, \ 248 "Number of entries in line [%i] does not match number " \ 249 "of columns in header [%i]." % (len(l), len(hdr)) 250 251 for i, v in enumerate(l): 252 if not dtype[i] is None: 253 try: 254 v = dtype[i](v) 255 except ValueError: 256 warning("Can't convert %s to desired datatype %s." % 257 (`v`, `dtype`) + " Leaving original type") 258 tbl[i].append(v) 259 260 # check 261 if not len(tbl) == len(hdr): 262 raise RuntimeError, "Number of columns read from file does not " \ 263 "match the number of header entries." 264 265 # fill dict 266 for i, v in enumerate(hdr): 267 self[v] = tbl[i]

268 269

270 - def __iadd__(self, other):

271 """Merge column data. 272 """ 273 # for all columns in the other object 274 for k, v in other.iteritems(): 275 if not self.has_key(k): 276 raise ValueError, 'Unknown key [%s].' % `k` 277 if not isinstance(v, list): 278 raise ValueError, 'Can only merge list data, but got [%s].' \ 279 % `type(v)` 280 # now it seems to be ok 281 # XXX check for datatype? 282 self[k] += v 283 284 # look for problems, like columns present in self, but not in other 285 self._check() 286 287 return self

288 289

290 - def selectSamples(self, selection):

291 """Return new ColumnData with selected samples""" 292 293 data = copy.deepcopy(self) 294 for k, v in data.iteritems(): 295 data[k] = [v[x] for x in selection] 296 297 data._check() 298 return data

299 300

301 - def getNColumns(self):

302 """Returns the number of columns. 303 """ 304 return len(self.keys())

305 306

307 - def tofile(self, filename, header=True, header_order=None, sep=' '):

308 """Write column data to a text file. 309 310 :Parameters: 311 filename : basestring 312 Target filename 313 header : bool 314 If `True` a column header is written, using the column 315 keys. If `False` no header is written. 316 header_order : None or list of basestring 317 If it is a list of strings, they will be used instead 318 of simply asking for the dictionary keys. However 319 these strings must match the dictionary keys in number 320 and identity. This argument type can be used to 321 determine the order of the columns in the output file. 322 The default value is `None`. In this case the columns 323 will be in an arbitrary order. 324 sep : basestring 325 String that is written as a separator between to data columns. 326 """ 327 # XXX do the try: except: dance 328 file_ = open(filename, 'w') 329 330 # write header 331 if header_order == None: 332 if self._header_order is None: 333 col_hdr = self.keys() 334 else: 335 # use stored order + newly added keys at the last columns 336 col_hdr = self._header_order + \ 337 list(set(self.keys()).difference( 338 set(self._header_order))) 339 else: 340 if not len(header_order) == self.getNColumns(): 341 raise ValueError, 'Header list does not match number of ' \ 342 'columns.' 343 for k in header_order: 344 if not self.has_key(k): 345 raise ValueError, 'Unknown key [%s]' % `k` 346 col_hdr = header_order 347 348 if header == True: 349 file_.write(sep.join(col_hdr) + '\n') 350 351 # for all rows 352 for r in xrange(self.getNRows()): 353 # get attributes for all keys 354 l = [str(self[k][r]) for k in col_hdr] 355 # write to file with proper separator 356 file_.write(sep.join(l) + '\n') 357 358 file_.close()

359 360

361 - def getNRows(self):

362 """Returns the number of rows. 363 """ 364 # no data no rows (after Bob Marley) 365 if not len(self.keys()): 366 return 0 367 # otherwise first key is as good as any other 368 else: 369 return len(self[self.keys()[0]])

370 371 ncolumns = property(fget=getNColumns) 372 nrows = property(fget=getNRows)

373 374 375

376 -class SampleAttributes(ColumnData):

377 """Read and write PyMVPA sample attribute definitions from and to text 378 files. 379 """

380 - def __init__(self, source, literallabels=False, header=None):

381 """Read PyMVPA sample attributes from disk. 382 383 :Parameters: 384 source: basestring 385 Filename of an atrribute file 386 literallabels: bool 387 Either labels are given as literal strings 388 header: None or bool or list of str 389 If None, ['labels', 'chunks'] is assumed. Otherwise the same 390 behavior as of `ColumnData` 391 """ 392 if literallabels: 393 dtypes = [str, float] 394 else: 395 dtypes = float 396 397 if header is None: 398 header = ['labels', 'chunks'] 399 ColumnData.__init__(self, source, 400 header=header, 401 sep=None, dtype=dtypes)

402 403

404 - def tofile(self, filename):

405 """Write sample attributes to a text file. 406 """ 407 ColumnData.tofile(self, filename, 408 header=False, 409 header_order=['labels', 'chunks'], 410 sep=' ')

411 412

413 - def getNSamples(self):

414 """Returns the number of samples in the file. 415 """ 416 return self.getNRows()

417 418

419 - def toEvents(self, **kwargs):

420 """Convert into a list of `Event` instances. 421 422 Each change in the label or chunks value is taken as a new event onset. 423 The length of an event is determined by the number of identical 424 consecutive label-chunk combinations. Since the attributes list has no 425 sense of absolute timing, both `onset` and `duration` are determined and 426 stored in #samples units. 427 428 :Parameters: 429 kwargs 430 Any keyword arugment provided would be replicated, through all 431 the entries. 432 """ 433 events = [] 434 prev_onset = 0 435 old_comb = None 436 duration = 1 437 # over all samples 438 for r in xrange(self.nrows): 439 # the label-chunk combination 440 comb = (self.labels[r], self.chunks[r]) 441 442 # check if things changed 443 if not comb == old_comb: 444 # did we ever had an event 445 if not old_comb is None: 446 events.append( 447 Event(onset=prev_onset, duration=duration, 448 label=old_comb[0], chunk=old_comb[1], **kwargs)) 449 # reset duration for next event 450 duration = 1 451 # store the current samples as onset for the next event 452 prev_onset = r 453 454 # update the reference combination 455 old_comb = comb 456 else: 457 # current event is lasting 458 duration += 1 459 460 # push the last event in the pipeline 461 if not old_comb is None: 462 events.append( 463 Event(onset=prev_onset, duration=duration, 464 label=old_comb[0], chunk=old_comb[1], **kwargs)) 465 466 return events

467 468 469 nsamples = property(fget=getNSamples)

470 471

472 -class SensorLocations(ColumnData):

473 """Base class for sensor location readers. 474 475 Each subclass should provide x, y, z coordinates via the `pos_x`, `pos_y`, 476 and `pos_z` attrbibutes. 477 478 Axes should follow the following convention: 479 480 x-axis: left -> right 481 y-axis: anterior -> posterior 482 z-axis: superior -> inferior 483 """

484 - def __init__(self, *args, **kwargs):

485 """Pass arguments to ColumnData. 486 """ 487 ColumnData.__init__(self, *args, **kwargs)

488 489

490 - def locations(self):

491 """Get the sensor locations as an array. 492 493 :Returns: 494 (nchannels x 3) array with coordinates in (x, y, z) 495 """ 496 return N.array((self.pos_x, self.pos_y, self.pos_z)).T

497 498 499

500 -class XAVRSensorLocations(SensorLocations):

501 """Read sensor location definitions from a specific text file format. 502 503 File layout is assumed to be 5 columns: 504 505 1. sensor name 506 2. some useless integer 507 3. position on x-axis 508 4. position on y-axis 509 5. position on z-axis 510 """

511 - def __init__(self, source):

512 """Read sensor locations from file. 513 514 :Parameter: 515 source : filename of an attribute file 516 """ 517 SensorLocations.__init__( 518 self, source, 519 header=['names', 'some_number', 'pos_x', 'pos_y', 'pos_z'], 520 sep=None, dtype=[str, int, float, float, float])

521 522

523 -class TuebingenMEGSensorLocations(SensorLocations):

524 """Read sensor location definitions from a specific text file format. 525 526 File layout is assumed to be 7 columns: 527 528 1: sensor name 529 2: position on y-axis 530 3: position on x-axis 531 4: position on z-axis 532 5-7: same as 2-4, but for some outer surface thingie. 533 534 Note that x and y seem to be swapped, ie. y as defined by SensorLocations 535 conventions seems to be first axis and followed by x. 536 537 Only inner surface coordinates are reported by `locations()`. 538 """

539 - def __init__(self, source):

540 """Read sensor locations from file. 541 542 :Parameter: 543 source : filename of an attribute file 544 """ 545 SensorLocations.__init__( 546 self, source, 547 header=['names', 'pos_y', 'pos_x', 'pos_z', 548 'pos_y2', 'pos_x2', 'pos_z2'], 549 sep=None, dtype=[str, float, float, float, float, float, float])

550 551

552 -def design2labels(columndata, baseline_label=0, 553 func=lambda x: x > 0.0):

554 """Helper to convert design matrix into a list of labels 555 556 Given a design, assign a single label to any given sample 557 558 TODO: fix description/naming 559 560 :Parameters: 561 columndata : ColumnData 562 Attributes where each known will be considered as a separate 563 explanatory variable (EV) in the design. 564 baseline_label 565 What label to assign for samples where none of EVs was given a value 566 func : functor 567 Function which decides either a value should be considered 568 569 :Output: 570 list of labels which are taken from column names in 571 ColumnData and baseline_label 572 573 """ 574 # doing it simple naive way but it should be of better control if 575 # we decide to process columndata with non-numeric entries etc 576 keys = columndata.keys() 577 labels = [] 578 for row in xrange(columndata.nrows): 579 entries = [ columndata[key][row] for key in keys ] 580 # which entries get selected 581 selected = filter(lambda x: func(x[1]), zip(keys, entries)) 582 nselected = len(selected) 583 584 if nselected > 1: 585 # if there is more than a single one -- we are in problem 586 raise ValueError, "Row #%i with items %s has multiple entries " \ 587 "meeting the criterion. Cannot decide on the label" % \ 588 (row, entries) 589 elif nselected == 1: 590 label = selected[0][0] 591 else: 592 label = baseline_label 593 labels.append(label) 594 return labels

595 596 597 __known_chunking_methods = { 598 'alllabels': 'Each chunk must contain instances of all labels' 599 } 600

601 -def labels2chunks(labels, method="alllabels", ignore_labels=None):

602 """Automagically decide on chunks based on labels 603 604 :Parameters: 605 labels 606 labels to base chunking on 607 method : basestring 608 codename for method to use. Known are %s 609 ignore_labels : list of basestring 610 depends on the method. If method ``alllabels``, then don't 611 seek for such labels in chunks. E.g. some 'reject' samples 612 613 :rtype: list 614 """ % __known_chunking_methods.keys() 615 616 chunks = [] 617 if ignore_labels is None: 618 ignore_labels = [] 619 alllabels = set(labels).difference(set(ignore_labels)) 620 if method == 'alllabels': 621 seenlabels = set() 622 lastlabel = None 623 chunk = 0 624 for label in labels: 625 if label != lastlabel: 626 if seenlabels == alllabels: 627 chunk += 1 628 seenlabels = set() 629 lastlabel = label 630 if not label in ignore_labels: 631 seenlabels.update([label]) 632 chunks.append(chunk) 633 chunks = N.array(chunks) 634 # fix up a bit the trailer 635 if seenlabels != alllabels: 636 chunks[chunks == chunk] = chunk-1 637 chunks = list(chunks) 638 else: 639 errmsg = "Unknown method to derive chunks is requested. Known are:\n" 640 for method, descr in __known_chunking_methods.iteritems(): 641 errmsg += " %s : %s\n" % (method, descr) 642 raise ValueError, errmsg 643 return chunks

644

Source Code for Module mvpa.misc.io.base