1
2
3
4
5
6
7
8
9 """Some little helper for reading (and writing) common formats from and to
10 disk."""
11
12 __docformat__ = 'restructuredtext'
13
14 import numpy as N
15 import mvpa.support.copy as copy
16 from mvpa.base.dochelpers import enhancedDocString
17 from re import sub as re_sub
18 from mvpa.base import warning
19
20 from mvpa.misc.support import Event
21
22 if __debug__:
23 from mvpa.base import debug
24
25
27 """Base class for data readers.
28
29 Every subclass has to put all information into to variable:
30
31 `self._data`: ndarray
32 The data array has to have the samples separating dimension along the
33 first axis.
34 `self._props`: dict
35 All other meaningful information has to be stored in a dictionary.
36
37 This class provides two methods (and associated properties) to retrieve
38 this information.
39 """
41 """Cheap init.
42 """
43 self._props = {}
44 self._data = None
45
46
48 """Return the dictionary with the data properties.
49 """
50 return self._props
51
52
54 """Return the data array.
55 """
56 return self._data
57
58
59 data = property(fget=getData, doc="Data array")
60 props = property(fget=getPropsAsDict, doc="Property dict")
61
62
63
65 """Read data that is stored in columns of text files.
66
67 All read data is available via a dictionary-like interface. If
68 column headers are available, the column names serve as dictionary keys.
69 If no header exists an articfical key is generated: str(number_of_column).
70
71 Splitting of text file lines is performed by the standard split() function
72 (which gets passed the `sep` argument as separator string) and each
73 element is converted into the desired datatype.
74
75 Because data is read into a dictionary no two columns can have the same
76 name in the header! Each column is stored as a list in the dictionary.
77 """
78 - def __init__(self, source, header=True, sep=None, headersep=None,
79 dtype=float, skiplines=0):
80 """Read data from file into a dictionary.
81
82 :Parameters:
83 source : basestring or dict
84 If values is given as a string all data is read from the
85 file and additonal keyword arguments can be sued to
86 customize the read procedure. If a dictionary is passed
87 a deepcopy is performed.
88 header : bool or list of basestring
89 Indicates whether the column names should be read from the
90 first line (`header=True`). If `header=False` unique
91 column names will be generated (see class docs). If
92 `header` is a python list, it's content is used as column
93 header names and its length has to match the number of
94 columns in the file.
95 sep : basestring or None
96 Separator string. The actual meaning depends on the output
97 format (see class docs).
98 headersep : basestring or None
99 Separator string used in the header. The actual meaning
100 depends on the output format (see class docs).
101 dtype : type or list(types)
102 Desired datatype(s). Datatype per column get be specified by
103 passing a list of types.
104 skiplines : int
105 Number of lines to skip at the beginning of the file.
106 """
107
108 dict.__init__(self)
109
110
111 self._header_order = None
112
113 if isinstance(source, str):
114 self._fromFile(source, header=header, sep=sep, headersep=headersep,
115 dtype=dtype, skiplines=skiplines)
116
117 elif isinstance(source, dict):
118 for k, v in source.iteritems():
119 self[k] = v
120
121 self._check()
122
123 else:
124 raise ValueError, 'Unkown source for ColumnData [%s]' \
125 % `type(source)`
126
127
128 classdict = self.__class__.__dict__
129 for k in self.keys():
130 if not classdict.has_key(k):
131 getter = "lambda self: self._getAttrib('%s')" % (k)
132
133 k_ = re_sub('[[\] ]', '_', k)
134
135 k_ = re_sub('__+', '_', k_)
136
137 k_ = re_sub('["\']', '', k_)
138 if __debug__:
139 debug("IOH", "Registering property %s for ColumnData key %s"
140 % (k_, k))
141
142
143
144 exec 'from %s import %s' % (self.__module__,
145 self.__class__.__name__)
146 exec "%s.%s = property(fget=%s)" % \
147 (self.__class__.__name__, k_, getter)
148
149
150
151
152
153
154
155
156
157 __doc__ = enhancedDocString('ColumnData', locals())
158
159
161 """Return corresponding value if given key is known to current instance
162
163 Is used for automatically added properties to the class.
164
165 :Raises:
166 ValueError:
167 If `key` is not known to given instance
168
169 :Returns:
170 Value if `key` is known
171 """
172 if self.has_key(key):
173 return self[key]
174 else:
175 raise ValueError, "Instance %s has no data about %s" \
176 % (`self`, `key`)
177
178
180 s = self.__class__.__name__
181 if len(self.keys())>0:
182 s += " %d rows, %d columns [" % \
183 (self.getNRows(), self.getNColumns())
184 s += reduce(lambda x, y: x+" %s" % y, self.keys())
185 s += "]"
186 return s
187
189 """Performs some checks for data integrity.
190 """
191 length = None
192 for k in self.keys():
193 if length == None:
194 length = len(self[k])
195 else:
196 if not len(self[k]) == length:
197 raise ValueError, "Data integrity lost. Columns do not " \
198 "have equal length."
199
200
201 - def _fromFile(self, filename, header, sep, headersep,
202 dtype, skiplines):
203 """Loads column data from file -- clears object first.
204 """
205
206 self.clear()
207
208 file_ = open(filename, 'r')
209
210 self._header_order = None
211
212 [ file_.readline() for x in range(skiplines) ]
213 """Simply skip some lines"""
214
215 if header == True:
216
217 hdr = file_.readline().split(headersep)
218
219 hdr = filter(lambda x:len(x.strip()), hdr)
220 self._header_order = hdr
221 elif isinstance(header, list):
222 hdr = header
223 else:
224 hdr = [ str(i) for i in xrange(len(file_.readline().split(sep))) ]
225
226 file_.seek(0)
227 [ file_.readline() for x in range(skiplines) ]
228
229
230
231 tbl = [ [] for i in xrange(len(hdr)) ]
232
233
234 if not isinstance(dtype, list):
235 dtype = [dtype] * len(hdr)
236
237
238 for line in file_:
239
240 line = line.strip()
241
242 if not line or line.startswith('#'):
243 continue
244 l = line.split(sep)
245
246 if not len(l) == len(hdr):
247 raise RuntimeError, \
248 "Number of entries in line [%i] does not match number " \
249 "of columns in header [%i]." % (len(l), len(hdr))
250
251 for i, v in enumerate(l):
252 if not dtype[i] is None:
253 try:
254 v = dtype[i](v)
255 except ValueError:
256 warning("Can't convert %s to desired datatype %s." %
257 (`v`, `dtype`) + " Leaving original type")
258 tbl[i].append(v)
259
260
261 if not len(tbl) == len(hdr):
262 raise RuntimeError, "Number of columns read from file does not " \
263 "match the number of header entries."
264
265
266 for i, v in enumerate(hdr):
267 self[v] = tbl[i]
268
269
271 """Merge column data.
272 """
273
274 for k, v in other.iteritems():
275 if not self.has_key(k):
276 raise ValueError, 'Unknown key [%s].' % `k`
277 if not isinstance(v, list):
278 raise ValueError, 'Can only merge list data, but got [%s].' \
279 % `type(v)`
280
281
282 self[k] += v
283
284
285 self._check()
286
287 return self
288
289
291 """Return new ColumnData with selected samples"""
292
293 data = copy.deepcopy(self)
294 for k, v in data.iteritems():
295 data[k] = [v[x] for x in selection]
296
297 data._check()
298 return data
299
300
302 """Returns the number of columns.
303 """
304 return len(self.keys())
305
306
307 - def tofile(self, filename, header=True, header_order=None, sep=' '):
308 """Write column data to a text file.
309
310 :Parameters:
311 filename : basestring
312 Target filename
313 header : bool
314 If `True` a column header is written, using the column
315 keys. If `False` no header is written.
316 header_order : None or list of basestring
317 If it is a list of strings, they will be used instead
318 of simply asking for the dictionary keys. However
319 these strings must match the dictionary keys in number
320 and identity. This argument type can be used to
321 determine the order of the columns in the output file.
322 The default value is `None`. In this case the columns
323 will be in an arbitrary order.
324 sep : basestring
325 String that is written as a separator between to data columns.
326 """
327
328 file_ = open(filename, 'w')
329
330
331 if header_order == None:
332 if self._header_order is None:
333 col_hdr = self.keys()
334 else:
335
336 col_hdr = self._header_order + \
337 list(set(self.keys()).difference(
338 set(self._header_order)))
339 else:
340 if not len(header_order) == self.getNColumns():
341 raise ValueError, 'Header list does not match number of ' \
342 'columns.'
343 for k in header_order:
344 if not self.has_key(k):
345 raise ValueError, 'Unknown key [%s]' % `k`
346 col_hdr = header_order
347
348 if header == True:
349 file_.write(sep.join(col_hdr) + '\n')
350
351
352 for r in xrange(self.getNRows()):
353
354 l = [str(self[k][r]) for k in col_hdr]
355
356 file_.write(sep.join(l) + '\n')
357
358 file_.close()
359
360
362 """Returns the number of rows.
363 """
364
365 if not len(self.keys()):
366 return 0
367
368 else:
369 return len(self[self.keys()[0]])
370
371 ncolumns = property(fget=getNColumns)
372 nrows = property(fget=getNRows)
373
374
375
377 """Read and write PyMVPA sample attribute definitions from and to text
378 files.
379 """
380 - def __init__(self, source, literallabels=False, header=None):
381 """Read PyMVPA sample attributes from disk.
382
383 :Parameters:
384 source: basestring
385 Filename of an atrribute file
386 literallabels: bool
387 Either labels are given as literal strings
388 header: None or bool or list of str
389 If None, ['labels', 'chunks'] is assumed. Otherwise the same
390 behavior as of `ColumnData`
391 """
392 if literallabels:
393 dtypes = [str, float]
394 else:
395 dtypes = float
396
397 if header is None:
398 header = ['labels', 'chunks']
399 ColumnData.__init__(self, source,
400 header=header,
401 sep=None, dtype=dtypes)
402
403
405 """Write sample attributes to a text file.
406 """
407 ColumnData.tofile(self, filename,
408 header=False,
409 header_order=['labels', 'chunks'],
410 sep=' ')
411
412
414 """Returns the number of samples in the file.
415 """
416 return self.getNRows()
417
418
420 """Convert into a list of `Event` instances.
421
422 Each change in the label or chunks value is taken as a new event onset.
423 The length of an event is determined by the number of identical
424 consecutive label-chunk combinations. Since the attributes list has no
425 sense of absolute timing, both `onset` and `duration` are determined and
426 stored in #samples units.
427
428 :Parameters:
429 kwargs
430 Any keyword arugment provided would be replicated, through all
431 the entries.
432 """
433 events = []
434 prev_onset = 0
435 old_comb = None
436 duration = 1
437
438 for r in xrange(self.nrows):
439
440 comb = (self.labels[r], self.chunks[r])
441
442
443 if not comb == old_comb:
444
445 if not old_comb is None:
446 events.append(
447 Event(onset=prev_onset, duration=duration,
448 label=old_comb[0], chunk=old_comb[1], **kwargs))
449
450 duration = 1
451
452 prev_onset = r
453
454
455 old_comb = comb
456 else:
457
458 duration += 1
459
460
461 if not old_comb is None:
462 events.append(
463 Event(onset=prev_onset, duration=duration,
464 label=old_comb[0], chunk=old_comb[1], **kwargs))
465
466 return events
467
468
469 nsamples = property(fget=getNSamples)
470
471
473 """Base class for sensor location readers.
474
475 Each subclass should provide x, y, z coordinates via the `pos_x`, `pos_y`,
476 and `pos_z` attrbibutes.
477
478 Axes should follow the following convention:
479
480 x-axis: left -> right
481 y-axis: anterior -> posterior
482 z-axis: superior -> inferior
483 """
488
489
491 """Get the sensor locations as an array.
492
493 :Returns:
494 (nchannels x 3) array with coordinates in (x, y, z)
495 """
496 return N.array((self.pos_x, self.pos_y, self.pos_z)).T
497
498
499
501 """Read sensor location definitions from a specific text file format.
502
503 File layout is assumed to be 5 columns:
504
505 1. sensor name
506 2. some useless integer
507 3. position on x-axis
508 4. position on y-axis
509 5. position on z-axis
510 """
512 """Read sensor locations from file.
513
514 :Parameter:
515 source : filename of an attribute file
516 """
517 SensorLocations.__init__(
518 self, source,
519 header=['names', 'some_number', 'pos_x', 'pos_y', 'pos_z'],
520 sep=None, dtype=[str, int, float, float, float])
521
522
524 """Read sensor location definitions from a specific text file format.
525
526 File layout is assumed to be 7 columns:
527
528 1: sensor name
529 2: position on y-axis
530 3: position on x-axis
531 4: position on z-axis
532 5-7: same as 2-4, but for some outer surface thingie.
533
534 Note that x and y seem to be swapped, ie. y as defined by SensorLocations
535 conventions seems to be first axis and followed by x.
536
537 Only inner surface coordinates are reported by `locations()`.
538 """
540 """Read sensor locations from file.
541
542 :Parameter:
543 source : filename of an attribute file
544 """
545 SensorLocations.__init__(
546 self, source,
547 header=['names', 'pos_y', 'pos_x', 'pos_z',
548 'pos_y2', 'pos_x2', 'pos_z2'],
549 sep=None, dtype=[str, float, float, float, float, float, float])
550
551
552 -def design2labels(columndata, baseline_label=0,
553 func=lambda x: x > 0.0):
554 """Helper to convert design matrix into a list of labels
555
556 Given a design, assign a single label to any given sample
557
558 TODO: fix description/naming
559
560 :Parameters:
561 columndata : ColumnData
562 Attributes where each known will be considered as a separate
563 explanatory variable (EV) in the design.
564 baseline_label
565 What label to assign for samples where none of EVs was given a value
566 func : functor
567 Function which decides either a value should be considered
568
569 :Output:
570 list of labels which are taken from column names in
571 ColumnData and baseline_label
572
573 """
574
575
576 keys = columndata.keys()
577 labels = []
578 for row in xrange(columndata.nrows):
579 entries = [ columndata[key][row] for key in keys ]
580
581 selected = filter(lambda x: func(x[1]), zip(keys, entries))
582 nselected = len(selected)
583
584 if nselected > 1:
585
586 raise ValueError, "Row #%i with items %s has multiple entries " \
587 "meeting the criterion. Cannot decide on the label" % \
588 (row, entries)
589 elif nselected == 1:
590 label = selected[0][0]
591 else:
592 label = baseline_label
593 labels.append(label)
594 return labels
595
596
597 __known_chunking_methods = {
598 'alllabels': 'Each chunk must contain instances of all labels'
599 }
600
601 -def labels2chunks(labels, method="alllabels", ignore_labels=None):
602 """Automagically decide on chunks based on labels
603
604 :Parameters:
605 labels
606 labels to base chunking on
607 method : basestring
608 codename for method to use. Known are %s
609 ignore_labels : list of basestring
610 depends on the method. If method ``alllabels``, then don't
611 seek for such labels in chunks. E.g. some 'reject' samples
612
613 :rtype: list
614 """ % __known_chunking_methods.keys()
615
616 chunks = []
617 if ignore_labels is None:
618 ignore_labels = []
619 alllabels = set(labels).difference(set(ignore_labels))
620 if method == 'alllabels':
621 seenlabels = set()
622 lastlabel = None
623 chunk = 0
624 for label in labels:
625 if label != lastlabel:
626 if seenlabels == alllabels:
627 chunk += 1
628 seenlabels = set()
629 lastlabel = label
630 if not label in ignore_labels:
631 seenlabels.update([label])
632 chunks.append(chunk)
633 chunks = N.array(chunks)
634
635 if seenlabels != alllabels:
636 chunks[chunks == chunk] = chunk-1
637 chunks = list(chunks)
638 else:
639 errmsg = "Unknown method to derive chunks is requested. Known are:\n"
640 for method, descr in __known_chunking_methods.iteritems():
641 errmsg += " %s : %s\n" % (method, descr)
642 raise ValueError, errmsg
643 return chunks
644