mvpa.datasets.meta

25 """Dataset container 26 27 The class is useful to combine several Datasets with different origin and 28 type and bind them together. Such a combined dataset can then by used to 29 e.g. pass it to a classifier. 30 31 MetaDataset does not permanently duplicate data stored in the dataset it 32 contains. The combined samples matrix is build on demand and samples 33 attribute access is redirected to the first dataset in the container. 34 35 Currently operations other than samples or feature selection are not fully 36 supported, e.g. passing a MetaDataset to detrend() will initially result in 37 a detrended MetaDataset, but the combined and detrended samples matrix will 38 be lost after the next call to selectSamples() or selectFeatures(), which 39 freshly pulls samples from all datasets in the container. """ 40 41 # This class is intentionally _not_ implemented as a subclass of Dataset. 42 # IMHO Dataset contains to much logic unecessary logic. 43 # XXX implement MappedMetaDataset along with a MetaMapper that simply calls 44 # the mappers in the datasets in the container; or maybe just add flag to 45 # MetaDataset to behave like a MappedDataset

46 - def __init__(self, datasets):

47 """Initialize dataset instance 48 49 :Parameters: 50 datasets : list 51 """ 52 # XXX Maybe add checks that all datasets have identical samples 53 # attributes 54 self.__datasets = datasets 55 56 # contains the combine samples matrix for caching 57 self.__samples = None

58 59

60 - def rebuildSamples(self):

61 """Update the combined samples matrix from all underlying datasets. 62 """ 63 # note, that hstack will do a copy of _all_ data 64 self.__samples = N.hstack([ds.samples for ds in self.__datasets])

65 66

67 - def __getattr__(self, name):

68 """Implemented to redirect access to underlying datasets. 69 """ 70 if name == 'samples': 71 # do something to combine (and cache) samples arrays 72 if self.__samples is None: 73 self.rebuildSamples() 74 return self.__samples 75 76 else: 77 # redirect all other to first dataset 78 # ??? maybe limit to some specific supported ones 79 return self.__datasets[0].__getattribute__(name)

80 81

82 - def selectFeatures(self, ids, sort=True):

83 """Do feature selection on all underlying datasets at once. 84 """ 85 # determine which features belong to what dataset 86 # and call its selectFeatures() accordingly 87 ids = N.asanyarray(ids) 88 result = [] 89 fsum = 0 90 for ds in self.__datasets: 91 # bool which meta feature ids belongs to this dataset 92 selector = N.logical_and(ids < fsum + ds.nfeatures, ids >= fsum) 93 # make feature ids relative to this dataset 94 selected = ids[selector] - fsum 95 # do feature selection on underlying dataset 96 # XXX not sure if we should keep empty datasets? (probably) 97 result.append(ds.selectFeatures(selected)) 98 fsum += ds.nfeatures 99 100 return MetaDataset(result)

101 102

103 - def applyMapper(self, *args, **kwargs):

104 """Apply a mapper on all underlying datasets. 105 """ 106 return MetaDataset([ds.applyMapper(*args, **kwargs) \ 107 for ds in self.__datasets])

108 109

110 - def selectSamples(self, *args, **kwargs):

111 """Select samples from all underlying datasets at once. 112 """ 113 return MetaDataset([ds.selectSamples(*args, **kwargs) \ 114 for ds in self.__datasets])

115 116

117 - def permuteLabels(self, *args, **kwargs):

118 """Toggle label permutation. 119 """ 120 # permute on first 121 self.__datasets[0].permuteLabels(*args, **kwargs) 122 123 # and apply to all others 124 for ds in self.__datasets[1:]: 125 ds.samples[:] = self.__datasets[0].samples

126 127

128 - def getRandomSamples(self, nperlabel):

129 """Return a MetaDataset with a random subset of samples. 130 """ 131 # if interger is given take this value for all classes 132 if isinstance(nperlabel, int): 133 nperlabel = [ nperlabel for i in self.__datasets[0].uniquelabels ] 134 135 sample = [] 136 # for each available class 137 for i, r in enumerate(self.__datasets[0].uniquelabels): 138 # get the list of pattern ids for this class 139 sample += \ 140 random.sample((self.__datasets[0].labels == r).nonzero()[0], 141 nperlabel[i] ) 142 143 return MetaDataset([ds.selectSamples(sample) \ 144 for ds in self.__datasets])

145 146

147 - def getNSamples( self ):

148 """Currently available number of samples. 149 """ 150 return self.__datasets[0].nsamples

151 152

153 - def getNFeatures( self ):

154 """Number of features per sample. 155 """ 156 return N.sum([ds.nfeatures for ds in self.__datasets])

157 158

159 - def setSamplesDType(self, dtype):

160 """Set the data type of the samples array. 161 """ 162 # reset samples 163 self.__samples = None 164 165 for ds in self.__datasets: 166 if ds.samples.dtype != dtype: 167 ds.samples = ds.samples.astype(dtype)

168 169

170 - def mapReverse(self, val):

171 """Perform reverse mapping 172 173 :Return: 174 List of results per each used mapper and the corresponding part of 175 the provided `val`. 176 """ 177 # assure array and transpose for easy slicing 178 # i.e. transpose of 1D does nothing, but of 2D puts features 179 # along first dimension 180 val = N.asanyarray(val).T 181 182 # do we have multiple or just one 183 mflag = len(val.shape) > 1 184 185 result = [] 186 fsum = 0 187 for ds in self.__datasets: 188 # calculate upper border 189 fsum_new = fsum + ds.nfeatures 190 191 # now map back if mapper is present, otherwise just store 192 # need to pass transposed!! 193 if isinstance(ds, MappedDataset): 194 result.append(ds.mapReverse(val[fsum:fsum_new].T)) 195 else: 196 result.append(val[fsum:fsum_new].T) 197 198 fsum = fsum_new 199 200 return result

201 202 203 # read-only class properties 204 nsamples = property(fget=getNSamples) 205 nfeatures = property(fget=getNFeatures) 206 datasets = property(fget=lambda self: self.__datasets)

Source Code for Module mvpa.datasets.meta