Package mvpa :: Package misc :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.stats

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Little statistics helper""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  from mvpa.base import externals 
 14   
 15  if externals.exists('scipy', raiseException=True): 
 16      import scipy.stats as stats 
 17   
 18  import numpy as N 
 19  import copy 
 20   
21 -def chisquare(obs, exp=None):
22 """Compute the chisquare value of a contingency table with arbitrary 23 dimensions. 24 25 If no expected frequencies are supplied, the total N is assumed to be 26 equally distributed across all cells. 27 28 Returns: chisquare-stats, associated p-value (upper tail) 29 """ 30 obs = N.array(obs) 31 32 # get total number of observations 33 nobs = N.sum(obs) 34 35 # if no expected value are supplied assume equal distribution 36 if exp == None: 37 exp = N.ones(obs.shape) * nobs / N.prod(obs.shape) 38 39 # make sure to have floating point data 40 exp = exp.astype(float) 41 42 # compute chisquare value 43 chisq = N.sum((obs - exp )**2 / exp) 44 45 # return chisq and probability (upper tail) 46 return chisq, stats.chisqprob(chisq, N.prod(obs.shape) - 1)
47 48
49 -class DSMatrix(object):
50 """DSMatrix allows for the creation of dissilimarity matrices using 51 arbitrary distance metrics. 52 """ 53 54 # metric is a string
55 - def __init__(self, data_vectors, metric='spearman'):
56 """Initialize DSMatrix 57 58 :Parameters: 59 data_vectors : ndarray 60 m x n collection of vectors, where m is the number of exemplars 61 and n is the number of features per exemplar 62 metric : string 63 Distance metric to use (e.g., 'euclidean', 'spearman', 'pearson', 64 'confusion') 65 """ 66 # init members 67 self.full_matrix = [] 68 self.u_triangle = None 69 self.vector_form = None 70 71 # this one we know straight away, so set it 72 self.metric = metric 73 74 # size of dataset (checking if we're dealing with a column vector only) 75 num_exem = N.shape(data_vectors)[0] 76 flag_1d = False 77 # changed 4/26/09 to new way of figuring out if array is 1-D 78 #if (isinstance(data_vectors, N.ndarray)): 79 if (not(num_exem == N.size(data_vectors))): 80 num_features = N.shape(data_vectors)[1] 81 else: 82 flag_1d = True 83 num_features = 1 84 85 # generate output (dissimilarity) matrix 86 dsmatrix = N.mat(N.zeros((num_exem, num_exem))) 87 88 if (metric == 'euclidean'): 89 #print 'Using Euclidean distance metric...' 90 # down rows 91 for i in range(num_exem): 92 # across columns 93 for j in range(num_exem): 94 if (not(flag_1d)): 95 dsmatrix[i, j] = N.linalg.norm( 96 data_vectors[i, :] - data_vectors[j, :]) 97 else: 98 dsmatrix[i, j] = N.linalg.norm( 99 data_vectors[i] - data_vectors[j]) 100 101 elif (metric == 'spearman'): 102 #print 'Using Spearman rank-correlation metric...' 103 # down rows 104 for i in range(num_exem): 105 # across columns 106 for j in range(num_exem): 107 dsmatrix[i, j] = 1 - stats.spearmanr( 108 data_vectors[i,:], data_vectors[j,:])[0] 109 110 elif (metric == 'pearson'): 111 #print 'Using Pearson correlation metric...' 112 # down rows 113 for i in range(num_exem): 114 # across columns 115 for j in range(num_exem): 116 dsmatrix[i, j] = 1 - stats.pearsonr( 117 data_vectors[i,:], data_vectors[j,:])[0] 118 119 elif (metric == 'confusion'): 120 #print 'Using confusion correlation metric...' 121 # down rows 122 for i in range(num_exem): 123 # across columns 124 for j in range(num_exem): 125 if (not(flag_1d)): 126 dsmatrix[i, j] = 1 - int( 127 N.floor(N.sum(( 128 data_vectors[i, :] == data_vectors[j, :] 129 ).astype(N.int32)) / num_features)) 130 else: 131 dsmatrix[i, j] = 1 - int( 132 data_vectors[i] == data_vectors[j]) 133 134 self.full_matrix = dsmatrix
135
136 - def getTriangle(self):
137 # if we need to create the u_triangle representation, do so 138 if (self.u_triangle is None): 139 self.u_triangle = N.triu(self.full_matrix) 140 141 return self.u_triangle
142 143 # create the dissimilarity matrix on the (upper) triangle of the two 144 # two dissimilarity matrices; we can just reuse the same dissimilarity 145 # matrix code, but since it will return a matrix, we need to pick out 146 # either dsm[0,1] or dsm[1,0] 147 # note: this is a bit of a kludge right now, but it's the only way to solve 148 # certain problems: 149 # 1. Set all 0-valued elements in the original matrix to -1 (an impossible 150 # value for a dissimilarity matrix) 151 # 2. Find the upper triangle of the matrix 152 # 3. Create a vector from the upper triangle, but only with the 153 # elements whose absolute value is greater than 0 -- this 154 # will keep everything from the original matrix that wasn't 155 # part of the zero'ed-out portion when we took the upper 156 # triangle 157 # 4. Set all the -1-valued elements in the vector to 0 (their 158 # original value) 159 # 5. Cast to numpy array
160 - def getVectorForm(self):
161 if (self.vector_form is not None): 162 return self.vector_form 163 164 orig_dsmatrix = copy.deepcopy(self.getFullMatrix()) 165 166 orig_dsmatrix[orig_dsmatrix == 0] = -1 167 168 orig_tri = N.triu(orig_dsmatrix) 169 170 vector_form = orig_tri[abs(orig_tri) > 0] 171 172 vector_form[vector_form == -1] = 0 173 174 vector_form = N.asarray(vector_form) 175 self.vector_form = vector_form[0] 176 177 return self.vector_form
178 179 # XXX is there any reason to have these get* methods 180 # instead of plain access to full_matrix and method?
181 - def getFullMatrix(self):
182 return self.full_matrix
183
184 - def getMetric(self):
185 return self.metric
186