-
Notifications
You must be signed in to change notification settings - Fork 1
/
weightedStats.m
154 lines (131 loc) · 5.41 KB
/
weightedStats.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
function [weightedMean,weightedStdOfMean,weightedStdOfSample] = weightedStats(data, weightsOrSigma,sw)
%wStats calculates weighted statistics (mean, stdev of mean) for a list of inputs with corresponding weights or {std}
%
%SYNOPSIS [weightedMean,weightedStd] = weightedStats(data, weightsOrSigma,sw)
%
%INPUT data: vector of input values.
%
% weightsOrSigma: weights or standardDeviations of the input data
% sw (opt): switch, either 'w' or {'s'}
%
% -> if either data or weights are matrices, the computation is done
% column-wise
%
%OUTPUT weightedMean: mean of the data weighted with weights (rowVector)
% weightedStdOfMean: sigma1 = sqrt[sum_i{(yi-mw)^2/sigmai^2}/((n-1)*sum_i{1/sigmai^2})]
% which is the general weighted std OF THE MEAN (not the sample, i.e. it is divided by sqrt(n))
% weightedStdOfSample = weightedStdOfMean * sqrt(n)
%
% CAUTION: according to www-gsi-vms.gsi.de/eb/people/wolle/buch/error.ps, if
% the uncertainity of the data points is much larger than the
% difference between them, sigma1 underestimates the "true" sigma.
% Hence, sigma2 = sqrt[1/sum_i{1/sigmai^2}] should be used. In
% general, the true sigma is to be max(sigma1,sigma2)
%
%reference: Taschenbuch der Mathematik, p. 815
%
%c: 06/03 jonas
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%test input: count input arguments
if nargin < 2 || isempty(weightsOrSigma) || isempty(data)
error('not enough or empty input arguments!')
end
weights = weightsOrSigma;
%test input: data size
sDat = size(data);
sIS = size(weights);
if any(sDat ~= sIS)
%one is a matrix and the other a vector, or bad user input
if sDat(1) ~= sIS(1)
if sDat(1) == sIS(2) && sDat(2) == sIS(1)
%bad user input: badly arranged vectors: make col-vectors
if sDat(1) == 1
data = data';
else
weights = weights';
end
else
%bad user input: fatal
error('bad input data size: if you want to specify a vector and a matrix for input, use a column-vector!')
end
else
%one's a vector, the other a matrix
if sDat(2) == 1
%make data a matrix
data = data*ones(1,sIS(2));
elseif sIS(2) == 1
%make weights a matrix
weights = weights*ones(1,sDat(2));
else
%bad input
error('bad input data size: specify either two matrices of equal size or a matrix and a vector or two vectors')
end
end
else
if sDat(1) == 1
%make both col vectors
data = data';
weights = weights';
end
end
%get # of data points
numRows = size(data,1);
numCols = size(data,2);
%calculate weights if necessary
if nargin == 2 || ~(strcmp(sw,'w') || strcmp(sw,'s'))
sw = 's';
end
if strcmp(sw,'s')
%w = 1/sigma^2
if any(weights == 0)
warning('WEIGHTEDSTATS:SigmaIsZero','At least one sigma == 0; set to eps');
weights = max(weights,eps);
end
%assign weight 1 to the measurement with smallest error
weights = (repmat(min(weights,[],1),numRows,1)./weights).^2;
end
%make sure the weights are positive
weights = abs(weights);
%calc weightedMean : each dataPoint is multiplied by the corresponding weight, the sum is divided
%by the sum of the weights
sumWeights = nansum(weights,1);
weightedMean = nansum(weights.*data,1)./sumWeights;
%---calc weightedStd---
squareDiffs = (data-repmat(weightedMean,numRows,1)).^2;
weightedSSQ = nansum(squareDiffs.*weights,1);
switch sw
case 'w'
%weighted mean : each squared difference from mean is weighted and divided by
%the number of non-zero weights http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weightsd.pdf
%get divisor (nnz is not defined for matrices)
for i=numCols:-1:1
% set NaN-weights to 0
nanWeights = isnan(weights(:,i));
weights(nanWeights,i) = 0;
nnzw = nnz(weights(:,i));
divisor(1,i) = (nnzw-1)/nnzw*sumWeights(i);
end
%assign output
sigma = sqrt(weightedSSQ./divisor);
weightedStdOfSample = sigma;
weightedStdOfMean = sigma/sqrt(nnzw);
case 's'
%calculate sigma1 = sqrt[sum_i{(yi-mw)^2/sigmai^2}/((n-1)*sum_i{1/sigmai^2})]
%which is the general weighted std OF THE MEAN (not the sample, i.e. it is divided by sqrt(n))
%
%CAUTION: according to www-gsi-vms.gsi.de/eb/people/wolle/buch/error.ps, if
%the uncertainity of the data points is much larger than the
%difference between them, sigma1 underestimates the "true" sigma.
%Hence, sigma2 = sqrt[1/sum_i{1/sigmai^2}] should be used. In
%general, the true sigma is to be max(sigma1,sigma2)
%sigma1. Correct number of observations
numRows = sum(~isnan(data) & ~isnan(weights),1);
divisor = (numRows-1).*sumWeights;
sigma1 = sqrt(weightedSSQ./divisor);
%sigma2
%sigma2 = sqrt(1/sumWeights);
%assign output
%weightedStdOfMean = max(sigma1,sigma2);
weightedStdOfMean = sigma1;
weightedStdOfSample = sigma1.*sqrt(numRows);
end