-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathregress.m
184 lines (170 loc) · 6.84 KB
/
regress.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
function [b,bint,r,rint,stats] = regress(y,X,alpha)
%REGRESS Multiple linear regression using least squares.
% B = REGRESS(Y,X) returns the vector B of regression coefficients in the
% linear model Y = X*B. X is an n-by-p design matrix, with rows
% corresponding to observations and columns to predictor variables. Y is
% an n-by-1 vector of response observations.
%
% [B,BINT] = REGRESS(Y,X) returns a matrix BINT of 95% confidence
% intervals for B.
%
% [B,BINT,R] = REGRESS(Y,X) returns a vector R of residuals.
%
% [B,BINT,R,RINT] = REGRESS(Y,X) returns a matrix RINT of intervals that
% can be used to diagnose outliers. If RINT(i,:) does not contain zero,
% then the i-th residual is larger than would be expected, at the 5%
% significance level. This is evidence that the I-th observation is an
% outlier.
%
% [B,BINT,R,RINT,STATS] = REGRESS(Y,X) returns a vector STATS containing, in
% the following order, the R-square statistic, the F statistic and p value
% for the full model, and an estimate of the error variance.
%
% [...] = REGRESS(Y,X,ALPHA) uses a 100*(1-ALPHA)% confidence level to
% compute BINT, and a (100*ALPHA)% significance level to compute RINT.
%
% X should include a column of ones so that the model contains a constant
% term. The F statistic and p value are computed under the assumption
% that the model contains a constant term, and they are not correct for
% models without a constant. The R-square value is one minus the ratio of
% the error sum of squares to the total sum of squares. This value can
% be negative for models without a constant, which indicates that the
% model is not appropriate for the data.
%
% If columns of X are linearly dependent, REGRESS sets the maximum
% possible number of elements of B to zero to obtain a "basic solution",
% and returns zeros in elements of BINT corresponding to the zero
% elements of B.
%
% REGRESS treats NaNs in X or Y as missing values, and removes them.
%
% See also LSCOV, POLYFIT, REGSTATS, ROBUSTFIT, STEPWISE.
% References:
% [1] Chatterjee, S. and A.S. Hadi (1986) "Influential Observations,
% High Leverage Points, and Outliers in Linear Regression",
% Statistical Science 1(3):379-416.
% [2] Draper N. and H. Smith (1981) Applied Regression Analysis, 2nd
% ed., Wiley.
% Copyright 1993-2005 The MathWorks, Inc.
% $Revision: 2.13.2.7 $ $Date: 2005/11/18 14:29:01 $
if nargin < 2
error('stats:regress:TooFewInputs', ...
'REGRESS requires at least two input arguments.');
elseif nargin == 2
alpha = 0.05;
end
% Check that matrix (X) and left hand side (y) have compatible dimensions
[n,ncolX] = size(X);
if ~isvector(y)
error('stats:regress:InvalidData', 'Y must be a vector.');
elseif numel(y) ~= n
error('stats:regress:InvalidData', ...
'The number of rows in Y must equal the number of rows in X.');
end
% Remove missing values, if any
wasnan = (isnan(y) | any(isnan(X),2));
havenans = any(wasnan);
if havenans
y(wasnan) = [];
X(wasnan,:) = [];
n = length(y);
end
% Use the rank-revealing QR to remove dependent columns of X.
[Q,R,perm] = qr(X,0);
p = sum(abs(diag(R)) > max(n,ncolX)*eps(R(1)));
if p < ncolX
warning('stats:regress:RankDefDesignMat', ...
'X is rank deficient to within machine precision.');
R = R(1:p,1:p);
Q = Q(:,1:p);
perm = perm(1:p);
end
% Compute the LS coefficients, filling in zeros in elements corresponding
% to rows of X that were thrown out.
b = zeros(ncolX,1);
b(perm) = R \ (Q'*y);
if nargout >= 2
% Find a confidence interval for each component of x
% Draper and Smith, equation 2.6.15, page 94
RI = R\eye(p);
nu = max(0,n-p); % Residual degrees of freedom
yhat = X*b; % Predicted responses at each data point.
r = y-yhat; % Residuals.
normr = norm(r);
if nu ~= 0
rmse = normr/sqrt(nu); % Root mean square error.
tval = tinv((1-alpha/2),nu);
else
rmse = NaN;
tval = 0;
end
s2 = rmse^2; % Estimator of error variance.
se = zeros(ncolX,1);
se(perm,:) = rmse*sqrt(sum(abs(RI).^2,2));
bint = [b-tval*se, b+tval*se];
% Find the standard errors of the residuals.
% Get the diagonal elements of the "Hat" matrix.
% Calculate the variance estimate obtained by removing each case (i.e. sigmai)
% see Chatterjee and Hadi p. 380 equation 14.
if nargout >= 4
hatdiag = sum(abs(Q).^2,2);
ok = ((1-hatdiag) > sqrt(eps(class(hatdiag))));
hatdiag(~ok) = 1;
if nu > 1
denom = (nu-1) .* (1-hatdiag);
sigmai = zeros(length(denom),1);
sigmai(ok) = sqrt(max(0,(nu*s2/(nu-1)) - (r(ok) .^2 ./ denom(ok))));
ser = sqrt(1-hatdiag) .* sigmai;
ser(~ok) = Inf;
elseif nu == 1
ser = sqrt(1-hatdiag) .* rmse;
ser(~ok) = Inf;
else % if nu == 0
ser = rmse*ones(length(y),1); % == Inf
end
% Create confidence intervals for residuals.
rint = [(r-tval*ser) (r+tval*ser)];
end
% Calculate R-squared and the other statistics.
if nargout == 5
% There are several ways to compute R^2, all equivalent for a
% linear model where X includes a constant term, but not equivalent
% otherwise. R^2 can be negative for models without an intercept.
% This indicates that the model is inappropriate.
SSE = normr.^2; % Error sum of squares.
RSS = norm(yhat-mean(y))^2; % Regression sum of squares.
TSS = norm(y-mean(y))^2; % Total sum of squares.
r2 = 1 - SSE/TSS; % R-square statistic.
if p > 1
F = (RSS/(p-1))/s2; % F statistic for regression
else
F = NaN;
end
prob = 1 - fcdf(F,p-1,nu); % Significance probability for regression
stats = [r2 F prob s2];
% All that requires a constant. Do we have one?
if ~any(all(X==1,1))
% Apparently not, but look for an implied constant.
b0 = R\(Q'*ones(n,1));
if (sum(abs(1-X(:,perm)*b0))>n*sqrt(eps(class(X))))
warning('stats:regress:NoConst',...
['R-square and the F statistic are not well-defined' ...
' unless X has a column of ones.\nType "help' ...
' regress" for more information.']);
end
end
end
% Restore NaN so inputs and outputs conform
if havenans
if nargout >= 3
tmp = repmat(NaN,length(wasnan),1);
tmp(~wasnan) = r;
r = tmp;
if nargout >= 4
tmp = repmat(NaN,length(wasnan),2);
tmp(~wasnan,:) = rint;
rint = tmp;
end
end
end
end % nargout >= 2