diff --git a/Project.toml b/Project.toml index 7ff9cfd3..e227b578 100644 --- a/Project.toml +++ b/Project.toml @@ -3,12 +3,15 @@ uuid = "38e38edf-8417-5370-95a0-9cbb8c7f171a" version = "1.8.1" [deps] +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/src/GLM.jl b/src/GLM.jl index 0e64f56a..196f5639 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -120,6 +120,7 @@ module GLM include("glmfit.jl") include("ftest.jl") include("negbinfit.jl") + include("influence.jl") include("deprecated.jl") end # module diff --git a/src/influence.jl b/src/influence.jl new file mode 100644 index 00000000..57f0477b --- /dev/null +++ b/src/influence.jl @@ -0,0 +1,93 @@ +##### +##### Measures of influence +##### + +StatsBase.leverage(model::LinPredModel) = _leverage(model, model.pp) + +function _leverage(_, pred::DensePredQR) + Q = pred.qr.Q + r = linpred_rank(pred) + y = diagm(size(Q, 1), r, trues(r)) + Z = Q * y + return vec(sum(abs2, Z; dims=1)) +end + +function _leverage(model, pred::DensePredChol{<:Any,<:Cholesky}) + X = weightedmodelmatrix(model) + choldiv!(pred.chol, X) + return vec(sum(abs2, X; dims=2)) +end + +function _leverage(model, pred::DensePredChol{<:Any,<:CholeskyPivoted}) + X = weightedmodelmatrix(model) + C = pred.chol + if any(x -> isapprox(x, zero(x)), diag(C.L)) + Q = qr!(X).Q + r = rank(C) + y = diagm(size(Q, 1), r, trues(r)) + X = Q * y + else + choldiv!(C, X) + end + return vec(sum(abs2, X; dims=2)) +end + +function _leverage(model, pred::SparsePredChol) + X = weightedmodelmatrix(model) + Z = pred.chol.L \ X' # can't be done in-place for SuiteSparse factorizations + return vec(sum(abs2, Z; dims=1)) +end + +# Overwrite `X` with the solution `Z` to `L*Z = Xᵀ`, where `L` is the lower triangular +# factor from the Cholesky factorization of `XᵀX`. +choldiv!(C::Cholesky, X) = ldiv!(C.L, X') +choldiv!(C::CholeskyPivoted, X) = ldiv!(C.L, view(X, :, invperm(C.p))') + +# `X` for unweighted models, `√W*X` for weighted, where `W` is a diagonal matrix +# of the prior weights. A copy of `X` is made so that it can be mutated downstream +# without affecting the underlying model object. +function weightedmodelmatrix(model::LinearModel) + X = copy(modelmatrix(model)) + priorwt = model.rr.wts + if !isempty(priorwt) + X .*= sqrt.(priorwt) + end + return X +end + +# `√W*X`, where `W` is a diagonal matrix of the working weights from the final IRLS +# iteration. This handles GLMs with and without prior weights, as the prior weights +# simply become part of the working weights for IRLS. No explicit copy of `X` needs +# to be made since we're always doing a multiplication, unlike for the method above. +weightedmodelmatrix(model::GeneralizedLinearModel) = + modelmatrix(model) .* sqrt.(model.rr.wrkwt) + +@noinline function _checkrankdeficient(model) + if linpred_rank(model.pp) < size(modelmatrix(model), 2) + throw(ArgumentError("models with collinear terms are not currently supported")) + end + return nothing +end + +function StatsBase.cooksdistance(model::GeneralizedLinearModel) + _checkrankdeficient(model) + y = response(model) + ŷ = fitted(model) + k = dof(model) - hasintercept(model) + φ̂ = dispersion(model) + h = leverage(model) + D = model.rr.d + return @. (y - ŷ)^2 / glmvar(D, ŷ) * (h / (1 - h)^2) / (φ̂ * (k + 1)) +end + +function StatsBase.cooksdistance(model::LinearModel) + _checkrankdeficient(model) + u = residuals(model) + #if !isempty(model.rr.wts) + # u .*= sqrt.(model.rr.wts) + #end + mse = dispersion(model, true) + k = dof(model) - 1 + h = leverage(model) + return @. u^2 * (h / (1 - h)^2) / (k * mse) +end diff --git a/src/lm.jl b/src/lm.jl index dc7dd487..0e3197b4 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -325,29 +325,3 @@ function confint(obj::LinearModel; level::Real=0.95) hcat(coef(obj),coef(obj)) + stderror(obj) * quantile(TDist(dof_residual(obj)), (1. - level)/2.) * [1. -1.] end - -""" - cooksdistance(obj::LinearModel) - -Compute [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) -for each observation in linear model `obj`, giving an estimate of the influence -of each data point. -Currently only implemented for linear models without weights. -""" -function StatsBase.cooksdistance(obj::LinearModel) - u = residuals(obj) - mse = dispersion(obj,true) - k = dof(obj)-1 - d_res = dof_residual(obj) - X = modelmatrix(obj) - XtX = crossmodelmatrix(obj) - k == size(X,2) || throw(ArgumentError("Models with collinear terms are not currently supported.")) - wts = obj.rr.wts - if isempty(wts) - hii = diag(X * inv(XtX) * X') - else - throw(ArgumentError("Weighted models are not currently supported.")) - end - D = @. u^2 * (hii / (1 - hii)^2) / (k*mse) - return D -end diff --git a/test/runtests.jl b/test/runtests.jl index 78436748..dca969f8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -41,6 +41,12 @@ linreg(x::AbstractVecOrMat, y::AbstractVector) = qr!(simplemm(x)) \ y @test isapprox(aic(lm1), -36.409684288095946) @test isapprox(aicc(lm1), -24.409684288095946) @test isapprox(bic(lm1), -37.03440588041178) + # Comparator values for `leverage` and `cooksdistance` are from R's `hatvalues` and + # `cooks.distance`, respectively + @test leverage(lm1) ≈ [0.5918367347, 0.2816326531, 0.1673469388, 0.1836734694, + 0.2489795918, 0.5265306122] atol=1e-10 + @test cooksdistance(lm1) ≈ [1.0705381016, 0.0038594655, 0.0123926845, 0.0940006684, + 0.1666111600, 2.1649894169] atol=1e-10 lm2 = fit(LinearModel, hcat(ones(6), 10form.Carb), form.OptDen, true) @test isa(lm2.pp.chol, CholeskyPivoted) @test lm2.pp.chol.piv == [2, 1] @@ -100,7 +106,13 @@ end @test isapprox(loglikelihood(lm_model), -4353.946729075838) @test isapprox(loglikelihood(glm_model), -4353.946729075838) @test isapprox(nullloglikelihood(lm_model), -4984.892139711452) - @test isapprox(mean(residuals(lm_model)), -5.412966629787718) + @test isapprox(mean(residuals(lm_model)), -5.412966629787718) + @test leverage(lm_model) ≈ leverage(glm_model) + # Values computed from R + @test leverage(lm_model)[1:10] ≈ [0.0031577628, 0.0050424090, 0.0044471988, 0.0084538844, + 0.0088088189, 0.0014421478, 0.0031762479, 0.0042745349, + 0.0073786323, 0.0126819467] atol=1e-8 + @test_broken cooksdistance(lm_model) ≈ cooksdistance(glm_model) end @testset "rankdeficient" begin @@ -154,6 +166,7 @@ end [Inf 0.0 1.0 -Inf Inf Inf 0.0 1.0 -Inf Inf Inf 0.0 1.0 -Inf Inf]) + @test leverage(model) ≈ [1, 1, 1] model = lm(@formula(y ~ 0 + x), df) ct = coeftable(model) @@ -165,6 +178,8 @@ end [Inf 0.0 1.0 -Inf Inf Inf 0.0 1.0 -Inf Inf Inf 0.0 1.0 -Inf Inf]) + @test leverage(model) ≈ [1, 1, 1] + @test all(isnan, cooksdistance(model)) model = glm(@formula(y ~ x), df, Normal(), IdentityLink()) ct = coeftable(model) @@ -176,6 +191,7 @@ end [Inf 0.0 1.0 -Inf Inf Inf 0.0 1.0 -Inf Inf Inf 0.0 1.0 -Inf Inf]) + @test leverage(model) ≈ [1, 1, 1] model = glm(@formula(y ~ 0 + x), df, Normal(), IdentityLink()) ct = coeftable(model) @@ -187,6 +203,8 @@ end [Inf 0.0 1.0 -Inf Inf Inf 0.0 1.0 -Inf Inf Inf 0.0 1.0 -Inf Inf]) + @test leverage(model) ≈ [1, 1, 1] + @test all(isnan, cooksdistance(model)) # Saturated and rank-deficient model df = DataFrame(x1=["a", "b", "c"], x2=["a", "b", "c"], y=[1, 2, 3]) @@ -203,6 +221,15 @@ end Inf 0.0 1.0 -Inf Inf NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN]) + if model isa LinearModel + # This is a very loose tolerance; the difference may just be due to the + # difference in how models are fit between this package and R, in particular + # the use of Cholesky vs. QR factorization, but idk + @test leverage(model) ≈ [1, 1, 1] atol=1e-4 + else + # Currently these are like [1.05, 1.49, 1.11] + @test_broken leverage(model) ≈ [1, 1, 1] atol=1e-4 + end end end