Implementation of dropcollinear feature in GeneralizedLinearModel (…

…#488)
JuliaStats · Oct 21, 2022 · 1459737 · 1459737
1 parent e2e9c85
commit 1459737
Show file tree

Hide file tree

Showing 5 changed files with 200 additions and 19 deletions.
diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -99,7 +99,7 @@ julia> data = DataFrame(X=[1,2,2], Y=[1,0,1])
    3 │     2      1
 
 julia> probit = glm(@formula(Y ~ X), data, Binomial(), ProbitLink())
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
+StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
 
 Y ~ 1 + X
 
@@ -140,7 +140,7 @@ julia> quine = dataset("MASS", "quine")
                      131 rows omitted
 
 julia> nbrmodel = glm(@formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink())
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
+StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
 
 Days ~ 1 + Eth + Sex + Age + Lrn
 
@@ -158,7 +158,7 @@ Lrn: SL       0.296768     0.185934   1.60    0.1105  -0.0676559   0.661191
 ────────────────────────────────────────────────────────────────────────────
 
 julia> nbrmodel = negbin(@formula(Days ~ Eth+Sex+Age+Lrn), quine, LogLink())
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
+StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
 
 Days ~ 1 + Eth + Sex + Age + Lrn
 
@@ -364,7 +364,7 @@ julia> dobson = DataFrame(Counts    = [18.,17,15,20,10,21,25,13,13],
    9 │    13.0  3        3
 
 julia> gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ Outcome + Treatment), dobson, Poisson())
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
+StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
 
 Counts ~ 1 + Outcome + Treatment
 
@@ -421,7 +421,7 @@ julia> round(optimal_bic.minimizer, digits = 5) # Optimal λ
 0.40935
 
 julia> glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(optimal_bic.minimizer)) # Best model
-StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
+StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
 
 Volume ~ 1 + Height + Girth
 

diff --git a/src/glmfit.jl b/src/glmfit.jl
@@ -282,7 +282,8 @@ function nulldeviance(m::GeneralizedLinearModel)
     else
         X = fill(1.0, length(y), hasint ? 1 : 0)
         nullm = fit(GeneralizedLinearModel,
-                    X, y, d, r.link, wts=wts, offset=offset,
+                    X, y, d, r.link; wts=wts, offset=offset,
+                    dropcollinear=isa(m.pp.chol, CholeskyPivoted),
                     maxiter=m.maxiter, minstepfac=m.minstepfac,
                     atol=m.atol, rtol=m.rtol)
         dev = deviance(nullm)
@@ -336,15 +337,19 @@ function nullloglikelihood(m::GeneralizedLinearModel)
     else
         X = fill(1.0, length(y), hasint ? 1 : 0)
         nullm = fit(GeneralizedLinearModel,
-                    X, y, d, r.link, wts=wts, offset=offset,
+                    X, y, d, r.link; wts=wts, offset=offset,
+                    dropcollinear=isa(m.pp.chol, CholeskyPivoted),
                     maxiter=m.maxiter, minstepfac=m.minstepfac,
                     atol=m.atol, rtol=m.rtol)
         ll = loglikelihood(nullm)
     end
     return ll
 end
 
-dof(x::GeneralizedLinearModel) = dispersion_parameter(x.rr.d) ? length(coef(x)) + 1 : length(coef(x))
+function dof(x::GeneralizedLinearModel)
+    modelrank = linpred_rank(x.pp)
+    dispersion_parameter(x.rr.d) ? modelrank + 1 : modelrank
+end
 
 function _fit!(m::AbstractGLM, verbose::Bool, maxiter::Integer, minstepfac::Real,
                atol::Real, rtol::Real, start)
@@ -521,6 +526,12 @@ const FIT_GLM_DOC = """
     for a list of built-in links).
 
     # Keyword Arguments
+    - `dropcollinear::Bool=true`: Controls whether or not `lm` accepts a model matrix which
+      is less-than-full rank.
+      If `true` (the default) the coefficient for redundant linearly dependent columns is
+      `0.0` and all associated statistics are set to `NaN`.
+      Typically from a set of linearly-dependent columns the last ones are identified as redundant
+      (however, the exact selection of columns identified as redundant is not guaranteed).
     - `dofit::Bool=true`: Determines whether model will be fit
     - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations.
       Such weights are equivalent to repeating each observation a number of times equal
@@ -557,6 +568,7 @@ function fit(::Type{M},
     y::AbstractVector{<:Real},
     d::UnivariateDistribution,
     l::Link = canonicallink(d);
+    dropcollinear::Bool = true,
     dofit::Bool = true,
     wts::AbstractVector{<:Real}      = similar(y, 0),
     offset::AbstractVector{<:Real}   = similar(y, 0),
@@ -568,7 +580,7 @@ function fit(::Type{M},
     end
 
     rr = GlmResp(y, d, l, offset, wts)
-    res = M(rr, cholpred(X), false)
+    res = M(rr, cholpred(X, dropcollinear), false)
     return dofit ? fit!(res; fitargs...) : res
 end
 

diff --git a/src/linpred.jl b/src/linpred.jl
@@ -157,11 +157,34 @@ function delbeta!(p::DensePredChol{T,<:Cholesky}, r::Vector{T}, wt::Vector{T}) w
 end
 
 function delbeta!(p::DensePredChol{T,<:CholeskyPivoted}, r::Vector{T}, wt::Vector{T}) where T<:BlasReal
-    cf = cholfactors(p.chol)
-    piv = p.chol.p
-    cf .= mul!(p.scratchm2, adjoint(LinearAlgebra.mul!(p.scratchm1, Diagonal(wt), p.X)), p.X)[piv, piv]
-    cholesky!(Hermitian(cf, Symbol(p.chol.uplo)))
-    ldiv!(p.chol, mul!(p.delbeta, transpose(p.scratchm1), r))
+    piv = p.chol.p # inverse vector
+    delbeta = p.delbeta
+    # p.scratchm1 = WX
+    mul!(p.scratchm1, Diagonal(wt), p.X)
+    # p.scratchm2 = X'WX
+    mul!(p.scratchm2, adjoint(p.scratchm1), p.X)
+    # delbeta = X'Wr
+    mul!(delbeta, transpose(p.scratchm1), r)
+    # calculate delbeta = (X'WX)\X'Wr
+    rnk = rank(p.chol)
+    if rnk == length(delbeta)
+        cf = cholfactors(p.chol)
+        cf .= p.scratchm2[piv, piv]
+        cholesky!(Hermitian(cf, Symbol(p.chol.uplo)))
+        ldiv!(p.chol, delbeta)
+    else
+        permute!(delbeta, piv)
+        for k=(rnk+1):length(delbeta)
+            delbeta[k] = -zero(T)
+        end
+        # shift full rank column to 1:rank
+        cf = cholfactors(p.chol)
+        cf .= p.scratchm2[piv, piv]
+        cholesky!(Hermitian(view(cf, 1:rnk, 1:rnk), Symbol(p.chol.uplo)))
+        ldiv!(Cholesky(view(cf, 1:rnk, 1:rnk), Symbol(p.chol.uplo), p.chol.info),
+              view(delbeta, 1:rnk))
+        invpermute!(delbeta, piv)
+    end
     p
 end
 
@@ -264,3 +287,6 @@ coef(obj::LinPredModel) = coef(obj.pp)
 dof_residual(obj::LinPredModel) = nobs(obj) - dof(obj) + 1
 
 hasintercept(m::LinPredModel) = any(i -> all(==(1), view(m.pp.X , :, i)), 1:size(m.pp.X, 2))
+
+linpred_rank(x::LinPred) = length(x.beta0)
+linpred_rank(x::DensePredChol{<:Any, <:CholeskyPivoted}) = x.chol.rank
diff --git a/src/lm.jl b/src/lm.jl
@@ -157,9 +157,7 @@ $FIT_LM_DOC
 lm(X, y, allowrankdeficient_dep::Union{Bool,Nothing}=nothing; kwargs...) =
     fit(LinearModel, X, y, allowrankdeficient_dep; kwargs...)
 
-dof(x::LinearModel) = length(coef(x)) + 1
-
-dof(obj::LinearModel{<:LmResp,<:DensePredChol{<:Real,<:CholeskyPivoted}}) = obj.pp.chol.rank + 1
+dof(x::LinearModel) = linpred_rank(x.pp) + 1
 
 """
     deviance(obj::LinearModel)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -767,8 +767,8 @@ end
 
     # Poisson with categorical predictors, weights and offset
     nointglm3 = fit(GeneralizedLinearModel, @formula(round(Postwt) ~ 0 + Prewt + Treat), anorexia,
-                    Poisson(), LogLink(), offset=log.(anorexia.Prewt),
-                    wts=repeat(1:4, outer=18), rtol=1e-8)
+                    Poisson(), LogLink(); offset=log.(anorexia.Prewt),
+                    wts=repeat(1:4, outer=18), rtol=1e-8, dropcollinear=false)
     @test !hasintercept(nointglm3.model)
     @test GLM.cancancel(nointglm3.model.rr)
     test_show(nointglm3)
@@ -1446,3 +1446,148 @@ end
         @test predict(mdl1) ≈ predict(mdl2)
     end
 end
+
+@testset "dropcollinear with GLMs" begin
+    data = DataFrame(x1=[4, 5, 9, 6, 5], x2=[5, 3, 6, 7, 1], 
+                     x3=[4.2, 4.6, 8.4, 6.2, 4.2], y=[14, 14, 24, 20, 11])
+
+    @testset "Check normal with identity link against equivalent linear model" begin
+        mdl1 = lm(@formula(y ~ x1 + x2 + x3), data; dropcollinear=true)
+        mdl2 = glm(@formula(y ~ x1 + x2 + x3), data, Normal(), IdentityLink();
+                   dropcollinear=true)
+
+        @test coef(mdl1) ≈ coef(mdl2)
+        @test stderror(mdl1)[1:3] ≈ stderror(mdl2)[1:3]
+        @test isnan(stderror(mdl1)[4])
+        @test dof(mdl1) == dof(mdl2)
+        @test dof_residual(mdl1) == dof_residual(mdl2)
+        @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true)
+        @test deviance(mdl1) ≈ deviance(mdl2)
+        @test loglikelihood(mdl1) ≈ loglikelihood(mdl2)
+        @test aic(mdl1) ≈ aic(mdl2)
+        @test predict(mdl1) ≈ predict(mdl2)
+    end
+    @testset "Check against equivalent linear model when dropcollinear = false" begin
+        mdl1 = lm(@formula(y ~ x1 + x2), data; dropcollinear=false)
+        mdl2 = glm(@formula(y ~ x1 + x2), data, Normal(), IdentityLink();
+                   dropcollinear=false)
+
+        @test coef(mdl1) ≈ coef(mdl2)
+        @test stderror(mdl1) ≈ stderror(mdl2)
+        @test dof(mdl1) == dof(mdl2)
+        @test dof_residual(mdl1) == dof_residual(mdl2)
+        @test GLM.dispersion(mdl1.model, true) ≈ GLM.dispersion(mdl2.model,true)
+        @test deviance(mdl1) ≈ deviance(mdl2)
+        @test loglikelihood(mdl1) ≈ loglikelihood(mdl2)
+        @test aic(mdl1) ≈ aic(mdl2)
+        @test predict(mdl1) ≈ predict(mdl2)
+    end
+
+    @testset "Check normal with identity link against outputs from R" begin
+        mdl = glm(@formula(y ~ x1 + x2 + x3), data, Normal(), IdentityLink();
+                   dropcollinear=true)
+        @test coef(mdl) ≈ [1.350439882697950, 1.740469208211143, 1.171554252199414, 0.0]
+        @test stderror(mdl)[1:3] ≈ [0.58371400875263, 0.10681694901238, 0.08531532203251]
+        @test dof(mdl) == 4
+        @test dof_residual(mdl) == 2
+        @test GLM.dispersion(mdl.model, true) ≈ 0.1341642228738996
+        @test deviance(mdl) ≈ 0.2683284457477991
+        @test loglikelihood(mdl) ≈ 0.2177608775670037
+        @test aic(mdl) ≈ 7.564478244866
+        @test predict(mdl) ≈ [14.17008797653959, 13.56744868035191, 24.04398826979472,
+                              19.99413489736071, 11.22434017595308]
+    end
+
+    num_rows = 100
+    dfrm = DataFrame()
+    dfrm.x1 = randn(StableRNG(123), num_rows)
+    dfrm.x2 = randn(StableRNG(1234), num_rows)
+    dfrm.x3 = 2*dfrm.x1 + 3*dfrm.x2
+    dfrm.y = Int.(randn(StableRNG(12345), num_rows) .> 0)
+
+    @testset "Test Logistic Regression Outputs from R" begin
+        mdl = glm(@formula(y ~ x1 + x2 + x3), dfrm, Binomial(), LogitLink();
+                  dropcollinear=true)
+        @test coef(mdl) ≈ [-0.1402582892604246, 0.1362176272953289, 0, -0.1134751362230204] atol = 1.0E-6
+        stderr = stderror(mdl)
+        @test isnan(stderr[3]) == true
+        @test vcat(stderr[1:2], stderr[4])  ≈ [0.20652049856206, 0.25292632684716, 0.07496476901643] atol = 1.0E-4
+        @test deviance(mdl) ≈ 135.68506068159
+        @test loglikelihood(mdl) ≈ -67.8425303407948
+        @test dof(mdl) == 3
+        @test dof_residual(mdl) == 98
+        @test aic(mdl) ≈ 141.68506068159
+        @test GLM.dispersion(mdl.model, true) ≈ 1
+        @test predict(mdl)[1:3] ≈ [0.4241893070433117, 0.3754516361306202, 0.6327877688720133] atol = 1.0E-6
+        @test confint(mdl)[1:2,1:2] ≈ [-0.5493329715011036 0.26350316142056085;
+                                       -0.3582545657827583 0.64313795309765587] atol = 1.0E-1
+    end
+
+    @testset "`rankdeficient` test case of lm in glm" begin
+        rng = StableRNG(1234321)
+        # an example of rank deficiency caused by a missing cell in a table
+        dfrm = DataFrame([categorical(repeat(string.('A':'D'), inner = 6)),
+                          categorical(repeat(string.('a':'c'), inner = 2, outer = 4))],
+                          [:G, :H])
+        f = @formula(0 ~ 1 + G*H)
+        X = ModelMatrix(ModelFrame(f, dfrm)).m
+        y = X * (1:size(X, 2)) + 0.1 * randn(rng, size(X, 1))
+        inds = deleteat!(collect(1:length(y)), 7:8)
+        m1 = fit(GeneralizedLinearModel, X, y, Normal())
+        @test isapprox(deviance(m1), 0.12160301538297297)
+        Xmissingcell = X[inds, :]
+        ymissingcell = y[inds]
+        @test_throws PosDefException m2 = glm(Xmissingcell, ymissingcell, Normal();
+            dropcollinear=false)
+        m2p = glm(Xmissingcell, ymissingcell, Normal(); dropcollinear=true)
+        @test isa(m2p.pp.chol, CholeskyPivoted)
+        @test rank(m2p.pp.chol) == 11
+        @test isapprox(deviance(m2p), 0.1215758392280204)
+        @test isapprox(coef(m2p), [0.9772643585228885, 8.903341608496437, 3.027347397503281,
+            3.9661379199401257, 5.079410103608552, 6.1944618141188625, 0.0, 7.930328728005131,
+            8.879994918604757, 2.986388408421915, 10.84972230524356, 11.844809275711485])
+        @test all(isnan, hcat(coeftable(m2p).cols[2:end]...)[7,:])
+
+        m2p_dep_pos = glm(Xmissingcell, ymissingcell, Normal())
+        @test_logs (:warn, "Positional argument `allowrankdeficient` is deprecated, use keyword " *
+                    "argument `dropcollinear` instead. Proceeding with positional argument value: true") fit(LinearModel, Xmissingcell, ymissingcell, true)
+        @test isa(m2p_dep_pos.pp.chol, CholeskyPivoted)
+        @test rank(m2p_dep_pos.pp.chol) == rank(m2p.pp.chol)
+        @test isapprox(deviance(m2p_dep_pos), deviance(m2p))
+        @test isapprox(coef(m2p_dep_pos), coef(m2p))
+    end
+
+    @testset "`rankdeficient` test in GLM with Gamma distribution" begin
+        rng = StableRNG(1234321)
+        # an example of rank deficiency caused by a missing cell in a table
+        dfrm = DataFrame([categorical(repeat(string.('A':'D'), inner = 6)),
+                          categorical(repeat(string.('a':'c'), inner = 2, outer = 4))],
+                          [:G, :H])
+        f = @formula(0 ~ 1 + G*H)
+        X = ModelMatrix(ModelFrame(f, dfrm)).m
+        y = X * (1:size(X, 2)) + 0.1 * randn(rng, size(X, 1))
+        inds = deleteat!(collect(1:length(y)), 7:8)
+        m1 = fit(GeneralizedLinearModel, X, y, Gamma())
+        @test isapprox(deviance(m1), 0.0407069934950098)
+        Xmissingcell = X[inds, :]
+        ymissingcell = y[inds]
+        @test_throws PosDefException glm(Xmissingcell, ymissingcell, Gamma(); dropcollinear=false)
+        m2p = glm(Xmissingcell, ymissingcell, Gamma(); dropcollinear=true)
+        @test isa(m2p.pp.chol, CholeskyPivoted)
+        @test rank(m2p.pp.chol) == 11
+        @test isapprox(deviance(m2p), 0.04070377141288433)
+        @test isapprox(coef(m2p), [ 1.0232644374837732, -0.0982622592717195, -0.7735523403010212,
+            -0.820974608805111, -0.8581573302333557, -0.8838279927663583, 0.0, 0.667219148331652,
+            0.7087696966674913, 0.011287703617517712, 0.6816245514668273, 0.7250492032072612])
+        @test all(isnan, hcat(coeftable(m2p).cols[2:end]...)[7,:])
+
+        m2p_dep_pos = fit(GeneralizedLinearModel, Xmissingcell, ymissingcell, Gamma())
+        @test_logs (:warn, "Positional argument `allowrankdeficient` is deprecated, use keyword " *
+                    "argument `dropcollinear` instead. Proceeding with positional argument value: true") fit(LinearModel, Xmissingcell, ymissingcell, true)
+        @test isa(m2p_dep_pos.pp.chol, CholeskyPivoted)
+        @test rank(m2p_dep_pos.pp.chol) == rank(m2p.pp.chol)
+        @test isapprox(deviance(m2p_dep_pos), deviance(m2p))
+        @test isapprox(coef(m2p_dep_pos), coef(m2p))
+    end
+end
+