Skip to content

Commit

Permalink
Implementation of dropcollinear feature in GeneralizedLinearModel (
Browse files Browse the repository at this point in the history
  • Loading branch information
mousum-github authored Oct 21, 2022
1 parent e2e9c85 commit 1459737
Show file tree
Hide file tree
Showing 5 changed files with 200 additions and 19 deletions.
10 changes: 5 additions & 5 deletions docs/src/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ julia> data = DataFrame(X=[1,2,2], Y=[1,0,1])
3 │ 2 1
julia> probit = glm(@formula(Y ~ X), data, Binomial(), ProbitLink())
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
Y ~ 1 + X
Expand Down Expand Up @@ -140,7 +140,7 @@ julia> quine = dataset("MASS", "quine")
131 rows omitted
julia> nbrmodel = glm(@formula(Days ~ Eth+Sex+Age+Lrn), quine, NegativeBinomial(2.0), LogLink())
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
Days ~ 1 + Eth + Sex + Age + Lrn
Expand All @@ -158,7 +158,7 @@ Lrn: SL 0.296768 0.185934 1.60 0.1105 -0.0676559 0.661191
────────────────────────────────────────────────────────────────────────────
julia> nbrmodel = negbin(@formula(Days ~ Eth+Sex+Age+Lrn), quine, LogLink())
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, NegativeBinomial{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
Days ~ 1 + Eth + Sex + Age + Lrn
Expand Down Expand Up @@ -364,7 +364,7 @@ julia> dobson = DataFrame(Counts = [18.,17,15,20,10,21,25,13,13],
9 │ 13.0 3 3
julia> gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ Outcome + Treatment), dobson, Poisson())
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Poisson{Float64}, LogLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
Counts ~ 1 + Outcome + Treatment
Expand Down Expand Up @@ -421,7 +421,7 @@ julia> round(optimal_bic.minimizer, digits = 5) # Optimal λ
0.40935
julia> glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(optimal_bic.minimizer)) # Best model
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, PowerLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
Volume ~ 1 + Height + Girth
Expand Down
20 changes: 16 additions & 4 deletions src/glmfit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,8 @@ function nulldeviance(m::GeneralizedLinearModel)
else
X = fill(1.0, length(y), hasint ? 1 : 0)
nullm = fit(GeneralizedLinearModel,
X, y, d, r.link, wts=wts, offset=offset,
X, y, d, r.link; wts=wts, offset=offset,
dropcollinear=isa(m.pp.chol, CholeskyPivoted),
maxiter=m.maxiter, minstepfac=m.minstepfac,
atol=m.atol, rtol=m.rtol)
dev = deviance(nullm)
Expand Down Expand Up @@ -336,15 +337,19 @@ function nullloglikelihood(m::GeneralizedLinearModel)
else
X = fill(1.0, length(y), hasint ? 1 : 0)
nullm = fit(GeneralizedLinearModel,
X, y, d, r.link, wts=wts, offset=offset,
X, y, d, r.link; wts=wts, offset=offset,
dropcollinear=isa(m.pp.chol, CholeskyPivoted),
maxiter=m.maxiter, minstepfac=m.minstepfac,
atol=m.atol, rtol=m.rtol)
ll = loglikelihood(nullm)
end
return ll
end

dof(x::GeneralizedLinearModel) = dispersion_parameter(x.rr.d) ? length(coef(x)) + 1 : length(coef(x))
function dof(x::GeneralizedLinearModel)
modelrank = linpred_rank(x.pp)
dispersion_parameter(x.rr.d) ? modelrank + 1 : modelrank
end

function _fit!(m::AbstractGLM, verbose::Bool, maxiter::Integer, minstepfac::Real,
atol::Real, rtol::Real, start)
Expand Down Expand Up @@ -521,6 +526,12 @@ const FIT_GLM_DOC = """
for a list of built-in links).
# Keyword Arguments
- `dropcollinear::Bool=true`: Controls whether or not `lm` accepts a model matrix which
is less-than-full rank.
If `true` (the default) the coefficient for redundant linearly dependent columns is
`0.0` and all associated statistics are set to `NaN`.
Typically from a set of linearly-dependent columns the last ones are identified as redundant
(however, the exact selection of columns identified as redundant is not guaranteed).
- `dofit::Bool=true`: Determines whether model will be fit
- `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations.
Such weights are equivalent to repeating each observation a number of times equal
Expand Down Expand Up @@ -557,6 +568,7 @@ function fit(::Type{M},
y::AbstractVector{<:Real},
d::UnivariateDistribution,
l::Link = canonicallink(d);
dropcollinear::Bool = true,
dofit::Bool = true,
wts::AbstractVector{<:Real} = similar(y, 0),
offset::AbstractVector{<:Real} = similar(y, 0),
Expand All @@ -568,7 +580,7 @@ function fit(::Type{M},
end

rr = GlmResp(y, d, l, offset, wts)
res = M(rr, cholpred(X), false)
res = M(rr, cholpred(X, dropcollinear), false)
return dofit ? fit!(res; fitargs...) : res
end

Expand Down
36 changes: 31 additions & 5 deletions src/linpred.jl
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,34 @@ function delbeta!(p::DensePredChol{T,<:Cholesky}, r::Vector{T}, wt::Vector{T}) w
end

function delbeta!(p::DensePredChol{T,<:CholeskyPivoted}, r::Vector{T}, wt::Vector{T}) where T<:BlasReal
cf = cholfactors(p.chol)
piv = p.chol.p
cf .= mul!(p.scratchm2, adjoint(LinearAlgebra.mul!(p.scratchm1, Diagonal(wt), p.X)), p.X)[piv, piv]
cholesky!(Hermitian(cf, Symbol(p.chol.uplo)))
ldiv!(p.chol, mul!(p.delbeta, transpose(p.scratchm1), r))
piv = p.chol.p # inverse vector
delbeta = p.delbeta
# p.scratchm1 = WX
mul!(p.scratchm1, Diagonal(wt), p.X)
# p.scratchm2 = X'WX
mul!(p.scratchm2, adjoint(p.scratchm1), p.X)
# delbeta = X'Wr
mul!(delbeta, transpose(p.scratchm1), r)
# calculate delbeta = (X'WX)\X'Wr
rnk = rank(p.chol)
if rnk == length(delbeta)
cf = cholfactors(p.chol)
cf .= p.scratchm2[piv, piv]
cholesky!(Hermitian(cf, Symbol(p.chol.uplo)))
ldiv!(p.chol, delbeta)
else
permute!(delbeta, piv)
for k=(rnk+1):length(delbeta)
delbeta[k] = -zero(T)
end
# shift full rank column to 1:rank
cf = cholfactors(p.chol)
cf .= p.scratchm2[piv, piv]
cholesky!(Hermitian(view(cf, 1:rnk, 1:rnk), Symbol(p.chol.uplo)))
ldiv!(Cholesky(view(cf, 1:rnk, 1:rnk), Symbol(p.chol.uplo), p.chol.info),
view(delbeta, 1:rnk))
invpermute!(delbeta, piv)
end
p
end

Expand Down Expand Up @@ -264,3 +287,6 @@ coef(obj::LinPredModel) = coef(obj.pp)
dof_residual(obj::LinPredModel) = nobs(obj) - dof(obj) + 1

hasintercept(m::LinPredModel) = any(i -> all(==(1), view(m.pp.X , :, i)), 1:size(m.pp.X, 2))

linpred_rank(x::LinPred) = length(x.beta0)
linpred_rank(x::DensePredChol{<:Any, <:CholeskyPivoted}) = x.chol.rank
4 changes: 1 addition & 3 deletions src/lm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,7 @@ $FIT_LM_DOC
lm(X, y, allowrankdeficient_dep::Union{Bool,Nothing}=nothing; kwargs...) =
fit(LinearModel, X, y, allowrankdeficient_dep; kwargs...)

dof(x::LinearModel) = length(coef(x)) + 1

dof(obj::LinearModel{<:LmResp,<:DensePredChol{<:Real,<:CholeskyPivoted}}) = obj.pp.chol.rank + 1
dof(x::LinearModel) = linpred_rank(x.pp) + 1

"""
deviance(obj::LinearModel)
Expand Down
149 changes: 147 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -767,8 +767,8 @@ end

# Poisson with categorical predictors, weights and offset
nointglm3 = fit(GeneralizedLinearModel, @formula(round(Postwt) ~ 0 + Prewt + Treat), anorexia,
Poisson(), LogLink(), offset=log.(anorexia.Prewt),
wts=repeat(1:4, outer=18), rtol=1e-8)
Poisson(), LogLink(); offset=log.(anorexia.Prewt),
wts=repeat(1:4, outer=18), rtol=1e-8, dropcollinear=false)
@test !hasintercept(nointglm3.model)
@test GLM.cancancel(nointglm3.model.rr)
test_show(nointglm3)
Expand Down Expand Up @@ -1446,3 +1446,148 @@ end
@test predict(mdl1) predict(mdl2)
end
end

@testset "dropcollinear with GLMs" begin
data = DataFrame(x1=[4, 5, 9, 6, 5], x2=[5, 3, 6, 7, 1],
x3=[4.2, 4.6, 8.4, 6.2, 4.2], y=[14, 14, 24, 20, 11])

@testset "Check normal with identity link against equivalent linear model" begin
mdl1 = lm(@formula(y ~ x1 + x2 + x3), data; dropcollinear=true)
mdl2 = glm(@formula(y ~ x1 + x2 + x3), data, Normal(), IdentityLink();
dropcollinear=true)

@test coef(mdl1) coef(mdl2)
@test stderror(mdl1)[1:3] stderror(mdl2)[1:3]
@test isnan(stderror(mdl1)[4])
@test dof(mdl1) == dof(mdl2)
@test dof_residual(mdl1) == dof_residual(mdl2)
@test GLM.dispersion(mdl1.model, true) GLM.dispersion(mdl2.model,true)
@test deviance(mdl1) deviance(mdl2)
@test loglikelihood(mdl1) loglikelihood(mdl2)
@test aic(mdl1) aic(mdl2)
@test predict(mdl1) predict(mdl2)
end
@testset "Check against equivalent linear model when dropcollinear = false" begin
mdl1 = lm(@formula(y ~ x1 + x2), data; dropcollinear=false)
mdl2 = glm(@formula(y ~ x1 + x2), data, Normal(), IdentityLink();
dropcollinear=false)

@test coef(mdl1) coef(mdl2)
@test stderror(mdl1) stderror(mdl2)
@test dof(mdl1) == dof(mdl2)
@test dof_residual(mdl1) == dof_residual(mdl2)
@test GLM.dispersion(mdl1.model, true) GLM.dispersion(mdl2.model,true)
@test deviance(mdl1) deviance(mdl2)
@test loglikelihood(mdl1) loglikelihood(mdl2)
@test aic(mdl1) aic(mdl2)
@test predict(mdl1) predict(mdl2)
end

@testset "Check normal with identity link against outputs from R" begin
mdl = glm(@formula(y ~ x1 + x2 + x3), data, Normal(), IdentityLink();
dropcollinear=true)
@test coef(mdl) [1.350439882697950, 1.740469208211143, 1.171554252199414, 0.0]
@test stderror(mdl)[1:3] [0.58371400875263, 0.10681694901238, 0.08531532203251]
@test dof(mdl) == 4
@test dof_residual(mdl) == 2
@test GLM.dispersion(mdl.model, true) 0.1341642228738996
@test deviance(mdl) 0.2683284457477991
@test loglikelihood(mdl) 0.2177608775670037
@test aic(mdl) 7.564478244866
@test predict(mdl) [14.17008797653959, 13.56744868035191, 24.04398826979472,
19.99413489736071, 11.22434017595308]
end

num_rows = 100
dfrm = DataFrame()
dfrm.x1 = randn(StableRNG(123), num_rows)
dfrm.x2 = randn(StableRNG(1234), num_rows)
dfrm.x3 = 2*dfrm.x1 + 3*dfrm.x2
dfrm.y = Int.(randn(StableRNG(12345), num_rows) .> 0)

@testset "Test Logistic Regression Outputs from R" begin
mdl = glm(@formula(y ~ x1 + x2 + x3), dfrm, Binomial(), LogitLink();
dropcollinear=true)
@test coef(mdl) [-0.1402582892604246, 0.1362176272953289, 0, -0.1134751362230204] atol = 1.0E-6
stderr = stderror(mdl)
@test isnan(stderr[3]) == true
@test vcat(stderr[1:2], stderr[4]) [0.20652049856206, 0.25292632684716, 0.07496476901643] atol = 1.0E-4
@test deviance(mdl) 135.68506068159
@test loglikelihood(mdl) -67.8425303407948
@test dof(mdl) == 3
@test dof_residual(mdl) == 98
@test aic(mdl) 141.68506068159
@test GLM.dispersion(mdl.model, true) 1
@test predict(mdl)[1:3] [0.4241893070433117, 0.3754516361306202, 0.6327877688720133] atol = 1.0E-6
@test confint(mdl)[1:2,1:2] [-0.5493329715011036 0.26350316142056085;
-0.3582545657827583 0.64313795309765587] atol = 1.0E-1
end

@testset "`rankdeficient` test case of lm in glm" begin
rng = StableRNG(1234321)
# an example of rank deficiency caused by a missing cell in a table
dfrm = DataFrame([categorical(repeat(string.('A':'D'), inner = 6)),
categorical(repeat(string.('a':'c'), inner = 2, outer = 4))],
[:G, :H])
f = @formula(0 ~ 1 + G*H)
X = ModelMatrix(ModelFrame(f, dfrm)).m
y = X * (1:size(X, 2)) + 0.1 * randn(rng, size(X, 1))
inds = deleteat!(collect(1:length(y)), 7:8)
m1 = fit(GeneralizedLinearModel, X, y, Normal())
@test isapprox(deviance(m1), 0.12160301538297297)
Xmissingcell = X[inds, :]
ymissingcell = y[inds]
@test_throws PosDefException m2 = glm(Xmissingcell, ymissingcell, Normal();
dropcollinear=false)
m2p = glm(Xmissingcell, ymissingcell, Normal(); dropcollinear=true)
@test isa(m2p.pp.chol, CholeskyPivoted)
@test rank(m2p.pp.chol) == 11
@test isapprox(deviance(m2p), 0.1215758392280204)
@test isapprox(coef(m2p), [0.9772643585228885, 8.903341608496437, 3.027347397503281,
3.9661379199401257, 5.079410103608552, 6.1944618141188625, 0.0, 7.930328728005131,
8.879994918604757, 2.986388408421915, 10.84972230524356, 11.844809275711485])
@test all(isnan, hcat(coeftable(m2p).cols[2:end]...)[7,:])

m2p_dep_pos = glm(Xmissingcell, ymissingcell, Normal())
@test_logs (:warn, "Positional argument `allowrankdeficient` is deprecated, use keyword " *
"argument `dropcollinear` instead. Proceeding with positional argument value: true") fit(LinearModel, Xmissingcell, ymissingcell, true)
@test isa(m2p_dep_pos.pp.chol, CholeskyPivoted)
@test rank(m2p_dep_pos.pp.chol) == rank(m2p.pp.chol)
@test isapprox(deviance(m2p_dep_pos), deviance(m2p))
@test isapprox(coef(m2p_dep_pos), coef(m2p))
end

@testset "`rankdeficient` test in GLM with Gamma distribution" begin
rng = StableRNG(1234321)
# an example of rank deficiency caused by a missing cell in a table
dfrm = DataFrame([categorical(repeat(string.('A':'D'), inner = 6)),
categorical(repeat(string.('a':'c'), inner = 2, outer = 4))],
[:G, :H])
f = @formula(0 ~ 1 + G*H)
X = ModelMatrix(ModelFrame(f, dfrm)).m
y = X * (1:size(X, 2)) + 0.1 * randn(rng, size(X, 1))
inds = deleteat!(collect(1:length(y)), 7:8)
m1 = fit(GeneralizedLinearModel, X, y, Gamma())
@test isapprox(deviance(m1), 0.0407069934950098)
Xmissingcell = X[inds, :]
ymissingcell = y[inds]
@test_throws PosDefException glm(Xmissingcell, ymissingcell, Gamma(); dropcollinear=false)
m2p = glm(Xmissingcell, ymissingcell, Gamma(); dropcollinear=true)
@test isa(m2p.pp.chol, CholeskyPivoted)
@test rank(m2p.pp.chol) == 11
@test isapprox(deviance(m2p), 0.04070377141288433)
@test isapprox(coef(m2p), [ 1.0232644374837732, -0.0982622592717195, -0.7735523403010212,
-0.820974608805111, -0.8581573302333557, -0.8838279927663583, 0.0, 0.667219148331652,
0.7087696966674913, 0.011287703617517712, 0.6816245514668273, 0.7250492032072612])
@test all(isnan, hcat(coeftable(m2p).cols[2:end]...)[7,:])

m2p_dep_pos = fit(GeneralizedLinearModel, Xmissingcell, ymissingcell, Gamma())
@test_logs (:warn, "Positional argument `allowrankdeficient` is deprecated, use keyword " *
"argument `dropcollinear` instead. Proceeding with positional argument value: true") fit(LinearModel, Xmissingcell, ymissingcell, true)
@test isa(m2p_dep_pos.pp.chol, CholeskyPivoted)
@test rank(m2p_dep_pos.pp.chol) == rank(m2p.pp.chol)
@test isapprox(deviance(m2p_dep_pos), deviance(m2p))
@test isapprox(coef(m2p_dep_pos), coef(m2p))
end
end

0 comments on commit 1459737

Please sign in to comment.