From 8c8d6e711176faf2cc525c58c285b17bd0690195 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 00:25:43 +0200 Subject: [PATCH 1/7] missing values for continuous don't lead to categorical --- src/schema.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/schema.jl b/src/schema.jl index d05b0ad7..73daa748 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -198,6 +198,8 @@ concrete_term(t::Term, x, hint::AbstractTerm) = hint concrete_term(t, d, hint) = t concrete_term(t::Term, xs::AbstractVector{<:Number}, ::Nothing) = concrete_term(t, xs, ContinuousTerm) +# and for missing values +concrete_term(t::Term, xs::AbstractVector{Union{Missing,T}} where T<:Number, ::Nothing) = concrete_term(t, xs, ContinuousTerm) function concrete_term(t::Term, xs::AbstractVector, ::Type{ContinuousTerm}) μ, σ2 = StatsBase.mean_and_var(xs) min, max = extrema(xs) From 17ca7cbb1878efca943d97772770a8141c04bcc5 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 10:33:30 +0200 Subject: [PATCH 2/7] local copy() that handles Missing but otherwise defaults to Base.copy() --- src/terms.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/terms.jl b/src/terms.jl index f9a89c17..0934c669 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -506,6 +506,13 @@ lazy_modelcols(x, d) = modelcols(x, d) +# this is weird, but using import Base: copy leads to exporting type piracy +# for non missing values, the compiler should hopefully optimize down the extra +# layer of indirection +function copy end +copy(x::Any) = Base.copy(x) +copy(m::Missing) = deepcopy(m) + modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym]) modelcols(t::CategoricalTerm, d::NamedTuple) = t.contrasts[d[t.sym], :] From a59659d2dce33e1c61384147c476002e868d58c0 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 10:36:29 +0200 Subject: [PATCH 3/7] don't bother with deepcopy for missing, just return the singleton --- src/terms.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/terms.jl b/src/terms.jl index 0934c669..50e4acc4 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -511,7 +511,7 @@ lazy_modelcols(x, d) = modelcols(x, d) # layer of indirection function copy end copy(x::Any) = Base.copy(x) -copy(m::Missing) = deepcopy(m) +copy(m::Missing) = m modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym]) From c43ae8c4c11df367df19b1b1c95f62f2eaab4634 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 10:49:53 +0200 Subject: [PATCH 4/7] tests for continuous terms with missing values --- test/terms.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/terms.jl b/test/terms.jl index b58900bc..a94b9e8f 100644 --- a/test/terms.jl +++ b/test/terms.jl @@ -31,6 +31,17 @@ StatsModels.apply_schema(mt::MultiTerm, sch::StatsModels.Schema, Mod::Type) = @test t0.min == 1.0 @test t0.max == 3.0 + vals0m = [3, missing, 1] + t0m = concrete_term(t, vals0m) + @test string(t0m) == "aaa" + @test mimestring(t0m) == "aaa(continuous)" + # compute all these values to make sure the behavior of terms matches + # the behavior of other relevant packages + @test isequal(t0m.mean, mean(vals0m)) + @test isequal(t0m.var, var(vals0m)) + @test isequal(t0m.min, min(vals0m...)) + @test isqual(t0m.max, max(vals0m...)) + t1 = concrete_term(t, [:a, :b, :c]) @test t1.contrasts isa StatsModels.ContrastsMatrix{DummyCoding} @test string(t1) == "aaa" From 51c5093bd249754f558001fb362a81afbd1ef917 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 10:57:13 +0200 Subject: [PATCH 5/7] typo --- test/terms.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/terms.jl b/test/terms.jl index a94b9e8f..698a134c 100644 --- a/test/terms.jl +++ b/test/terms.jl @@ -40,7 +40,7 @@ StatsModels.apply_schema(mt::MultiTerm, sch::StatsModels.Schema, Mod::Type) = @test isequal(t0m.mean, mean(vals0m)) @test isequal(t0m.var, var(vals0m)) @test isequal(t0m.min, min(vals0m...)) - @test isqual(t0m.max, max(vals0m...)) + @test isequal(t0m.max, max(vals0m...)) t1 = concrete_term(t, [:a, :b, :c]) @test t1.contrasts isa StatsModels.ContrastsMatrix{DummyCoding} From 5cdeb25e52fa7a83ed6ffe9e81fbb57eac9beb85 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 25 Sep 2019 15:47:30 +0200 Subject: [PATCH 6/7] rework getindex for contrast matrices to handle missing values --- src/contrasts.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/contrasts.jl b/src/contrasts.jl index f4f03af8..960a8437 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -229,8 +229,13 @@ function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Intege levels[not_base] end -Base.getindex(contrasts::ContrastsMatrix, rowinds, colinds) = - getindex(contrasts.matrix, getindex.(Ref(contrasts.invindex), rowinds), colinds) +function Base.getindex(contrasts::ContrastsMatrix{C,T}, rowinds, colinds) where {C,T} + # allow rows to be missing + rows = get.(Ref(contrasts.invindex), rowinds, missing) + # create a row of nothing but missings for missing values + mrow = reduce(vcat, [missing for c in getindex(contrasts.matrix, 1, colinds)]) + vcat([r === missing ? mrow : getindex(contrasts.matrix, r, colinds) for r in rows]) +end # Making a contrast type T only requires that there be a method for # contrasts_matrix(T, baseind, n) and optionally termnames(T, levels, baseind) From 967aee65b0cd8f89c69cd34ed5fa54ffd5ae5fb0 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sat, 10 Jun 2023 13:12:08 -0500 Subject: [PATCH 7/7] omit missing data after schema application --- src/modelframe.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/modelframe.jl b/src/modelframe.jl index 13c59f16..cdebcf4b 100644 --- a/src/modelframe.jl +++ b/src/modelframe.jl @@ -78,11 +78,11 @@ function ModelFrame(f::FormulaTerm, data::ColumnTable; throw(ArgumentError(msg)) end - data, _ = missing_omit(data, f) - sch = schema(f, data, contrasts) f = apply_schema(f, sch, M) - + + data, _ = missing_omit(data, f) + ModelFrame(f, sch, data, model) end