From d01bd20745455cffa37704a1e9a21bbd50e1cea7 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Thu, 14 Dec 2017 19:03:00 +0100 Subject: [PATCH 1/2] join: try harder to preserve non-missingness - left and right joins preserve non-missingness of the left and right frame columns, respectively - non-missingness of the on-columns is preserved (if non-missing in both frames) --- src/abstractdataframe/join.jl | 24 +++++++++++++++++---- test/join.jl | 39 ++++++++++++++++------------------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 913dd68110..effcd3e4ec 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -85,20 +85,23 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, @assert nrow == length(all_orig_right_ixs) + loil ncleft = ncol(joiner.dfl) cols = Vector{Any}(ncleft + ncol(dfr_noon)) - _similar = kind == :inner ? similar : similar_missing + # inner and left joins preserve non-missingness of the left frame + _similar_left = kind == :inner || kind == :left ? similar : similar_missing for (i, col) in enumerate(columns(joiner.dfl)) - cols[i] = _similar(col, nrow) + cols[i] = _similar_left(col, nrow) copy!(cols[i], view(col, all_orig_left_ixs)) end + # inner and right joins preserve non-missingness of the right frame + _similar_right = kind == :inner || kind == :right ? similar : similar_missing for (i, col) in enumerate(columns(dfr_noon)) - cols[i+ncleft] = _similar(col, nrow) + cols[i+ncleft] = _similar_right(col, nrow) copy!(cols[i+ncleft], view(col, all_orig_right_ixs)) permute!(cols[i+ncleft], right_perm) end res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon))) if length(rightonly_ixs.join) > 0 - # some left rows are missings, so the values of the "on" columns + # some left rows are missing, so the values of the "on" columns # need to be taken from the right for (on_col_ix, on_col) in enumerate(joiner.left_on) # fix the result of the rightjoin by taking the nonmissing values from the right table @@ -106,6 +109,19 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, copy!(res[on_col], offset, view(joiner.dfr_on[on_col_ix], rightonly_ixs.orig)) end end + if kind ∈ (:right, :outer) && !isempty(rightonly_ixs.join) + # At this point on-columns of the result allow missing values, because + # right-only rows were filled with missing values when processing joiner.dfl + # However, when the right on-column (plus the left one for the outer join) + # does not allow missing values, the result should also disallow them. + for (on_col_ix, on_col) in enumerate(joiner.left_on) + LT = eltype(joiner.dfl_on[on_col_ix]) + RT = eltype(joiner.dfr_on[on_col_ix]) + if !(RT >: Missing) && (kind == :right || !(LT >: Missing)) + res[on_col] = disallowmissing(res[on_col]) + end + end + end return res end diff --git a/test/join.jl b/test/join.jl index 59412d762d..0b6ccb6bde 100644 --- a/test/join.jl +++ b/test/join.jl @@ -186,17 +186,17 @@ module TestJoin fid = [1, 3, 5], fid_1 = [1, 3, missing]) @test typeof.(l(on).columns) == - [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)] + [Vector{Int}, Vector{Float64}, Vector{Union{Float64, Missing}}] @test r(on) ≅ DataFrame(id = [1, 3, 0, 2, 4], fid = [1, 3, missing, missing, missing], fid_1 = [1, 3, 0, 2, 4]) @test typeof.(r(on).columns) == - [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)] + [Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Float64}] @test o(on) ≅ DataFrame(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, missing, missing, missing], fid_1 = [1, 3, missing, 0, 2, 4]) @test typeof.(o(on).columns) == - [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)] + [Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}] on = :fid @test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1]) @@ -204,28 +204,25 @@ module TestJoin @test l(on) ≅ DataFrame(id = [1, 3, 5], fid = [1, 3, 5], id_1 = [1, 3, missing]) - @test typeof.(l(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)] + @test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Union{Int, Missing}}] @test r(on) ≅ DataFrame(id = [1, 3, missing, missing, missing], fid = [1, 3, 0, 2, 4], id_1 = [1, 3, 0, 2, 4]) - @test typeof.(r(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)] + @test typeof.(r(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Int}] @test o(on) ≅ DataFrame(id = [1, 3, 5, missing, missing, missing], fid = [1, 3, 5, 0, 2, 4], id_1 = [1, 3, missing, 0, 2, 4]) - @test typeof.(o(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)] + @test typeof.(o(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Union{Int, Missing}}] on = [:id, :fid] @test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid]) @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}] @test l(on) == DataFrame(id = [1, 3, 5], fid = [1, 3, 5]) - @test typeof.(l(on).columns) == [Vector{Union{Int, Missing}}, - Vector{Union{Float64, Missing}}] + @test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}] @test r(on) == DataFrame(id = [1, 3, 0, 2, 4], fid = [1, 3, 0, 2, 4]) - @test typeof.(r(on).columns) == [Vector{Union{Int, Missing}}, - Vector{Union{Float64, Missing}}] + @test typeof.(r(on).columns) == [Vector{Int}, Vector{Float64}] @test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4]) - @test typeof.(o(on).columns) == [Vector{Union{Int, Missing}}, - Vector{Union{Float64, Missing}}] + @test typeof.(o(on).columns) == [Vector{Int}, Vector{Float64}] end @testset "all joins with CategoricalArrays" begin @@ -276,17 +273,17 @@ module TestJoin fid = [1, 3, 5], fid_1 = [1, 3, missing]) @test all(isa.(l(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)])) + [CategoricalVector{T} for T in (Int,Float64,Union{Float64, Missing})])) @test r(on) ≅ DataFrame(id = [1, 3, 0, 2, 4], fid = [1, 3, missing, missing, missing], fid_1 = [1, 3, 0, 2, 4]) @test all(isa.(r(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)])) + [CategoricalVector{T} for T in (Int,Union{Float64, Missing},Float64)])) @test o(on) ≅ DataFrame(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, missing, missing, missing], fid_1 = [1, 3, missing, 0, 2, 4]) @test all(isa.(o(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)])) + [CategoricalVector{T} for T in (Int,Union{Float64,Missing},Union{Float64, Missing})])) on = :fid @test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1]) @@ -296,17 +293,17 @@ module TestJoin fid = [1, 3, 5], id_1 = [1, 3, missing]) @test all(isa.(l(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)])) + [CategoricalVector{T} for T in (Int, Float64, Union{Int, Missing})])) @test r(on) ≅ DataFrame(id = [1, 3, missing, missing, missing], fid = [1, 3, 0, 2, 4], id_1 = [1, 3, 0, 2, 4]) @test all(isa.(r(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)])) + [CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Int)])) @test o(on) ≅ DataFrame(id = [1, 3, 5, missing, missing, missing], fid = [1, 3, 5, 0, 2, 4], id_1 = [1, 3, missing, 0, 2, 4]) @test all(isa.(o(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)])) + [CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Union{Int, Missing})])) on = [:id, :fid] @test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid]) @@ -315,15 +312,15 @@ module TestJoin @test l(on) == DataFrame(id = [1, 3, 5], fid = [1, 3, 5]) @test all(isa.(l(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)])) + [CategoricalVector{T} for T in (Int, Float64)])) @test r(on) == DataFrame(id = [1, 3, 0, 2, 4], fid = [1, 3, 0, 2, 4]) @test all(isa.(r(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)])) + [CategoricalVector{T} for T in (Int, Float64)])) @test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4]) @test all(isa.(o(on).columns, - [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)])) + [CategoricalVector{T} for T in (Int, Float64)])) end @testset "maintain CategoricalArray levels ordering on join - non-`on` cols" begin From c6bc53b675630fc1b1a0d9d76c7c11a6cdde1b85 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Thu, 14 Dec 2017 17:21:53 +0100 Subject: [PATCH 2/2] add join() test for frames with Any column --- test/join.jl | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/test/join.jl b/test/join.jl index 0b6ccb6bde..e454950037 100644 --- a/test/join.jl +++ b/test/join.jl @@ -462,4 +462,38 @@ module TestJoin @test join(left, right, on = [:id => :ID, :sid => :SID], kind=:anti) == DataFrame(id = 1:2, sid = string.(1:2)) end + + @testset "join with a column of type Any" begin + l = DataFrame(a=Any[1:7;], b=[1:7;]) + r = DataFrame(a=Any[3:10;], b=[3:10;]) + + # join by :a and :b (Any is the on-column) + @test join(l, r, on=[:a, :b], kind=:inner) ≅ DataFrame(a=Any[3:7;], b=3:7) + @test eltypes(join(l, r, on=[:a, :b], kind=:inner)) == [Any, Int] + + @test join(l, r, on=[:a, :b], kind=:left) ≅ DataFrame(a=Any[1:7;], b=1:7) + @test eltypes(join(l, r, on=[:a, :b], kind=:left)) == [Any, Int] + + @test join(l, r, on=[:a, :b], kind=:right) ≅ DataFrame(a=Any[3:10;], b=3:10) + @test eltypes(join(l, r, on=[:a, :b], kind=:right)) == [Any, Int] + + @test join(l, r, on=[:a, :b], kind=:outer) ≅ DataFrame(a=Any[1:10;], b=1:10) + @test eltypes(join(l, r, on=[:a, :b], kind=:outer)) == [Any, Int] + + # join by :b (Any is not on-column) + @test join(l, r, on=:b, kind=:inner) ≅ DataFrame(a=Any[3:7;], b=3:7, a_1=Any[3:7;]) + @test eltypes(join(l, r, on=:b, kind=:inner)) == [Any, Int, Any] + + @test join(l, r, on=:b, kind=:left) ≅ + DataFrame(a=Any[1:7;], b=1:7, a_1=[fill(missing, 2); 3:7;]) + @test eltypes(join(l, r, on=:b, kind=:left)) == [Any, Int, Any] + + @test join(l, r, on=:b, kind=:right) ≅ + DataFrame(a=[3:7; fill(missing, 3)], b=3:10, a_1=Any[3:10;]) + @test eltypes(join(l, r, on=:b, kind=:right)) == [Any, Int, Any] + + @test join(l, r, on=:b, kind=:outer) ≅ + DataFrame(a=[1:7; fill(missing, 3)], b=1:10, a_1=[fill(missing, 2); 3:10;]) + @test eltypes(join(l, r, on=:b, kind=:outer)) == [Any, Int, Any] + end end