Skip to content

Commit

Permalink
join: try harder to preserve non-missingness
Browse files Browse the repository at this point in the history
  • Loading branch information
alyst committed Dec 13, 2017
1 parent 9500505 commit 76044d5
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 31 deletions.
38 changes: 32 additions & 6 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,27 +85,53 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
@assert nrow == length(all_orig_right_ixs) + loil
ncleft = ncol(joiner.dfl)
cols = Vector{Any}(ncleft + ncol(dfr_noon))
_similar = kind == :inner ? similar : similar_missing
# inner and left joins preserve non-missingness of the left frame
# it is also preserved if all right rows have left matches
_similar_left = kind == :inner || kind == :left || length(rightonly_ixs.join) == 0 ? similar : similar_missing
for (i, col) in enumerate(columns(joiner.dfl))
cols[i] = _similar(col, nrow)
copy!(cols[i], view(col, all_orig_left_ixs))
on_col_ix = findfirst(joiner.left_on, names(joiner.dfl)[i])
if on_col_ix > 0 && kind == :right
# if right join, construct the on-column
# using the right frame to preserve missingness and cat.levels
rcol = joiner.dfr_on[on_col_ix]
cols[i] = similar(rcol, nrow)
copy!(cols[i], view(rcol, all_orig_right_ixs))
permute!(cols[i], right_perm)
else
cols[i] = _similar_left(col, nrow)
copy!(cols[i], view(col, all_orig_left_ixs))
end
end
# inner and right joins preserve non-missingness of the right frame
# it is also preserved if all left rows have right matches
_similar_right = kind == :inner || kind == :right || length(leftonly_ixs.join) == 0 ? similar : similar_missing
for (i, col) in enumerate(columns(dfr_noon))
cols[i+ncleft] = _similar(col, nrow)
cols[i+ncleft] = _similar_right(col, nrow)
copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
permute!(cols[i+ncleft], right_perm)
end
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))

if length(rightonly_ixs.join) > 0
if length(rightonly_ixs.join) > 0 && kind != :right
# some left rows are missings, so the values of the "on" columns
# need to be taken from the right
# need to be taken from the right (unless already done when processing the left frame)
for (on_col_ix, on_col) in enumerate(joiner.left_on)
# fix the result of the rightjoin by taking the nonmissing values from the right table
offset = nrow - length(rightonly_ixs.orig) + 1
copy!(res[on_col], offset, view(joiner.dfr_on[on_col_ix], rightonly_ixs.orig))
end
end
if kind == :outer && !isempty(rightonly_ixs.join)
# some non-missing on-columns may have become missing
# when constructing res, because there are both left-only and right-only rows
for (on_col_ix, on_col) in enumerate(joiner.left_on)
LT = eltype(joiner.dfl_on[on_col_ix])
RT = eltype(joiner.dfr_on[on_col_ix])
if Missings.T(LT) === LT && Missings.T(RT) === RT
res[on_col] = disallowmissing(res[on_col])
end
end
end
return res
end

Expand Down
47 changes: 22 additions & 25 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,46 +186,43 @@ module TestJoin
fid = [1, 3, 5],
fid_1 = [1, 3, missing])
@test typeof.(l(on).columns) ==
[Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
[Vector{Int}, Vector{Float64}, Vector{Union{Float64, Missing}}]
@test r(on) DataFrame(id = [1, 3, 0, 2, 4],
fid = [1, 3, missing, missing, missing],
fid_1 = [1, 3, 0, 2, 4])
@test typeof.(r(on).columns) ==
[Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
[Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Float64}]
@test o(on) DataFrame(id = [1, 3, 5, 0, 2, 4],
fid = [1, 3, 5, missing, missing, missing],
fid_1 = [1, 3, missing, 0, 2, 4])
@test typeof.(o(on).columns) ==
[Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
[Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}]

on = :fid
@test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
@test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Int}]
@test l(on) DataFrame(id = [1, 3, 5],
fid = [1, 3, 5],
id_1 = [1, 3, missing])
@test typeof.(l(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
@test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Union{Int, Missing}}]
@test r(on) DataFrame(id = [1, 3, missing, missing, missing],
fid = [1, 3, 0, 2, 4],
id_1 = [1, 3, 0, 2, 4])
@test typeof.(r(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
@test typeof.(r(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Int}]
@test o(on) DataFrame(id = [1, 3, 5, missing, missing, missing],
fid = [1, 3, 5, 0, 2, 4],
id_1 = [1, 3, missing, 0, 2, 4])
@test typeof.(o(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
@test typeof.(o(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Union{Int, Missing}}]

on = [:id, :fid]
@test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid])
@test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}]
@test l(on) == DataFrame(id = [1, 3, 5], fid = [1, 3, 5])
@test typeof.(l(on).columns) == [Vector{Union{Int, Missing}},
Vector{Union{Float64, Missing}}]
@test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}]
@test r(on) == DataFrame(id = [1, 3, 0, 2, 4], fid = [1, 3, 0, 2, 4])
@test typeof.(r(on).columns) == [Vector{Union{Int, Missing}},
Vector{Union{Float64, Missing}}]
@test typeof.(r(on).columns) == [Vector{Int}, Vector{Float64}]
@test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4])
@test typeof.(o(on).columns) == [Vector{Union{Int, Missing}},
Vector{Union{Float64, Missing}}]
@test typeof.(o(on).columns) == [Vector{Int}, Vector{Float64}]
end

@testset "all joins with CategoricalArrays" begin
Expand Down Expand Up @@ -276,17 +273,17 @@ module TestJoin
fid = [1, 3, 5],
fid_1 = [1, 3, missing])
@test all(isa.(l(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
[CategoricalVector{T} for T in (Int,Float64,Union{Float64, Missing})]))
@test r(on) DataFrame(id = [1, 3, 0, 2, 4],
fid = [1, 3, missing, missing, missing],
fid_1 = [1, 3, 0, 2, 4])
@test all(isa.(r(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
[CategoricalVector{T} for T in (Int,Union{Float64, Missing},Float64)]))
@test o(on) DataFrame(id = [1, 3, 5, 0, 2, 4],
fid = [1, 3, 5, missing, missing, missing],
fid_1 = [1, 3, missing, 0, 2, 4])
@test all(isa.(o(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
[CategoricalVector{T} for T in (Int,Union{Float64,Missing},Union{Float64, Missing})]))

on = :fid
@test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
Expand All @@ -296,17 +293,17 @@ module TestJoin
fid = [1, 3, 5],
id_1 = [1, 3, missing])
@test all(isa.(l(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
[CategoricalVector{T} for T in (Int, Float64, Union{Int, Missing})]))
@test r(on) DataFrame(id = [1, 3, missing, missing, missing],
fid = [1, 3, 0, 2, 4],
id_1 = [1, 3, 0, 2, 4])
@test all(isa.(r(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
[CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Int)]))
@test o(on) DataFrame(id = [1, 3, 5, missing, missing, missing],
fid = [1, 3, 5, 0, 2, 4],
id_1 = [1, 3, missing, 0, 2, 4])
@test all(isa.(o(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
[CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Union{Int, Missing})]))

on = [:id, :fid]
@test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid])
Expand All @@ -315,15 +312,15 @@ module TestJoin
@test l(on) == DataFrame(id = [1, 3, 5],
fid = [1, 3, 5])
@test all(isa.(l(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
[CategoricalVector{T} for T in (Int, Float64)]))
@test r(on) == DataFrame(id = [1, 3, 0, 2, 4],
fid = [1, 3, 0, 2, 4])
@test all(isa.(r(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
[CategoricalVector{T} for T in (Int, Float64)]))
@test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4],
fid = [1, 3, 5, 0, 2, 4])
@test all(isa.(o(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
[CategoricalVector{T} for T in (Int, Float64)]))
end

@testset "maintain CategoricalArray levels ordering on join - non-`on` cols" begin
Expand All @@ -346,8 +343,8 @@ module TestJoin
@test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "d", "c"]
@test levels(join(A, B, on=:b, kind=:right)[:b]) == ["a", "b", "c"]
@test levels(join(B, A, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "d", "c"]
@test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
@test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
Expand All @@ -364,8 +361,8 @@ module TestJoin
@test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "c", "d"]
@test levels(join(A, B, on=:b, kind=:right)[:b]) == ["a", "b", "c"]
@test levels(join(B, A, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
@test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "c", "d"]
@test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
Expand Down

0 comments on commit 76044d5

Please sign in to comment.