Skip to content

Commit

Permalink
Merge pull request #1316 from alyst/join_coltypes
Browse files Browse the repository at this point in the history
Preserve non-missingness during non-inner joins
  • Loading branch information
nalimilan authored Dec 18, 2017
2 parents 740978e + c6bc53b commit 6a1fa20
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 25 deletions.
24 changes: 20 additions & 4 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,27 +85,43 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
@assert nrow == length(all_orig_right_ixs) + loil
ncleft = ncol(joiner.dfl)
cols = Vector{Any}(ncleft + ncol(dfr_noon))
_similar = kind == :inner ? similar : similar_missing
# inner and left joins preserve non-missingness of the left frame
_similar_left = kind == :inner || kind == :left ? similar : similar_missing
for (i, col) in enumerate(columns(joiner.dfl))
cols[i] = _similar(col, nrow)
cols[i] = _similar_left(col, nrow)
copy!(cols[i], view(col, all_orig_left_ixs))
end
# inner and right joins preserve non-missingness of the right frame
_similar_right = kind == :inner || kind == :right ? similar : similar_missing
for (i, col) in enumerate(columns(dfr_noon))
cols[i+ncleft] = _similar(col, nrow)
cols[i+ncleft] = _similar_right(col, nrow)
copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
permute!(cols[i+ncleft], right_perm)
end
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))

if length(rightonly_ixs.join) > 0
# some left rows are missings, so the values of the "on" columns
# some left rows are missing, so the values of the "on" columns
# need to be taken from the right
for (on_col_ix, on_col) in enumerate(joiner.left_on)
# fix the result of the rightjoin by taking the nonmissing values from the right table
offset = nrow - length(rightonly_ixs.orig) + 1
copy!(res[on_col], offset, view(joiner.dfr_on[on_col_ix], rightonly_ixs.orig))
end
end
if kind (:right, :outer) && !isempty(rightonly_ixs.join)
# At this point on-columns of the result allow missing values, because
# right-only rows were filled with missing values when processing joiner.dfl
# However, when the right on-column (plus the left one for the outer join)
# does not allow missing values, the result should also disallow them.
for (on_col_ix, on_col) in enumerate(joiner.left_on)
LT = eltype(joiner.dfl_on[on_col_ix])
RT = eltype(joiner.dfr_on[on_col_ix])
if !(RT >: Missing) && (kind == :right || !(LT >: Missing))
res[on_col] = disallowmissing(res[on_col])
end
end
end
return res
end

Expand Down
73 changes: 52 additions & 21 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,46 +186,43 @@ module TestJoin
fid = [1, 3, 5],
fid_1 = [1, 3, missing])
@test typeof.(l(on).columns) ==
[Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
[Vector{Int}, Vector{Float64}, Vector{Union{Float64, Missing}}]
@test r(on) DataFrame(id = [1, 3, 0, 2, 4],
fid = [1, 3, missing, missing, missing],
fid_1 = [1, 3, 0, 2, 4])
@test typeof.(r(on).columns) ==
[Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
[Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Float64}]
@test o(on) DataFrame(id = [1, 3, 5, 0, 2, 4],
fid = [1, 3, 5, missing, missing, missing],
fid_1 = [1, 3, missing, 0, 2, 4])
@test typeof.(o(on).columns) ==
[Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
[Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}]

on = :fid
@test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
@test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Int}]
@test l(on) DataFrame(id = [1, 3, 5],
fid = [1, 3, 5],
id_1 = [1, 3, missing])
@test typeof.(l(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
@test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Union{Int, Missing}}]
@test r(on) DataFrame(id = [1, 3, missing, missing, missing],
fid = [1, 3, 0, 2, 4],
id_1 = [1, 3, 0, 2, 4])
@test typeof.(r(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
@test typeof.(r(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Int}]
@test o(on) DataFrame(id = [1, 3, 5, missing, missing, missing],
fid = [1, 3, 5, 0, 2, 4],
id_1 = [1, 3, missing, 0, 2, 4])
@test typeof.(o(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
@test typeof.(o(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Union{Int, Missing}}]

on = [:id, :fid]
@test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid])
@test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}]
@test l(on) == DataFrame(id = [1, 3, 5], fid = [1, 3, 5])
@test typeof.(l(on).columns) == [Vector{Union{Int, Missing}},
Vector{Union{Float64, Missing}}]
@test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}]
@test r(on) == DataFrame(id = [1, 3, 0, 2, 4], fid = [1, 3, 0, 2, 4])
@test typeof.(r(on).columns) == [Vector{Union{Int, Missing}},
Vector{Union{Float64, Missing}}]
@test typeof.(r(on).columns) == [Vector{Int}, Vector{Float64}]
@test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4])
@test typeof.(o(on).columns) == [Vector{Union{Int, Missing}},
Vector{Union{Float64, Missing}}]
@test typeof.(o(on).columns) == [Vector{Int}, Vector{Float64}]
end

@testset "all joins with CategoricalArrays" begin
Expand Down Expand Up @@ -276,17 +273,17 @@ module TestJoin
fid = [1, 3, 5],
fid_1 = [1, 3, missing])
@test all(isa.(l(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
[CategoricalVector{T} for T in (Int,Float64,Union{Float64, Missing})]))
@test r(on) DataFrame(id = [1, 3, 0, 2, 4],
fid = [1, 3, missing, missing, missing],
fid_1 = [1, 3, 0, 2, 4])
@test all(isa.(r(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
[CategoricalVector{T} for T in (Int,Union{Float64, Missing},Float64)]))
@test o(on) DataFrame(id = [1, 3, 5, 0, 2, 4],
fid = [1, 3, 5, missing, missing, missing],
fid_1 = [1, 3, missing, 0, 2, 4])
@test all(isa.(o(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
[CategoricalVector{T} for T in (Int,Union{Float64,Missing},Union{Float64, Missing})]))

on = :fid
@test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
Expand All @@ -296,17 +293,17 @@ module TestJoin
fid = [1, 3, 5],
id_1 = [1, 3, missing])
@test all(isa.(l(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
[CategoricalVector{T} for T in (Int, Float64, Union{Int, Missing})]))
@test r(on) DataFrame(id = [1, 3, missing, missing, missing],
fid = [1, 3, 0, 2, 4],
id_1 = [1, 3, 0, 2, 4])
@test all(isa.(r(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
[CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Int)]))
@test o(on) DataFrame(id = [1, 3, 5, missing, missing, missing],
fid = [1, 3, 5, 0, 2, 4],
id_1 = [1, 3, missing, 0, 2, 4])
@test all(isa.(o(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
[CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Union{Int, Missing})]))

on = [:id, :fid]
@test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid])
Expand All @@ -315,15 +312,15 @@ module TestJoin
@test l(on) == DataFrame(id = [1, 3, 5],
fid = [1, 3, 5])
@test all(isa.(l(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
[CategoricalVector{T} for T in (Int, Float64)]))
@test r(on) == DataFrame(id = [1, 3, 0, 2, 4],
fid = [1, 3, 0, 2, 4])
@test all(isa.(r(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
[CategoricalVector{T} for T in (Int, Float64)]))
@test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4],
fid = [1, 3, 5, 0, 2, 4])
@test all(isa.(o(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
[CategoricalVector{T} for T in (Int, Float64)]))
end

@testset "maintain CategoricalArray levels ordering on join - non-`on` cols" begin
Expand Down Expand Up @@ -465,4 +462,38 @@ module TestJoin
@test join(left, right, on = [:id => :ID, :sid => :SID], kind=:anti) ==
DataFrame(id = 1:2, sid = string.(1:2))
end

@testset "join with a column of type Any" begin
l = DataFrame(a=Any[1:7;], b=[1:7;])
r = DataFrame(a=Any[3:10;], b=[3:10;])

# join by :a and :b (Any is the on-column)
@test join(l, r, on=[:a, :b], kind=:inner) DataFrame(a=Any[3:7;], b=3:7)
@test eltypes(join(l, r, on=[:a, :b], kind=:inner)) == [Any, Int]

@test join(l, r, on=[:a, :b], kind=:left) DataFrame(a=Any[1:7;], b=1:7)
@test eltypes(join(l, r, on=[:a, :b], kind=:left)) == [Any, Int]

@test join(l, r, on=[:a, :b], kind=:right) DataFrame(a=Any[3:10;], b=3:10)
@test eltypes(join(l, r, on=[:a, :b], kind=:right)) == [Any, Int]

@test join(l, r, on=[:a, :b], kind=:outer) DataFrame(a=Any[1:10;], b=1:10)
@test eltypes(join(l, r, on=[:a, :b], kind=:outer)) == [Any, Int]

# join by :b (Any is not on-column)
@test join(l, r, on=:b, kind=:inner) DataFrame(a=Any[3:7;], b=3:7, a_1=Any[3:7;])
@test eltypes(join(l, r, on=:b, kind=:inner)) == [Any, Int, Any]

@test join(l, r, on=:b, kind=:left)
DataFrame(a=Any[1:7;], b=1:7, a_1=[fill(missing, 2); 3:7;])
@test eltypes(join(l, r, on=:b, kind=:left)) == [Any, Int, Any]

@test join(l, r, on=:b, kind=:right)
DataFrame(a=[3:7; fill(missing, 3)], b=3:10, a_1=Any[3:10;])
@test eltypes(join(l, r, on=:b, kind=:right)) == [Any, Int, Any]

@test join(l, r, on=:b, kind=:outer)
DataFrame(a=[1:7; fill(missing, 3)], b=1:10, a_1=[fill(missing, 2); 3:10;])
@test eltypes(join(l, r, on=:b, kind=:outer)) == [Any, Int, Any]
end
end

0 comments on commit 6a1fa20

Please sign in to comment.