join: try harder to preserve non-missingness

JuliaData · Dec 13, 2017 · 76044d5 · 76044d5
1 parent 9500505
commit 76044d5
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 31 deletions.
diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -85,27 +85,53 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
     @assert nrow == length(all_orig_right_ixs) + loil
     ncleft = ncol(joiner.dfl)
     cols = Vector{Any}(ncleft + ncol(dfr_noon))
-    _similar = kind == :inner ? similar : similar_missing
+    # inner and left joins preserve non-missingness of the left frame
+    # it is also preserved if all right rows have left matches
+    _similar_left = kind == :inner || kind == :left || length(rightonly_ixs.join) == 0 ? similar : similar_missing
     for (i, col) in enumerate(columns(joiner.dfl))
-        cols[i] = _similar(col, nrow)
-        copy!(cols[i], view(col, all_orig_left_ixs))
+        on_col_ix = findfirst(joiner.left_on, names(joiner.dfl)[i])
+        if on_col_ix > 0 && kind == :right
+            # if right join, construct the on-column
+            # using the right frame to preserve missingness and cat.levels
+            rcol = joiner.dfr_on[on_col_ix]
+            cols[i] = similar(rcol, nrow)
+            copy!(cols[i], view(rcol, all_orig_right_ixs))
+            permute!(cols[i], right_perm)
+        else
+            cols[i] = _similar_left(col, nrow)
+            copy!(cols[i], view(col, all_orig_left_ixs))
+        end
     end
+    # inner and right joins preserve non-missingness of the right frame
+    # it is also preserved if all left rows have right matches
+    _similar_right = kind == :inner || kind == :right || length(leftonly_ixs.join) == 0 ? similar : similar_missing
     for (i, col) in enumerate(columns(dfr_noon))
-        cols[i+ncleft] = _similar(col, nrow)
+        cols[i+ncleft] = _similar_right(col, nrow)
         copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
         permute!(cols[i+ncleft], right_perm)
     end
     res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))
 
-    if length(rightonly_ixs.join) > 0
+    if length(rightonly_ixs.join) > 0 && kind != :right
         # some left rows are missings, so the values of the "on" columns
-        # need to be taken from the right
+        # need to be taken from the right (unless already done when processing the left frame)
         for (on_col_ix, on_col) in enumerate(joiner.left_on)
             # fix the result of the rightjoin by taking the nonmissing values from the right table
             offset = nrow - length(rightonly_ixs.orig) + 1
             copy!(res[on_col], offset, view(joiner.dfr_on[on_col_ix], rightonly_ixs.orig))
         end
     end
+    if kind == :outer && !isempty(rightonly_ixs.join)
+        # some non-missing on-columns may have become missing
+        # when constructing res, because there are both left-only and right-only rows
+        for (on_col_ix, on_col) in enumerate(joiner.left_on)
+            LT = eltype(joiner.dfl_on[on_col_ix])
+            RT = eltype(joiner.dfr_on[on_col_ix])
+            if Missings.T(LT) === LT && Missings.T(RT) === RT
+                res[on_col] = disallowmissing(res[on_col])
+            end
+        end
+    end
     return res
 end
 

diff --git a/test/join.jl b/test/join.jl
@@ -186,46 +186,43 @@ module TestJoin
                                 fid = [1, 3, 5],
                                 fid_1 = [1, 3, missing])
         @test typeof.(l(on).columns) ==
-            [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
+            [Vector{Int}, Vector{Float64}, Vector{Union{Float64, Missing}}]
         @test r(on) ≅ DataFrame(id = [1, 3, 0, 2, 4],
                                 fid = [1, 3, missing, missing, missing],
                                 fid_1 = [1, 3, 0, 2, 4])
         @test typeof.(r(on).columns) ==
-            [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
+            [Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Float64}]
         @test o(on) ≅ DataFrame(id = [1, 3, 5, 0, 2, 4],
                                 fid = [1, 3, 5, missing, missing, missing],
                                 fid_1 = [1, 3, missing, 0, 2, 4])
         @test typeof.(o(on).columns) ==
-            [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
+            [Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}]
 
         on = :fid
         @test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
         @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Int}]
         @test l(on) ≅ DataFrame(id = [1, 3, 5],
                                 fid = [1, 3, 5],
                                 id_1 = [1, 3, missing])
-        @test typeof.(l(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
+        @test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Union{Int, Missing}}]
         @test r(on) ≅ DataFrame(id = [1, 3, missing, missing, missing],
                                 fid = [1, 3, 0, 2, 4],
                                 id_1 = [1, 3, 0, 2, 4])
-        @test typeof.(r(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
+        @test typeof.(r(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Int}]
         @test o(on) ≅ DataFrame(id = [1, 3, 5, missing, missing, missing],
                                 fid = [1, 3, 5, 0, 2, 4],
                                 id_1 = [1, 3, missing, 0, 2, 4])
-        @test typeof.(o(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
+        @test typeof.(o(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Union{Int, Missing}}]
 
         on = [:id, :fid]
         @test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid])
         @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}]
         @test l(on) == DataFrame(id = [1, 3, 5], fid = [1, 3, 5])
-        @test typeof.(l(on).columns) == [Vector{Union{Int, Missing}},
-                                         Vector{Union{Float64, Missing}}]
+        @test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}]
         @test r(on) == DataFrame(id = [1, 3, 0, 2, 4], fid = [1, 3, 0, 2, 4])
-        @test typeof.(r(on).columns) == [Vector{Union{Int, Missing}},
-                                         Vector{Union{Float64, Missing}}]
+        @test typeof.(r(on).columns) == [Vector{Int}, Vector{Float64}]
         @test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4])
-        @test typeof.(o(on).columns) == [Vector{Union{Int, Missing}},
-                                         Vector{Union{Float64, Missing}}]
+        @test typeof.(o(on).columns) == [Vector{Int}, Vector{Float64}]
     end
 
     @testset "all joins with CategoricalArrays" begin
@@ -276,17 +273,17 @@ module TestJoin
                                 fid = [1, 3, 5],
                                 fid_1 = [1, 3, missing])
         @test all(isa.(l(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
+                       [CategoricalVector{T} for T in (Int,Float64,Union{Float64, Missing})]))
         @test r(on) ≅ DataFrame(id = [1, 3, 0, 2, 4],
                                 fid = [1, 3, missing, missing, missing],
                                 fid_1 = [1, 3, 0, 2, 4])
         @test all(isa.(r(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
+                       [CategoricalVector{T} for T in (Int,Union{Float64, Missing},Float64)]))
         @test o(on) ≅ DataFrame(id = [1, 3, 5, 0, 2, 4],
                                 fid = [1, 3, 5, missing, missing, missing],
                                 fid_1 = [1, 3, missing, 0, 2, 4])
         @test all(isa.(o(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
+                       [CategoricalVector{T} for T in (Int,Union{Float64,Missing},Union{Float64, Missing})]))
 
         on = :fid
         @test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
@@ -296,17 +293,17 @@ module TestJoin
                                 fid = [1, 3, 5],
                                 id_1 = [1, 3, missing])
         @test all(isa.(l(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
+                       [CategoricalVector{T} for T in (Int, Float64, Union{Int, Missing})]))
         @test r(on) ≅ DataFrame(id = [1, 3, missing, missing, missing],
                                 fid = [1, 3, 0, 2, 4],
                                 id_1 = [1, 3, 0, 2, 4])
         @test all(isa.(r(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
+                       [CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Int)]))
         @test o(on) ≅ DataFrame(id = [1, 3, 5, missing, missing, missing],
                                 fid = [1, 3, 5, 0, 2, 4],
                                 id_1 = [1, 3, missing, 0, 2, 4])
         @test all(isa.(o(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
+                       [CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Union{Int, Missing})]))
 
         on = [:id, :fid]
         @test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid])
@@ -315,15 +312,15 @@ module TestJoin
         @test l(on) == DataFrame(id = [1, 3, 5],
                                  fid = [1, 3, 5])
         @test all(isa.(l(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
+                       [CategoricalVector{T} for T in (Int, Float64)]))
         @test r(on) == DataFrame(id = [1, 3, 0, 2, 4],
                                  fid = [1, 3, 0, 2, 4])
         @test all(isa.(r(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
+                       [CategoricalVector{T} for T in (Int, Float64)]))
         @test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4],
                                  fid = [1, 3, 5, 0, 2, 4])
         @test all(isa.(o(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
+                       [CategoricalVector{T} for T in (Int, Float64)]))
     end
 
     @testset "maintain CategoricalArray levels ordering on join - non-`on` cols" begin
@@ -346,8 +343,8 @@ module TestJoin
         @test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
         @test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
         @test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
-        @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
-        @test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "d", "c"]
+        @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["a", "b", "c"]
+        @test levels(join(B, A, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
         @test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "d", "c"]
         @test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
         @test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
@@ -364,8 +361,8 @@ module TestJoin
         @test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
         @test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
         @test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
-        @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
-        @test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "c", "d"]
+        @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["a", "b", "c"]
+        @test levels(join(B, A, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
         @test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
         @test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "c", "d"]
         @test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]