JuliaData · nalimilan · Dec 18, 2017 · Dec 14, 2017 · Dec 14, 2017
diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -85,27 +85,43 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
     @assert nrow == length(all_orig_right_ixs) + loil
     ncleft = ncol(joiner.dfl)
     cols = Vector{Any}(ncleft + ncol(dfr_noon))
-    _similar = kind == :inner ? similar : similar_missing
+    # inner and left joins preserve non-missingness of the left frame
+    _similar_left = kind == :inner || kind == :left ? similar : similar_missing
     for (i, col) in enumerate(columns(joiner.dfl))
-        cols[i] = _similar(col, nrow)
+        cols[i] = _similar_left(col, nrow)
         copy!(cols[i], view(col, all_orig_left_ixs))
     end
+    # inner and right joins preserve non-missingness of the right frame
+    _similar_right = kind == :inner || kind == :right ? similar : similar_missing
     for (i, col) in enumerate(columns(dfr_noon))
-        cols[i+ncleft] = _similar(col, nrow)
+        cols[i+ncleft] = _similar_right(col, nrow)
         copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
         permute!(cols[i+ncleft], right_perm)
     end
     res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))
 
     if length(rightonly_ixs.join) > 0
-        # some left rows are missings, so the values of the "on" columns
+        # some left rows are missing, so the values of the "on" columns
         # need to be taken from the right
         for (on_col_ix, on_col) in enumerate(joiner.left_on)
             # fix the result of the rightjoin by taking the nonmissing values from the right table
             offset = nrow - length(rightonly_ixs.orig) + 1
             copy!(res[on_col], offset, view(joiner.dfr_on[on_col_ix], rightonly_ixs.orig))
         end
     end
+    if kind ∈ (:right, :outer) && !isempty(rightonly_ixs.join)
+        # At this point on-columns of the result allow missing values, because
+        # right-only rows were filled with missing values when processing joiner.dfl
+        # However, when the right on-column (plus the left one for the outer join)
+        # does not allow missing values, the result should also disallow them.
+        for (on_col_ix, on_col) in enumerate(joiner.left_on)
+            LT = eltype(joiner.dfl_on[on_col_ix])
+            RT = eltype(joiner.dfr_on[on_col_ix])
+            if !(RT >: Missing) && (kind == :right || !(LT >: Missing))
+                res[on_col] = disallowmissing(res[on_col])
+            end
+        end
+    end
     return res
 end
 

diff --git a/test/join.jl b/test/join.jl
@@ -186,46 +186,43 @@ module TestJoin
                                 fid = [1, 3, 5],
                                 fid_1 = [1, 3, missing])
         @test typeof.(l(on).columns) ==
-            [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
+            [Vector{Int}, Vector{Float64}, Vector{Union{Float64, Missing}}]
         @test r(on) ≅ DataFrame(id = [1, 3, 0, 2, 4],
                                 fid = [1, 3, missing, missing, missing],
                                 fid_1 = [1, 3, 0, 2, 4])
         @test typeof.(r(on).columns) ==
-            [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
+            [Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Float64}]
         @test o(on) ≅ DataFrame(id = [1, 3, 5, 0, 2, 4],
                                 fid = [1, 3, 5, missing, missing, missing],
                                 fid_1 = [1, 3, missing, 0, 2, 4])
         @test typeof.(o(on).columns) ==
-            [Vector{Union{T, Missing}} for T in (Int, Float64, Float64)]
+            [Vector{Int}, Vector{Union{Float64, Missing}}, Vector{Union{Float64, Missing}}]
 
         on = :fid
         @test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
         @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Int}]
         @test l(on) ≅ DataFrame(id = [1, 3, 5],
                                 fid = [1, 3, 5],
                                 id_1 = [1, 3, missing])
-        @test typeof.(l(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
+        @test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}, Vector{Union{Int, Missing}}]
         @test r(on) ≅ DataFrame(id = [1, 3, missing, missing, missing],
                                 fid = [1, 3, 0, 2, 4],
                                 id_1 = [1, 3, 0, 2, 4])
-        @test typeof.(r(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
+        @test typeof.(r(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Int}]
         @test o(on) ≅ DataFrame(id = [1, 3, 5, missing, missing, missing],
                                 fid = [1, 3, 5, 0, 2, 4],
                                 id_1 = [1, 3, missing, 0, 2, 4])
-        @test typeof.(o(on).columns) == [Vector{Union{T, Missing}} for T in (Int,Float64,Int)]
+        @test typeof.(o(on).columns) == [Vector{Union{Int, Missing}}, Vector{Float64}, Vector{Union{Int, Missing}}]
 
         on = [:id, :fid]
         @test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid])
         @test typeof.(i(on).columns) == [Vector{Int}, Vector{Float64}]
         @test l(on) == DataFrame(id = [1, 3, 5], fid = [1, 3, 5])
-        @test typeof.(l(on).columns) == [Vector{Union{Int, Missing}},
-                                         Vector{Union{Float64, Missing}}]
+        @test typeof.(l(on).columns) == [Vector{Int}, Vector{Float64}]
         @test r(on) == DataFrame(id = [1, 3, 0, 2, 4], fid = [1, 3, 0, 2, 4])
-        @test typeof.(r(on).columns) == [Vector{Union{Int, Missing}},
-                                         Vector{Union{Float64, Missing}}]
+        @test typeof.(r(on).columns) == [Vector{Int}, Vector{Float64}]
         @test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4], fid = [1, 3, 5, 0, 2, 4])
-        @test typeof.(o(on).columns) == [Vector{Union{Int, Missing}},
-                                         Vector{Union{Float64, Missing}}]
+        @test typeof.(o(on).columns) == [Vector{Int}, Vector{Float64}]
     end
 
     @testset "all joins with CategoricalArrays" begin
@@ -276,17 +273,17 @@ module TestJoin
                                 fid = [1, 3, 5],
                                 fid_1 = [1, 3, missing])
         @test all(isa.(l(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
+                       [CategoricalVector{T} for T in (Int,Float64,Union{Float64, Missing})]))
         @test r(on) ≅ DataFrame(id = [1, 3, 0, 2, 4],
                                 fid = [1, 3, missing, missing, missing],
                                 fid_1 = [1, 3, 0, 2, 4])
         @test all(isa.(r(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
+                       [CategoricalVector{T} for T in (Int,Union{Float64, Missing},Float64)]))
         @test o(on) ≅ DataFrame(id = [1, 3, 5, 0, 2, 4],
                                 fid = [1, 3, 5, missing, missing, missing],
                                 fid_1 = [1, 3, missing, 0, 2, 4])
         @test all(isa.(o(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int,Float64,Float64)]))
+                       [CategoricalVector{T} for T in (Int,Union{Float64,Missing},Union{Float64, Missing})]))
 
         on = :fid
         @test i(on) == DataFrame(Any[[1, 3], [1.0, 3.0], [1, 3]], [:id, :fid, :id_1])
@@ -296,17 +293,17 @@ module TestJoin
                                 fid = [1, 3, 5],
                                 id_1 = [1, 3, missing])
         @test all(isa.(l(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
+                       [CategoricalVector{T} for T in (Int, Float64, Union{Int, Missing})]))
         @test r(on) ≅ DataFrame(id = [1, 3, missing, missing, missing],
                                 fid = [1, 3, 0, 2, 4],
                                 id_1 = [1, 3, 0, 2, 4])
         @test all(isa.(r(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
+                       [CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Int)]))
         @test o(on) ≅ DataFrame(id = [1, 3, 5, missing, missing, missing],
                                 fid = [1, 3, 5, 0, 2, 4],
                                 id_1 = [1, 3, missing, 0, 2, 4])
         @test all(isa.(o(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64, Int)]))
+                       [CategoricalVector{T} for T in (Union{Int, Missing}, Float64, Union{Int, Missing})]))
 
         on = [:id, :fid]
         @test i(on) == DataFrame(Any[[1, 3], [1, 3]], [:id, :fid])
@@ -315,15 +312,15 @@ module TestJoin
         @test l(on) == DataFrame(id = [1, 3, 5],
                                  fid = [1, 3, 5])
         @test all(isa.(l(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
+                       [CategoricalVector{T} for T in (Int, Float64)]))
         @test r(on) == DataFrame(id = [1, 3, 0, 2, 4],
                                  fid = [1, 3, 0, 2, 4])
         @test all(isa.(r(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
+                       [CategoricalVector{T} for T in (Int, Float64)]))
         @test o(on) == DataFrame(id = [1, 3, 5, 0, 2, 4],
                                  fid = [1, 3, 5, 0, 2, 4])
         @test all(isa.(o(on).columns,
-                       [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
+                       [CategoricalVector{T} for T in (Int, Float64)]))
     end
 
     @testset "maintain CategoricalArray levels ordering on join - non-`on` cols" begin
@@ -465,4 +462,38 @@ module TestJoin
         @test join(left, right, on = [:id => :ID, :sid => :SID], kind=:anti) ==
             DataFrame(id = 1:2, sid = string.(1:2))
     end
+
+    @testset "join with a column of type Any" begin
+        l = DataFrame(a=Any[1:7;], b=[1:7;])
+        r = DataFrame(a=Any[3:10;], b=[3:10;])
+
+        # join by :a and :b (Any is the on-column)
+        @test join(l, r, on=[:a, :b], kind=:inner) ≅ DataFrame(a=Any[3:7;], b=3:7)
+        @test eltypes(join(l, r, on=[:a, :b], kind=:inner)) == [Any, Int]
+
+        @test join(l, r, on=[:a, :b], kind=:left) ≅ DataFrame(a=Any[1:7;], b=1:7)
+        @test eltypes(join(l, r, on=[:a, :b], kind=:left)) == [Any, Int]
+
+        @test join(l, r, on=[:a, :b], kind=:right) ≅ DataFrame(a=Any[3:10;], b=3:10)
+        @test eltypes(join(l, r, on=[:a, :b], kind=:right)) == [Any, Int]
+
+        @test join(l, r, on=[:a, :b], kind=:outer) ≅ DataFrame(a=Any[1:10;], b=1:10)
+        @test eltypes(join(l, r, on=[:a, :b], kind=:outer)) == [Any, Int]
+
+        # join by :b (Any is not on-column)
+        @test join(l, r, on=:b, kind=:inner) ≅ DataFrame(a=Any[3:7;], b=3:7, a_1=Any[3:7;])
+        @test eltypes(join(l, r, on=:b, kind=:inner)) == [Any, Int, Any]
+
+        @test join(l, r, on=:b, kind=:left) ≅
+            DataFrame(a=Any[1:7;], b=1:7, a_1=[fill(missing, 2); 3:7;])
+        @test eltypes(join(l, r, on=:b, kind=:left)) == [Any, Int, Any]
+
+        @test join(l, r, on=:b, kind=:right) ≅
+            DataFrame(a=[3:7; fill(missing, 3)], b=3:10, a_1=Any[3:10;])
+        @test eltypes(join(l, r, on=:b, kind=:right)) == [Any, Int, Any]
+
+        @test join(l, r, on=:b, kind=:outer) ≅
+            DataFrame(a=[1:7; fill(missing, 3)], b=1:10, a_1=[fill(missing, 2); 3:10;])
+        @test eltypes(join(l, r, on=:b, kind=:outer)) == [Any, Int, Any]
+    end
 end