Skip to content
This repository has been archived by the owner on May 5, 2019. It is now read-only.

Commit

Permalink
adjust outer join behavior (types and right outer join bug)
Browse files Browse the repository at this point in the history
outer joins need to return nullable tables as they may introduce missing
data. similar_nullable on DataTables has been removed (unused) and
replaced with a similar_nullable that works on
NullableCategoricalArrays, and this change is made to support the new
changes to join. The 3 outer joins share a function with inner joins,
and this shared function (compose_joined_table) now performs a check to
see if the join type is :inner, and if so, it will return the same
column type as the parent table rather than promoting to a nullable
column. A bug was found in right-outer join behavior where the values
unique to the right table were added to the table in the incorrect
locations, overwriting data and leaving nulls where they shouldn't be.
This bug, due to incorrect values in rightonly_ixs.join, was fixed by
filling the last n-rows of the datatable where n =
length(rightonly_ixs.join). Tests were checked for accuracy against
pandas.
  • Loading branch information
cjprybol authored and nalimilan committed May 12, 2017
1 parent 776f293 commit 12443f4
Show file tree
Hide file tree
Showing 2 changed files with 254 additions and 38 deletions.
90 changes: 53 additions & 37 deletions src/abstractdatatable/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,10 @@ similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}})
similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =
NullableArray{eltype(T)}(dims)

similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) =
similar_nullable{T}(dv::CategoricalArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =
NullableCategoricalArray{T}(dims)

similar_nullable(dt::AbstractDataTable, dims::Int) =
DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt)))

similar_nullable{T,R}(dv::NullableCategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) =
similar_nullable{T}(dv::NullableCategoricalArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =
NullableCategoricalArray{T}(dims)

# helper structure for DataTables joining
Expand Down Expand Up @@ -47,55 +44,74 @@ Base.length(x::RowIndexMap) = length(x.orig)

# composes the joined data table using the maps between the left and right
# table rows and the indices of rows in the result
function compose_joined_table(joiner::DataTableJoiner,
function compose_joined_table(joiner::DataTableJoiner, kind::Symbol,
left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap,
right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap)
@assert length(left_ixs) == length(right_ixs)
# compose left half of the result taking all left columns
all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig)
if length(leftonly_ixs) > 0

ril = length(right_ixs)
lil = length(left_ixs)
loil = length(leftonly_ixs)
roil = length(rightonly_ixs)

if loil > 0
# combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) indices of the left table rows
# preserving the original rows order
all_orig_left_ixs = similar(left_ixs.orig, length(left_ixs)+length(leftonly_ixs))
all_orig_left_ixs = similar(left_ixs.orig, lil + loil)
@inbounds all_orig_left_ixs[left_ixs.join] = left_ixs.orig
@inbounds all_orig_left_ixs[leftonly_ixs.join] = leftonly_ixs.orig
else
# the result contains only the left rows that are matched to right rows (left_ixs)
all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere
end
ril = length(right_ixs)
loil = length(leftonly_ixs)
roil = length(rightonly_ixs)
left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil)
for col in columns(joiner.dtl)],
names(joiner.dtl))

# compose right half of the result taking all right columns excluding on
dtr_noon = without(joiner.dtr, joiner.on_cols)
# permutation to swap rightonly and leftonly rows
right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil)
if length(leftonly_ixs) > 0
# compose right_perm with the permutation that restores left rows order
right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil]
end
all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig)
right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm]
for col in columns(dtr_noon)],
names(dtr_noon))
# merge left and right parts of the joined table
res = hcat!(left_dt, right_dt)

# compose right half of the result taking all right columns excluding on
dtr_noon = without(joiner.dtr, joiner.on_cols)

nrow = length(all_orig_left_ixs) + roil
@assert nrow == length(all_orig_right_ixs) + loil
ncleft = ncol(joiner.dtl)
cols = Vector{Any}(ncleft + ncol(dtr_noon))
_similar = kind == :inner ? similar : similar_nullable
for (i, col) in enumerate(columns(joiner.dtl))
cols[i] = _similar(col, nrow)
fillcolumn!(cols[i], col, all_orig_left_ixs)
end
for (i, col) in enumerate(columns(dtr_noon))
cols[i+ncleft] = _similar(col, nrow)
fillcolumn!(cols[i+ncleft], col, all_orig_right_ixs)
permute!(cols[i+ncleft], right_perm)
end
res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon)))

if length(rightonly_ixs.join) > 0
# some left rows are nulls, so the values of the "on" columns
# need to be taken from the right
for (on_col_ix, on_col) in enumerate(joiner.on_cols)
# fix the result of the rightjoin by taking the nonnull values from the right table
res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix]
offset = nrow - length(rightonly_ixs.orig)
fillcolumn!(res[on_col], joiner.dtr_on[on_col_ix], rightonly_ixs.orig, offset)
end
end
return res
end

function fillcolumn!{T1, T2}(dtcol::AbstractVector{T1}, refcol::AbstractVector{T2},
indices::Vector{Int}, offset::Int=0)
@inbounds for (j, k) in enumerate(indices)
dtcol[j+offset] = refcol[k]
end
end

# map the indices of the left and right joined tables
# to the indices of the rows in the resulting table
# if `nothing` is given, the corresponding map is not built
Expand Down Expand Up @@ -210,7 +226,8 @@ join(dt1::AbstractDataTable,
- `:cross` : a full Cartesian product of the key combinations; every
row of `dt1` is matched with every row of `dt2`
Null values are filled in where needed to complete joins.
For the three join operations that may introduce missing values (`:outer`, `:left`,
and `:right`), all columns of the returned data table will be nullable.
### Result
Expand Down Expand Up @@ -246,22 +263,21 @@ function Base.join(dt1::AbstractDataTable,
joiner = DataTableJoiner(dt1, dt2, on)

if kind == :inner
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, false, true, false)...)
compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, false, true, false)...)
elseif kind == :left
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, true, true, false)...)
compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, true, true, false)...)
elseif kind == :right
right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on,
group_rows(joiner.dtl_on),
true, true, true, false)
compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs)
compose_joined_table(joiner, kind, update_row_maps!(joiner.dtr_on, joiner.dtl_on,
group_rows(joiner.dtl_on),
true, true, true, false)[[3, 4, 1, 2]]...)
elseif kind == :outer
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, true, true, true)...)
compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, true, true, true)...)
elseif kind == :semi
# hash the right rows
dtr_on_grp = group_rows(joiner.dtr_on)
Expand Down
Loading

0 comments on commit 12443f4

Please sign in to comment.