Skip to content
This repository has been archived by the owner on May 5, 2019. It is now read-only.

Commit

Permalink
incorporate edits suggested during review
Browse files Browse the repository at this point in the history
  • Loading branch information
cjprybol committed Mar 1, 2017
1 parent 637b8cf commit 49d6328
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 98 deletions.
75 changes: 34 additions & 41 deletions src/abstractdatatable/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable}
on_cols::Vector{Symbol}

function DataTableJoiner(dtl::DT1, dtr::DT2, on::Union{Symbol,Vector{Symbol}})
on_cols = (isa(on, Symbol) ? fill(on, 1) : on)
on_cols = isa(on, Symbol) ? [on] : on
new(dtl, dtr, dtl[on_cols], dtr[on_cols], on_cols)
end
end
Expand All @@ -171,13 +171,6 @@ end

Base.length(x::RowIndexMap) = length(x.orig)

# fix the result of the rightjoin by taking the nonnull values from the right table
function fix_rightjoin_column!(res_col::AbstractArray, col_ix::Int, joiner::DataTableJoiner,
all_orig_left_ixs::Vector{Int}, rightonly_ixs::RowIndexMap)
res_col[rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, col_ix]
res_col
end

# composes the joined data table using the maps between the left and right
# table rows and the indices of rows in the result
function compose_joined_table(joiner::DataTableJoiner,
Expand All @@ -190,24 +183,24 @@ function compose_joined_table(joiner::DataTableJoiner,
# permute the indices to restore left table rows order
all_orig_left_ixs[vcat(left_ixs.join, leftonly_ixs.join)] = all_orig_left_ixs
end
left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+length(rightonly_ixs))
ril = length(right_ixs)
loil = length(leftonly_ixs)
roil = length(rightonly_ixs)
left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil)
for col in columns(joiner.dtl)],
names(joiner.dtl))

# compose right half of the result taking all right columns excluding on
dtr_noon = without(joiner.dtr, joiner.on_cols)
# permutation to swap rightonly and leftonly rows
right_perm = vcat(1:length(right_ixs),
(length(right_ixs)+length(rightonly_ixs)+1:
length(right_ixs)+length(rightonly_ixs)+length(leftonly_ixs)),
length(right_ixs)+1:length(right_ixs)+length(rightonly_ixs))
right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil)
if length(leftonly_ixs) > 0
# compose right_perm with the permutation that restores left rows order
right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:length(right_ixs)+length(leftonly_ixs)]
right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil]
end
all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig)
right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+length(leftonly_ixs))[right_perm]
for col in columns(dtr_noon)],
right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm]
for col in columns(dtr_noon)],
names(dtr_noon))
# merge left and right parts of the joined table
res = hcat!(left_dt, right_dt)
Expand All @@ -216,7 +209,8 @@ function compose_joined_table(joiner::DataTableJoiner,
# some left rows are nulls, so the values of the "on" columns
# need to be taken from the right
for (on_col_ix, on_col) in enumerate(joiner.on_cols)
res[on_col] = fix_rightjoin_column!(res[on_col], on_col_ix, joiner, all_orig_left_ixs, rightonly_ixs)
# fix the result of the rightjoin by taking the nonnull values from the right table
res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix]
end
end
return res
Expand All @@ -233,8 +227,8 @@ function update_row_maps!(left_table::AbstractDataTable,
right_ixs::Union{Void, RowIndexMap},
rightonly_mask::Union{Void, Vector{Bool}})
# helper functions
update!(ixs::Void, orig_ix::Int, join_ix::Int, count::Int = 1) = ixs
function update!(ixs::RowIndexMap, orig_ix::Int, join_ix::Int, count::Int = 1)
@inline update!(ixs::Void, orig_ix::Int, join_ix::Int, count::Int = 1) = nothing
@inline function update!(ixs::RowIndexMap, orig_ix::Int, join_ix::Int, count::Int = 1)
for i in 1:count
push!(ixs.orig, orig_ix)
end
Expand All @@ -243,16 +237,16 @@ function update_row_maps!(left_table::AbstractDataTable,
end
ixs
end
update!(ixs::Void, orig_ixs::AbstractArray, join_ix::Int) = ixs
function update!(ixs::RowIndexMap, orig_ixs::AbstractArray, join_ix::Int)
@inline update!(ixs::Void, orig_ixs::AbstractArray, join_ix::Int) = nothing
@inline function update!(ixs::RowIndexMap, orig_ixs::AbstractArray, join_ix::Int)
append!(ixs.orig, orig_ixs)
for i in join_ix:(join_ix+length(orig_ixs)-1)
push!(ixs.join, i)
end
ixs
end
update!(ixs::Void, orig_ixs::AbstractArray) = ixs
update!(mask::Vector{Bool}, orig_ixs::AbstractArray) = (mask[orig_ixs] = false)
@inline update!(ixs::Void, orig_ixs::AbstractArray) = ixs
@inline update!(mask::Vector{Bool}, orig_ixs::AbstractArray) = (mask[orig_ixs] = false)

# iterate over left rows and compose the left<->right index map
next_join_ix = 1
Expand Down Expand Up @@ -297,10 +291,10 @@ function update_row_maps!(left_table::AbstractDataTable,
rightonly_mask = map_rightonly ? fill(true, nrow(right_table)) : nothing
update_row_maps!(left_table, right_table, right_dict, left_ixs, leftonly_ixs, right_ixs, rightonly_mask)
if map_rightonly
rightonly_orig_ixs = (1:length(rightonly_mask))[rightonly_mask]
rightonly_orig_ixs = find(rightonly_mask)
rightonly_ixs = RowIndexMap(rightonly_orig_ixs,
collect(length(right_ixs.orig)+
(leftonly_ixs === nothing ? 0 : length(leftonly_ixs))+
collect(length(right_ixs.orig) +
(leftonly_ixs === nothing ? 0 : length(leftonly_ixs)) +
(1:length(rightonly_orig_ixs))))
else
rightonly_ixs = nothing
Expand Down Expand Up @@ -377,30 +371,29 @@ function Base.join(dt1::AbstractDataTable,

if kind == :inner
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, false, true, false)...)
group_rows(joiner.dtr_on),
true, false, true, false)...)
elseif kind == :left
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, true, true, false)...)
group_rows(joiner.dtr_on),
true, true, true, false)...)
elseif kind == :right
right_ixs, rightonly_ixs, left_ixs, leftonly_ixs =
update_row_maps!(joiner.dtr_on, joiner.dtl_on,
group_rows(joiner.dtl_on),
true, true, true, false)
right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on,
group_rows(joiner.dtl_on),
true, true, true, false)
compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs)
elseif kind == :outer
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
group_rows(joiner.dtr_on),
true, true, true, true)...)
group_rows(joiner.dtr_on),
true, true, true, true)...)
elseif kind == :semi
# hash the right rows
dtr_on_grp = group_rows(joiner.dtr_on)
# iterate over left rows and leave those found in right
left_ixs = Vector{Int}()
sizehint!(left_ixs, nrow(joiner.dtl))
for l_ix in 1:nrow(joiner.dtl_on)
if in(dtr_on_grp, joiner.dtl_on, l_ix)
@inbounds for l_ix in 1:nrow(joiner.dtl_on)
if findrow(dtr_on_grp, joiner.dtl_on, l_ix) != 0
push!(left_ixs, l_ix)
end
end
Expand All @@ -411,14 +404,14 @@ function Base.join(dt1::AbstractDataTable,
# iterate over left rows and leave those not found in right
leftonly_ixs = Vector{Int}()
sizehint!(leftonly_ixs, nrow(joiner.dtl))
for l_ix in 1:nrow(joiner.dtl_on)
if !in(dtr_on_grp, joiner.dtl_on, l_ix)
@inbounds for l_ix in 1:nrow(joiner.dtl_on)
if findrow(dtr_on_grp, joiner.dtl_on, l_ix) == 0
push!(leftonly_ixs, l_ix)
end
end
return joiner.dtl[leftonly_ixs, :]
else
throw(ArgumentError("Unknown kind ($kind) of join requested"))
throw(ArgumentError("Unknown kind of join requested: ($kind)"))
end
end

Expand Down
8 changes: 4 additions & 4 deletions src/abstractdatatable/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ DTPerm{O<:Ordering, DT<:AbstractDataTable}(o::O, dt::DT) = DTPerm{O,DT}(o,dt)
col_ordering{O<:Ordering}(o::DTPerm{O}, i::Int) = o.ord
col_ordering{V<:AbstractVector}(o::DTPerm{V}, i::Int) = o.ord[i]

Base.getindex(o::DTPerm, i::Int, j::Int) = o.dt[i, j]
Base.getindex(o::DTPerm, a::DataTableRow, j::Int) = a[j]
Base.@propagate_inbounds Base.getindex(o::DTPerm, i::Int, j::Int) = o.dt[i, j]
Base.@propagate_inbounds Base.getindex(o::DTPerm, a::DataTableRow, j::Int) = a[j]

function Sort.lt(o::DTPerm, a, b)
@inbounds for i = 1:ncol(o.dt)
Expand Down Expand Up @@ -276,8 +276,8 @@ Base.sort(dt::AbstractDataTable, a::Algorithm, o::Ordering) = dt[sortperm(dt, a,
Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Union{Perm,DTPerm}) = sort!([1:size(dt, 1);], a, o)
Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Ordering) = sortperm(dt, a, DTPerm(o,dt))

# #Extras to speed up sorting
# dependant on https://github.com/JuliaData/CategoricalArrays.jl/issues/12
# Extras to speed up sorting
# dependent on https://github.com/JuliaData/CategoricalArrays.jl/issues/12
# Base.sortperm{V}(df::AbstractDataFrame, a::Algorithm, o::FastPerm{Sort.ForwardOrdering,V}) = sortperm(o.vec)
# Base.sortperm{V}(df::AbstractDataFrame, a::Algorithm, o::FastPerm{Sort.ReverseOrdering,V}) = reverse(sortperm(o.vec))

Expand Down
22 changes: 10 additions & 12 deletions src/datatablerow/datatablerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,7 @@ end
isequal_colel(col::AbstractArray, r1::Int, r2::Int) =
(r1 == r2) || isequal(Base.unsafe_getindex(col, r1), Base.unsafe_getindex(col, r2))

function isequal_colel{T}(col::Union{NullableArray{T},
AbstractNullableCategoricalArray{T}},
r1::Int, r2::Int)
function isequal_colel{T}(col::Union{NullableArray{T}, AbstractNullableCategoricalArray{T}}, r1::Int, r2::Int)
(r1 == r2) && return true
isequal(col[r1], col[r2])
end
Expand All @@ -118,7 +116,7 @@ end
function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int)
(dt1 === dt2) && return isequal_row(dt1, r1, r2)
(ncol(dt1) == ncol(dt2)) ||
throw(ArgumentError("Rows of the data tables that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
throw(ArgumentError("Rows of the tables that have different number of columns cannot be compared. Got $(ncol(dt1)) and $(ncol(dt2)) columns"))
@inbounds for (col1, col2) in zip(columns(dt1), columns(dt2))
isequal_colel(col1[r1], col2[r2]) || return false
end
Expand All @@ -135,14 +133,14 @@ function Base.isless(r1::DataTableRow, r2::DataTableRow)
(ncol(r1.dt) == ncol(r2.dt)) ||
throw(ArgumentError("Rows of the data tables that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
@inbounds for i in 1:ncol(r1.dt)
col1 = r1.dt[i]
col2 = r2.dt[i]
isnull1 = _isnull(col1[r1.row])
isnull2 = _isnull(col2[r2.row])
(isnull1 != isnull2) && return isnull2 # null > !null
if !isnull1
v1 = get(col1[r1.row])
v2 = get(col2[r2.row])
x = r1.dt[i][r1.row]
y = r2.dt[i][r2.row]
isnullx = _isnull(x)
isnully = _isnull(y)
(isnullx != isnully) && return isnully # null > !null
if !isnullx
v1 = get(x)

This comment has been minimized.

Copy link
@nalimilan

nalimilan Mar 2, 2017

Member

Ah, also better call unsafe_get here since isnull has been called manually. Unfortunately, I now realize that function isn't defined on 0.5, so you'll have to define a fallback like this somewhere:

if !isdefined(Base, :unsafe_get)
    unsafe_get(x::Nullable) = x.value
    unsafe_get(x::Any) = x
end
v2 = get(y)
isless(v1, v2) && return true
!isequal(v1, v2) && return false
end
Expand Down
4 changes: 1 addition & 3 deletions src/datatablerow/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ function row_group_slots(dt::AbstractDataTable,
groups::Union{Vector{Int}, Void} = nothing)
@assert groups === nothing || length(groups) == nrow(dt)
rhashes = hashrows(dt)
# inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481
sz = Base._tablesz(length(rhashes))
@assert sz >= length(rhashes)
szm1 = sz-1
Expand Down Expand Up @@ -186,6 +187,3 @@ function Base.getindex(gd::RowGroupDict, dtr::DataTableRow)
gix = gd.groups[g_row]
return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
end

# Check if there is matching row in gd
Base.in(gd::RowGroupDict, dt::DataTable, row::Int) = (findrow(gd, dt, row) != 0)
40 changes: 3 additions & 37 deletions src/groupeddatatable/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,40 +26,6 @@ end
# Split
#

function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
# translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).

# count group sizes, location 0 for NULL
n = length(x)
# counts = x.pool
counts = fill(0, ngroups + 1)
for i = 1:n
counts[x[i] + 1] += 1
end

# mark the start of each contiguous group of like-indexed data
where = fill(1, ngroups + 1)
if null_last
for i = 3:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
where[1] = where[end] + counts[end]
else
for i = 2:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
end

# this is our indexer
result = fill(0, n)
for i = 1:n
label = x[i] + 1
result[where[label]] = i
where[label] += 1
end
result, where, counts
end

"""
A view of an AbstractDataTable split into row groups
Expand Down Expand Up @@ -397,9 +363,9 @@ end

# Groups DataTable by cols before applying aggregate
function aggregate{S<:ColumnIndex, T <:Function}(d::AbstractDataTable,
cols::Union{S, AbstractVector{S}},
fs::Union{T, Vector{T}};
sort::Bool=false)
cols::Union{S, AbstractVector{S}},
fs::Union{T, Vector{T}};
sort::Bool=false)
aggregate(groupby(d, cols, sort=sort), fs)
end

Expand Down
2 changes: 1 addition & 1 deletion test/datatablerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ module TestDataTableRow
# test incompatible frames
@test_throws UndefVarError isequal(DataTableRow(dt, 1), DataTableRow(dt3, 1))

# test _RowGroupDict
# test RowGroupDict
N = 20
d1 = rand(map(Int64, 1:2), N)
dt5 = DataTable(Any[d1], [:d1])
Expand Down

0 comments on commit 49d6328

Please sign in to comment.