incorporate edits suggested during review

JuliaData · Mar 1, 2017 · 49d6328 · nalimilan · Mar 2, 2017 · 49d6328
1 parent 637b8cf
commit 49d6328
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 98 deletions.
diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl
@@ -153,7 +153,7 @@ immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable}
     on_cols::Vector{Symbol}
 
     function DataTableJoiner(dtl::DT1, dtr::DT2, on::Union{Symbol,Vector{Symbol}})
-        on_cols = (isa(on, Symbol) ? fill(on, 1) : on)
+        on_cols = isa(on, Symbol) ? [on] : on
         new(dtl, dtr, dtl[on_cols], dtr[on_cols], on_cols)
     end
 end
@@ -171,13 +171,6 @@ end
 
 Base.length(x::RowIndexMap) = length(x.orig)
 
-# fix the result of the rightjoin by taking the nonnull values from the right table
-function fix_rightjoin_column!(res_col::AbstractArray, col_ix::Int, joiner::DataTableJoiner,
-                               all_orig_left_ixs::Vector{Int}, rightonly_ixs::RowIndexMap)
-    res_col[rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, col_ix]
-    res_col
-end
-
 # composes the joined data table using the maps between the left and right
 # table rows and the indices of rows in the result
 function compose_joined_table(joiner::DataTableJoiner,
@@ -190,24 +183,24 @@ function compose_joined_table(joiner::DataTableJoiner,
         # permute the indices to restore left table rows order
         all_orig_left_ixs[vcat(left_ixs.join, leftonly_ixs.join)] = all_orig_left_ixs
     end
-    left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+length(rightonly_ixs))
+    ril = length(right_ixs)
+    loil = length(leftonly_ixs)
+    roil = length(rightonly_ixs)
+    left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil)
                             for col in columns(joiner.dtl)],
                         names(joiner.dtl))
 
     # compose right half of the result taking all right columns excluding on
     dtr_noon = without(joiner.dtr, joiner.on_cols)
     # permutation to swap rightonly and leftonly rows
-    right_perm = vcat(1:length(right_ixs),
-                      (length(right_ixs)+length(rightonly_ixs)+1:
-                       length(right_ixs)+length(rightonly_ixs)+length(leftonly_ixs)),
-                      length(right_ixs)+1:length(right_ixs)+length(rightonly_ixs))
+    right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil)
     if length(leftonly_ixs) > 0
         # compose right_perm with the permutation that restores left rows order
-        right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:length(right_ixs)+length(leftonly_ixs)]
+        right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil]
     end
     all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig)
-    right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+length(leftonly_ixs))[right_perm]
-                             for col in columns(dtr_noon)],
+    right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm]
+                            for col in columns(dtr_noon)],
                          names(dtr_noon))
     # merge left and right parts of the joined table
     res = hcat!(left_dt, right_dt)
@@ -216,7 +209,8 @@ function compose_joined_table(joiner::DataTableJoiner,
         # some left rows are nulls, so the values of the "on" columns
         # need to be taken from the right
         for (on_col_ix, on_col) in enumerate(joiner.on_cols)
-            res[on_col] = fix_rightjoin_column!(res[on_col], on_col_ix, joiner, all_orig_left_ixs, rightonly_ixs)
+            # fix the result of the rightjoin by taking the nonnull values from the right table
+            res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix]
         end
     end
     return res
@@ -233,8 +227,8 @@ function update_row_maps!(left_table::AbstractDataTable,
                           right_ixs::Union{Void, RowIndexMap},
                           rightonly_mask::Union{Void, Vector{Bool}})
     # helper functions
-    update!(ixs::Void, orig_ix::Int, join_ix::Int, count::Int = 1) = ixs
-    function update!(ixs::RowIndexMap, orig_ix::Int, join_ix::Int, count::Int = 1)
+    @inline update!(ixs::Void, orig_ix::Int, join_ix::Int, count::Int = 1) = nothing
+    @inline function update!(ixs::RowIndexMap, orig_ix::Int, join_ix::Int, count::Int = 1)
         for i in 1:count
             push!(ixs.orig, orig_ix)
         end
@@ -243,16 +237,16 @@ function update_row_maps!(left_table::AbstractDataTable,
         end
         ixs
     end
-    update!(ixs::Void, orig_ixs::AbstractArray, join_ix::Int) = ixs
-    function update!(ixs::RowIndexMap, orig_ixs::AbstractArray, join_ix::Int)
+    @inline update!(ixs::Void, orig_ixs::AbstractArray, join_ix::Int) = nothing
+    @inline function update!(ixs::RowIndexMap, orig_ixs::AbstractArray, join_ix::Int)
         append!(ixs.orig, orig_ixs)
         for i in join_ix:(join_ix+length(orig_ixs)-1)
             push!(ixs.join, i)
         end
         ixs
     end
-    update!(ixs::Void, orig_ixs::AbstractArray) = ixs
-    update!(mask::Vector{Bool}, orig_ixs::AbstractArray) = (mask[orig_ixs] = false)
+    @inline update!(ixs::Void, orig_ixs::AbstractArray) = ixs
+    @inline update!(mask::Vector{Bool}, orig_ixs::AbstractArray) = (mask[orig_ixs] = false)
 
     # iterate over left rows and compose the left<->right index map
     next_join_ix = 1
@@ -297,10 +291,10 @@ function update_row_maps!(left_table::AbstractDataTable,
     rightonly_mask = map_rightonly ? fill(true, nrow(right_table)) : nothing
     update_row_maps!(left_table, right_table, right_dict, left_ixs, leftonly_ixs, right_ixs, rightonly_mask)
     if map_rightonly
-        rightonly_orig_ixs = (1:length(rightonly_mask))[rightonly_mask]
+        rightonly_orig_ixs = find(rightonly_mask)
         rightonly_ixs = RowIndexMap(rightonly_orig_ixs,
-                                    collect(length(right_ixs.orig)+
-                                            (leftonly_ixs === nothing ? 0 : length(leftonly_ixs))+
+                                    collect(length(right_ixs.orig) +
+                                            (leftonly_ixs === nothing ? 0 : length(leftonly_ixs)) +
                                             (1:length(rightonly_orig_ixs))))
     else
         rightonly_ixs = nothing
@@ -377,30 +371,29 @@ function Base.join(dt1::AbstractDataTable,
 
     if kind == :inner
         compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
-                             group_rows(joiner.dtr_on),
-                             true, false, true, false)...)
+                                                      group_rows(joiner.dtr_on),
+                                                      true, false, true, false)...)
     elseif kind == :left
         compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
-                             group_rows(joiner.dtr_on),
-                             true, true, true, false)...)
+                                                      group_rows(joiner.dtr_on),
+                                                      true, true, true, false)...)
     elseif kind == :right
-        right_ixs, rightonly_ixs, left_ixs, leftonly_ixs =
-            update_row_maps!(joiner.dtr_on, joiner.dtl_on,
-                            group_rows(joiner.dtl_on),
-                            true, true, true, false)
+        right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on,
+                                                                            group_rows(joiner.dtl_on),
+                                                                            true, true, true, false)
         compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs)
     elseif kind == :outer
         compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
-                             group_rows(joiner.dtr_on),
-                             true, true, true, true)...)
+                                                      group_rows(joiner.dtr_on),
+                                                      true, true, true, true)...)
     elseif kind == :semi
         # hash the right rows
         dtr_on_grp = group_rows(joiner.dtr_on)
         # iterate over left rows and leave those found in right
         left_ixs = Vector{Int}()
         sizehint!(left_ixs, nrow(joiner.dtl))
-        for l_ix in 1:nrow(joiner.dtl_on)
-            if in(dtr_on_grp, joiner.dtl_on, l_ix)
+        @inbounds for l_ix in 1:nrow(joiner.dtl_on)
+            if findrow(dtr_on_grp, joiner.dtl_on, l_ix) != 0
                 push!(left_ixs, l_ix)
             end
         end
@@ -411,14 +404,14 @@ function Base.join(dt1::AbstractDataTable,
         # iterate over left rows and leave those not found in right
         leftonly_ixs = Vector{Int}()
         sizehint!(leftonly_ixs, nrow(joiner.dtl))
-        for l_ix in 1:nrow(joiner.dtl_on)
-            if !in(dtr_on_grp, joiner.dtl_on, l_ix)
+        @inbounds for l_ix in 1:nrow(joiner.dtl_on)
+            if findrow(dtr_on_grp, joiner.dtl_on, l_ix) == 0
                 push!(leftonly_ixs, l_ix)
             end
         end
         return joiner.dtl[leftonly_ixs, :]
     else
-        throw(ArgumentError("Unknown kind ($kind) of join requested"))
+        throw(ArgumentError("Unknown kind of join requested: ($kind)"))
     end
 end
 

diff --git a/src/abstractdatatable/sort.jl b/src/abstractdatatable/sort.jl
@@ -73,8 +73,8 @@ DTPerm{O<:Ordering, DT<:AbstractDataTable}(o::O, dt::DT) = DTPerm{O,DT}(o,dt)
 col_ordering{O<:Ordering}(o::DTPerm{O}, i::Int) = o.ord
 col_ordering{V<:AbstractVector}(o::DTPerm{V}, i::Int) = o.ord[i]
 
-Base.getindex(o::DTPerm, i::Int, j::Int) = o.dt[i, j]
-Base.getindex(o::DTPerm, a::DataTableRow, j::Int) = a[j]
+Base.@propagate_inbounds Base.getindex(o::DTPerm, i::Int, j::Int) = o.dt[i, j]
+Base.@propagate_inbounds Base.getindex(o::DTPerm, a::DataTableRow, j::Int) = a[j]
 
 function Sort.lt(o::DTPerm, a, b)
     @inbounds for i = 1:ncol(o.dt)
@@ -276,8 +276,8 @@ Base.sort(dt::AbstractDataTable, a::Algorithm, o::Ordering) = dt[sortperm(dt, a,
 Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Union{Perm,DTPerm}) = sort!([1:size(dt, 1);], a, o)
 Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Ordering) = sortperm(dt, a, DTPerm(o,dt))
 
-# #Extras to speed up sorting
-# dependant on https://github.com/JuliaData/CategoricalArrays.jl/issues/12
+# Extras to speed up sorting
+# dependent on https://github.com/JuliaData/CategoricalArrays.jl/issues/12
 # Base.sortperm{V}(df::AbstractDataFrame, a::Algorithm, o::FastPerm{Sort.ForwardOrdering,V}) = sortperm(o.vec)
 # Base.sortperm{V}(df::AbstractDataFrame, a::Algorithm, o::FastPerm{Sort.ReverseOrdering,V}) = reverse(sortperm(o.vec))
 

diff --git a/src/datatablerow/datatablerow.jl b/src/datatablerow/datatablerow.jl
@@ -94,9 +94,7 @@ end
 isequal_colel(col::AbstractArray, r1::Int, r2::Int) =
     (r1 == r2) || isequal(Base.unsafe_getindex(col, r1), Base.unsafe_getindex(col, r2))
 
-function isequal_colel{T}(col::Union{NullableArray{T},
-                                     AbstractNullableCategoricalArray{T}},
-                                     r1::Int, r2::Int)
+function isequal_colel{T}(col::Union{NullableArray{T}, AbstractNullableCategoricalArray{T}}, r1::Int, r2::Int)
     (r1 == r2) && return true
     isequal(col[r1], col[r2])
 end
@@ -118,7 +116,7 @@ end
 function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int)
     (dt1 === dt2) && return isequal_row(dt1, r1, r2)
     (ncol(dt1) == ncol(dt2)) ||
-        throw(ArgumentError("Rows of the data tables that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
+        throw(ArgumentError("Rows of the tables that have different number of columns cannot be compared. Got $(ncol(dt1)) and $(ncol(dt2)) columns"))
     @inbounds for (col1, col2) in zip(columns(dt1), columns(dt2))
         isequal_colel(col1[r1], col2[r2]) || return false
     end
@@ -135,14 +133,14 @@ function Base.isless(r1::DataTableRow, r2::DataTableRow)
     (ncol(r1.dt) == ncol(r2.dt)) ||
         throw(ArgumentError("Rows of the data tables that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
     @inbounds for i in 1:ncol(r1.dt)
-        col1 = r1.dt[i]
-        col2 = r2.dt[i]
-        isnull1 = _isnull(col1[r1.row])
-        isnull2 = _isnull(col2[r2.row])
-        (isnull1 != isnull2) && return isnull2 # null > !null
-        if !isnull1
-            v1 = get(col1[r1.row])
-            v2 = get(col2[r2.row])
+        x = r1.dt[i][r1.row]
+        y = r2.dt[i][r2.row]
+        isnullx = _isnull(x)
+        isnully = _isnull(y)
+        (isnullx != isnully) && return isnully # null > !null
+        if !isnullx
+            v1 = get(x)
+            v2 = get(y)
             isless(v1, v2) && return true
             !isequal(v1, v2) && return false
         end

diff --git a/src/datatablerow/utils.jl b/src/datatablerow/utils.jl
@@ -79,6 +79,7 @@ function row_group_slots(dt::AbstractDataTable,
                          groups::Union{Vector{Int}, Void} = nothing)
     @assert groups === nothing || length(groups) == nrow(dt)
     rhashes = hashrows(dt)
+    # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481
     sz = Base._tablesz(length(rhashes))
     @assert sz >= length(rhashes)
     szm1 = sz-1
@@ -186,6 +187,3 @@ function Base.getindex(gd::RowGroupDict, dtr::DataTableRow)
     gix = gd.groups[g_row]
     return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
 end
-
-# Check if there is matching row in gd
-Base.in(gd::RowGroupDict, dt::DataTable, row::Int) = (findrow(gd, dt, row) != 0)
diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl
@@ -26,40 +26,6 @@ end
 # Split
 #
 
-function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
-    # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
-
-    # count group sizes, location 0 for NULL
-    n = length(x)
-    # counts = x.pool
-    counts = fill(0, ngroups + 1)
-    for i = 1:n
-        counts[x[i] + 1] += 1
-    end
-
-    # mark the start of each contiguous group of like-indexed data
-    where = fill(1, ngroups + 1)
-    if null_last
-        for i = 3:ngroups+1
-            where[i] = where[i - 1] + counts[i - 1]
-        end
-        where[1] = where[end] + counts[end]
-    else
-        for i = 2:ngroups+1
-            where[i] = where[i - 1] + counts[i - 1]
-        end
-    end
-
-    # this is our indexer
-    result = fill(0, n)
-    for i = 1:n
-        label = x[i] + 1
-        result[where[label]] = i
-        where[label] += 1
-    end
-    result, where, counts
-end
-
 """
 A view of an AbstractDataTable split into row groups
 
@@ -397,9 +363,9 @@ end
 
 # Groups DataTable by cols before applying aggregate
 function aggregate{S<:ColumnIndex, T <:Function}(d::AbstractDataTable,
-                                    cols::Union{S, AbstractVector{S}},
-                                    fs::Union{T, Vector{T}};
-                                    sort::Bool=false)
+                                                 cols::Union{S, AbstractVector{S}},
+                                                 fs::Union{T, Vector{T}};
+                                                 sort::Bool=false)
     aggregate(groupby(d, cols, sort=sort), fs)
 end
 

diff --git a/test/datatablerow.jl b/test/datatablerow.jl
@@ -54,7 +54,7 @@ module TestDataTableRow
     # test incompatible frames
     @test_throws UndefVarError isequal(DataTableRow(dt, 1), DataTableRow(dt3, 1))
 
-    # test _RowGroupDict
+    # test RowGroupDict
     N = 20
     d1 = rand(map(Int64, 1:2), N)
     dt5 = DataTable(Any[d1], [:d1])