Split groupeddataframe/grouping.jl into several files

JuliaData · Dec 9, 2019 · 72096ac · 72096ac
1 parent 025824f
commit 72096ac
Show file tree

Hide file tree

Showing 5 changed files with 407 additions and 380 deletions.
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -67,7 +67,7 @@ include("abstractdataframe/abstractdataframe.jl")
 include("dataframe/dataframe.jl")
 include("subdataframe/subdataframe.jl")
 include("dataframerow/dataframerow.jl")
-include("groupeddataframe/grouping.jl")
+include("groupeddataframe/groupeddataframe.jl")
 include("dataframerow/utils.jl")
 
 include("other/broadcasting.jl")
@@ -76,6 +76,10 @@ include("abstractdataframe/iteration.jl")
 include("abstractdataframe/join.jl")
 include("abstractdataframe/reshape.jl")
 
+include("groupeddataframe/groupkeys.jl")
+include("groupeddataframe/getindex.jl")
+include("groupeddataframe/splitapplycombine.jl")
+
 include("abstractdataframe/show.jl")
 include("groupeddataframe/show.jl")
 include("dataframerow/show.jl")

diff --git a/src/groupeddataframe/getindex.jl b/src/groupeddataframe/getindex.jl
@@ -0,0 +1,116 @@
+#
+# Vector-like indexing
+#
+
+Compat.lastindex(gd::GroupedDataFrame) = length(gd.starts)
+Base.first(gd::GroupedDataFrame) = gd[1]
+Base.last(gd::GroupedDataFrame) = gd[end]
+
+# Single integer
+Base.getindex(gd::GroupedDataFrame, idx::Integer) =
+    view(gd.parent, gd.idx[gd.starts[idx]:gd.ends[idx]], :)
+
+# Array of integers
+function Base.getindex(gd::GroupedDataFrame, idxs::AbstractArray{T}) where {T<:Integer}
+    new_starts = gd.starts[idxs]
+    new_ends = gd.ends[idxs]
+    if !allunique(new_starts)
+        throw(ArgumentError("duplicates in idxs argument are not allowed"))
+    end
+    new_groups = zeros(Int, length(gd.groups))
+    for idx in eachindex(new_starts)
+        @inbounds for j in new_starts[idx]:new_ends[idx]
+            new_groups[gd.idx[j]] = idx
+        end
+    end
+    GroupedDataFrame(gd.parent, gd.cols, new_groups, gd.idx, new_starts, new_ends)
+end
+
+# Colon (creates copy)
+Base.getindex(gd::GroupedDataFrame, idxs::Colon) =
+    GroupedDataFrame(gd.parent, gd.cols, gd.groups, gd.idx, gd.starts, gd.ends)
+
+
+#
+# Dictionary-like indexing
+#
+
+# GroupKey
+function Base.getindex(gd::GroupedDataFrame, key::GroupKey)
+    gd === parent(key) && return gd[getfield(key, :idx)]
+    throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from."))
+end
+
+# Tuple
+function Base.getindex(gd::GroupedDataFrame, key::Tuple)
+    for i in 1:length(gd)
+        isequal(Tuple(_groupvalues(gd, i)), key) && return gd[i]
+    end
+    throw(KeyError(key))
+end
+
+# NamedTuple
+function Base.getindex(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
+    if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
+        throw(KeyError(key))
+    end
+    return gd[Tuple(key)]
+end
+
+"""
+    get(gd::GroupedDataFrame, key, default)
+
+Get a group based on the values of the grouping columns.
+
+`key` may be a `NamedTuple` or `Tuple` of grouping column values (in the same
+order as the `cols` argument to `groupby`).
+
+# Examples
+
+```jldoctest
+julia> df = DataFrame(a = repeat([:foo, :bar, :baz], outer=[2]),
+                      b = repeat([2, 1], outer=[3]),
+                      c = 1:6);
+
+julia> gd = groupby(df, :a)
+GroupedDataFrame with 3 groups based on key: a
+First Group (2 rows): a = :foo
+│ Row │ a      │ b     │ c     │
+│     │ Symbol │ Int64 │ Int64 │
+├─────┼────────┼───────┼───────┤
+│ 1   │ foo    │ 2     │ 1     │
+│ 2   │ foo    │ 1     │ 4     │
+⋮
+Last Group (2 rows): a = :baz
+│ Row │ a      │ b     │ c     │
+│     │ Symbol │ Int64 │ Int64 │
+├─────┼────────┼───────┼───────┤
+│ 1   │ baz    │ 2     │ 3     │
+│ 2   │ baz    │ 1     │ 6     │
+
+julia> get(gd, (a=:bar,), nothing)
+2×3 SubDataFrame
+│ Row │ a      │ b     │ c     │
+│     │ Symbol │ Int64 │ Int64 │
+├─────┼────────┼───────┼───────┤
+│ 1   │ bar    │ 1     │ 2     │
+│ 2   │ bar    │ 2     │ 5     │
+
+julia> get(gd, (:baz,), nothing)
+2×3 SubDataFrame
+│ Row │ a      │ b     │ c     │
+│     │ Symbol │ Int64 │ Int64 │
+├─────┼────────┼───────┼───────┤
+│ 1   │ baz    │ 2     │ 3     │
+│ 2   │ baz    │ 1     │ 6     │
+
+julia> get(gd, (:qux,), nothing)
+```
+"""
+function Base.get(gd::GroupedDataFrame, key::Union{Tuple, NamedTuple}, default)
+    try
+        return gd[key]
+    catch KeyError
+        return default
+    end
+end
diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
@@ -0,0 +1,121 @@
+#
+# Definition and basic methods
+#
+
+"""
+    GroupedDataFrame
+
+The result of a [`groupby`](@ref) operation on an `AbstractDataFrame`; a
+view into the `AbstractDataFrame` grouped by rows.
+
+Not meant to be constructed directly, see `groupby`.
+"""
+struct GroupedDataFrame{T<:AbstractDataFrame}
+    parent::T
+    cols::Vector{Int}    # columns used for grouping
+    groups::Vector{Int}  # group indices for each row
+    idx::Vector{Int}     # indexing vector when grouped by the given columns
+    starts::Vector{Int}  # starts of groups
+    ends::Vector{Int}    # ends of groups
+end
+
+Base.broadcastable(::GroupedDataFrame) =
+    throw(ArgumentError("broadcasting over `GroupedDataFrame`s is reserved"))
+
+"""
+    parent(gd::GroupedDataFrame)
+
+Return the parent data frame of `gd`.
+"""
+Base.parent(gd::GroupedDataFrame) = getfield(gd, :parent)
+
+Base.names(gd::GroupedDataFrame) = names(gd.parent)
+_names(gd::GroupedDataFrame) = _names(gd.parent)
+
+Base.length(gd::GroupedDataFrame) = length(gd.starts)
+
+
+function Base.iterate(gd::GroupedDataFrame, i=1)
+    if i > length(gd.starts)
+        nothing
+    else
+        (view(gd.parent, gd.idx[gd.starts[i]:gd.ends[i]], :), i+1)
+    end
+end
+
+
+function DataFrame(gd::GroupedDataFrame; copycols::Bool=true)
+    if !copycols
+        throw(ArgumentError("It is not possible to construct a `DataFrame`" *
+                            "from GroupedDataFrame with `copycols=false`"))
+    end
+    length(gd) == 0 && return similar(parent(gd), 0)
+    idx = similar(gd.idx)
+    doff = 1
+    for (s,e) in zip(gd.starts, gd.ends)
+        n = e - s + 1
+        copyto!(idx, doff, gd.idx, s, n)
+        doff += n
+    end
+    resize!(idx, doff - 1)
+    parent(gd)[idx, :]
+end
+
+
+#
+# Equality testing
+#
+
+function Base.:(==)(gd1::GroupedDataFrame, gd2::GroupedDataFrame)
+    gd1.cols == gd2.cols &&
+        length(gd1) == length(gd2) &&
+        all(x -> ==(x...), zip(gd1, gd2))
+end
+
+function Base.isequal(gd1::GroupedDataFrame, gd2::GroupedDataFrame)
+    isequal(gd1.cols, gd2.cols) &&
+        isequal(length(gd1), length(gd2)) &&
+        all(x -> isequal(x...), zip(gd1, gd2))
+end
+
+
+#
+# Public utility funcs
+#
+
+"""
+    groupindices(gd::GroupedDataFrame)
+
+Return a vector of group indices for each row of `parent(gd)`.
+
+Rows appearing in group `gd[i]` are attributed index `i`. Rows not present in
+any group are attributed `missing` (this can happen if `skipmissing=true` was
+passed when creating `gd`, or if `gd` is a subset from a larger [`GroupedDataFrame`](@ref)).
+"""
+groupindices(gd::GroupedDataFrame) = replace(gd.groups, 0=>missing)
+
+"""
+    groupvars(gd::GroupedDataFrame)
+
+Return a vector of column names in `parent(gd)` used for grouping.
+"""
+groupvars(gd::GroupedDataFrame) = _names(gd)[gd.cols]
+
+
+#
+# Internal utility funcs
+#
+
+# Get grouping variable index by its name
+function _groupvar_idx(gd::GroupedDataFrame, name::Symbol, strict::Bool)
+    i = findfirst(==(name), groupvars(gd))
+    i === nothing && strict && throw(ArgumentError("$name is not a grouping column"))
+    return i
+end
+
+# Get values of grouping columns for single group
+_groupvalues(gd::GroupedDataFrame, i::Integer) = gd.parent[gd.idx[gd.starts[i]], gd.cols]
+
+# Get values of single grouping column for single group
+_groupvalues(gd::GroupedDataFrame, i::Integer, col::Integer) = gd.parent[gd.idx[gd.starts[i]], gd.cols[col]]
+_groupvalues(gd::GroupedDataFrame, i::Integer, col::Symbol) = _groupvalues(gd, i, _groupvar_idx(gd, col, true))