-
Notifications
You must be signed in to change notification settings - Fork 369
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split groupeddataframe/grouping.jl into several files
- Loading branch information
Showing
5 changed files
with
407 additions
and
380 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
# | ||
# Vector-like indexing | ||
# | ||
|
||
Compat.lastindex(gd::GroupedDataFrame) = length(gd.starts) | ||
Base.first(gd::GroupedDataFrame) = gd[1] | ||
Base.last(gd::GroupedDataFrame) = gd[end] | ||
|
||
# Single integer | ||
Base.getindex(gd::GroupedDataFrame, idx::Integer) = | ||
view(gd.parent, gd.idx[gd.starts[idx]:gd.ends[idx]], :) | ||
|
||
# Array of integers | ||
function Base.getindex(gd::GroupedDataFrame, idxs::AbstractArray{T}) where {T<:Integer} | ||
new_starts = gd.starts[idxs] | ||
new_ends = gd.ends[idxs] | ||
if !allunique(new_starts) | ||
throw(ArgumentError("duplicates in idxs argument are not allowed")) | ||
end | ||
new_groups = zeros(Int, length(gd.groups)) | ||
for idx in eachindex(new_starts) | ||
@inbounds for j in new_starts[idx]:new_ends[idx] | ||
new_groups[gd.idx[j]] = idx | ||
end | ||
end | ||
GroupedDataFrame(gd.parent, gd.cols, new_groups, gd.idx, new_starts, new_ends) | ||
end | ||
|
||
# Colon (creates copy) | ||
Base.getindex(gd::GroupedDataFrame, idxs::Colon) = | ||
GroupedDataFrame(gd.parent, gd.cols, gd.groups, gd.idx, gd.starts, gd.ends) | ||
|
||
|
||
# | ||
# Dictionary-like indexing | ||
# | ||
|
||
# GroupKey | ||
function Base.getindex(gd::GroupedDataFrame, key::GroupKey) | ||
gd === parent(key) && return gd[getfield(key, :idx)] | ||
throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from.")) | ||
end | ||
|
||
# Tuple | ||
function Base.getindex(gd::GroupedDataFrame, key::Tuple) | ||
for i in 1:length(gd) | ||
isequal(Tuple(_groupvalues(gd, i)), key) && return gd[i] | ||
end | ||
throw(KeyError(key)) | ||
end | ||
|
||
# NamedTuple | ||
function Base.getindex(gd::GroupedDataFrame, key::NamedTuple{N}) where {N} | ||
if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols)) | ||
throw(KeyError(key)) | ||
end | ||
return gd[Tuple(key)] | ||
end | ||
|
||
""" | ||
get(gd::GroupedDataFrame, key, default) | ||
Get a group based on the values of the grouping columns. | ||
`key` may be a `NamedTuple` or `Tuple` of grouping column values (in the same | ||
order as the `cols` argument to `groupby`). | ||
# Examples | ||
```jldoctest | ||
julia> df = DataFrame(a = repeat([:foo, :bar, :baz], outer=[2]), | ||
b = repeat([2, 1], outer=[3]), | ||
c = 1:6); | ||
julia> gd = groupby(df, :a) | ||
GroupedDataFrame with 3 groups based on key: a | ||
First Group (2 rows): a = :foo | ||
│ Row │ a │ b │ c │ | ||
│ │ Symbol │ Int64 │ Int64 │ | ||
├─────┼────────┼───────┼───────┤ | ||
│ 1 │ foo │ 2 │ 1 │ | ||
│ 2 │ foo │ 1 │ 4 │ | ||
⋮ | ||
Last Group (2 rows): a = :baz | ||
│ Row │ a │ b │ c │ | ||
│ │ Symbol │ Int64 │ Int64 │ | ||
├─────┼────────┼───────┼───────┤ | ||
│ 1 │ baz │ 2 │ 3 │ | ||
│ 2 │ baz │ 1 │ 6 │ | ||
julia> get(gd, (a=:bar,), nothing) | ||
2×3 SubDataFrame | ||
│ Row │ a │ b │ c │ | ||
│ │ Symbol │ Int64 │ Int64 │ | ||
├─────┼────────┼───────┼───────┤ | ||
│ 1 │ bar │ 1 │ 2 │ | ||
│ 2 │ bar │ 2 │ 5 │ | ||
julia> get(gd, (:baz,), nothing) | ||
2×3 SubDataFrame | ||
│ Row │ a │ b │ c │ | ||
│ │ Symbol │ Int64 │ Int64 │ | ||
├─────┼────────┼───────┼───────┤ | ||
│ 1 │ baz │ 2 │ 3 │ | ||
│ 2 │ baz │ 1 │ 6 │ | ||
julia> get(gd, (:qux,), nothing) | ||
``` | ||
""" | ||
function Base.get(gd::GroupedDataFrame, key::Union{Tuple, NamedTuple}, default) | ||
try | ||
return gd[key] | ||
catch KeyError | ||
return default | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# | ||
# Definition and basic methods | ||
# | ||
|
||
""" | ||
GroupedDataFrame | ||
The result of a [`groupby`](@ref) operation on an `AbstractDataFrame`; a | ||
view into the `AbstractDataFrame` grouped by rows. | ||
Not meant to be constructed directly, see `groupby`. | ||
""" | ||
struct GroupedDataFrame{T<:AbstractDataFrame} | ||
parent::T | ||
cols::Vector{Int} # columns used for grouping | ||
groups::Vector{Int} # group indices for each row | ||
idx::Vector{Int} # indexing vector when grouped by the given columns | ||
starts::Vector{Int} # starts of groups | ||
ends::Vector{Int} # ends of groups | ||
end | ||
|
||
Base.broadcastable(::GroupedDataFrame) = | ||
throw(ArgumentError("broadcasting over `GroupedDataFrame`s is reserved")) | ||
|
||
""" | ||
parent(gd::GroupedDataFrame) | ||
Return the parent data frame of `gd`. | ||
""" | ||
Base.parent(gd::GroupedDataFrame) = getfield(gd, :parent) | ||
|
||
Base.names(gd::GroupedDataFrame) = names(gd.parent) | ||
_names(gd::GroupedDataFrame) = _names(gd.parent) | ||
|
||
Base.length(gd::GroupedDataFrame) = length(gd.starts) | ||
|
||
|
||
function Base.iterate(gd::GroupedDataFrame, i=1) | ||
if i > length(gd.starts) | ||
nothing | ||
else | ||
(view(gd.parent, gd.idx[gd.starts[i]:gd.ends[i]], :), i+1) | ||
end | ||
end | ||
|
||
|
||
function DataFrame(gd::GroupedDataFrame; copycols::Bool=true) | ||
if !copycols | ||
throw(ArgumentError("It is not possible to construct a `DataFrame`" * | ||
"from GroupedDataFrame with `copycols=false`")) | ||
end | ||
length(gd) == 0 && return similar(parent(gd), 0) | ||
idx = similar(gd.idx) | ||
doff = 1 | ||
for (s,e) in zip(gd.starts, gd.ends) | ||
n = e - s + 1 | ||
copyto!(idx, doff, gd.idx, s, n) | ||
doff += n | ||
end | ||
resize!(idx, doff - 1) | ||
parent(gd)[idx, :] | ||
end | ||
|
||
|
||
# | ||
# Equality testing | ||
# | ||
|
||
function Base.:(==)(gd1::GroupedDataFrame, gd2::GroupedDataFrame) | ||
gd1.cols == gd2.cols && | ||
length(gd1) == length(gd2) && | ||
all(x -> ==(x...), zip(gd1, gd2)) | ||
end | ||
|
||
function Base.isequal(gd1::GroupedDataFrame, gd2::GroupedDataFrame) | ||
isequal(gd1.cols, gd2.cols) && | ||
isequal(length(gd1), length(gd2)) && | ||
all(x -> isequal(x...), zip(gd1, gd2)) | ||
end | ||
|
||
|
||
# | ||
# Public utility funcs | ||
# | ||
|
||
""" | ||
groupindices(gd::GroupedDataFrame) | ||
Return a vector of group indices for each row of `parent(gd)`. | ||
Rows appearing in group `gd[i]` are attributed index `i`. Rows not present in | ||
any group are attributed `missing` (this can happen if `skipmissing=true` was | ||
passed when creating `gd`, or if `gd` is a subset from a larger [`GroupedDataFrame`](@ref)). | ||
""" | ||
groupindices(gd::GroupedDataFrame) = replace(gd.groups, 0=>missing) | ||
|
||
""" | ||
groupvars(gd::GroupedDataFrame) | ||
Return a vector of column names in `parent(gd)` used for grouping. | ||
""" | ||
groupvars(gd::GroupedDataFrame) = _names(gd)[gd.cols] | ||
|
||
|
||
# | ||
# Internal utility funcs | ||
# | ||
|
||
# Get grouping variable index by its name | ||
function _groupvar_idx(gd::GroupedDataFrame, name::Symbol, strict::Bool) | ||
i = findfirst(==(name), groupvars(gd)) | ||
i === nothing && strict && throw(ArgumentError("$name is not a grouping column")) | ||
return i | ||
end | ||
|
||
# Get values of grouping columns for single group | ||
_groupvalues(gd::GroupedDataFrame, i::Integer) = gd.parent[gd.idx[gd.starts[i]], gd.cols] | ||
|
||
# Get values of single grouping column for single group | ||
_groupvalues(gd::GroupedDataFrame, i::Integer, col::Integer) = gd.parent[gd.idx[gd.starts[i]], gd.cols[col]] | ||
_groupvalues(gd::GroupedDataFrame, i::Integer, col::Symbol) = _groupvalues(gd, i, _groupvar_idx(gd, col, true)) |
Oops, something went wrong.