Skip to content

Commit

Permalink
Split groupeddataframe/grouping.jl into several files
Browse files Browse the repository at this point in the history
  • Loading branch information
jlumpe committed Dec 9, 2019
1 parent 025824f commit 72096ac
Show file tree
Hide file tree
Showing 5 changed files with 407 additions and 380 deletions.
6 changes: 5 additions & 1 deletion src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ include("abstractdataframe/abstractdataframe.jl")
include("dataframe/dataframe.jl")
include("subdataframe/subdataframe.jl")
include("dataframerow/dataframerow.jl")
include("groupeddataframe/grouping.jl")
include("groupeddataframe/groupeddataframe.jl")
include("dataframerow/utils.jl")

include("other/broadcasting.jl")
Expand All @@ -76,6 +76,10 @@ include("abstractdataframe/iteration.jl")
include("abstractdataframe/join.jl")
include("abstractdataframe/reshape.jl")

include("groupeddataframe/groupkeys.jl")
include("groupeddataframe/getindex.jl")
include("groupeddataframe/splitapplycombine.jl")

include("abstractdataframe/show.jl")
include("groupeddataframe/show.jl")
include("dataframerow/show.jl")
Expand Down
116 changes: 116 additions & 0 deletions src/groupeddataframe/getindex.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#
# Vector-like indexing
#

Compat.lastindex(gd::GroupedDataFrame) = length(gd.starts)
Base.first(gd::GroupedDataFrame) = gd[1]
Base.last(gd::GroupedDataFrame) = gd[end]

# Single integer
Base.getindex(gd::GroupedDataFrame, idx::Integer) =
view(gd.parent, gd.idx[gd.starts[idx]:gd.ends[idx]], :)

# Array of integers
function Base.getindex(gd::GroupedDataFrame, idxs::AbstractArray{T}) where {T<:Integer}
new_starts = gd.starts[idxs]
new_ends = gd.ends[idxs]
if !allunique(new_starts)
throw(ArgumentError("duplicates in idxs argument are not allowed"))
end
new_groups = zeros(Int, length(gd.groups))
for idx in eachindex(new_starts)
@inbounds for j in new_starts[idx]:new_ends[idx]
new_groups[gd.idx[j]] = idx
end
end
GroupedDataFrame(gd.parent, gd.cols, new_groups, gd.idx, new_starts, new_ends)
end

# Colon (creates copy)
Base.getindex(gd::GroupedDataFrame, idxs::Colon) =
GroupedDataFrame(gd.parent, gd.cols, gd.groups, gd.idx, gd.starts, gd.ends)


#
# Dictionary-like indexing
#

# GroupKey
function Base.getindex(gd::GroupedDataFrame, key::GroupKey)
gd === parent(key) && return gd[getfield(key, :idx)]
throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from."))
end

# Tuple
function Base.getindex(gd::GroupedDataFrame, key::Tuple)
for i in 1:length(gd)
isequal(Tuple(_groupvalues(gd, i)), key) && return gd[i]
end
throw(KeyError(key))
end

# NamedTuple
function Base.getindex(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
throw(KeyError(key))
end
return gd[Tuple(key)]
end

"""
get(gd::GroupedDataFrame, key, default)
Get a group based on the values of the grouping columns.
`key` may be a `NamedTuple` or `Tuple` of grouping column values (in the same
order as the `cols` argument to `groupby`).
# Examples
```jldoctest
julia> df = DataFrame(a = repeat([:foo, :bar, :baz], outer=[2]),
b = repeat([2, 1], outer=[3]),
c = 1:6);
julia> gd = groupby(df, :a)
GroupedDataFrame with 3 groups based on key: a
First Group (2 rows): a = :foo
│ Row │ a │ b │ c │
│ │ Symbol │ Int64 │ Int64 │
├─────┼────────┼───────┼───────┤
│ 1 │ foo │ 2 │ 1 │
│ 2 │ foo │ 1 │ 4 │
Last Group (2 rows): a = :baz
│ Row │ a │ b │ c │
│ │ Symbol │ Int64 │ Int64 │
├─────┼────────┼───────┼───────┤
│ 1 │ baz │ 2 │ 3 │
│ 2 │ baz │ 1 │ 6 │
julia> get(gd, (a=:bar,), nothing)
2×3 SubDataFrame
│ Row │ a │ b │ c │
│ │ Symbol │ Int64 │ Int64 │
├─────┼────────┼───────┼───────┤
│ 1 │ bar │ 1 │ 2 │
│ 2 │ bar │ 2 │ 5 │
julia> get(gd, (:baz,), nothing)
2×3 SubDataFrame
│ Row │ a │ b │ c │
│ │ Symbol │ Int64 │ Int64 │
├─────┼────────┼───────┼───────┤
│ 1 │ baz │ 2 │ 3 │
│ 2 │ baz │ 1 │ 6 │
julia> get(gd, (:qux,), nothing)
```
"""
function Base.get(gd::GroupedDataFrame, key::Union{Tuple, NamedTuple}, default)
try
return gd[key]
catch KeyError
return default
end
end
121 changes: 121 additions & 0 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#
# Definition and basic methods
#

"""
GroupedDataFrame
The result of a [`groupby`](@ref) operation on an `AbstractDataFrame`; a
view into the `AbstractDataFrame` grouped by rows.
Not meant to be constructed directly, see `groupby`.
"""
struct GroupedDataFrame{T<:AbstractDataFrame}
parent::T
cols::Vector{Int} # columns used for grouping
groups::Vector{Int} # group indices for each row
idx::Vector{Int} # indexing vector when grouped by the given columns
starts::Vector{Int} # starts of groups
ends::Vector{Int} # ends of groups
end

Base.broadcastable(::GroupedDataFrame) =
throw(ArgumentError("broadcasting over `GroupedDataFrame`s is reserved"))

"""
parent(gd::GroupedDataFrame)
Return the parent data frame of `gd`.
"""
Base.parent(gd::GroupedDataFrame) = getfield(gd, :parent)

Base.names(gd::GroupedDataFrame) = names(gd.parent)
_names(gd::GroupedDataFrame) = _names(gd.parent)

Base.length(gd::GroupedDataFrame) = length(gd.starts)


function Base.iterate(gd::GroupedDataFrame, i=1)
if i > length(gd.starts)
nothing
else
(view(gd.parent, gd.idx[gd.starts[i]:gd.ends[i]], :), i+1)
end
end


function DataFrame(gd::GroupedDataFrame; copycols::Bool=true)
if !copycols
throw(ArgumentError("It is not possible to construct a `DataFrame`" *
"from GroupedDataFrame with `copycols=false`"))
end
length(gd) == 0 && return similar(parent(gd), 0)
idx = similar(gd.idx)
doff = 1
for (s,e) in zip(gd.starts, gd.ends)
n = e - s + 1
copyto!(idx, doff, gd.idx, s, n)
doff += n
end
resize!(idx, doff - 1)
parent(gd)[idx, :]
end


#
# Equality testing
#

function Base.:(==)(gd1::GroupedDataFrame, gd2::GroupedDataFrame)
gd1.cols == gd2.cols &&
length(gd1) == length(gd2) &&
all(x -> ==(x...), zip(gd1, gd2))
end

function Base.isequal(gd1::GroupedDataFrame, gd2::GroupedDataFrame)
isequal(gd1.cols, gd2.cols) &&
isequal(length(gd1), length(gd2)) &&
all(x -> isequal(x...), zip(gd1, gd2))
end


#
# Public utility funcs
#

"""
groupindices(gd::GroupedDataFrame)
Return a vector of group indices for each row of `parent(gd)`.
Rows appearing in group `gd[i]` are attributed index `i`. Rows not present in
any group are attributed `missing` (this can happen if `skipmissing=true` was
passed when creating `gd`, or if `gd` is a subset from a larger [`GroupedDataFrame`](@ref)).
"""
groupindices(gd::GroupedDataFrame) = replace(gd.groups, 0=>missing)

"""
groupvars(gd::GroupedDataFrame)
Return a vector of column names in `parent(gd)` used for grouping.
"""
groupvars(gd::GroupedDataFrame) = _names(gd)[gd.cols]


#
# Internal utility funcs
#

# Get grouping variable index by its name
function _groupvar_idx(gd::GroupedDataFrame, name::Symbol, strict::Bool)
i = findfirst(==(name), groupvars(gd))
i === nothing && strict && throw(ArgumentError("$name is not a grouping column"))
return i
end

# Get values of grouping columns for single group
_groupvalues(gd::GroupedDataFrame, i::Integer) = gd.parent[gd.idx[gd.starts[i]], gd.cols]

# Get values of single grouping column for single group
_groupvalues(gd::GroupedDataFrame, i::Integer, col::Integer) = gd.parent[gd.idx[gd.starts[i]], gd.cols[col]]
_groupvalues(gd::GroupedDataFrame, i::Integer, col::Symbol) = _groupvalues(gd, i, _groupvar_idx(gd, col, true))
Loading

0 comments on commit 72096ac

Please sign in to comment.