Skip to content

Commit

Permalink
Index GroupedDataFrame with array of keys (#2046)
Browse files Browse the repository at this point in the history
  • Loading branch information
jlumpe authored Feb 1, 2020
1 parent bd9b394 commit 46a4ca8
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 43 deletions.
11 changes: 9 additions & 2 deletions docs/src/lib/indexing.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,6 @@ performance on par with integer indexing.
The elements of a `GroupedDataFrame` are [`SubDataFrame`](@ref)s of its parent.

* `gd[i::Integer]` -> Get the `i`th group.
* `gd[a::AbstractArray{<:Integer}]` -> Get a reduced `GroupedDataFrame` containing only the groups
with indices in `a`.
* `gd[key::NamedTuple]` -> Get the group corresponding to the given values of the
grouping columns. The fields of the `NamedTuple` must match the grouping columns
columns passed to [`groupby`](@ref) (including order).
Expand All @@ -223,3 +221,12 @@ The elements of a `GroupedDataFrame` are [`SubDataFrame`](@ref)s of its parent.
* `gd[key::GroupKey]` -> Get the group corresponding to the [`GroupKey`](@ref)
`key` (one of the elements of the vector returned by [`keys(::GroupedDataFrame)`](@ref)).
This should be nearly as fast as integer indexing.
* `gd[a::AbstractVector]` -> Select multiple groups and return them in a new
`GroupedDataFrame` object. Groups may be selected by integer position using an
array of `Integer`s or `Bool`s, similar to a standard array. Alternatively the
array may contain keys of any of the types supported for dictionary-like
indexing (`GroupKey`, `Tuple`, or `NamedTuple`). Selected groups must be
unique, and different types of indices cannot be mixed.
* `gd[n::Not]` -> Any of the above types wrapped in `Not`. The result
will be a new `GroupedDataFrame` containing all groups in `gd` *not* selected
by the wrapped index.
1 change: 1 addition & 0 deletions docs/src/lib/types.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ DataFrame
DataFrameRow
GroupedDataFrame
GroupKey
GroupKeys
SubDataFrame
DataFrameRows
DataFrameColumns
Expand Down
151 changes: 120 additions & 31 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ function DataFrame(gd::GroupedDataFrame; copycols::Bool=true)
parent(gd)[idx, :]
end


#
# Accessing group indices, columns, and values
#

"""
groupindices(gd::GroupedDataFrame)
Expand Down Expand Up @@ -111,7 +116,7 @@ _groupvalues(gd::GroupedDataFrame, i::Integer, col::Symbol) =


#
# Length and iteration
# Vector interface and integer indexing
#

Base.length(gd::GroupedDataFrame) = gd.ngroups
Expand All @@ -124,20 +129,21 @@ function Base.iterate(gd::GroupedDataFrame, i=1)
end
end


#
# Vector interface and integer indexing
#

Compat.lastindex(gd::GroupedDataFrame) = gd.ngroups
Base.first(gd::GroupedDataFrame) = gd[1]
Base.last(gd::GroupedDataFrame) = gd[end]

# These have to be defined for some to_indices() logic to work, as long
# as GroupedDataFrame is not <: AbstractArray
Base.IndexStyle(::Type{<:GroupedDataFrame}) = IndexLinear()
Base.IndexStyle(::GroupedDataFrame) = IndexLinear()
Base.keys(::IndexLinear, gd::GroupedDataFrame) = Base.OneTo(length(gd))

# Single integer indexing
Base.getindex(gd::GroupedDataFrame, idx::Integer) =
view(gd.parent, gd.idx[gd.starts[idx]:gd.ends[idx]], :)

# Array of integers
# Index with array of integers OR bools
function Base.getindex(gd::GroupedDataFrame, idxs::AbstractVector{<:Integer})
new_starts = gd.starts[idxs]
new_ends = gd.ends[idxs]
Expand All @@ -155,14 +161,14 @@ function Base.getindex(gd::GroupedDataFrame, idxs::AbstractVector{<:Integer})
new_starts, new_ends, length(new_starts))
end

# Colon (creates copy)
# Index with colon (creates copy)
Base.getindex(gd::GroupedDataFrame, idxs::Colon) =
GroupedDataFrame(gd.parent, gd.cols, gd.groups, gd.idx,
gd.starts, gd.ends, gd.ngroups)


#
# Dictionary interface and indexing
# GroupKey and GroupKeys
#

"""
Expand Down Expand Up @@ -245,6 +251,111 @@ Base.IndexStyle(::Type{<:GroupKeys}) = IndexLinear()
end


#
# Non-standard indexing
#

# Non-standard indexing relies on converting to integer indices first
# The full version (to_indices) is required rather than to_index even though
# GroupedDataFrame behaves as a 1D array due to the behavior of Colon and Not.
# Note that this behavior would be the default if it was <:AbstractArray
Base.getindex(gd::GroupedDataFrame, idx...) = getindex(gd, Base.to_indices(gd, idx)...)

# The allowed key types for dictionary-like indexing
const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple}
# All allowed scalar index types
const GroupIndexTypes = Union{Integer, GroupKeyTypes}

# Find integer index for dictionary keys
function Base.to_index(gd::GroupedDataFrame, key::GroupKey)
gd === parent(key) && return getfield(key, :idx)
throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from."))
end

function Base.to_index(gd::GroupedDataFrame, key::Tuple)
for i in 1:length(gd)
isequal(Tuple(_groupvalues(gd, i)), key) && return i
end
throw(KeyError(key))
end

function Base.to_index(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
throw(KeyError(key))
end
return Base.to_index(gd, Tuple(key))
end

# Array of (possibly non-standard) indices
function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T}
# A concrete eltype which is <: GroupKeyTypes, don't need to check
if isconcretetype(T) && T <: GroupKeyTypes
return [Base.to_index(gd, i) for i in idxs]
end

# Edge case - array is empty
isempty(idxs) && return Int[]

# Infer eltype based on type of first index, expect rest to match
idx1 = idxs[1]
E1 = typeof(idx1)

E = if E1 <: Integer && E1 !== Bool
Integer
elseif E1 <: GroupKey
GroupKey
elseif E1 <: Tuple
Tuple
elseif E1 <: NamedTuple
NamedTuple
else
throw(ArgumentError("Invalid index: $idx1 of type $E1"))
end

# Convert each index to integer format
ints = Vector{Int}(undef, length(idxs))
for (i, idx) in enumerate(idxs)
if !(idx isa GroupIndexTypes) || idx isa Bool
throw(ArgumentError("Invalid index: $idx of type $(typeof(idx))"))
end
idx isa E || throw(ArgumentError("Mixed index types in array not allowed"))
ints[i] = Base.to_index(gd, idx)
end

return ints
end


#
# Indexing with Not/InvertedIndex
#

# InvertedIndex wrapping any other valid index type
# to_indices() is needed here rather than to_index() in order to override the
# to_indices(::Any, ::Tuple{Not}) methods defined in InvertedIndices.jl
function Base.to_indices(gd::GroupedDataFrame, (idx,)::Tuple{<:Not})
(skip_idx,) = Base.to_indices(gd, (idx.skip,))
idxs = Base.OneTo(length(gd))[Not(skip_idx)]
return (idxs,)
end

# InvertedIndex wrapping a boolean array
# The definition above works but we need to define specialized methods to avoid
# ambiguity in dispatch
function Base.to_indices(gd::GroupedDataFrame,
(idx,)::Tuple{Not{<:Union{BitArray{1}, Vector{Bool}}}})
(findall(!, idx.skip),)
end
function Base.to_indices(gd::GroupedDataFrame,
(idx,)::Tuple{Not{<:AbstractVector{Bool}}})
(findall(!, idx.skip),)
end


#
# Dictionary interface
#

"""
keys(gd::GroupedDataFrame)
Expand Down Expand Up @@ -331,28 +442,6 @@ true
"""
Base.keys(gd::GroupedDataFrame) = GroupKeys(gd)

# Index with GroupKey
function Base.getindex(gd::GroupedDataFrame, key::GroupKey)
gd === parent(key) && return gd[getfield(key, :idx)]
throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from."))
end

# Index with tuple
function Base.getindex(gd::GroupedDataFrame, key::Tuple)
for i in 1:length(gd)
isequal(Tuple(_groupvalues(gd, i)), key) && return gd[i]
end
throw(KeyError(key))
end

# Index with named tuple
function Base.getindex(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
throw(KeyError(key))
end
return gd[Tuple(key)]
end

"""
get(gd::GroupedDataFrame, key, default)
Expand Down
Loading

0 comments on commit 46a4ca8

Please sign in to comment.