Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index GroupedDataFrame with array of keys #2046

Merged
merged 13 commits into from
Feb 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions docs/src/lib/indexing.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,6 @@ performance on par with integer indexing.
The elements of a `GroupedDataFrame` are [`SubDataFrame`](@ref)s of its parent.

* `gd[i::Integer]` -> Get the `i`th group.
* `gd[a::AbstractArray{<:Integer}]` -> Get a reduced `GroupedDataFrame` containing only the groups
with indices in `a`.
* `gd[key::NamedTuple]` -> Get the group corresponding to the given values of the
grouping columns. The fields of the `NamedTuple` must match the grouping columns
columns passed to [`groupby`](@ref) (including order).
Expand All @@ -223,3 +221,12 @@ The elements of a `GroupedDataFrame` are [`SubDataFrame`](@ref)s of its parent.
* `gd[key::GroupKey]` -> Get the group corresponding to the [`GroupKey`](@ref)
`key` (one of the elements of the vector returned by [`keys(::GroupedDataFrame)`](@ref)).
This should be nearly as fast as integer indexing.
* `gd[a::AbstractVector]` -> Select multiple groups and return them in a new
`GroupedDataFrame` object. Groups may be selected by integer position using an
array of `Integer`s or `Bool`s, similar to a standard array. Alternatively the
array may contain keys of any of the types supported for dictionary-like
indexing (`GroupKey`, `Tuple`, or `NamedTuple`). Selected groups must be
unique, and different types of indices cannot be mixed.
* `gd[n::Not]` -> Any of the above types wrapped in `Not`. The result
will be a new `GroupedDataFrame` containing all groups in `gd` *not* selected
by the wrapped index.
1 change: 1 addition & 0 deletions docs/src/lib/types.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ DataFrame
DataFrameRow
GroupedDataFrame
GroupKey
GroupKeys
SubDataFrame
DataFrameRows
DataFrameColumns
Expand Down
151 changes: 120 additions & 31 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ function DataFrame(gd::GroupedDataFrame; copycols::Bool=true)
parent(gd)[idx, :]
end


#
# Accessing group indices, columns, and values
#

"""
groupindices(gd::GroupedDataFrame)

Expand Down Expand Up @@ -111,7 +116,7 @@ _groupvalues(gd::GroupedDataFrame, i::Integer, col::Symbol) =


#
# Length and iteration
# Vector interface and integer indexing
#

Base.length(gd::GroupedDataFrame) = gd.ngroups
Expand All @@ -124,20 +129,21 @@ function Base.iterate(gd::GroupedDataFrame, i=1)
end
end


#
# Vector interface and integer indexing
#

Compat.lastindex(gd::GroupedDataFrame) = gd.ngroups
Base.first(gd::GroupedDataFrame) = gd[1]
Base.last(gd::GroupedDataFrame) = gd[end]

# These have to be defined for some to_indices() logic to work, as long
# as GroupedDataFrame is not <: AbstractArray
Base.IndexStyle(::Type{<:GroupedDataFrame}) = IndexLinear()
Base.IndexStyle(::GroupedDataFrame) = IndexLinear()
Base.keys(::IndexLinear, gd::GroupedDataFrame) = Base.OneTo(length(gd))

# Single integer indexing
Base.getindex(gd::GroupedDataFrame, idx::Integer) =
view(gd.parent, gd.idx[gd.starts[idx]:gd.ends[idx]], :)

# Array of integers
# Index with array of integers OR bools
function Base.getindex(gd::GroupedDataFrame, idxs::AbstractVector{<:Integer})
new_starts = gd.starts[idxs]
new_ends = gd.ends[idxs]
Expand All @@ -155,14 +161,14 @@ function Base.getindex(gd::GroupedDataFrame, idxs::AbstractVector{<:Integer})
new_starts, new_ends, length(new_starts))
end

# Colon (creates copy)
# Index with colon (creates copy)
Base.getindex(gd::GroupedDataFrame, idxs::Colon) =
GroupedDataFrame(gd.parent, gd.cols, gd.groups, gd.idx,
gd.starts, gd.ends, gd.ngroups)


#
# Dictionary interface and indexing
# GroupKey and GroupKeys
#

"""
Expand Down Expand Up @@ -245,6 +251,111 @@ Base.IndexStyle(::Type{<:GroupKeys}) = IndexLinear()
end


#
# Non-standard indexing
#

# Non-standard indexing relies on converting to integer indices first
# The full version (to_indices) is required rather than to_index even though
# GroupedDataFrame behaves as a 1D array due to the behavior of Colon and Not.
# Note that this behavior would be the default if it was <:AbstractArray
Base.getindex(gd::GroupedDataFrame, idx...) = getindex(gd, Base.to_indices(gd, idx)...)

# The allowed key types for dictionary-like indexing
const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple}
# All allowed scalar index types
const GroupIndexTypes = Union{Integer, GroupKeyTypes}

# Find integer index for dictionary keys
function Base.to_index(gd::GroupedDataFrame, key::GroupKey)
gd === parent(key) && return getfield(key, :idx)
throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from."))
end

function Base.to_index(gd::GroupedDataFrame, key::Tuple)
for i in 1:length(gd)
isequal(Tuple(_groupvalues(gd, i)), key) && return i
end
throw(KeyError(key))
end

function Base.to_index(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
throw(KeyError(key))
end
return Base.to_index(gd, Tuple(key))
end

# Array of (possibly non-standard) indices
function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T}
# A concrete eltype which is <: GroupKeyTypes, don't need to check
if isconcretetype(T) && T <: GroupKeyTypes
return [Base.to_index(gd, i) for i in idxs]
end

# Edge case - array is empty
isempty(idxs) && return Int[]

# Infer eltype based on type of first index, expect rest to match
idx1 = idxs[1]
E1 = typeof(idx1)

E = if E1 <: Integer && E1 !== Bool
Integer
elseif E1 <: GroupKey
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
GroupKey
elseif E1 <: Tuple
Tuple
elseif E1 <: NamedTuple
NamedTuple
else
throw(ArgumentError("Invalid index: $idx1 of type $E1"))
end

# Convert each index to integer format
ints = Vector{Int}(undef, length(idxs))
for (i, idx) in enumerate(idxs)
if !(idx isa GroupIndexTypes) || idx isa Bool
throw(ArgumentError("Invalid index: $idx of type $(typeof(idx))"))
end
idx isa E || throw(ArgumentError("Mixed index types in array not allowed"))
jlumpe marked this conversation as resolved.
Show resolved Hide resolved
ints[i] = Base.to_index(gd, idx)
end

return ints
end


#
# Indexing with Not/InvertedIndex
#

# InvertedIndex wrapping any other valid index type
# to_indices() is needed here rather than to_index() in order to override the
# to_indices(::Any, ::Tuple{Not}) methods defined in InvertedIndices.jl
function Base.to_indices(gd::GroupedDataFrame, (idx,)::Tuple{<:Not})
(skip_idx,) = Base.to_indices(gd, (idx.skip,))
idxs = Base.OneTo(length(gd))[Not(skip_idx)]
return (idxs,)
end

# InvertedIndex wrapping a boolean array
# The definition above works but we need to define specialized methods to avoid
# ambiguity in dispatch
function Base.to_indices(gd::GroupedDataFrame,
(idx,)::Tuple{Not{<:Union{BitArray{1}, Vector{Bool}}}})
jlumpe marked this conversation as resolved.
Show resolved Hide resolved
(findall(!, idx.skip),)
end
function Base.to_indices(gd::GroupedDataFrame,
(idx,)::Tuple{Not{<:AbstractVector{Bool}}})
(findall(!, idx.skip),)
end


#
# Dictionary interface
#

"""
keys(gd::GroupedDataFrame)

Expand Down Expand Up @@ -331,28 +442,6 @@ true
"""
Base.keys(gd::GroupedDataFrame) = GroupKeys(gd)

# Index with GroupKey
function Base.getindex(gd::GroupedDataFrame, key::GroupKey)
gd === parent(key) && return gd[getfield(key, :idx)]
throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from."))
end

# Index with tuple
function Base.getindex(gd::GroupedDataFrame, key::Tuple)
for i in 1:length(gd)
isequal(Tuple(_groupvalues(gd, i)), key) && return gd[i]
end
throw(KeyError(key))
end

# Index with named tuple
function Base.getindex(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
throw(KeyError(key))
end
return gd[Tuple(key)]
end

"""
get(gd::GroupedDataFrame, key, default)

Expand Down
Loading