diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index e49018abda..b77cc232cc 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -212,8 +212,6 @@ performance on par with integer indexing. The elements of a `GroupedDataFrame` are [`SubDataFrame`](@ref)s of its parent. * `gd[i::Integer]` -> Get the `i`th group. -* `gd[a::AbstractArray{<:Integer}]` -> Get a reduced `GroupedDataFrame` containing only the groups - with indices in `a`. * `gd[key::NamedTuple]` -> Get the group corresponding to the given values of the grouping columns. The fields of the `NamedTuple` must match the grouping columns columns passed to [`groupby`](@ref) (including order). @@ -223,3 +221,12 @@ The elements of a `GroupedDataFrame` are [`SubDataFrame`](@ref)s of its parent. * `gd[key::GroupKey]` -> Get the group corresponding to the [`GroupKey`](@ref) `key` (one of the elements of the vector returned by [`keys(::GroupedDataFrame)`](@ref)). This should be nearly as fast as integer indexing. +* `gd[a::AbstractVector]` -> Select multiple groups and return them in a new + `GroupedDataFrame` object. Groups may be selected by integer position using an + array of `Integer`s or `Bool`s, similar to a standard array. Alternatively the + array may contain keys of any of the types supported for dictionary-like + indexing (`GroupKey`, `Tuple`, or `NamedTuple`). Selected groups must be + unique, and different types of indices cannot be mixed. +* `gd[n::Not]` -> Any of the above types wrapped in `Not`. The result + will be a new `GroupedDataFrame` containing all groups in `gd` *not* selected + by the wrapped index. diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md index c60dbd2b7b..109e29c9c7 100644 --- a/docs/src/lib/types.md +++ b/docs/src/lib/types.md @@ -115,6 +115,7 @@ DataFrame DataFrameRow GroupedDataFrame GroupKey +GroupKeys SubDataFrame DataFrameRows DataFrameColumns diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index d6b18788a7..9f9220b9c5 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -75,6 +75,11 @@ function DataFrame(gd::GroupedDataFrame; copycols::Bool=true) parent(gd)[idx, :] end + +# +# Accessing group indices, columns, and values +# + """ groupindices(gd::GroupedDataFrame) @@ -111,7 +116,7 @@ _groupvalues(gd::GroupedDataFrame, i::Integer, col::Symbol) = # -# Length and iteration +# Vector interface and integer indexing # Base.length(gd::GroupedDataFrame) = gd.ngroups @@ -124,20 +129,21 @@ function Base.iterate(gd::GroupedDataFrame, i=1) end end - -# -# Vector interface and integer indexing -# - Compat.lastindex(gd::GroupedDataFrame) = gd.ngroups Base.first(gd::GroupedDataFrame) = gd[1] Base.last(gd::GroupedDataFrame) = gd[end] +# These have to be defined for some to_indices() logic to work, as long +# as GroupedDataFrame is not <: AbstractArray +Base.IndexStyle(::Type{<:GroupedDataFrame}) = IndexLinear() +Base.IndexStyle(::GroupedDataFrame) = IndexLinear() +Base.keys(::IndexLinear, gd::GroupedDataFrame) = Base.OneTo(length(gd)) + # Single integer indexing Base.getindex(gd::GroupedDataFrame, idx::Integer) = view(gd.parent, gd.idx[gd.starts[idx]:gd.ends[idx]], :) -# Array of integers +# Index with array of integers OR bools function Base.getindex(gd::GroupedDataFrame, idxs::AbstractVector{<:Integer}) new_starts = gd.starts[idxs] new_ends = gd.ends[idxs] @@ -155,14 +161,14 @@ function Base.getindex(gd::GroupedDataFrame, idxs::AbstractVector{<:Integer}) new_starts, new_ends, length(new_starts)) end -# Colon (creates copy) +# Index with colon (creates copy) Base.getindex(gd::GroupedDataFrame, idxs::Colon) = GroupedDataFrame(gd.parent, gd.cols, gd.groups, gd.idx, gd.starts, gd.ends, gd.ngroups) # -# Dictionary interface and indexing +# GroupKey and GroupKeys # """ @@ -245,6 +251,111 @@ Base.IndexStyle(::Type{<:GroupKeys}) = IndexLinear() end +# +# Non-standard indexing +# + +# Non-standard indexing relies on converting to integer indices first +# The full version (to_indices) is required rather than to_index even though +# GroupedDataFrame behaves as a 1D array due to the behavior of Colon and Not. +# Note that this behavior would be the default if it was <:AbstractArray +Base.getindex(gd::GroupedDataFrame, idx...) = getindex(gd, Base.to_indices(gd, idx)...) + +# The allowed key types for dictionary-like indexing +const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple} +# All allowed scalar index types +const GroupIndexTypes = Union{Integer, GroupKeyTypes} + +# Find integer index for dictionary keys +function Base.to_index(gd::GroupedDataFrame, key::GroupKey) + gd === parent(key) && return getfield(key, :idx) + throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from.")) +end + +function Base.to_index(gd::GroupedDataFrame, key::Tuple) + for i in 1:length(gd) + isequal(Tuple(_groupvalues(gd, i)), key) && return i + end + throw(KeyError(key)) +end + +function Base.to_index(gd::GroupedDataFrame, key::NamedTuple{N}) where {N} + if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols)) + throw(KeyError(key)) + end + return Base.to_index(gd, Tuple(key)) +end + +# Array of (possibly non-standard) indices +function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T} + # A concrete eltype which is <: GroupKeyTypes, don't need to check + if isconcretetype(T) && T <: GroupKeyTypes + return [Base.to_index(gd, i) for i in idxs] + end + + # Edge case - array is empty + isempty(idxs) && return Int[] + + # Infer eltype based on type of first index, expect rest to match + idx1 = idxs[1] + E1 = typeof(idx1) + + E = if E1 <: Integer && E1 !== Bool + Integer + elseif E1 <: GroupKey + GroupKey + elseif E1 <: Tuple + Tuple + elseif E1 <: NamedTuple + NamedTuple + else + throw(ArgumentError("Invalid index: $idx1 of type $E1")) + end + + # Convert each index to integer format + ints = Vector{Int}(undef, length(idxs)) + for (i, idx) in enumerate(idxs) + if !(idx isa GroupIndexTypes) || idx isa Bool + throw(ArgumentError("Invalid index: $idx of type $(typeof(idx))")) + end + idx isa E || throw(ArgumentError("Mixed index types in array not allowed")) + ints[i] = Base.to_index(gd, idx) + end + + return ints +end + + +# +# Indexing with Not/InvertedIndex +# + +# InvertedIndex wrapping any other valid index type +# to_indices() is needed here rather than to_index() in order to override the +# to_indices(::Any, ::Tuple{Not}) methods defined in InvertedIndices.jl +function Base.to_indices(gd::GroupedDataFrame, (idx,)::Tuple{<:Not}) + (skip_idx,) = Base.to_indices(gd, (idx.skip,)) + idxs = Base.OneTo(length(gd))[Not(skip_idx)] + return (idxs,) +end + +# InvertedIndex wrapping a boolean array +# The definition above works but we need to define specialized methods to avoid +# ambiguity in dispatch +function Base.to_indices(gd::GroupedDataFrame, + (idx,)::Tuple{Not{<:Union{BitArray{1}, Vector{Bool}}}}) + (findall(!, idx.skip),) +end +function Base.to_indices(gd::GroupedDataFrame, + (idx,)::Tuple{Not{<:AbstractVector{Bool}}}) + (findall(!, idx.skip),) +end + + +# +# Dictionary interface +# + """ keys(gd::GroupedDataFrame) @@ -331,28 +442,6 @@ true """ Base.keys(gd::GroupedDataFrame) = GroupKeys(gd) -# Index with GroupKey -function Base.getindex(gd::GroupedDataFrame, key::GroupKey) - gd === parent(key) && return gd[getfield(key, :idx)] - throw(ErrorException("Cannot use a GroupKey to index a GroupedDataFrame other than the one it was derived from.")) -end - -# Index with tuple -function Base.getindex(gd::GroupedDataFrame, key::Tuple) - for i in 1:length(gd) - isequal(Tuple(_groupvalues(gd, i)), key) && return gd[i] - end - throw(KeyError(key)) -end - -# Index with named tuple -function Base.getindex(gd::GroupedDataFrame, key::NamedTuple{N}) where {N} - if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols)) - throw(KeyError(key)) - end - return gd[Tuple(key)] -end - """ get(gd::GroupedDataFrame, key, default) diff --git a/test/grouping.jl b/test/grouping.jl index 9b3c5cddd6..3b7ed041bf 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -957,9 +957,12 @@ end @testset "iteration protocol" begin gd = groupby_checked(DataFrame(A = [:A, :A, :B, :B], B = 1:4), :A) + count = 0 for v in gd - @test size(v) == (2,2) + count += 1 + @test v ≅ gd[count] end + @test count == length(gd) end @testset "type stability of index fields" begin @@ -972,17 +975,30 @@ end @inferred ends(gd) == getfield(gd, :ends) end -@testset "getindex" begin +@testset "Array-like getindex" begin df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]), b = 1:8) gd = groupby_checked(df, :a) + + # Invalid + @test_throws ArgumentError gd[true] + @test_throws ArgumentError gd[[1, 2, 1]] # Duplicate + @test_throws ArgumentError gd["a"] + + # Single integer @test gd[1] isa SubDataFrame @test gd[1] == view(df, [1, 5], :) @test_throws BoundsError gd[5] - @test_throws ArgumentError gd[true] - @test_throws ArgumentError gd[[1, 2, 1]] - @test_throws MethodError gd["a"] - gd2 = gd[[false, true, false, false]] + + # first, last, lastindex + @test first(gd) == gd[1] + @test last(gd) == gd[4] + @test lastindex(gd) == 4 + @test gd[end] == gd[4] + + # Boolean array + idx2 = [false, true, false, false] + gd2 = gd[idx2] @test length(gd2) == 1 @test gd2[1] == gd[2] @test_throws BoundsError gd[[true, false]] @@ -990,7 +1006,10 @@ end @test gd2.starts == [3] @test gd2.ends == [4] @test gd2.idx == gd.idx + @test gd[BitArray(idx2)] ≅ gd2 + @test gd[1:2][false:true] ≅ gd[[2]] # AbstractArray{Bool} + # Colon gd3 = gd[:] @test gd3 isa GroupedDataFrame @test length(gd3) == 4 @@ -998,17 +1017,28 @@ end for i in 1:4 @test gd3[i] == gd[i] end - gd4 = gd[[2,1]] + + # Integer array + idx4 = [2,1] + gd4 = gd[idx4] @test gd4 isa GroupedDataFrame @test length(gd4) == 2 - for i in 1:2 - @test gd4[i] == gd[3-i] + for (i, j) in enumerate(idx4) + @test gd4[i] == gd[j] end - @test_throws BoundsError gd[1:5] @test gd4.groups == [2, 1, 0, 0, 2, 1, 0, 0] @test gd4.starts == [3,1] @test gd4.ends == [4,2] @test gd4.idx == gd.idx + + # Infer eltype + @test gd[Array{Any}(idx4)] ≅ gd4 + # Mixed (non-Bool) integer types should work + @test gd[Any[idx4[1], Unsigned(idx4[2])]] ≅ gd4 + @test_throws ArgumentError gd[Any[2, true]] + + # Out-of-bounds + @test_throws BoundsError gd[1:5] end @testset "== and isequal" begin @@ -1445,6 +1475,105 @@ end ] end +@testset "GroupedDataFrame indexing with array of keys" begin + df = DataFrame(a = repeat([:A, :B, missing], outer=4), b = repeat(1:2, inner=6), c = 1:12) + gd = groupby_checked(df, [:a, :b]) + + ints = [4, 6, 2, 1] + gd2 = gd[ints] + gkeys = keys(gd)[ints] + + # Test with GroupKeys, Tuples, and NamedTuples + for converter in [identity, Tuple, NamedTuple] + a = converter.(gkeys) + @test gd[a] ≅ gd2 + + # Infer eltype + @test gd[Array{Any}(a)] ≅ gd2 + + # Duplicate keys + a2 = converter.(keys(gd)[[1, 2, 1]]) + @test_throws ArgumentError gd[a2] + end +end + +@testset "InvertedIndex with GroupedDataFrame" begin + df = DataFrame(a = repeat([:A, :B, missing], outer=4), b = repeat(1:2, inner=6), c = 1:12) + gd = groupby_checked(df, [:a, :b]) + + # Inverted scalar index + skip_i = 3 + skip_key = keys(gd)[skip_i] + expected = gd[[i != skip_i for i in 1:length(gd)]] + expected_inv = gd[[skip_i]] + + for skip in [skip_i, skip_key, Tuple(skip_key), NamedTuple(skip_key)] + @test gd[Not(skip)] ≅ expected + # Nested + @test gd[Not(Not(skip))] ≅ expected_inv + end + + @test_throws ArgumentError gd[Not(true)] # Bool <: Integer, but should fail + + # Inverted array index + skipped = [3, 5, 2] + skipped_bool = [i ∈ skipped for i in 1:length(gd)] + skipped_keys = keys(gd)[skipped] + expected2 = gd[.!skipped_bool] + expected2_inv = gd[skipped_bool] + + for skip in [skipped, skipped_keys, Tuple.(skipped_keys), NamedTuple.(skipped_keys)] + @test gd[Not(skip)] ≅ expected2 + # Infer eltype + @test gd[Not(Array{Any}(skip))] ≅ expected2 + # Nested + @test gd[Not(Not(skip))] ≅ expected2_inv + @test gd[Not(Not(Array{Any}(skip)))] ≅ expected2_inv + end + + # Mixed integer arrays + @test gd[Not(Any[Unsigned(skipped[1]), skipped[2:end]...])] ≅ expected2 + @test_throws ArgumentError gd[Not(Any[2, true])] + + # Boolean array + @test gd[Not(skipped_bool)] ≅ expected2 + @test gd[Not(Not(skipped_bool))] ≅ expected2_inv + @test gd[1:2][Not(false:true)] ≅ gd[[1]] # Not{AbstractArray{Bool}} + + # Inverted colon + @test gd[Not(:)] ≅ gd[Int[]] + @test gd[Not(Not(:))] ≅ gd +end + +@testset "GroupedDataFrame array index homogeneity" begin + df = DataFrame(a = repeat([:A, :B, missing], outer=4), b = repeat(1:2, inner=6), c = 1:12) + gd = groupby_checked(df, [:a, :b]) + + # All scalar index types + idxsets = [1:length(gd), keys(gd), Tuple.(keys(gd)), NamedTuple.(keys(gd))] + + # Mixing index types should fail + for (i, idxset1) in enumerate(idxsets) + idx1 = idxset1[1] + for (j, idxset2) in enumerate(idxsets) + i == j && continue + + idx2 = idxset2[2] + + # With Any eltype + a = Any[idx1, idx2] + @test_throws ArgumentError gd[a] + @test_throws ArgumentError gd[Not(a)] + + # Most specific applicable eltype, which is <: GroupKeyTypes + T = Union{typeof(idx1), typeof(idx2)} + a2 = T[idx1, idx2] + @test_throws ArgumentError gd[a2] + @test_throws ArgumentError gd[Not(a2)] + end + end +end + @testset "Parent DataFrame names changed" begin df = DataFrame(a = repeat([:A, :B, missing], outer=4), b = repeat([:X, :Y], inner=6), c = 1:12) gd = groupby_checked(df, [:a, :b])