Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix unique() behaviour, add unique!() #358

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 26 additions & 21 deletions src/array.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## Code for CategoricalArray

import Base: Array, convert, collect, copy, getindex, setindex!, similar, size,
unique, vcat, in, summary, float, complex, copyto!
unique, unique!, vcat, in, summary, float, complex, copyto!

# Used for keyword argument default value
_isordered(x::AbstractCategoricalArray) = isordered(x)
Expand Down Expand Up @@ -868,31 +868,36 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::AbstractVector;
return A
end

function _unique(::Type{S},
refs::AbstractArray{T},
pool::CategoricalPool) where {S, T<:Integer}
nlevels = length(levels(pool)) + 1
order = fill(0, nlevels) # 0 indicates not seen
# If we don't track missings, short-circuit even if none has been seen
count = S >: Missing ? 0 : 1
@inbounds for i in refs
if order[i + 1] == 0
count += 1
order[i + 1] = count
count == nlevels && break
# return unique refs (each value is unique) in the order of appearance in `refs`
# equivalent to fallback Base.unique() implementation,
# but short-circuits once references to all levels are encountered
function _uniquerefs(A::CatArrOrSub{T}) where T
arefs = refs(A)
res = similar(arefs, 0)
nlevels = length(levels(A))
maxunique = nlevels + (T >: Missing ? 1 : 0)
seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref)
@inbounds for ref in arefs
if !seen[ref + 1]
push!(res, ref)
seen[ref + 1] = true
(length(res) == maxunique) && break
end
end
S[i == 1 ? missing : levels(pool)[i - 1] for i in sortperm(order) if order[i] != 0]
return res
end

"""
unique(A::CategoricalArray)
unique(A::CatArrOrSub{T}) where T =
CategoricalVector{T}(_uniquerefs(A), copy(pool(A)))

Return levels which appear in `A` in their order of appearance.
This function is significantly slower than [`levels`](@ref DataAPI.levels)
since it needs to check whether levels are used or not.
"""
unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool)
function unique!(A::CategoricalVector)
_urefs = _uniquerefs(A)
if length(_urefs) != length(A)
resize!(A.refs, length(_urefs))
copyto!(A.refs, _urefs)
end
return A
end

"""
droplevels!(A::CategoricalArray)
Expand Down
7 changes: 0 additions & 7 deletions src/subarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,6 @@ isordered(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} = isordered(paren
levels!(sa::SubArray{T,N,P}, newlevels::Vector) where {T,N,P<:CategoricalArray} =
levels!(parent(sa), newlevels)

function unique(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray}
A = parent(sa)
refs = view(A.refs, sa.indices...)
S = eltype(P) >: Missing ? Union{eltype(levels(A.pool)), Missing} : eltype(levels(A.pool))
_unique(S, refs, A.pool)
end

refs(A::SubArray{<:Any, <:Any, <:CategoricalArray}) =
view(parent(A).refs, parentindices(A)...)

Expand Down
23 changes: 21 additions & 2 deletions test/11_array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ using CategoricalArrays: DefaultRefType, leveltype
@test isordered(x) === ordered
@test levels(x) == sort(unique(a))
@test unique(x) == unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (3,)
@test length(x) === 3

Expand Down Expand Up @@ -272,6 +273,7 @@ using CategoricalArrays: DefaultRefType, leveltype
@test x == collect(a)
@test isordered(x) === ordered
@test levels(x) == unique(x) == unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (4,)
@test length(x) === 4
@test leveltype(x) === Float64
Expand Down Expand Up @@ -437,6 +439,7 @@ using CategoricalArrays: DefaultRefType, leveltype
@test x[4] === CategoricalValue(x.pool, 4)
@test levels(x) == unique(a)
@test unique(x) == unique(collect(x))
@test typeof(unique(x)) === typeof(x)

x[1:2] .= -1
@test x[1] === CategoricalValue(x.pool, 5)
Expand Down Expand Up @@ -473,6 +476,7 @@ using CategoricalArrays: DefaultRefType, leveltype
@test x == a
@test isordered(x) === ordered
@test levels(x) == unique(x) == unique(a)
@test unique(x) isa CategoricalVector{String, R}
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -729,27 +733,42 @@ end
@test levels!(x, ["Young", "Middle", "Old"]) === x
@test levels(x) == ["Young", "Middle", "Old"]
@test unique(x) == ["Old", "Young", "Middle"]
@test typeof(unique(x)) === typeof(x)
@test levels!(x, ["Young", "Middle", "Old", "Unused"]) === x
@test levels(x) == ["Young", "Middle", "Old", "Unused"]
@test unique(x) == ["Old", "Young", "Middle"]
@test levels!(x, ["Unused1", "Young", "Middle", "Old", "Unused2"]) === x
@test levels(x) == ["Unused1", "Young", "Middle", "Old", "Unused2"]
@test unique(x) == ["Old", "Young", "Middle"]

y = copy(x)
@test unique!(y) === y
@test y == unique(x)

x = CategoricalArray(String[])
@test isa(levels(x), Vector{String}) && isempty(levels(x))
@test isa(unique(x), Vector{String}) && isempty(unique(x))
@test isa(unique(x), typeof(x)) && isempty(unique(x))
@test levels!(x, ["Young", "Middle", "Old"]) === x
@test levels(x) == ["Young", "Middle", "Old"]
@test isa(unique(x), Vector{String}) && isempty(unique(x))
@test isa(unique(x), typeof(x)) && isempty(unique(x))

y = copy(x)
@test unique!(y) === y
@test y == unique(x)

# To test short-circuiting
x = CategoricalArray(repeat(1:10, inner=10))
@test levels(x) == collect(1:10)
@test unique(x) == collect(1:10)
@test unique(x) isa typeof(x)
@test levels!(x, [19:-1:1; 20]) === x
@test levels(x) == [19:-1:1; 20]
@test unique(x) == collect(1:10)
@test unique(x) isa typeof(x)

y = copy(x)
@test unique!(y) === y
@test y == 1:10
end

end
12 changes: 12 additions & 0 deletions test/12_missingarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,14 @@ const ≅ = isequal
@test isordered(x) === ordered
@test levels(x) == sort(unique(a))
@test unique(x) == unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (3,)
@test length(x) === 3

y = copy(x)
@test y === unique!(y)
@test y == unique(x)

@test convert(CategoricalArray, x) === x
@test convert(CategoricalArray{Union{String, Missing}}, x) === x
@test convert(CategoricalArray{Union{String, Missing}, 1}, x) === x
Expand Down Expand Up @@ -296,6 +301,7 @@ const ≅ = isequal
@test x ≅ a
@test levels(x) == filter(x->!ismissing(x), unique(a))
@test unique(x) ≅ unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (3,)
@test length(x) === 3

Expand Down Expand Up @@ -440,6 +446,7 @@ const ≅ = isequal
@test x == collect(a)
@test isordered(x) === ordered
@test levels(x) == unique(x) == unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (4,)
@test length(x) === 4
@test leveltype(x) === Float64
Expand Down Expand Up @@ -616,6 +623,7 @@ const ≅ = isequal
@test x[4] === CategoricalValue(x.pool, 4)
@test levels(x) == unique(a)
@test unique(x) == unique(collect(x))
@test typeof(unique(x)) === typeof(x)

x[1:2] .= -1
@test x[1] === CategoricalValue(x.pool, 5)
Expand All @@ -625,6 +633,7 @@ const ≅ = isequal
@test isordered(x) === false
@test levels(x) == vcat(unique(a), -1)
@test unique(x) == unique(collect(x))
@test typeof(unique(x)) === typeof(x)


ordered!(x, ordered)
Expand Down Expand Up @@ -656,6 +665,7 @@ const ≅ = isequal
@test x == a
@test isordered(x) === ordered
@test levels(x) == unique(x) == unique(a)
@test unique(x) isa CategoricalVector{Union{String, Missing}, R}
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -816,6 +826,7 @@ const ≅ = isequal
@test isordered(x) === ordered
@test levels(x) == filter(x->!ismissing(x), unique(a))
@test unique(x) ≅ unique(a)
@test unique(x) isa CategoricalVector{Union{String, Missing}, R}
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -1137,6 +1148,7 @@ end
x = CategoricalArray(["Old", "Young", "Middle", missing, "Young"])
@test levels(x) == ["Middle", "Old", "Young"]
@test unique(x) ≅ ["Old", "Young", "Middle", missing]
@test typeof(unique(x)) === typeof(x)
@test levels!(x, ["Young", "Middle", "Old"]) === x
@test levels(x) == ["Young", "Middle", "Old"]
@test unique(x) ≅ ["Old", "Young", "Middle", missing]
Expand Down
Loading