Skip to content

Commit

Permalink
Add specialized vcat() method for CategoricalArray (#18)
Browse files Browse the repository at this point in the history
More efficient and merges levels intelligently.
  • Loading branch information
gustafsson authored and nalimilan committed Sep 26, 2016
1 parent b176bb4 commit 3b687af
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 1 deletion.
37 changes: 36 additions & 1 deletion src/array.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## Common code for CategoricalArray and NullableCategoricalArray

import Base: convert, copy, getindex, setindex!, similar, size, linearindexing
import Base: convert, copy, getindex, setindex!, similar, size, linearindexing, vcat

# Used for keyword argument default value
_ordered(x::AbstractCategoricalArray) = ordered(x)
Expand Down Expand Up @@ -215,6 +215,14 @@ end
R = reftype(length(index(A.pool)))
convert($A{T, N, R}, A)
end

function vcat(A::$A...)
newlevels, isordered = mergelevels(map(levels, A)...)

refs = [indexin(index(a.pool), newlevels)[a.refs] for a in A]
$A(DefaultRefType[refs...;],
CategoricalPool(newlevels, isordered && all(ordered, A)))
end
end
end

Expand Down Expand Up @@ -351,3 +359,30 @@ function getindex(A::CategoricalArray, i::Int)
end

levels!(A::CategoricalArray, newlevels::Vector) = _levels!(A, newlevels)

function mergelevels(levels...)
T = Base.promote_eltype(levels...)
res = Array{T}(0)
isordered = true

for l in levels
levelsmap = indexin(l, res)

isordered &= issorted(levelsmap[levelsmap.!=0])
if !isordered
# Give up attempt to order res
append!(res, l[levelsmap.==0])
else
i = length(res)+1
for j = length(l):-1:1
if levelsmap[j] == 0
insert!(res, i, l[j])
else
i = levelsmap[j]
end
end
end
end

res, isordered
end
61 changes: 61 additions & 0 deletions test/11_array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,67 @@ for isordered in (false, true)
@test x[1] === x.pool.valindex[3]
@test x[2] === x.pool.valindex[1]
@test levels(x) == ["c", "a", "b"]


# Tests of vcat of CategoricalArray
# Test that vcat of compact arrays use a reftype that doesn't overflow
a1 = 3:200
a2 = 300:-1:100
ca1 = CategoricalArray(a1)
ca2 = CategoricalArray(a2)
cca1 = compact(ca1)
cca2 = compact(ca2)
r = vcat(cca1, cca2)
@test r == vcat(a1, a2)
@test isa(cca1, CategoricalArray{Int,1,UInt8})
@test isa(cca2, CategoricalArray{Int,1,UInt8})
@test isa(r, CategoricalArray{Int,1,CategoricalArrays.DefaultRefType})
@test isa(vcat(cca1, ca2), CategoricalArray{Int,1,CategoricalArrays.DefaultRefType})
@test ordered(r) == false
@test levels(r) == collect(3:300)

# Test vcat of multidimensional arrays
a1 = Array{Int}(2,3,4,5)
a2 = Array{Int}(3,3,4,5)
a1[1:end] = (length(a1):-1:1) + 2
a2[1:end] = (1:length(a2)) + 10
ca1 = CategoricalArray(a1)
ca2 = CategoricalArray(a2)
cca1 = compact(ca1)
cca2 = compact(ca2)
r = vcat(cca1, cca2)
@test r == vcat(a1, a2)
@test isa(r, CategoricalArray{Int,4,CategoricalArrays.DefaultRefType})
@test ordered(r) == false
@test levels(r) == collect(3:length(a2)+10)

# Test that sortedmerge handles mutually compatible ordering
@test CategoricalArrays.mergelevels([6,3,4,7],[2,3,5,4],[2,4,8]) == ([6,2,3,5,4,7,8],true)
@test CategoricalArrays.mergelevels([6,3,4,7],[2,3,6,5,4],[2,4,8]) == ([6,3,4,7,2,5,8],false)

# Test concatenation of mutually compatible levels
a1 = ["Young", "Middle"]
a2 = ["Middle", "Old"]
ca1 = CategoricalArray(a1, ordered=true)
ca2 = CategoricalArray(a2, ordered=true)
levels!(ca1, ["Young", "Middle"])
levels!(ca2, ["Middle", "Old"])
r = vcat(ca1, ca2)
@test r == vcat(a1, a2)
@test levels(r) == ["Young", "Middle", "Old"]
@test ordered(r) == true

# Test concatenation of conflicting ordering. This drops the ordering
a1 = ["Old", "Young", "Young"]
a2 = ["Old", "Young", "Middle", "Young"]
ca1 = CategoricalArray(a1, ordered=true)
ca2 = CategoricalArray(a2, ordered=true)
levels!(ca1, ["Young", "Middle", "Old"])
# ca2 has another order
r = vcat(ca1, ca2)
@test r == vcat(a1, a2)
@test levels(r) == ["Young", "Middle", "Old"]
@test ordered(r) == false
end
end
end
Expand Down

0 comments on commit 3b687af

Please sign in to comment.