diff --git a/.travis.yml b/.travis.yml index 9bd5550..169a758 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,7 @@ os: - osx - linux julia: - - 0.5 + - 0.6 - nightly notifications: email: false diff --git a/README.md b/README.md index af72246..99ba24a 100644 --- a/README.md +++ b/README.md @@ -2,69 +2,78 @@ [![Build Status](https://travis-ci.org/nalimilan/FreqTables.jl.svg?branch=master)](https://travis-ci.org/nalimilan/FreqTables.jl) [![Coverage Status](https://coveralls.io/repos/nalimilan/FreqTables.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/nalimilan/FreqTables.jl?branch=master) -[![FreqTables](http://pkg.julialang.org/badges/FreqTables_0.5.svg)](http://pkg.julialang.org/?pkg=FreqTables&ver=0.5) [![FreqTables](http://pkg.julialang.org/badges/FreqTables_0.6.svg)](http://pkg.julialang.org/?pkg=FreqTables&ver=0.6) This package allows computing one- or multi-way frequency tables (a.k.a. contingency or pivot tables) from -any type of vector or array. It includes support for [`PooledDataArray`s](https://github.com/JuliaStats/DataArrays.jl) -and [`DataFrame`s](https://github.com/JuliaStats/DataFrames.jl/), as well as for weighted counts. +any type of vector or array. It includes support for [`CategoricalArray`](https://github.com/JuliaData/CategoricalArrays.jl) +and [`DataFrame`](https://github.com/JuliaData/DataFrames.jl), as well as for weighted counts. Tables are represented as [`NamedArray`](https://github.com/davidavdav/NamedArrays.jl/) objects. ```julia julia> using FreqTables + julia> x = repeat(["a", "b", "c", "d"], outer=[100]); + julia> y = repeat(["A", "B", "C", "D"], inner=[10], outer=[10]); + julia> freqtable(x) -4-element NamedArrays.NamedArray{Int64,1,Array{Int64,1},Tuple{Dict{ASCIIString,Int64}}} -a 100 -b 100 -c 100 -d 100 +4-element Named Array{Int64,1} +Dim1 │ +──────┼──── +a │ 100 +b │ 100 +c │ 100 +d │ 100 julia> freqtable(x, y) -4x4 NamedArrays.NamedArray{Int64,2,Array{Int64,2},Tuple{Dict{ASCIIString,Int64},Dict{ASCIIString,Int64}}} -Dim1 \ Dim2 A B C D -a 30 20 30 20 -b 30 20 30 20 -c 20 30 20 30 -d 20 30 20 30 +4×4 Named Array{Int64,2} +Dim1 ╲ Dim2 │ A B C D +────────────┼─────────────── +a │ 30 20 30 20 +b │ 30 20 30 20 +c │ 20 30 20 30 +d │ 20 30 20 30 julia> freqtable(x, y, subset=1:20) -4x2 NamedArrays.NamedArray{Int64,2,Array{Int64,2},Tuple{Dict{ASCIIString,Int64},Dict{ASCIIString,Int64}}} -Dim1 \ Dim2 A B -a 3 2 -b 3 2 -c 2 3 -d 2 3 +4×2 Named Array{Int64,2} +Dim1 ╲ Dim2 │ A B +────────────┼───── +a │ 3 2 +b │ 3 2 +c │ 2 3 +d │ 2 3 julia> freqtable(x, y, subset=1:20, weights=repeat([1, .5], outer=[10])) -4x2 NamedArrays.NamedArray{Float64,2,Array{Float64,2},Tuple{Dict{ASCIIString,Int64},Dict{ASCIIString,Int64}}} -Dim1 \ Dim2 A B -a 3.0 2.0 -b 1.5 1.0 -c 2.0 3.0 -d 1.0 1.5 +4×2 Named Array{Float64,2} +Dim1 ╲ Dim2 │ A B +────────────┼───────── +a │ 3.0 2.0 +b │ 1.5 1.0 +c │ 2.0 3.0 +d │ 1.0 1.5 ``` -For convenience, when working with a data frame, one can also pass the `DataFrame` object and columns as symbols: +For convenience, when working with a data frame, one can also pass a `DataFrame` object and columns as symbols: ```julia -julia> using RDatasets +julia> using DataFrames, CSV -julia> iris = dataset("datasets", "iris"); +julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv")); julia> iris[:LongSepal] = iris[:SepalLength] .> 5.0; julia> freqtable(iris, :Species, :LongSepal) -3x2 NamedArrays.NamedArray{Int64,2,Array{Int64,2},Tuple{Dict{ASCIIString,Int64},Dict{Bool,Int64}}} -Species \ LongSepal false true -setosa 28 22 -versicolor 3 47 -virginica 1 49 +3×2 Named Array{Int64,2} +Species ╲ LongSepal │ false true +────────────────────┼───────────── +setosa │ 28 22 +versicolor │ 3 47 +virginica │ 1 49 julia> freqtable(iris, :Species, :LongSepal, subset=iris[:PetalLength] .< 4.0) -2x2 NamedArrays.NamedArray{Int64,2,Array{Int64,2},Tuple{Dict{ASCIIString,Int64},Dict{Bool,Int64}}} -Species \ LongSepal false true -setosa 28 22 -versicolor 3 8 +2×2 Named Array{Int64,2} +Species ╲ LongSepal │ false true +────────────────────┼───────────── +setosa │ 28 22 +versicolor │ 3 8 ``` diff --git a/REQUIRE b/REQUIRE index 88ab021..e53ce83 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,4 +1,4 @@ -julia 0.5 +julia 0.6 NamedArrays -DataArrays -DataFrames +CategoricalArrays 0.3.0 +DataFrames 0.11.0 \ No newline at end of file diff --git a/src/FreqTables.jl b/src/FreqTables.jl index 3be4059..00a07d1 100644 --- a/src/FreqTables.jl +++ b/src/FreqTables.jl @@ -1,5 +1,5 @@ module FreqTables - using DataArrays + using CategoricalArrays using DataFrames using NamedArrays diff --git a/src/freqtable.jl b/src/freqtable.jl index 891818e..7cac518 100644 --- a/src/freqtable.jl +++ b/src/freqtable.jl @@ -6,19 +6,14 @@ end Base.getindex(w::UnitWeights, ::Integer...) = 1 Base.getindex(w::UnitWeights, ::AbstractVector) = w -# @pure only exists in Julia 0.5 -if isdefined(Base, Symbol("@pure")) - import Base.@pure -else - macro pure(x) esc(x) end -end - # About the type inference limitation which prompts this workaround, see # https://github.com/JuliaLang/julia/issues/10880 -@pure eltypes(T) = Tuple{map(eltype, T.parameters)...} +Base.@pure eltypes(T) = Tuple{map(eltype, T.parameters)...} +Base.@pure vectypes(T) = Tuple{map(U -> Vector{U}, T.parameters)...} # Internal function needed for now so that n is inferred function _freqtable{T<:Real}(x::Tuple, + skipmissing::Bool = false, weights::AbstractVector{T} = UnitWeights(), subset::Union{Void, AbstractVector{Int}, AbstractVector{Bool}} = nothing) if !isa(subset, Void) @@ -52,24 +47,31 @@ function _freqtable{T<:Real}(x::Tuple, end end - k = collect(keys(d)) + if skipmissing + filter!((k, v) -> !any(ismissing, k), d) + end - dimnames = Vector{Any}(n) + keyvec = collect(keys(d)) + + dimnames = Vector{Vector}(n) for i in 1:n s = Set{vtypes.parameters[i]}() - for j in 1:length(k) - push!(s, k[j][i]) + for j in 1:length(keyvec) + push!(s, keyvec[j][i]) end - dimnames[i] = unique(s) - elty = eltype(dimnames[i]) - if method_exists(isless, (elty, elty)) + # convert() is needed for Union{T, Missing}, which currently gives a Vector{Any} + # which breaks inference of the return type + dimnames[i] = convert(Vector{vtypes.parameters[i]}, unique(s)) + try sort!(dimnames[i]) + catch err + err isa MethodError || rethrow(err) end end - a = zeros(eltype(weights), map(length, dimnames)...) - na = NamedArray(a, tuple(dimnames...), ntuple(i -> "Dim$i", n)) + a = zeros(eltype(weights), map(length, dimnames)...)::Array{eltype(weights), n} + na = NamedArray(a, tuple(dimnames...)::vectypes(vtypes), ntuple(i -> "Dim$i", n)) for (k, v) in d na[k...] = v @@ -79,14 +81,19 @@ function _freqtable{T<:Real}(x::Tuple, end freqtable{T<:Real}(x::AbstractVector...; + skipmissing::Bool = false, weights::AbstractVector{T} = UnitWeights(), subset::Union{Void, AbstractVector{Int}, AbstractVector{Bool}} = nothing) = - _freqtable(x, weights, subset) + _freqtable(x, skipmissing, weights, subset) # Internal function needed for now so that n is inferred -function _freqtable{n}(x::NTuple{n, PooledDataVector}, usena = false) - len = map(length, x) - lev = map(levels, x) +function _freqtable{n}(x::NTuple{n, AbstractCategoricalVector}, skipmissing::Bool = false) + len = map(length, x) + miss = map(v -> eltype(v) >: Missing, x) + lev = map(v -> eltype(v) >: Missing && !skipmissing ? [levels(v); missing] : levels(v), x) + dims = map(length, lev) + # First entry is for missing values (only correct and used if present) + ord = map((v, d) -> Int[d; CategoricalArrays.order(v.pool)], x, dims) for i in 1:n if len[1] != len[i] @@ -94,62 +101,30 @@ function _freqtable{n}(x::NTuple{n, PooledDataVector}, usena = false) end end - if usena - dims = map(l -> length(l) + 1, lev) - sizes = cumprod([dims...]) - a = zeros(Int, dims) - - for i in 1:len[1] - el = Int(x[1].refs[i]) + sizes = cumprod([dims...]) + a = zeros(Int, dims) + missingpossible = any(miss) - if el == 0 - el = dims[1] - end - - for j in 2:n - val = Int(x[j].refs[i]) - - if val == zero(val) - val = dims[j] - end - - el += Int((val - 1) * sizes[j - 1]) - end - - a[el] += 1 - end + @inbounds for i in 1:len[1] + ref = x[1].refs[i] + el = ord[1][ref + 1] + anymiss = missingpossible & (ref <= 0) - NamedArray(a, map(l -> [l; "NA"], lev), ntuple(i -> "Dim$i", n)) - else - dims = map(length, lev) - sizes = cumprod([dims...]) - a = zeros(Int, dims) - - for i in 1:len[1] - pos = (x[1].refs[i] != zero(UInt)) - el = Int(x[1].refs[i]) - - for j in 2:n - val = x[j].refs[i] - - if val == zero(val) - pos = false - break - end - - el += Int((val - 1) * sizes[j - 1]) - end + for j in 2:n + ref = x[j].refs[i] + anymiss |= missingpossible & (ref <= 0) + el += (ord[j][ref + 1] - 1) * sizes[j - 1] + end - if pos - @inbounds a[el] += 1 - end - end + if !(missingpossible && skipmissing && anymiss) + a[el] += 1 + end + end - NamedArray(a, lev, ntuple(i -> "Dim$i", n)) - end + NamedArray(a, lev, ntuple(i -> "Dim$i", n)) end -freqtable(x::PooledDataVector...; usena::Bool = false) = _freqtable(x, usena) +freqtable(x::AbstractCategoricalVector...; skipmissing::Bool = false) = _freqtable(x, skipmissing) function freqtable(d::DataFrame, x::Symbol...; args...) a = freqtable([d[y] for y in x]...; args...) diff --git a/test/REQUIRE b/test/REQUIRE index 755be54..a1aaed1 100644 --- a/test/REQUIRE +++ b/test/REQUIRE @@ -1 +1 @@ -RDatasets +CSV 0.2.0 diff --git a/test/freqtable.jl b/test/freqtable.jl index 30543d2..816cb4f 100644 --- a/test/freqtable.jl +++ b/test/freqtable.jl @@ -2,63 +2,108 @@ using FreqTables using Base.Test x = repeat(["a", "b", "c", "d"], outer=[100]); -y = repeat(["A", "B", "C", "D"], inner=[10], outer=[10]); +# Values not in order to test discrepancy between index and levels with CategoricalArray +y = repeat(["D", "C", "A", "B"], inner=[10], outer=[10]); -@test freqtable(x).array == [100, 100, 100, 100] -@test freqtable(y).array == [100, 100, 100, 100] -@test freqtable(x, y).array == [30 20 30 20; - 30 20 30 20; - 20 30 20 30; - 20 30 20 30] +tab = @inferred freqtable(x) +@test tab == [100, 100, 100, 100] +@test names(tab) == [["a", "b", "c", "d"]] +tab = @inferred freqtable(y) +@test tab == [100, 100, 100, 100] +@test names(tab) == [["A", "B", "C", "D"]] +tab = @inferred freqtable(x, y) +@test tab == [30 20 20 30; + 30 20 20 30; + 20 30 30 20; + 20 30 30 20] +@test names(tab) == [["a", "b", "c", "d"], ["A", "B", "C", "D"]] -@test freqtable(x, y, - subset=1:20, - weights=repeat([1, .5], outer=[10])).array == [3.0 2.0 - 1.5 1.0 - 2.0 3.0 - 1.0 1.5] +tab =freqtable(x, y, + subset=1:20, + weights=repeat([1, .5], outer=[10])) +@test tab == [2.0 3.0 + 1.0 1.5 + 3.0 2.0 + 1.5 1.0] +@test names(tab) == [["a", "b", "c", "d"], ["C", "D"]] -using DataArrays -xpda = PooledDataArray(x) -ypda = PooledDataArray(y) +using CategoricalArrays +cx = CategoricalArray(x) +cy = CategoricalArray(y) -@test freqtable(xpda).array == [100, 100, 100, 100] -@test freqtable(ypda).array == [100, 100, 100, 100] -@test freqtable(xpda, ypda).array == [30 20 30 20; - 30 20 30 20; - 20 30 20 30; - 20 30 20 30] +tab = @inferred freqtable(cx) +@test tab == [100, 100, 100, 100] +@test names(tab) == [["a", "b", "c", "d"]] +tab = @inferred freqtable(cy) +@test tab == [100, 100, 100, 100] +@test names(tab) == [["A", "B", "C", "D"]] +tab = @inferred freqtable(cx, cy) +@test tab == [30 20 20 30; + 30 20 20 30; + 20 30 30 20; + 20 30 30 20] +@test names(tab) == [["a", "b", "c", "d"], ["A", "B", "C", "D"]] -xpda[1] = NA -ypda[[1, 10, 20, 400]] = NA -@test freqtable(xpda).array == [99, 100, 100, 100] -@test freqtable(ypda).array == [98, 99, 100, 99] -@test freqtable(xpda, ypda).array == [29 20 30 20; - 29 20 30 20; - 20 30 20 30; - 20 29 20 29] +using Missings +const ≅ = isequal +mx = Array{Union{String, Missing}}(x) +my = Array{Union{String, Missing}}(y) +mx[1] = missing +my[[1, 10, 20, 400]] = missing -@test freqtable(xpda, usena=true).array == [99, 100, 100, 100, 1] -@test freqtable(ypda, usena=true).array == [98, 99, 100, 99, 4] -@test freqtable(xpda, ypda, usena=true).array == [29 20 30 20 0; - 29 20 30 20 1; - 20 30 20 30 0; - 20 29 20 29 2; - 0 0 0 0 1] +mcx = categorical(mx) +mcy = categorical(my) +tab = freqtable(mx) +tabc = freqtable(mcx) +@test tab == tabc == [99, 100, 100, 100, 1] +@test names(tab) ≅ names(tabc) ≅ [["a", "b", "c", "d", missing]] +tab = freqtable(my) +tabc = freqtable(mcy) +@test tab == tabc == [100, 99, 99, 98, 4] +@test names(tab) ≅ names(tabc) ≅ [["A", "B", "C", "D", missing]] +tab = freqtable(mx, my) +tabc = freqtable(mcx, mcy) +@test tab == tabc == [30 20 20 29 0; + 30 20 20 29 1; + 20 30 30 20 0; + 20 29 29 20 2; + 0 0 0 0 1] +@test names(tab) ≅ names(tabc) ≅ [["a", "b", "c", "d", missing], + ["A", "B", "C", "D", missing]] -using RDatasets -iris = dataset("datasets", "iris") + +tab = freqtable(mx, skipmissing=true) +tabc = freqtable(mcx, skipmissing=true) +@test tab == tabc == [99, 100, 100, 100] +@test names(tab) ≅ names(tabc) ≅ [["a", "b", "c", "d"]] +tab = freqtable(my, skipmissing=true) +tabc = freqtable(mcy, skipmissing=true) +@test names(tab) ≅ names(tabc) ≅ [["A", "B", "C", "D"]] +@test tab == tabc == [100, 99, 99, 98] +tab = freqtable(mx, my, skipmissing=true) +tabc = freqtable(mcx, mcy, skipmissing=true) +@test tab == tabc == [30 20 20 29; + 30 20 20 29; + 20 30 30 20; + 20 29 29 20] + + +using DataFrames, CSV +iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"), categorical=false); iris[:LongSepal] = iris[:SepalLength] .> 5.0 -@test freqtable(iris, :Species, :LongSepal).array == [28 22 - 3 47 - 1 49] -@test freqtable(iris, :Species, :LongSepal, - subset=iris[:PetalLength] .< 4.0).array ==[28 22 - 3 8] +tab = freqtable(iris, :Species, :LongSepal) +@test tab == [28 22 + 3 47 + 1 49] +@test names(tab) == [["setosa", "versicolor", "virginica"], [false, true]] +tab = freqtable(iris, :Species, :LongSepal, subset=iris[:PetalLength] .< 4.0) +@test tab == [28 22 + 3 8] +@test names(tab) == [["setosa", "versicolor"], [false, true]] # Issue #5 -@test freqtable([Set(1), Set(2)]).array == [1, 1] -@test freqtable([Set(1), Set(2)], [Set(1), Set(2)]).array == eye(2) +@test freqtable([Set(1), Set(2)]) == [1, 1] +@test freqtable([Set(1), Set(2)], [Set(1), Set(2)]) == eye(2)