nalimilan · nalimilan · Nov 29, 2017 · Nov 28, 2017 · Nov 29, 2017 · Nov 29, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -3,7 +3,7 @@ os:
   - osx
   - linux
 julia:
-  - 0.5
+  - 0.6
   - nightly
 notifications:
   email: false

diff --git a/README.md b/README.md
@@ -2,69 +2,78 @@
 
 [![Build Status](https://travis-ci.org/nalimilan/FreqTables.jl.svg?branch=master)](https://travis-ci.org/nalimilan/FreqTables.jl)
 [![Coverage Status](https://coveralls.io/repos/nalimilan/FreqTables.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/nalimilan/FreqTables.jl?branch=master)
-[![FreqTables](http://pkg.julialang.org/badges/FreqTables_0.5.svg)](http://pkg.julialang.org/?pkg=FreqTables&ver=0.5)
 [![FreqTables](http://pkg.julialang.org/badges/FreqTables_0.6.svg)](http://pkg.julialang.org/?pkg=FreqTables&ver=0.6)
 
 This package allows computing one- or multi-way frequency tables (a.k.a. contingency or pivot tables) from
-any type of vector or array. It includes support for [`PooledDataArray`s](https://github.com/JuliaStats/DataArrays.jl)
-and [`DataFrame`s](https://github.com/JuliaStats/DataFrames.jl/), as well as for weighted counts.
+any type of vector or array. It includes support for [`CategoricalArray`](https://github.com/JuliaData/CategoricalArrays.jl)
+and [`DataFrame`](https://github.com/JuliaData/DataFrames.jl), as well as for weighted counts.
 
 Tables are represented as [`NamedArray`](https://github.com/davidavdav/NamedArrays.jl/) objects.
 
 ```julia
 julia> using FreqTables
+
 julia> x = repeat(["a", "b", "c", "d"], outer=[100]);
+
 julia> y = repeat(["A", "B", "C", "D"], inner=[10], outer=[10]);
+
 julia> freqtable(x)
-4-element NamedArrays.NamedArray{Int64,1,Array{Int64,1},Tuple{Dict{ASCIIString,Int64}}}
-a 100
-b 100
-c 100
-d 100
+4-element Named Array{Int64,1}
+Dim1  │
+──────┼────
+a     │ 100
+b     │ 100
+c     │ 100
+d     │ 100
 
 julia> freqtable(x, y)
-4x4 NamedArrays.NamedArray{Int64,2,Array{Int64,2},Tuple{Dict{ASCIIString,Int64},Dict{ASCIIString,Int64}}}
-Dim1 \ Dim2 A  B  C  D 
-a           30 20 30 20
-b           30 20 30 20
-c           20 30 20 30
-d           20 30 20 30
+4×4 Named Array{Int64,2}
+Dim1 ╲ Dim2 │  A   B   C   D
+────────────┼───────────────
+a           │ 30  20  30  20
+b           │ 30  20  30  20
+c           │ 20  30  20  30
+d           │ 20  30  20  30
 
 julia> freqtable(x, y, subset=1:20)
-4x2 NamedArrays.NamedArray{Int64,2,Array{Int64,2},Tuple{Dict{ASCIIString,Int64},Dict{ASCIIString,Int64}}}
-Dim1 \ Dim2 A B
-a           3 2
-b           3 2
-c           2 3
-d           2 3
+4×2 Named Array{Int64,2}
+Dim1 ╲ Dim2 │ A  B
+────────────┼─────
+a           │ 3  2
+b           │ 3  2
+c           │ 2  3
+d           │ 2  3
 
 julia> freqtable(x, y, subset=1:20, weights=repeat([1, .5], outer=[10]))
-4x2 NamedArrays.NamedArray{Float64,2,Array{Float64,2},Tuple{Dict{ASCIIString,Int64},Dict{ASCIIString,Int64}}}
-Dim1 \ Dim2 A   B  
-a           3.0 2.0
-b           1.5 1.0
-c           2.0 3.0
-d           1.0 1.5
+4×2 Named Array{Float64,2}
+Dim1 ╲ Dim2 │   A    B
+────────────┼─────────
+a           │ 3.0  2.0
+b           │ 1.5  1.0
+c           │ 2.0  3.0
+d           │ 1.0  1.5
 ```
 
-For convenience, when working with a data frame, one can also pass the `DataFrame` object and columns as symbols:
+For convenience, when working with a data frame, one can also pass a `DataFrame` object and columns as symbols:
 ```julia
-julia> using RDatasets
+julia> using DataFrames, CSV
 
-julia> iris = dataset("datasets", "iris");
+julia> iris = CSV.read(joinpath(Pkg.dir("DataFrames"), "test/data/iris.csv"));
 
 julia> iris[:LongSepal] = iris[:SepalLength] .> 5.0;
 
 julia> freqtable(iris, :Species, :LongSepal)
-3x2 NamedArrays.NamedArray{Int64,2,Array{Int64,2},Tuple{Dict{ASCIIString,Int64},Dict{Bool,Int64}}}
-Species \ LongSepal false true 
-setosa              28    22   
-versicolor          3     47   
-virginica           1     49   
+3×2 Named Array{Int64,2}
+Species ╲ LongSepal │ false   true
+────────────────────┼─────────────
+setosa              │    28     22
+versicolor          │     3     47
+virginica           │     1     49
 
 julia> freqtable(iris, :Species, :LongSepal, subset=iris[:PetalLength] .< 4.0)
-2x2 NamedArrays.NamedArray{Int64,2,Array{Int64,2},Tuple{Dict{ASCIIString,Int64},Dict{Bool,Int64}}}
-Species \ LongSepal false true 
-setosa              28    22   
-versicolor          3     8    
+2×2 Named Array{Int64,2}
+Species ╲ LongSepal │ false   true
+────────────────────┼─────────────
+setosa              │    28     22
+versicolor          │     3      8
 ```
diff --git a/REQUIRE b/REQUIRE
@@ -1,4 +1,4 @@
-julia 0.5
+julia 0.6
 NamedArrays
-DataArrays
-DataFrames
+CategoricalArrays 0.3.0
+DataFrames 0.11.0
diff --git a/src/FreqTables.jl b/src/FreqTables.jl
@@ -1,5 +1,5 @@
 module FreqTables
-    using DataArrays
+    using CategoricalArrays
     using DataFrames
     using NamedArrays
 

diff --git a/src/freqtable.jl b/src/freqtable.jl
@@ -6,19 +6,14 @@ end
 Base.getindex(w::UnitWeights, ::Integer...) = 1
 Base.getindex(w::UnitWeights, ::AbstractVector) = w
 
-# @pure only exists in Julia 0.5
-if isdefined(Base, Symbol("@pure"))
-    import Base.@pure
-else
-    macro pure(x) esc(x) end
-end
-
 # About the type inference limitation which prompts this workaround, see
 # https://github.com/JuliaLang/julia/issues/10880
-@pure eltypes(T) = Tuple{map(eltype, T.parameters)...}
+Base.@pure eltypes(T) = Tuple{map(eltype, T.parameters)...}
+Base.@pure vectypes(T) = Tuple{map(U -> Vector{U}, T.parameters)...}
 
 # Internal function needed for now so that n is inferred
 function _freqtable{T<:Real}(x::Tuple,
+                             skipmissing::Bool = false,
                              weights::AbstractVector{T} = UnitWeights(),
                              subset::Union{Void, AbstractVector{Int}, AbstractVector{Bool}} = nothing)
     if !isa(subset, Void)
@@ -52,24 +47,31 @@ function _freqtable{T<:Real}(x::Tuple,
         end
     end
 
-    k = collect(keys(d))
+    if skipmissing
+        filter!((k, v) -> !any(ismissing, k), d)
+    end
 
-    dimnames = Vector{Any}(n)
+    keyvec = collect(keys(d))
+
+    dimnames = Vector{Vector}(n)
     for i in 1:n
         s = Set{vtypes.parameters[i]}()
-        for j in 1:length(k)
-            push!(s, k[j][i])
+        for j in 1:length(keyvec)
+            push!(s, keyvec[j][i])
         end
 
-        dimnames[i] = unique(s)
-        elty = eltype(dimnames[i])
-        if method_exists(isless, (elty, elty))
+        # convert() is needed for Union{T, Missing}, which currently gives a Vector{Any}
+        # which breaks inference of the return type
+        dimnames[i] = convert(Vector{vtypes.parameters[i]}, unique(s))
+        try
             sort!(dimnames[i])
+        catch err
+            err isa MethodError || rethrow(err)
         end
     end
 
-    a = zeros(eltype(weights), map(length, dimnames)...)
-    na = NamedArray(a, tuple(dimnames...), ntuple(i -> "Dim$i", n))
+    a = zeros(eltype(weights), map(length, dimnames)...)::Array{eltype(weights), n}
+    na = NamedArray(a, tuple(dimnames...)::vectypes(vtypes), ntuple(i -> "Dim$i", n))
 
     for (k, v) in d
         na[k...] = v
@@ -79,77 +81,50 @@ function _freqtable{T<:Real}(x::Tuple,
 end
 
 freqtable{T<:Real}(x::AbstractVector...;
+                   skipmissing::Bool = false,
                    weights::AbstractVector{T} = UnitWeights(),
                    subset::Union{Void, AbstractVector{Int}, AbstractVector{Bool}} = nothing) =
-    _freqtable(x, weights, subset)
+    _freqtable(x, skipmissing, weights, subset)
 
 # Internal function needed for now so that n is inferred
-function _freqtable{n}(x::NTuple{n, PooledDataVector}, usena = false)
-	len = map(length, x)
-	lev = map(levels, x)
+function _freqtable{n}(x::NTuple{n, AbstractCategoricalVector}, skipmissing::Bool = false)
+    len = map(length, x)
+    miss = map(v -> eltype(v) >: Missing, x)
+    lev = map(v -> eltype(v) >: Missing && !skipmissing ? [levels(v); missing] : levels(v), x)
+    dims = map(length, lev)
+    # First entry is for missing values (only correct and used if present)
+    ord = map((v, d) -> Int[d; CategoricalArrays.order(v.pool)], x, dims)
 
 	for i in 1:n
 	    if len[1] != len[i]
 	        error(string("arguments are not of the same length: ", tuple(len...)))
 	    end
 	end
 
-	if usena
-        dims = map(l -> length(l) + 1, lev)
-	    sizes = cumprod([dims...])
-	    a = zeros(Int, dims)
-
-	    for i in 1:len[1]
-	        el = Int(x[1].refs[i])
+    sizes = cumprod([dims...])
+    a = zeros(Int, dims)
+    missingpossible = any(miss)
 
-            if el == 0
-	            el = dims[1]
-	        end
-
-	        for j in 2:n
-	            val = Int(x[j].refs[i])
-
-	            if val == zero(val)
-	                val = dims[j]
-	            end
-
-	            el += Int((val - 1) * sizes[j - 1])
-	        end
-
-	        a[el] += 1
-	    end
+    @inbounds for i in 1:len[1]
+        ref = x[1].refs[i]        
+        el = ord[1][ref + 1]
+        anymiss = missingpossible & (ref <= 0)
 
-	    NamedArray(a, map(l -> [l; "NA"], lev), ntuple(i -> "Dim$i", n))
-	else
-        dims = map(length, lev)
-	    sizes = cumprod([dims...])
-	    a = zeros(Int, dims)
-
-	    for i in 1:len[1]
-	        pos = (x[1].refs[i] != zero(UInt))
-	        el = Int(x[1].refs[i])
-
-	        for j in 2:n
-	            val = x[j].refs[i]
-
-	            if val == zero(val)
-	                pos = false
-	                break
-	            end
-
-	            el += Int((val - 1) * sizes[j - 1])
-	        end
+        for j in 2:n
+            ref = x[j].refs[i]
+            anymiss |= missingpossible & (ref <= 0)
+            el += (ord[j][ref + 1] - 1) * sizes[j - 1]
+        end
 
-	        if pos
-	            @inbounds a[el] += 1
-	        end
-	    end
+        if !(missingpossible && skipmissing && anymiss)
+            a[el] += 1
+        end
+    end
 
-	    NamedArray(a, lev, ntuple(i -> "Dim$i", n))
-	end
+    NamedArray(a, lev, ntuple(i -> "Dim$i", n))
 end
 
-freqtable(x::PooledDataVector...; usena::Bool = false) = _freqtable(x, usena)
+freqtable(x::AbstractCategoricalVector...; skipmissing::Bool = false) = _freqtable(x, skipmissing)
 
 function freqtable(d::DataFrame, x::Symbol...; args...)
     a = freqtable([d[y] for y in x]...; args...)

diff --git a/test/REQUIRE b/test/REQUIRE
@@ -1 +1 @@
-RDatasets
+CSV 0.2.0
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,7 @@ os: @@
       - osx
       - linux
     julia:
-      - 0.5
+      - 0.6
       - nightly
     notifications:
       email: false
@@ Expand Down @@