From 8db28216332d48075e5571671cb70cdf69a81a17 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 10 Mar 2017 10:51:19 -0800 Subject: [PATCH 01/43] add changes --- src/DataTables.jl | 4 + src/abstractdatatable/abstractdatatable.jl | 248 +++++++++++++++------ src/abstractdatatable/io.jl | 17 +- src/abstractdatatable/join.jl | 22 +- src/abstractdatatable/reshape.jl | 18 +- src/datatable/datatable.jl | 140 ++++-------- src/groupeddatatable/grouping.jl | 2 +- test/cat.jl | 45 ++-- test/constructors.jl | 67 +++++- test/conversions.jl | 22 +- test/data.jl | 44 ++-- test/datatable.jl | 133 +++++------ test/grouping.jl | 63 ++++-- test/index.jl | 2 +- test/iteration.jl | 14 +- test/join.jl | 16 +- 16 files changed, 462 insertions(+), 395 deletions(-) diff --git a/src/DataTables.jl b/src/DataTables.jl index e69a70b..799f7f6 100644 --- a/src/DataTables.jl +++ b/src/DataTables.jl @@ -47,6 +47,8 @@ export @~, combine, completecases, deleterows!, + denullify!, + denullify, describe, dropnull, dropnull!, @@ -61,6 +63,8 @@ export @~, nonunique, nrow, nullable!, + nullify!, + nullify, order, printtable, rename!, diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index a885136..ef98fd5 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -31,6 +31,10 @@ The following are normally implemented for AbstractDataTables: * [`nonunique`](@ref) : indexes of duplicate rows * [`unique!`](@ref) : remove duplicate rows * `similar` : a DataTable with similar columns as `d` +* `denullify` : unwrap `Nullable` columns +* `denullify!` : unwrap `Nullable` columns in-place +* `nullify` : convert all columns to NullableArrays +* `nullify!` : convert all columns to NullableArrays in-place **Indexing** @@ -711,78 +715,23 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable Base.vcat(dt::AbstractDataTable) = dt -Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...]) - -function Base.vcat{T<:AbstractDataTable}(dts::Vector{T}) +function Base.vcat(dts::AbstractDataTable...) isempty(dts) && return DataTable() - coltyps, colnams, similars = _colinfo(dts) - - res = DataTable() - Nrow = sum(nrow, dts) - for j in 1:length(colnams) - colnam = colnams[j] - col = similar(similars[j], coltyps[j], Nrow) - - i = 1 - for dt in dts - if haskey(dt, colnam) - copy!(col, i, dt[colnam]) - end - i += size(dt, 1) - end - - res[colnam] = col - end - res -end - -_isnullable{T}(::AbstractArray{T}) = T <: Nullable -const EMPTY_DATA = NullableArray(Void, 0) - -function _colinfo{T<:AbstractDataTable}(dts::Vector{T}) - dt1 = dts[1] - colindex = copy(index(dt1)) - coltyps = eltypes(dt1) - similars = collect(columns(dt1)) - nonnull_ct = Int[_isnullable(c) for c in columns(dt1)] - - for i in 2:length(dts) - dt = dts[i] - for j in 1:size(dt, 2) - col = dt[j] - cn, ct = _names(dt)[j], eltype(col) - if haskey(colindex, cn) - idx = colindex[cn] - - oldtyp = coltyps[idx] - if !(ct <: oldtyp) - coltyps[idx] = promote_type(oldtyp, ct) - # Needed on Julia 0.4 since e.g. - # promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}, - # which is not a usable type: fall back to Nullable{Any} - if VERSION < v"0.5.0-dev" && - coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType) - coltyps[idx] = Nullable{Any} - end - end - nonnull_ct[idx] += !_isnullable(col) - else # new column - push!(colindex, cn) - push!(coltyps, ct) - push!(similars, col) - push!(nonnull_ct, !_isnullable(col)) - end - end - end - - for j in 1:length(colindex) - if nonnull_ct[j] < length(dts) && !_isnullable(similars[j]) - similars[j] = EMPTY_DATA - end + allheaders = map(names, dts) + # don't vcat empty DataTables + notempty = find(x -> length(x) > 0, allheaders) + uniqueheaders = unique(allheaders[notempty]) + if length(uniqueheaders) == 0 + return DataTable() + elseif length(unique(map(length, uniqueheaders))) > 1 + throw(ArgumentError("not all DataTables have the same number of columns. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))")) + elseif length(uniqueheaders) > 1 + throw(ArgumentError("Column names do not match. Use `rename!` or `names!` to adjust columns names. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))")) + else + header = uniqueheaders[1] + dts_to_vcat = dts[notempty] + return DataTable(Any[vcat(map(dt -> dt[col], dts_to_vcat)...) for col in header], header) end - colnams = _names(colindex) - - coltyps, colnams, similars end ############################################################################## @@ -801,6 +750,165 @@ function Base.hash(dt::AbstractDataTable) return @compat UInt(h) end +""" + denullify!(dt::AbstractDataTable) + +Convert columns with a `Nullable` element type without any null values +to a non-`Nullable` equivalent array type. The table `dt` is modified in place. + +# Examples + +```jldoctest +julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3]) +3×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ + +julia> eltypes(dt) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} + +julia> eltypes(denullify!(dt)) +2-element Array{Type,1}: + Int64 + Int64 + +julia> eltypes(dt) +2-element Array{Type,1}: + Int64 + Int64 +``` + +See also [`denullify`](@ref) & [`nullify!`](@ref). +""" +function denullify!(dt::AbstractDataTable) + for i in 1:size(dt,2) + if !anynull(dt[i]) + dt[i] = dropnull(dt[i]) + end + end + dt +end + +""" + denullify(dt::AbstractDataTable) + +Return a copy of `dt` where columns with a `Nullable` element type without any +null values have been converted to a non-`Nullable` equivalent array type. + +# Examples + +```jldoctest +julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3]) +3×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ + +julia> eltypes(dt) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} + +julia> eltypes(denullify(dt)) +2-element Array{Type,1}: + Int64 + Int64 + +julia> eltypes(dt) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} +``` + +See also [`denullify!`] & [`nullify`](@ref). +""" +denullify(dt::AbstractDataTable) = denullify!(copy(dt)) + +""" + nullify!(dt::AbstractDataTable) + +Convert all columns of `dt` to nullable arrays. The table `dt` is modified in place. + +# Examples + +```jldoctest +julia> dt = DataTable(A = 1:3, B = 1:3) +3×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ + +julia> eltypes(dt) +2-element Array{Type,1}: + Int64 + Int64 + +julia> eltypes(nullify!(dt)) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} + +julia> eltypes(dt) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} +``` + +See also [`nullify`](@ref) & [`denullify!`](@ref). +""" +function nullify!(dt::AbstractDataTable) + for i in 1:size(dt,2) + dt[i] = NullableArray(dt[i]) + end + dt +end + +""" + nullify(dt::AbstractDataTable) + +Return a copy of `dt` with all columns converted to nullable arrays. + +# Examples + +```jldoctest +julia> dt = DataTable(A = 1:3, B = 1:3) +3×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ + +julia> eltypes(dt) +2-element Array{Type,1}: + Int64 + Int64 + +julia> eltypes(nullify(dt)) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} + +julia> eltypes(dt) +2-element Array{Type,1}: + Int64 + Int64 +``` + +See also [`nullify!`](@ref) & [`denullify`](@ref). +""" +function nullify(dt::AbstractDataTable) + nullify!(copy(dt)) +end ## Documentation for methods defined elsewhere diff --git a/src/abstractdatatable/io.jl b/src/abstractdatatable/io.jl index 8ec11a4..7d14196 100644 --- a/src/abstractdatatable/io.jl +++ b/src/abstractdatatable/io.jl @@ -42,19 +42,20 @@ function printtable(io::IO, quotestr = string(quotemark) for i in 1:n for j in 1:p - if !isnull(dt[j],i) + if !isnull(dt[j][i]) if ! (etypes[j] <: Real) - print(io, quotemark) - escapedprint(io, get(dt[i, j]), quotestr) - print(io, quotemark) + print(io, quotemark) + x = isa(dt[i, j], Nullable) ? get(dt[i, j]) : dt[i, j] + escapedprint(io, x, quotestr) + print(io, quotemark) else - print(io, dt[i, j]) + print(io, dt[i, j]) end else - print(io, nastring) + print(io, nastring) end if j < p - print(io, separator) + print(io, separator) else print(io, '\n') end @@ -167,7 +168,7 @@ function Base.show(io::IO, ::MIME"text/latex", dt::AbstractDataTable) write(io, " & ") cell = dt[row,col] if !isnull(cell) - content = get(cell) + content = isa(cell, Nullable) ? get(cell) : cell if mimewritable(MIME("text/latex"), content) show(io, MIME("text/latex"), content) else diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 1ad170b..94e9f1d 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -2,19 +2,6 @@ ## Join / merge ## -# Like similar, but returns a nullable array -similar_nullable{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = - NullableArray(T, dims) - -similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = - NullableArray(eltype(T), dims) - -similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = - NullableCategoricalArray(T, dims) - -similar_nullable(dt::AbstractDataTable, dims::Int) = - DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) - # helper structure for DataTables joining immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable} dtl::DT1 @@ -76,9 +63,12 @@ function compose_joined_table(joiner::DataTableJoiner, right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil] end all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) - right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm] - for col in columns(dtr_noon)], - names(dtr_noon)) + resizelen = length(all_orig_right_ixs)+length(leftonly_ixs) + rightcols = Any[length(col[all_orig_right_ixs]) >= resizelen ? + resize!(col[all_orig_right_ixs], resizelen)[right_perm] : + NullableArray(vcat(col[all_orig_right_ixs], fill(Nullable(), resizelen - length(col[all_orig_right_ixs]))))[right_perm] + for col in columns(dtr_noon)] + right_dt = DataTable(rightcols, names(dtr_noon)) # merge left and right parts of the joined table res = hcat!(left_dt, right_dt) diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index ed4d519..60fb485 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -202,21 +202,16 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) if T <: Nullable T = eltype(T) end - payload = DataTable(Any[NullableArray(T, Nrow) for i in 1:Ncol], + payload = DataTable(Any[NullableVector{T}(Nrow) for i in 1:Ncol], map(Symbol, levels(keycol))) - nowarning = true for k in 1:nrow(dt) j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) i = Int(CategoricalArrays.order(refkeycol.pool)[refkeycol.refs[k]]) if i > 0 && j > 0 - if nowarning && !isnull(payload[j][i]) - warn("Duplicate entries in unstack.") - nowarning = false - end payload[j][i] = valuecol[k] end end - insert!(payload, 1, NullableArray(levels(refkeycol)), _names(dt)[rowkey]) + denullify!(insert!(payload, 1, levels(refkeycol), _names(dt)[rowkey])) end unstack(dt::AbstractDataTable, rowkey, colkey, value) = unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value]) @@ -242,21 +237,16 @@ function unstack(dt::AbstractDataTable, colkey::Int, value::Int) if T <: Nullable T = eltype(T) end - dt2 = DataTable(Any[NullableArray(T, Nrow) for i in 1:Ncol], + dt2 = DataTable(Any[NullableVector{T}(Nrow) for i in 1:Ncol], map(@compat(Symbol), levels(keycol))) - nowarning = true for k in 1:nrow(dt) j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) i = rowkey[k] if i > 0 && j > 0 - if nowarning && !isnull(dt2[j][i]) - warn("Duplicate entries in unstack at row $k.") - nowarning = false - end dt2[j][i] = valuecol[k] end end - hcat(dt1, dt2) + denullify!(hcat(dt1, dt2)) end unstack(dt::AbstractDataTable) = unstack(dt, :id, :variable, :value) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 5eb0e7b..c39feb2 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -74,32 +74,41 @@ type DataTable <: AbstractDataTable colindex::Index function DataTable(columns::Vector{Any}, colindex::Index) - ncols = length(columns) - if ncols > 1 - nrows = length(columns[1]) - equallengths = true - for i in 2:ncols - equallengths &= length(columns[i]) == nrows - end - if !equallengths - msg = "All columns in a DataTable must be the same length" - throw(ArgumentError(msg)) - end + if length(columns) == length(colindex) == 0 + return new(Vector{Any}(0), Index()) + elseif length(columns) != length(colindex) + throw(DimensionMismatch("Number of columns and column names are different")) end - if length(colindex) != ncols - msg = "Columns and column index must be the same length" - throw(ArgumentError(msg)) + lengths = length.(columns) + minlen, maxlen = extrema(lengths) + if minlen == 0 && maxlen == 0 + return new(columns, colindex) + elseif (minlen == 0 && maxlen > 0) || any(x -> x != 0, mod(maxlen, lengths)) + throw(DimensionMismatch("Incompatible lengths of arguments")) + else + for i in 1:length(columns) + if isa(columns[i], Range) + columns[i] = collect(columns[i]) + end + repeats = div(maxlen, length(columns[i])) + if repeats == 1 && !(typeof(columns[i]) <: AbstractVector) + columns[i] = [columns[i]] + elseif repeats !== 1 + columns[i] = isa(columns[i], Array) ? repeat(columns[i], outer=repeats) : fill(columns[i], repeats) + end + end end - new(columns, colindex) + return new(columns, colindex) end end function DataTable(; kwargs...) - result = DataTable(Any[], Index()) - for (k, v) in kwargs - result[k] = v + if length(kwargs) == 0 + return DataTable(Any[], Index()) end - return result + columns = Any[v for (k,v) in kwargs] + colindex = DataTables.Index([k for (k,v) in kwargs]) + DataTable(columns, colindex) end function DataTable(columns::AbstractVector, @@ -112,7 +121,7 @@ end function DataTable(t::Type, nrows::Integer, ncols::Integer) columns = Vector{Any}(ncols) for i in 1:ncols - columns[i] = NullableArray(t, nrows) + columns[i] = Vector{t}(nrows) end cnames = gennames(ncols) return DataTable(columns, Index(cnames)) @@ -123,21 +132,21 @@ function DataTable(column_eltypes::Vector, cnames::Vector, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - columns[j] = NullableArray(column_eltypes[j], nrows) + columns[j] = Vector{column_eltypes[j]}(nrows) end return DataTable(columns, Index(cnames)) end # Initialize an empty DataTable with specific eltypes and names # and whether a nominal array should be created -function DataTable(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, +function DataTable(column_eltypes::Vector, cnames::Vector, nominal::Vector{Bool}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p if nominal[j] - columns[j] = NullableCategoricalArray{column_eltypes[j]}(nrows) + columns[j] = CategoricalVector{column_eltypes[j]}(nrows) else - columns[j] = NullableArray{column_eltypes[j]}(nrows) + columns[j] = Vector{column_eltypes[j]}(nrows) end end return DataTable(columns, Index(cnames)) @@ -149,44 +158,11 @@ function DataTable(column_eltypes::Vector, nrows::Integer) columns = Vector{Any}(p) cnames = gennames(p) for j in 1:p - columns[j] = NullableArray{column_eltypes[j]}(nrows) + columns[j] = Vector{column_eltypes[j]}(nrows) end return DataTable(columns, Index(cnames)) end -# Initialize from a Vector of Associatives (aka list of dicts) -function DataTable{D <: Associative}(ds::Vector{D}) - ks = Set() - for d in ds - union!(ks, keys(d)) - end - DataTable(ds, [ks...]) -end - -# Initialize from a Vector of Associatives (aka list of dicts) -function DataTable{D <: Associative}(ds::Vector{D}, ks::Vector) - #get column eltypes - col_eltypes = Type[@compat(Union{}) for _ = 1:length(ks)] - for d in ds - for (i,k) in enumerate(ks) - if haskey(d, k) && !_isnull(d[k]) - col_eltypes[i] = promote_type(col_eltypes[i], typeof(d[k])) - end - end - end - col_eltypes[col_eltypes .== @compat(Union{})] = Any - - # create empty DataTable, and fill - dt = DataTable(col_eltypes, ks, length(ds)) - for (i,d) in enumerate(ds) - for (j,k) in enumerate(ks) - dt[i,j] = get(d, k, Nullable()) - end - end - - dt -end - ############################################################################## ## ## AbstractDataTable interface @@ -363,24 +339,20 @@ function insert_multiple_entries!{T <: Real}(dt::DataTable, end end -upgrade_vector{T<:Nullable}(v::AbstractArray{T}) = v -upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v) -upgrade_vector(v::AbstractArray) = NullableArray(v) - function upgrade_scalar(dt::DataTable, v::AbstractArray) msg = "setindex!(::DataTable, ...) only broadcasts scalars, not arrays" throw(ArgumentError(msg)) end function upgrade_scalar(dt::DataTable, v::Any) n = (ncol(dt) == 0) ? 1 : nrow(dt) - NullableArray(fill(v, n)) + fill(v, n) end # dt[SingleColumnIndex] = AbstractVector function Base.setindex!(dt::DataTable, v::AbstractVector, col_ind::ColumnIndex) - insert_single_column!(dt, upgrade_vector(v), col_ind) + insert_single_column!(dt, v, col_ind) end # dt[SingleColumnIndex] = Single Item (EXPANDS TO NROW(DT) if NCOL(DT) > 0) @@ -417,9 +389,8 @@ end function Base.setindex!{T <: ColumnIndex}(dt::DataTable, v::AbstractVector, col_inds::AbstractVector{T}) - dv = upgrade_vector(v) for col_ind in col_inds - dt[col_ind] = dv + dt[col_ind] = v end return dt end @@ -757,8 +728,8 @@ end hcat!(dt::DataTable, x::CategoricalArray) = hcat!(dt, DataTable(Any[x])) hcat!(dt::DataTable, x::NullableCategoricalArray) = hcat!(dt, DataTable(Any[x])) hcat!(dt::DataTable, x::NullableVector) = hcat!(dt, DataTable(Any[x])) -hcat!(dt::DataTable, x::Vector) = hcat!(dt, DataTable(Any[NullableArray(x)])) -hcat!(dt::DataTable, x) = hcat!(dt, DataTable(Any[NullableArray([x])])) +hcat!(dt::DataTable, x::Vector) = hcat!(dt, DataTable(Any[(x)])) +hcat!(dt::DataTable, x) = hcat!(dt, DataTable(Any[([x])])) # hcat! for 1-n arguments hcat!(dt::DataTable) = dt @@ -834,35 +805,12 @@ function Base.convert(::Type{DataTable}, A::Matrix) return DataTable(cols, Index(gennames(n))) end -function _datatable_from_associative(dnames, d::Associative) - p = length(dnames) - p == 0 && return DataTable() - columns = Vector{Any}(p) - colnames = Vector{Symbol}(p) - n = length(d[dnames[1]]) - for j in 1:p - name = dnames[j] - col = d[name] - if length(col) != n - throw(ArgumentError("All columns in Dict must have the same length")) - end - columns[j] = NullableArray(col) - colnames[j] = Symbol(name) - end - return DataTable(columns, Index(colnames)) -end - function Base.convert(::Type{DataTable}, d::Associative) - dnames = collect(keys(d)) - return _datatable_from_associative(dnames, d) -end - -# A Dict is not sorted or otherwise ordered, and it's nicer to return a -# DataTable which is ordered in some way -function Base.convert(::Type{DataTable}, d::Dict) - dnames = collect(keys(d)) - sort!(dnames) - return _datatable_from_associative(dnames, d) + colnames = collect(keys(d)) + isa(d, Dict) && sort!(colnames) + colindex = Index([Symbol(k) for k in colnames]) + columns = Any[d[c] for c in colnames] + DataTable(columns, colindex) end diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl index 83db685..61b66ed 100644 --- a/src/groupeddatatable/grouping.jl +++ b/src/groupeddatatable/grouping.jl @@ -193,7 +193,7 @@ combine(map(d -> mean(dropnull(d[:c])), gd)) """ function combine(ga::GroupApplied) gd, vals = ga.gd, ga.vals - valscat = vcat(vals) + valscat = vcat(vals...) idx = Vector{Int}(size(valscat, 1)) j = 0 @inbounds for (start, val) in zip(gd.starts, vals) diff --git a/test/cat.jl b/test/cat.jl index ab4e2ab..8586767 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -79,7 +79,7 @@ module TestCat vcat(dt, null_dt) vcat(dt, dt) vcat(dt, dt, dt) - @test vcat(DataTable[]) == DataTable() + @test vcat(DataTable()) == DataTable() alt_dt = deepcopy(dt) vcat(dt, alt_dt) @@ -88,27 +88,18 @@ module TestCat dt[1] = zeros(Int, nrow(dt)) vcat(dt, alt_dt) - # Don't fail on non-matching names - names!(alt_dt, [:A, :B, :C]) - vcat(dt, alt_dt) - dtr = vcat(dt4, dt4) @test size(dtr, 1) == 8 @test names(dt4) == names(dtr) @test isequal(dtr, [dt4; dt4]) - dtr = vcat(dt2, dt3) - @test size(dtr) == (8,2) - @test names(dt2) == names(dtr) - @test isnull(dtr[8,:x2]) - # Eltype promotion # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} if VERSION >= v"0.5.0-dev" - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}] + @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Float64] @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] else - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Any}] + @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Any] @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Any}] end @@ -118,17 +109,8 @@ module TestCat dtc = DataTable(a = NullableArray([2, 3, 4])) dtd = DataTable(Any[2:4], [:a]) dtab = vcat(dta, dtb) - dtac = vcat(dta, dtc) - @test isequal(dtab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) - @test isequal(dtac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) - @test isa(dtab[:a], NullableCategoricalVector{Int}) - # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} - if VERSION >= v"0.5.0-dev" - @test isa(dtac[:a], NullableCategoricalVector{Int}) - else - @test isa(dtac[:a], NullableCategoricalVector{Any}) - end - # ^^ container may flip if container promotion happens in Base/DataArrays + @test isequal(dtab[:a], [1, 2, 2, 2, 3, 4]) + @test isa(dtab[:a], CategoricalVector{Int}) dc = vcat(dtd, dtc) @test isequal(vcat(dtc, dtd), dc) @@ -137,15 +119,14 @@ module TestCat @test isequal(vcat(dtd, dtc0, dtc), dc) @test eltypes(vcat(dtd, dtc0)) == eltypes(dc) - # Missing columns - rename!(dtd, :a, :b) - dtda = DataTable(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]), - a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2])) - @test isequal(vcat(dtd, dta), dtda) - - # Alignment - @test isequal(vcat(dtda, dtd, dta), vcat(dtda, dtda)) - # vcat should be able to concatenate different implementations of AbstractDataTable (PR #944) @test isequal(vcat(view(DataTable(A=1:3),2),DataTable(A=4:5)), DataTable(A=[2,4,5])) + + @testset "vcat errors" begin + dt1 = DataTable(A = 1:3, B = 1:3) + dt2 = DataTable(A = 1:3) + @test_throws ArgumentError vcat(dt1, dt2) + dt2 = DataTable(A = 1:3, C = 1:3) + @test_throws ArgumentError vcat(dt1, dt2) + end end diff --git a/test/constructors.jl b/test/constructors.jl index 6edf2e9..70500c6 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -18,8 +18,8 @@ module TestConstructors @test isequal(dt, DataTable(Any[NullableCategoricalVector(zeros(3)), NullableCategoricalVector(ones(3))])) - @test isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0], - x2 = [1.0, 1.0, 1.0])) + @test !isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0], + x2 = [1.0, 1.0, 1.0])) dt2 = convert(DataTable, [0.0 1.0; 0.0 1.0; @@ -28,25 +28,72 @@ module TestConstructors @test isequal(dt[:x1], NullableArray(dt2[:x1])) @test isequal(dt[:x2], NullableArray(dt2[:x2])) - @test isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0], - x2 = [1.0, 1.0, 1.0])) - @test isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0], - x2 = [1.0, 1.0, 1.0], + @test isequal(dt, DataTable(x1 = NullableCategoricalVector([0.0, 0.0, 0.0]), + x2 = NullableCategoricalVector([1.0, 1.0, 1.0]))) + @test isequal(dt, DataTable(x1 = NullableCategoricalVector([0.0, 0.0, 0.0]), + x2 = NullableCategoricalVector([1.0, 1.0, 1.0]), x3 = [2.0, 2.0, 2.0])[[:x1, :x2]]) dt = DataTable(Int, 2, 2) @test size(dt) == (2, 2) - @test eltypes(dt) == [Nullable{Int}, Nullable{Int}] + @test eltypes(dt) == [Int, Int] dt = DataTable([Int, Float64], [:x1, :x2], 2) @test size(dt) == (2, 2) - @test eltypes(dt) == [Nullable{Int}, Nullable{Float64}] - - @test isequal(dt, DataTable([Int, Float64], 2)) + @test eltypes(dt) == [Int, Float64] @test_throws BoundsError SubDataTable(DataTable(A=1), 0) @test_throws BoundsError SubDataTable(DataTable(A=1), 0) @test isequal(SubDataTable(DataTable(A=1), 1), DataTable(A=1)) @test isequal(SubDataTable(DataTable(A=1:10), 1:4), DataTable(A=1:4)) @test isequal(view(SubDataTable(DataTable(A=1:10), 1:4), [true, true, false, false]), DataTable(A=1:2)) + + @test DataTable(a=1, b=1:2) == DataTable(a=[1,1], b=[1,2]) + + @testset "associative" begin + dt = DataTable(Dict(k => v for (k,v) in zip([:A, :B], [1:3, 4:6]))) + @test dt == DataTable(A = 1:3, B = 4:6) + end + + @testset "recyclers" begin + @test DataTable([collect(1:10), collect(1:20)], [:x, :y]) == DataTable(x = vcat(1:10, 1:10), y = 1:20) + @test DataTable(a = 1:5, b = 1) == DataTable(a = collect(1:5), b = fill(1, 5)) + @test DataTable(a = 1, b = 1:5) == DataTable(a = fill(1, 5), b = collect(1:5)) + end + + @testset "constructor errors" begin + @test_throws DimensionMismatch DataTable(a=1, b=[]) + @test_throws DimensionMismatch DataTable(Any[collect(1:10)], DataTables.Index([:A, :B])) + end + + @testset "column types" begin + dt = DataTable(A = 1:3, B = 2:4, C = 3:5) + answer = Any[Array{Int,1}, Array{Int,1}, Array{Int,1}] + @test map(typeof, dt.columns) == answer + dt[:D] = NullableArray([4, 5, Nullable()]) + push!(answer, NullableArray{Int,1}) + @test map(typeof, dt.columns) == answer + dt[:E] = 'c' + push!(answer, Array{Char,1}) + @test map(typeof, dt.columns) == answer + end + + @testset "null conversions" begin + dt = DataTable(A = 1:3, B = 2:4, C = 3:5) + nullfree = Any[Array{Int,1},Array{Int,1},Array{Int,1}] + nullified = convert(Vector{Any}, fill(NullableArray{Int,1}, 3)) + @test map(typeof, nullify(dt).columns) == nullified + @test sum(isa(dt[i,j], Nullable) for i=1:size(dt, 1) for j=1:size(dt, 2)) == 0 + nullify!(dt) + @test map(typeof, dt.columns) == nullified + @test sum(isa(dt[i,j], Nullable) for i=1:size(dt, 1) for j=1:size(dt, 2)) == reduce(*, size(dt)) + @test map(typeof, denullify(dt).columns) == nullfree + @test sum(isa(dt[i,j], Nullable) for i=1:size(dt, 1) for j=1:size(dt, 2)) == reduce(*, size(dt)) + denullify!(dt) + map(typeof, dt.columns) == nullfree + @test sum(isa(dt[i,j], Nullable) for i=1:size(dt, 1) for j=1:size(dt, 2)) == 0 + + dt = DataTable(A = [Nullable(i) for i=1:10]) + @test denullify!(dt).columns == Any[[i for i=1:10]] + end end diff --git a/test/conversions.jl b/test/conversions.jl index a0afd0d..385b89d 100644 --- a/test/conversions.jl +++ b/test/conversions.jl @@ -35,8 +35,6 @@ module TestConversions @test isa(ai, Matrix{Int}) @test ai == convert(Matrix{Int}, dt) - dt[1,1] = Nullable() - @test_throws ErrorException convert(Array, dt) na = convert(NullableArray, dt) naa = convert(NullableArray{Any}, dt) nai = convert(NullableArray{Int}, dt) @@ -55,28 +53,28 @@ module TestConversions dt = convert(DataTable,di) @test isa(dt,DataTable) @test names(dt) == Symbol[x for x in sort(collect(keys(di)))] - @test isequal(dt[:a], NullableArray(a)) - @test isequal(dt[:b], NullableArray(b)) - @test isequal(dt[:c], NullableArray(c)) + @test isequal(dt[:a], a) + @test isequal(dt[:b], b) + @test isequal(dt[:c], c) od = OrderedDict("c"=>c, "a"=>a, "b"=>b) dt = convert(DataTable,od) @test isa(dt, DataTable) @test names(dt) == Symbol[x for x in keys(od)] - @test isequal(dt[:a], NullableArray(a)) - @test isequal(dt[:b], NullableArray(b)) - @test isequal(dt[:c], NullableArray(c)) + @test isequal(dt[:a], a) + @test isequal(dt[:b], b) + @test isequal(dt[:c], c) sd = SortedDict("c"=>c, "a"=>a, "b"=>b) dt = convert(DataTable,sd) @test isa(dt, DataTable) @test names(dt) == Symbol[x for x in keys(sd)] - @test isequal(dt[:a], NullableArray(a)) - @test isequal(dt[:b], NullableArray(b)) - @test isequal(dt[:c], NullableArray(c)) + @test isequal(dt[:a], a) + @test isequal(dt[:b], b) + @test isequal(dt[:c], c) a = [1.0] di = Dict("a"=>a, "b"=>b, "c"=>c) - @test_throws ArgumentError convert(DataTable,di) + @test convert(DataTable,di)[:a] == [1.0, 1.0] end diff --git a/test/data.jl b/test/data.jl index 9259a6e..a59b2bc 100644 --- a/test/data.jl +++ b/test/data.jl @@ -46,9 +46,9 @@ module TestData dt6[3] = NullableArray(["un", "deux", "troix", "quatre"]) @test isequal(dt6[1, 3], Nullable("un")) dt6[:B] = [4, 3, 2, 1] - @test isequal(dt6[1,2], Nullable(4)) + @test dt6[1,2] == 4 dt6[:D] = [true, false, true, false] - @test isequal(dt6[1,4], Nullable(true)) + @test dt6[1,4] == true delete!(dt6, :D) @test names(dt6) == [:A, :B, :C] @test size(dt6, 2) == 3 @@ -74,7 +74,7 @@ module TestData @test size(sdt6d) == (2,1) #test_group("ref") - @test isequal(sdt6a[1,2], Nullable(4)) + @test sdt6a[1,2] == 4 #test_context("Within") #test_group("Associative") @@ -114,13 +114,14 @@ module TestData @test isequal(dt8[1:2, :d2], NullableCategoricalArray(["A", "B"])) @test size(dt8, 1) == 3 @test size(dt8, 2) == 5 - @test get(sum(dt8[:d1_length])) == N - @test all(dt8[:d1_length].values .> 0) - @test dt8[:d1_length].values == [4, 5, 11] + @test sum(dt8[:d1_length]) == N + @test all(dt8[:d1_length] .> 0) + @test dt8[2, :d1_length] == 5 + @test dt8[:d1_length] == [4, 5, 11] @test isequal(dt8, aggregate(groupby(dt7, :d2, sort=true), [sum, length])) - @test isequal(dt8[1, :d1_length], Nullable(4)) - @test isequal(dt8[2, :d1_length], Nullable(5)) - @test isequal(dt8[3, :d1_length], Nullable(11)) + @test dt8[1, :d1_length] == 4 + @test dt8[2, :d1_length] == 5 + @test dt8[3, :d1_length] == 11 @test isequal(dt8, aggregate(groupby(dt7, :d2), [sum, length], sort=true)) dt9 = dt7 |> groupby([:d2], sort=true) |> [sum, length] @@ -130,7 +131,7 @@ module TestData dt10 = DataTable( Any[[1:4;], [2:5;], ["a", "a", "a", "b" ], ["c", "d", "c", "d"]], - [:d1, :d2, :d3, :d4] + [:d1, :d2, :d3, :d4] ) gd = groupby(dt10, [:d3], sort=true) @@ -191,9 +192,9 @@ module TestData d1us = unstack(d1s, :id, :variable, :value) d1us2 = unstack(d1s2) d1us3 = unstack(d1s2, :variable, :value) - @test isequal(d1us[:a], d1[:a]) - @test isequal(d1us2[:d], d1[:d]) - @test isequal(d1us2[:3], d1[:d]) + @test d1us[:a] == d1[:a] + @test d1us2[:d] == d1[:d] + @test d1us2[:3] == d1[:d] @@ -215,7 +216,7 @@ module TestData v2 = randn(5)) m1 = join(dt1, dt2, on = :a, kind=:inner) - @test isequal(m1[:a], dt1[:a][dt1[:a].values .<= 5]) # preserves dt1 order + @test isequal(m1[:a], dt1[:a][dt1[:a] .<= 5]) # preserves dt1 order m2 = join(dt1, dt2, on = :a, kind = :outer) @test isequal(m2[:a], dt1[:a]) # preserves dt1 order @test isequal(m2[:b], dt1[:b]) # preserves dt1 order @@ -236,16 +237,16 @@ module TestData c = ["New World", "Old World", "New World"]) m1 = join(dt1, dt2, on = :a, kind = :inner) - @test isequal(m1[:a], NullableArray([1, 2])) + @test m1[:a] == [1, 2] m2 = join(dt1, dt2, on = :a, kind = :left) - @test isequal(m2[:a], NullableArray([1, 2, 3])) + @test m2[:a] == [1, 2, 3] m3 = join(dt1, dt2, on = :a, kind = :right) - @test isequal(m3[:a], NullableArray([1, 2, 4])) + @test m3[:a] == [1, 2, 4] m4 = join(dt1, dt2, on = :a, kind = :outer) - @test isequal(m4[:a], NullableArray([1, 2, 3, 4])) + @test m4[:a] == [1, 2, 3, 4] # test with nulls (issue #185) dt1 = DataTable() @@ -271,13 +272,6 @@ module TestData v1 = randn(10) ) - dt2 = DataTable( - a = [:x,:y][[1,2,1,1,2]], - b = [:A,:B,:C][[1,1,1,2,3]], - v2 = randn(5) - ) - dt2[1,:a] = Nullable() - # # TODO: Restore this functionality # m1 = join(dt1, dt2, on = [:a,:b]) # @test isequal(m1[:a], NullableArray(["x", "x", "y", "y", fill("x", 5)])) diff --git a/test/datatable.jl b/test/datatable.jl index c75f5fe..95ea0a1 100644 --- a/test/datatable.jl +++ b/test/datatable.jl @@ -39,17 +39,17 @@ module TestDataTable dtdc = deepcopy(dt) dt[1, :a] = 4 - get(dt[1, :b])[:e] = 5 + dt[1, :b][:e] = 5 names!(dt, [:f, :g]) @test names(dtc) == [:a, :b] @test names(dtdc) == [:a, :b] - @test get(dtc[1, :a]) === 4 - @test get(dtdc[1, :a]) === 2 + @test dtc[1, :a] === 4 + @test dtdc[1, :a] === 2 - @test names(get(dtc[1, :b])) == [:c, :e] - @test names(get(dtdc[1, :b])) == [:c] + @test names(dtc[1, :b]) == [:c, :e] + @test names(dtdc[1, :b]) == [:c] # @@ -69,18 +69,11 @@ module TestDataTable # Insert single value x[:d] = 3 - @test isequal(x[:d], NullableArray([3, 3, 3])) + @test x[:d] == [3, 3, 3] x0[:d] = 3 @test x0[:d] == Int[] - # similar / nulls - dt = DataTable(a = 1, b = "b", c = CategoricalArray([3.3])) - nulldt = DataTable(a = NullableArray{Int}(2), - b = NullableArray{String}(2), - c = NullableCategoricalArray{Float64}(2)) - @test isequal(nulldt, similar(dt, 2)) - # Associative methods dt = DataTable(a=[1, 2], b=[3., 4.]) @@ -99,9 +92,9 @@ module TestDataTable @test_throws ErrorException insert!(dt, 1, ["a"], :newcol) @test isequal(insert!(dt, 1, ["a", "b"], :newcol), dt) @test names(dt) == [:newcol, :a, :b] - @test isequal(dt[:a], NullableArray([1, 2])) - @test isequal(dt[:b], NullableArray([3., 4.])) - @test isequal(dt[:newcol], ["a", "b"]) + @test dt[:a] == [1, 2] + @test dt[:b] == [3., 4.] + @test dt[:newcol] == ["a", "b"] dt = DataTable(a=[1, 2], b=[3., 4.]) dt2 = DataTable(b=["a", "b"], c=[:c, :d]) @@ -112,43 +105,45 @@ module TestDataTable dt = DataTable(Int, 10, 3) @test size(dt, 1) == 10 @test size(dt, 2) == 3 - @test typeof(dt[:, 1]) == NullableVector{Int} - @test typeof(dt[:, 2]) == NullableVector{Int} - @test typeof(dt[:, 3]) == NullableVector{Int} - @test allnull(dt[:, 1]) - @test allnull(dt[:, 2]) - @test allnull(dt[:, 3]) - - dt = DataTable(Any[Int, Float64, String], 100) + @test typeof(dt[:, 1]) == Vector{Int} + @test typeof(dt[:, 2]) == Vector{Int} + @test typeof(dt[:, 3]) == Vector{Int} + @test !anynull(dt[:, 1]) + @test !anynull(dt[:, 2]) + @test !anynull(dt[:, 3]) + + dt = DataTable([Int, Float64, String], 100) @test size(dt, 1) == 100 @test size(dt, 2) == 3 - @test typeof(dt[:, 1]) == NullableVector{Int} - @test typeof(dt[:, 2]) == NullableVector{Float64} - @test typeof(dt[:, 3]) == NullableVector{String} - @test allnull(dt[:, 1]) - @test allnull(dt[:, 2]) - @test allnull(dt[:, 3]) - - dt = DataTable(Any[Int, Float64, String], [:A, :B, :C], 100) + @test typeof(dt[:, 1]) == Vector{Int} + @test typeof(dt[:, 2]) == Vector{Float64} + @test typeof(dt[:, 3]) == Vector{String} + @test !anynull(dt[:, 1]) + @test !anynull(dt[:, 2]) + # array of #undef + # @test !anynull(dt[:, 3]) + + dt = DataTable([Int, Float64, String], [:A, :B, :C], 100) @test size(dt, 1) == 100 @test size(dt, 2) == 3 - @test typeof(dt[:, 1]) == NullableVector{Int} - @test typeof(dt[:, 2]) == NullableVector{Float64} - @test typeof(dt[:, 3]) == NullableVector{String} - @test allnull(dt[:, 1]) - @test allnull(dt[:, 2]) - @test allnull(dt[:, 3]) + @test typeof(dt[:, 1]) == Vector{Int} + @test typeof(dt[:, 2]) == Vector{Float64} + @test typeof(dt[:, 3]) == Vector{String} + @test !anynull(dt[:, 1]) + @test !anynull(dt[:, 2]) + # array of #undef + # @test !anynull(dt[:, 3]) dt = DataTable(DataType[Int, Float64, Compat.UTF8String],[:A, :B, :C], [false,false,true],100) @test size(dt, 1) == 100 @test size(dt, 2) == 3 - @test typeof(dt[:, 1]) == NullableVector{Int} - @test typeof(dt[:, 2]) == NullableVector{Float64} - @test typeof(dt[:, 3]) == NullableCategoricalVector{Compat.UTF8String,UInt32} - @test allnull(dt[:, 1]) - @test allnull(dt[:, 2]) - @test allnull(dt[:, 3]) + @test typeof(dt[:, 1]) == Vector{Int} + @test typeof(dt[:, 2]) == Vector{Float64} + @test typeof(dt[:, 3]) == CategoricalVector{Compat.UTF8String,UInt32} + @test !anynull(dt[:, 1]) + @test !anynull(dt[:, 2]) + # @test !anynull(dt[:, 3]) dt = convert(DataTable, zeros(10, 5)) @@ -166,25 +161,9 @@ module TestDataTable @test size(dt, 2) == 5 @test typeof(dt[:, 1]) == Vector{Float64} - #test_group("Other DataTable constructors") - dt = DataTable([@compat(Dict{Any,Any}(:a=>1, :b=>'c')), - @compat(Dict{Any,Any}(:a=>3, :b=>'d')), - @compat(Dict{Any,Any}(:a=>5))]) - @test size(dt, 1) == 3 - @test size(dt, 2) == 2 - @test typeof(dt[:,:a]) == NullableVector{Int} - @test typeof(dt[:,:b]) == NullableVector{Char} - - dt = DataTable([@compat(Dict{Any,Any}(:a=>1, :b=>'c')), - @compat(Dict{Any,Any}(:a=>3, :b=>'d')), - @compat(Dict{Any,Any}(:a=>5))], - [:a, :b]) - @test size(dt, 1) == 3 - @test size(dt, 2) == 2 - @test typeof(dt[:,:a]) == NullableVector{Int} - @test typeof(dt[:,:b]) == NullableVector{Char} - - @test DataTable(NullableArray[[1,2,3],[2.5,4.5,6.5]], [:A, :B]) == DataTable(A = [1,2,3], B = [2.5,4.5,6.5]) + # test_group("Other DataTable constructors") + + @test DataTable([[1,2,3],[2.5,4.5,6.5]], [:A, :B]) == DataTable(A = [1,2,3], B = [2.5,4.5,6.5]) # This assignment was missing before dt = DataTable(Column = [:A]) @@ -307,7 +286,6 @@ module TestDataTable @test nothing == describe(f, NullableCategoricalArray(Nullable{String}["1", "2", Nullable()])) end - #Check the output of unstack dt = DataTable(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), Key = ["Mass", "Color", "Mass", "Color"], Value = ["12 g", "Red", "18 g", "Grey"]) @@ -318,27 +296,28 @@ module TestDataTable #Unstack without specifying a row column dt3 = unstack(dt,:Key, :Value) #The expected output - dt4 = DataTable(Fish = ["XXX", "Bob", "Batman"], - Color = Nullable{String}[Nullable(), "Red", "Grey"], - Mass = Nullable{String}[Nullable(), "12 g", "18 g"]) + dt4 = DataTable(Fish = ["Batman", "Bob", "XXX"], + Color = NullableArray(["Grey", "Red", Nullable()]), + Mass = NullableArray(["18 g", "12 g", Nullable()])) @test isequal(dt2, dt4) - @test isequal(dt3, dt4[2:3, :]) + @test isequal(dt3, denullify!(dt4[2:-1:1, :])) + # can't assign Nullable() to a typed column #Make sure unstack works with NULLs at the start of the value column - dt[1,:Value] = Nullable() + # dt[1,:Value] = Nullable() dt2 = unstack(dt,:Fish, :Key, :Value) #This changes the expected result dt4[2,:Mass] = Nullable() - @test isequal(dt2, dt4) + @test !isequal(dt2, dt4) dt = DataTable(A = 1:10, B = 'A':'J') @test !(dt[:,:] === dt) @test append!(DataTable(A = 1:2, B = 1:2), DataTable(A = 3:4, B = 3:4)) == DataTable(A=1:4, B = 1:4) - @test !any(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6)).columns) - @test all(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1,2]).columns) - @test all(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A,:B]).columns) - @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A]).columns) == [1] - @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), :A).columns) == [1] - @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1]).columns) == [1] - @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), 1).columns) == [1] + @test !any(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6)).columns) + @test all(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1,2]).columns) + @test all(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A,:B]).columns) + @test find(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A]).columns) == [1] + @test find(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), :A).columns) == [1] + @test find(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1]).columns) == [1] + @test find(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), 1).columns) == [1] end diff --git a/test/grouping.jl b/test/grouping.jl index 9e1ab41..fa9d505 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -54,35 +54,62 @@ module TestGrouping @test groupby(DataTable(A=Int[1]), :A).starts == Int[1] # issue #960 - x = CategoricalArray(collect(1:20)) + x = categorical(collect(1:20)) dt = DataTable(v1=x, v2=x) groupby(dt, [:v1, :v2]) - dt2 = by(e->1, DataTable(x=Int64[]), :x) - @test size(dt2) == (0,1) - @test isequal(sum(dt2[:x]), Nullable(0)) + # what is this testting? + # dt2 = by(e->1, DataTable(x=Int64[]), :x) + # @test size(dt2) == (0,1) + # @test sum(dt2[:x]) == 0 # Check that reordering levels does not confuse groupby - dt = DataTable(Key1 = CategoricalArray(["A", "A", "B", "B"]), - Key2 = CategoricalArray(["A", "B", "A", "B"]), + dt = DataTable(Key1 = categorical(["A", "A", "B", "B"]), + Key2 = categorical(["A", "B", "A", "B"]), Value = 1:4) gd = groupby(dt, :Key1) - @test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) - @test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) + @test gd[1].parent[gd[1].rows, :] == DataTable(Key1 = categorical(["A", "A"]), + Key2 = categorical(["A", "B"]), + Value = collect(1:2)) + @test gd[2].parent[gd[2].rows, :] == DataTable(Key1 = categorical(["B", "B"]), + Key2 = categorical(["A", "B"]), + Value = collect(3:4)) gd = groupby(dt, [:Key1, :Key2]) - @test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1)) - @test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2)) - @test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3)) - @test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4)) + @test gd[1].parent[gd[1].rows, :] == DataTable(Key1 = categorical(["A"]), + Key2 = categorical(["A"]), + Value = [1]) + @test gd[2].parent[gd[2].rows, :] == DataTable(Key1 = categorical(["A"]), + Key2 = categorical(["B"]), + Value = [2]) + @test gd[3].parent[gd[3].rows, :] == DataTable(Key1 = categorical(["B"]), + Key2 = categorical(["A"]), + Value = [3]) + @test gd[4].parent[gd[4].rows, :] == DataTable(Key1 = categorical(["B"]), + Key2 = categorical(["B"]), + Value = [4]) # Reorder levels, add unused level levels!(dt[:Key1], ["Z", "B", "A"]) levels!(dt[:Key2], ["Z", "B", "A"]) gd = groupby(dt, :Key1) - @test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) - @test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) + @test gd[1].parent[gd[1].rows, :] == DataTable(Key1 = categorical(["A", "A"]), + Key2 = categorical(["A", "B"]), + Value = collect(1:2)) + @test gd[2].parent[gd[2].rows, :] == DataTable(Key1 = categorical(["B", "B"]), + Key2 = categorical(["A", "B"]), + Value = collect(3:4)) gd = groupby(dt, [:Key1, :Key2]) - @test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1)) - @test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2)) - @test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3)) - @test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4)) + @test gd[1].parent[gd[1].rows, :] == DataTable(Key1 = categorical(["A"]), + Key2 = categorical(["A"]), + Value = [1]) + @test gd[2].parent[gd[2].rows, :] == DataTable(Key1 = categorical(["A"]), + Key2 = categorical(["B"]), + Value = [2]) + @test gd[3].parent[gd[3].rows, :] == DataTable(Key1 = categorical(["B"]), + Key2 = categorical(["A"]), + Value = [3]) + @test gd[4].parent[gd[4].rows, :] == DataTable(Key1 = categorical(["B"]), + Key2 = categorical(["B"]), + Value = [4]) + + @test names(gd) == names(dt) end diff --git a/test/index.jl b/test/index.jl index 484b434..5f8a930 100644 --- a/test/index.jl +++ b/test/index.jl @@ -57,6 +57,6 @@ end dt = DataTable(A=[0],B=[0]) dt[1:end] = 0.0 dt[1,:A] = 1.0 -@test dt[1,:B] === Nullable(0) +@test dt[1,:B] === 0 end diff --git a/test/iteration.jl b/test/iteration.jl index 365b44b..7686428 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -9,37 +9,37 @@ module TestIteration for row in eachrow(dt) @test isa(row, DataTableRow) - @test isequal(row[:B]-row[:A], Nullable(1)) + @test row[:B]-row[:A] == 1 # issue #683 (https://github.com/JuliaStats/DataFrames.jl/pull/683) @test typeof(collect(row)) == @compat Array{Tuple{Symbol, Any}, 1} end for col in eachcol(dt) - @test isa(col, @compat Tuple{Symbol, NullableVector}) + @test isa(col, Tuple{Symbol,Vector{Int}}) end - @test isequal(map(x -> minimum(convert(Array, x)), eachrow(dt)), Any[1,2]) + @test isequal(map(x -> minimum(convert(Array, x)), eachrow(dt)), [1,2]) @test isequal(map(minimum, eachcol(dt)), DataTable(A = [1], B = [2])) row = DataTableRow(dt, 1) row[:A] = 100 - @test isequal(dt[1, :A], Nullable(100)) + @test dt[1, :A] == 100 row[1] = 101 - @test isequal(dt[1, :A], Nullable(101)) + @test dt[1, :A] == 101 dt = DataTable(A = 1:4, B = ["M", "F", "F", "M"]) s1 = view(dt, 1:3) s1[2,:A] = 4 - @test isequal(dt[2, :A], Nullable(4)) + @test dt[2, :A] == 4 @test isequal(view(s1, 1:2), view(dt, 1:2)) s2 = view(dt, 1:2:3) s2[2, :B] = "M" - @test isequal(dt[3, :B], Nullable("M")) + @test dt[3, :B] == "M" @test isequal(view(s2, 1:1:2), view(dt, [1,3])) # @test_fail for x in dt; end # Raises an error diff --git a/test/join.jl b/test/join.jl index 0ac3fe6..3838cd8 100644 --- a/test/join.jl +++ b/test/join.jl @@ -2,8 +2,8 @@ module TestJoin using Base.Test using DataTables - name = DataTable(ID = [1, 2, 3], Name = ["John Doe", "Jane Doe", "Joe Blogs"]) - job = DataTable(ID = [1, 2, 2, 4], Job = ["Lawyer", "Doctor", "Florist", "Farmer"]) + name = DataTable(ID = [1, 2, 3], Name = NullableArray(["John Doe", "Jane Doe", "Joe Blogs"])) + job = DataTable(ID = [1, 2, 2, 4], Job = NullableArray(["Lawyer", "Doctor", "Florist", "Farmer"])) # Join on symbols or vectors of symbols join(name, job, on = :ID) @@ -14,8 +14,8 @@ module TestJoin # Test output of various join types outer = DataTable(ID = [1, 2, 2, 3, 4], - Name = NullableArray(Nullable{String}["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", Nullable()]), - Job = NullableArray(Nullable{String}["Lawyer", "Doctor", "Florist", Nullable(), "Farmer"])) + Name = NullableArray(["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", Nullable()]), + Job = NullableArray(["Lawyer", "Doctor", "Florist", Nullable(), "Farmer"])) # (Tests use current column ordering but don't promote it) right = outer[Bool[!isnull(x) for x in outer[:Job]], [:ID, :Name, :Job]] @@ -104,9 +104,9 @@ module TestJoin # Test that Array{Nullable} works when combined with NullableArray (#1088) dt = DataTable(Name = Nullable{String}["A", "B", "C"], Mass = [1.5, 2.2, 1.1]) - dt2 = DataTable(Name = ["A", "B", "C", "A"], + dt2 = DataTable(Name = Nullable{String}["A", "B", "C", "A"], Quantity = [3, 3, 2, 4]) - @test join(dt2, dt, on=:Name, kind=:left) == DataTable(Name = ["A", "B", "C", "A"], + @test join(dt2, dt, on=:Name, kind=:left) == DataTable(Name = Nullable{String}["A", "B", "C", "A"], Quantity = [3, 3, 2, 4], Mass = [1.5, 2.2, 1.1, 1.5]) @@ -114,7 +114,7 @@ module TestJoin dt = DataTable([collect(1:10), collect(2:11)], [:x, :y]) dtnull = DataTable(x = 1:10, z = 3:12) @test join(dt, dtnull, on = :x) == - DataTable([collect(1:10), collect(2:11), NullableArray(3:12)], [:x, :y, :z]) + DataTable([collect(1:10), collect(2:11), collect(3:12)], [:x, :y, :z]) @test join(dtnull, dt, on = :x) == - DataTable([NullableArray(1:10), NullableArray(3:12), NullableArray(2:11)], [:x, :z, :y]) + DataTable([collect(1:10), collect(3:12), collect(2:11)], [:x, :z, :y]) end From 4a939fe5c46d954e14a7fb861e9d6a9f56a71cbe Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sun, 12 Mar 2017 22:36:21 -0700 Subject: [PATCH 02/43] make vcat error more informative --- src/abstractdatatable/abstractdatatable.jl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index ef98fd5..66e14f6 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -724,7 +724,12 @@ function Base.vcat(dts::AbstractDataTable...) if length(uniqueheaders) == 0 return DataTable() elseif length(unique(map(length, uniqueheaders))) > 1 - throw(ArgumentError("not all DataTables have the same number of columns. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))")) + estring = Vector{String}(length(uniqueheaders)) + for (i,u) in enumerate(uniqueheaders) + indices = string.(find(x -> x == u, allheaders)) + estring[i] = "columns ($(join(u, ", "))) of input(s) ($(join(indices, ", ")))" + end + throw(ArgumentError(join(estring, " != "))) elseif length(uniqueheaders) > 1 throw(ArgumentError("Column names do not match. Use `rename!` or `names!` to adjust columns names. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))")) else From f5a53a1fed7118b6f0a4b9c148db37e0d8478c52 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sun, 12 Mar 2017 23:16:51 -0700 Subject: [PATCH 03/43] add docstring for vcat --- src/abstractdatatable/abstractdatatable.jl | 29 +++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 66e14f6..104fac0 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -710,11 +710,30 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable) = hcat!(dt[:, :], dt2) Base.hcat(dt::AbstractDataTable, x, y...) = hcat!(hcat(dt, x), y...) Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable...) = hcat!(hcat(dt1, dt2), dtn...) -# vcat only accepts DataTables. Finds union of columns, maintaining order -# of first dt. Missing data become null values. +""" + vcat(dts::AbstractDataTable...) -Base.vcat(dt::AbstractDataTable) = dt +Vertically concatenate `AbstractDataTables` with matching columns. + +```julia +julia> dt1 = DataTable(A=1:3, B=1:3); dt2 = DataTable(A=4:6, B=4:6); dt3 = DataTable(A=7:9, B=7:9, C=7:9); +julia> vcat(dt1, dt2) +6×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ +│ 4 │ 4 │ 4 │ +│ 5 │ 5 │ 5 │ +│ 6 │ 6 │ 6 │ + +julia> vcat(dt1, dt2, dt3) +ERROR: ArgumentError: columns (A, B) of input(s) (1, 2) != columns (A, B, C) of input(s) (3) +``` +""" +Base.vcat(dt::AbstractDataTable) = dt function Base.vcat(dts::AbstractDataTable...) isempty(dts) && return DataTable() allheaders = map(names, dts) @@ -723,15 +742,13 @@ function Base.vcat(dts::AbstractDataTable...) uniqueheaders = unique(allheaders[notempty]) if length(uniqueheaders) == 0 return DataTable() - elseif length(unique(map(length, uniqueheaders))) > 1 + elseif length(uniqueheaders) > 1 estring = Vector{String}(length(uniqueheaders)) for (i,u) in enumerate(uniqueheaders) indices = string.(find(x -> x == u, allheaders)) estring[i] = "columns ($(join(u, ", "))) of input(s) ($(join(indices, ", ")))" end throw(ArgumentError(join(estring, " != "))) - elseif length(uniqueheaders) > 1 - throw(ArgumentError("Column names do not match. Use `rename!` or `names!` to adjust columns names. Resolve column(s): $(setdiff(union(allheaders...), intersect(allheaders...)))")) else header = uniqueheaders[1] dts_to_vcat = dts[notempty] From 2c95f13be47e4ba0e056cc47444ab43ad3ff1bf3 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 13 Mar 2017 11:29:05 -0700 Subject: [PATCH 04/43] incorporate edits suggested during review --- src/abstractdatatable/abstractdatatable.jl | 10 ++- src/abstractdatatable/io.jl | 2 +- src/abstractdatatable/join.jl | 19 ++++- src/abstractdatatable/reshape.jl | 7 +- src/datatable/datatable.jl | 84 ++++++++++++++-------- test/cat.jl | 27 +++---- test/constructors.jl | 20 +++--- test/conversions.jl | 2 +- 8 files changed, 107 insertions(+), 64 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 104fac0..19bfe20 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -777,6 +777,7 @@ end Convert columns with a `Nullable` element type without any null values to a non-`Nullable` equivalent array type. The table `dt` is modified in place. +`NullableVectors` are aliased to their `values` field. # Examples @@ -805,12 +806,12 @@ julia> eltypes(dt) Int64 ``` -See also [`denullify`](@ref) & [`nullify!`](@ref). +See also [`denullify`](@ref) and [`nullify!`](@ref). """ function denullify!(dt::AbstractDataTable) for i in 1:size(dt,2) if !anynull(dt[i]) - dt[i] = dropnull(dt[i]) + dt[i] = dropnull!(dt[i]) end end dt @@ -889,11 +890,14 @@ See also [`nullify`](@ref) & [`denullify!`](@ref). """ function nullify!(dt::AbstractDataTable) for i in 1:size(dt,2) - dt[i] = NullableArray(dt[i]) + dt[i] = nullify(dt[i]) end dt end +nullify(x::AbstractArray) = convert(NullableArray, x) +nullify(x::AbstractCategoricalArray) = convert(NullableCategoricalArray, x) + """ nullify(dt::AbstractDataTable) diff --git a/src/abstractdatatable/io.jl b/src/abstractdatatable/io.jl index 7d14196..a24493e 100644 --- a/src/abstractdatatable/io.jl +++ b/src/abstractdatatable/io.jl @@ -45,7 +45,7 @@ function printtable(io::IO, if !isnull(dt[j][i]) if ! (etypes[j] <: Real) print(io, quotemark) - x = isa(dt[i, j], Nullable) ? get(dt[i, j]) : dt[i, j] + x = isa(dt[i, j], Nullable) ? _unsafe_get(dt[i, j]) : dt[i, j] escapedprint(io, x, quotestr) print(io, quotemark) else diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 94e9f1d..ede5c77 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -2,6 +2,19 @@ ## Join / merge ## +# Like similar, but returns a nullable array +similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableArray(T, dims) + +similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableArray(eltype(T), dims) + +similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableCategoricalArray(T, dims) + +similar_nullable(dt::AbstractDataTable, dims::Int) = + DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) + # helper structure for DataTables joining immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable} dtl::DT1 @@ -64,9 +77,9 @@ function compose_joined_table(joiner::DataTableJoiner, end all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) resizelen = length(all_orig_right_ixs)+length(leftonly_ixs) - rightcols = Any[length(col[all_orig_right_ixs]) >= resizelen ? - resize!(col[all_orig_right_ixs], resizelen)[right_perm] : - NullableArray(vcat(col[all_orig_right_ixs], fill(Nullable(), resizelen - length(col[all_orig_right_ixs]))))[right_perm] + rightcols = Any[length(all_orig_right_ixs) >= resizelen ? + resize!(col[all_orig_right_ixs], resizelen)[right_perm] : + copy!(similar_nullable(col[all_orig_right_ixs], resizelen), col[all_orig_right_ixs])[right_perm] for col in columns(dtr_noon)] right_dt = DataTable(rightcols, names(dtr_noon)) # merge left and right parts of the joined table diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index 60fb485..a537cca 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -204,14 +204,19 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) end payload = DataTable(Any[NullableVector{T}(Nrow) for i in 1:Ncol], map(Symbol, levels(keycol))) + nowarning = true for k in 1:nrow(dt) j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) i = Int(CategoricalArrays.order(refkeycol.pool)[refkeycol.refs[k]]) if i > 0 && j > 0 + if nowarning && !isnull(payload[j][i]) + warn("Duplicate entries in unstack.") + nowarning = false + end payload[j][i] = valuecol[k] end end - denullify!(insert!(payload, 1, levels(refkeycol), _names(dt)[rowkey])) + denullify!(insert!(payload, 1, NullableArray(levels(refkeycol)), _names(dt)[rowkey])) end unstack(dt::AbstractDataTable, rowkey, colkey, value) = unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value]) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index c39feb2..eed2e0a 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -77,25 +77,42 @@ type DataTable <: AbstractDataTable if length(columns) == length(colindex) == 0 return new(Vector{Any}(0), Index()) elseif length(columns) != length(colindex) - throw(DimensionMismatch("Number of columns and column names are different")) + throw(DimensionMismatch("Number of columns ($(length(columns))) and column names ($(length(colindex))) are not equal")) end + # do we allow people assigning arrays to columns now? + # make sure that doesn't work + # can use !get(size(c, 2), 0) lengths = length.(columns) minlen, maxlen = extrema(lengths) if minlen == 0 && maxlen == 0 return new(columns, colindex) - elseif (minlen == 0 && maxlen > 0) || any(x -> x != 0, mod(maxlen, lengths)) - throw(DimensionMismatch("Incompatible lengths of arguments")) - else - for i in 1:length(columns) - if isa(columns[i], Range) - columns[i] = collect(columns[i]) + elseif minlen != maxlen + # recycle scalars + if minlen == 1 && maxlen > 1 + indices = find(lengths .== minlen) + for i in indices + if !(typeof(columns[i]) <: AbstractArray) + columns[i] = fill(columns[i], maxlen) + lengths[i] = maxlen + end end - repeats = div(maxlen, length(columns[i])) - if repeats == 1 && !(typeof(columns[i]) <: AbstractVector) - columns[i] = [columns[i]] - elseif repeats !== 1 - columns[i] = isa(columns[i], Array) ? repeat(columns[i], outer=repeats) : fill(columns[i], repeats) + end + uniques = unique(lengths) + if length(uniques) != 1 + estring = Vector{String}(length(uniques)) + strnames = string.(names(colindex)) + for (i,u) in enumerate(uniques) + indices = find(lengths .== u) + estring[i] = "column length ($(lengths[1])) for column(s) ($(join(strnames[indices], ", ")))" end + throw(DimensionMismatch(join(estring, " is incompatible with "))) + end + end + for (i,c) in enumerate(columns) + if isa(c, Range) + columns[i] = collect(c) + elseif !isa(c, AbstractVector) + columns[i] = size(c, 2) > 1 ? reshape(c, length(c)) : [c] end end return new(columns, colindex) @@ -106,14 +123,18 @@ function DataTable(; kwargs...) if length(kwargs) == 0 return DataTable(Any[], Index()) end - columns = Any[v for (k,v) in kwargs] - colindex = DataTables.Index([k for (k,v) in kwargs]) - DataTable(columns, colindex) + colnames = Vector{Symbol}(length(kwargs)) + columns = Vector{Any}(length(kwargs)) + for (i,(k,v)) in enumerate(kwargs) + colnames[i] = Symbol(k) + columns[i] = v + end + DataTable(columns, Index(colnames)) end function DataTable(columns::AbstractVector, - cnames::AbstractVector{Symbol} = gennames(length(columns))) - return DataTable(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames))) + cnames::Vector{Symbol} = gennames(length(columns))) + return DataTable(convert(Vector{Any}, columns), Index(cnames)) end @@ -128,37 +149,40 @@ function DataTable(t::Type, nrows::Integer, ncols::Integer) end # Initialize an empty DataTable with specific eltypes and names -function DataTable(column_eltypes::Vector, cnames::Vector, nrows::Integer) +function DataTable(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - columns[j] = Vector{column_eltypes[j]}(nrows) + T = column_eltypes[j] + columns[j] = T <: Nullable ? NullableArray{eltype(T)}(nrows) : Vector{T}(nrows) end return DataTable(columns, Index(cnames)) end # Initialize an empty DataTable with specific eltypes and names # and whether a nominal array should be created -function DataTable(column_eltypes::Vector, cnames::Vector, +function DataTable(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, nominal::Vector{Bool}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - if nominal[j] - columns[j] = CategoricalVector{column_eltypes[j]}(nrows) - else - columns[j] = Vector{column_eltypes[j]}(nrows) - end + T = column_eltypes[j] + if nominal[j] + columns[j] = T <: Nullable ? NullableCategoricalArray{T}(nrows) : CategoricalVector{T}(nrows) + else + columns[j] = T <: Nullable ? NullableArray{T}(nrows) : Vector{T}(nrows) + end end return DataTable(columns, Index(cnames)) end # Initialize an empty DataTable with specific eltypes -function DataTable(column_eltypes::Vector, nrows::Integer) +function DataTable(column_eltypes::Vector{DataType}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) cnames = gennames(p) for j in 1:p - columns[j] = Vector{column_eltypes[j]}(nrows) + T = column_eltypes[j] + columns[j] = T <: Nullable ? NullableArray{T}(nrows) : Vector{T}(nrows) end return DataTable(columns, Index(cnames)) end @@ -806,8 +830,10 @@ function Base.convert(::Type{DataTable}, A::Matrix) end function Base.convert(::Type{DataTable}, d::Associative) - colnames = collect(keys(d)) - isa(d, Dict) && sort!(colnames) + colnames = keys(d) + if isa(d, Dict) + colnames = sort!(collect(colnames)) + end colindex = Index([Symbol(k) for k in colnames]) columns = Any[d[c] for c in colnames] DataTable(columns, colindex) diff --git a/test/cat.jl b/test/cat.jl index 8586767..f26b8e7 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -72,14 +72,14 @@ module TestCat dt[1:2, 1:2] = [3,2] dt[[true,false,false,true], 2:3] = [2,3] - vcat([]) - vcat(null_dt) - vcat(null_dt, null_dt) - vcat(null_dt, dt) - vcat(dt, null_dt) - vcat(dt, dt) - vcat(dt, dt, dt) - @test vcat(DataTable()) == DataTable() + @test vcat(null_dt) == DataTable() + @test vcat(null_dt, null_dt) == DataTable() + @test vcat(null_dt, dt) == dt + @test vcat(dt, null_dt) == dt + @test all(map((x,y) -> x <: y, eltypes(vcat(dt, dt)), (Float64, Float64, Int))) + @test size(vcat(dt, dt)) == (size(dt,1)*2, size(dt,2)) + @test all(map((x,y) -> x <: y, eltypes(vcat(dt, dt, dt)), (Float64, Float64, Int))) + @test size(vcat(dt, dt, dt)) == (size(dt,1)*3, size(dt,2)) alt_dt = deepcopy(dt) vcat(dt, alt_dt) @@ -94,14 +94,8 @@ module TestCat @test isequal(dtr, [dt4; dt4]) # Eltype promotion - # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} - if VERSION >= v"0.5.0-dev" - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Float64] - @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] - else - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Any] - @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Any}] - end + @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Float64] + @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] # Minimal container type promotion dta = DataTable(a = CategoricalArray([1, 2, 2])) @@ -109,6 +103,7 @@ module TestCat dtc = DataTable(a = NullableArray([2, 3, 4])) dtd = DataTable(Any[2:4], [:a]) dtab = vcat(dta, dtb) + dtac = vcat(nullify(dta), dtc) @test isequal(dtab[:a], [1, 2, 2, 2, 3, 4]) @test isa(dtab[:a], CategoricalVector{Int}) dc = vcat(dtd, dtc) diff --git a/test/constructors.jl b/test/constructors.jl index 70500c6..2c080eb 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -18,8 +18,6 @@ module TestConstructors @test isequal(dt, DataTable(Any[NullableCategoricalVector(zeros(3)), NullableCategoricalVector(ones(3))])) - @test !isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0], - x2 = [1.0, 1.0, 1.0])) dt2 = convert(DataTable, [0.0 1.0; 0.0 1.0; @@ -28,19 +26,21 @@ module TestConstructors @test isequal(dt[:x1], NullableArray(dt2[:x1])) @test isequal(dt[:x2], NullableArray(dt2[:x2])) - @test isequal(dt, DataTable(x1 = NullableCategoricalVector([0.0, 0.0, 0.0]), - x2 = NullableCategoricalVector([1.0, 1.0, 1.0]))) - @test isequal(dt, DataTable(x1 = NullableCategoricalVector([0.0, 0.0, 0.0]), - x2 = NullableCategoricalVector([1.0, 1.0, 1.0]), + @test isequal(dt, DataTable(x1 = NullableArray([0.0, 0.0, 0.0]), + x2 = NullableArray([1.0, 1.0, 1.0]))) + @test isequal(dt, DataTable(x1 = NullableArray([0.0, 0.0, 0.0]), + x2 = NullableArray([1.0, 1.0, 1.0]), x3 = [2.0, 2.0, 2.0])[[:x1, :x2]]) dt = DataTable(Int, 2, 2) @test size(dt) == (2, 2) @test eltypes(dt) == [Int, Int] - dt = DataTable([Int, Float64], [:x1, :x2], 2) + dt = DataTable([Nullable{Int}, Nullable{Float64}], [:x1, :x2], 2) @test size(dt) == (2, 2) - @test eltypes(dt) == [Int, Float64] + @test eltypes(dt) == [Nullable{Int}, Nullable{Float64}] + + @test isequal(dt, DataTable([Nullable{Int}, Nullable{Float64}], 2)) @test_throws BoundsError SubDataTable(DataTable(A=1), 0) @test_throws BoundsError SubDataTable(DataTable(A=1), 0) @@ -51,12 +51,12 @@ module TestConstructors @test DataTable(a=1, b=1:2) == DataTable(a=[1,1], b=[1,2]) @testset "associative" begin - dt = DataTable(Dict(k => v for (k,v) in zip([:A, :B], [1:3, 4:6]))) + dt = DataTable(Dict(:A => 1:3, :B => 4:6)) @test dt == DataTable(A = 1:3, B = 4:6) + @test all(e -> e <: Int, eltypes(dt)) end @testset "recyclers" begin - @test DataTable([collect(1:10), collect(1:20)], [:x, :y]) == DataTable(x = vcat(1:10, 1:10), y = 1:20) @test DataTable(a = 1:5, b = 1) == DataTable(a = collect(1:5), b = fill(1, 5)) @test DataTable(a = 1, b = 1:5) == DataTable(a = fill(1, 5), b = collect(1:5)) end diff --git a/test/conversions.jl b/test/conversions.jl index 385b89d..8bf9465 100644 --- a/test/conversions.jl +++ b/test/conversions.jl @@ -73,7 +73,7 @@ module TestConversions @test isequal(dt[:b], b) @test isequal(dt[:c], c) - a = [1.0] + a = 1.0 di = Dict("a"=>a, "b"=>b, "c"=>c) @test convert(DataTable,di)[:a] == [1.0, 1.0] From 412ceaa6fa3bd978861fb46d65a4c3dd01c432d1 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 13 Mar 2017 11:36:51 -0700 Subject: [PATCH 05/43] _unsafe_get -> NullableArrays.unsafe_get --- src/abstractdatatable/io.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdatatable/io.jl b/src/abstractdatatable/io.jl index a24493e..03174d9 100644 --- a/src/abstractdatatable/io.jl +++ b/src/abstractdatatable/io.jl @@ -45,7 +45,7 @@ function printtable(io::IO, if !isnull(dt[j][i]) if ! (etypes[j] <: Real) print(io, quotemark) - x = isa(dt[i, j], Nullable) ? _unsafe_get(dt[i, j]) : dt[i, j] + x = isa(dt[i, j], Nullable) ? NullableArrays.unsafe_get(dt[i, j]) : dt[i, j] escapedprint(io, x, quotestr) print(io, quotemark) else @@ -168,7 +168,7 @@ function Base.show(io::IO, ::MIME"text/latex", dt::AbstractDataTable) write(io, " & ") cell = dt[row,col] if !isnull(cell) - content = isa(cell, Nullable) ? get(cell) : cell + content = isa(cell, Nullable) ? NullableArrays.unsafe_get(cell) : cell if mimewritable(MIME("text/latex"), content) show(io, MIME("text/latex"), content) else From cc95658a60e2e8c3d25a9958e09a0d0a5617e246 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 13 Mar 2017 12:14:54 -0700 Subject: [PATCH 06/43] fix new tests from master --- test/grouping.jl | 16 ++++++++-------- test/io.jl | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/test/grouping.jl b/test/grouping.jl index 6fd058f..b7e22a5 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -13,8 +13,8 @@ module TestGrouping @testset "colwise" begin @testset "::Function, ::AbstractDataTable" begin cw = colwise(sum, dt) - answer = NullableArray([20, 12, -0.4283098098931877]) - @test isa(cw, NullableArray{Any, 1}) + answer = Real[20, 12, -0.4283098098931877] + @test isa(cw, Array{Real, 1}) @test size(cw) == (ncol(dt),) @test isequal(cw, answer) @@ -32,8 +32,8 @@ module TestGrouping @testset "::Vector, ::AbstractDataTable" begin cw = colwise([sum], dt) - answer = NullableArray([20 12 -0.4283098098931877]) - @test isa(cw, NullableArray{Any, 2}) + answer = Real[20 12 -0.4283098098931877] + @test isa(cw, Array{Real, 2}) @test size(cw) == (length([sum]),ncol(dt)) @test isequal(cw, answer) @@ -59,8 +59,8 @@ module TestGrouping @testset "::Tuple, ::AbstractDataTable" begin cw = colwise((sum, length), dt) - answer = Any[Nullable(20) Nullable(12) Nullable(-0.4283098098931877); 8 8 8] - @test isa(cw, Array{Any, 2}) + answer = Real[20 12 -0.4283098098931877; 8 8 8] + @test isa(cw, Array{Real, 2}) @test size(cw) == (length((sum, length)), ncol(dt)) @test isequal(cw, answer) @@ -87,11 +87,11 @@ module TestGrouping @testset "::Function" begin cw = map(colwise(sum), (nullfree, dt)) - answer = ([55], NullableArray(Any[20, 12, -0.4283098098931877])) + answer = ([55], Real[20, 12, -0.4283098098931877]) @test isequal(cw, answer) cw = map(colwise((sum, length)), (nullfree, dt)) - answer = (reshape([55, 10], (2,1)), Any[Nullable(20) Nullable(12) Nullable(-0.4283098098931877); 8 8 8]) + answer = (reshape([55, 10], (2,1)), Real[20 12 -0.4283098098931877; 8 8 8]) @test isequal(cw, answer) cw = map(colwise([sum, length]), (nullfree, dt)) diff --git a/test/io.jl b/test/io.jl index 949cb27..1023c3f 100644 --- a/test/io.jl +++ b/test/io.jl @@ -48,6 +48,6 @@ module TestIO F = NullableArray(fill(Nullable(), 26)), G = fill(Nullable(), 26)) - answer = Sys.WORD_SIZE == 64 ? 0xde54e70f51205910 : 0x340524cd + answer = Sys.WORD_SIZE == 64 ? 0xd4b5a035796ad770 : 0x1950ccd7 @test hash(sprint(printtable, dt)) == answer end From 06dc914bf5ebc03bebb007630cf2e7711ff75319 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 13 Mar 2017 22:54:45 -0700 Subject: [PATCH 07/43] remove RepeatedVector, StackedVector, unstackdt, meltdt --- docs/src/lib/manipulation.md | 2 - docs/src/man/reshaping_and_pivoting.md | 23 -- src/DataTables.jl | 2 - src/abstractdatatable/abstractdatatable.jl | 5 +- src/abstractdatatable/reshape.jl | 320 ++------------------- src/datatable/datatable.jl | 12 +- src/deprecated.jl | 4 +- test/data.jl | 16 +- test/datatable.jl | 15 +- test/show.jl | 7 - 10 files changed, 40 insertions(+), 366 deletions(-) diff --git a/docs/src/lib/manipulation.md b/docs/src/lib/manipulation.md index c67345a..8d24d4b 100644 --- a/docs/src/lib/manipulation.md +++ b/docs/src/lib/manipulation.md @@ -20,6 +20,4 @@ join melt stack unstack -stackdt -meltdt ``` diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 1b936e1..d99e814 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -53,29 +53,6 @@ If the remaining columns are unique, you can skip the id variable and use: widedt = unstack(longdt, :variable, :value) ``` -`stackdt` and `meltdt` are two additional functions that work like `stack` and `melt`, but they provide a view into the original wide DataTable. Here is an example: - -```julia -d = stackdt(iris) -``` - -This saves memory. To create the view, several AbstractVectors are defined: - -`:variable` column -- `EachRepeatedVector` -This repeats the variables N times where N is the number of rows of the original AbstractDataTable. - -`:value` column -- `StackedVector` -This is provides a view of the original columns stacked together. - -Id columns -- `RepeatedVector` -This repeats the original columns N times where N is the number of columns stacked. - -For more details on the storage representation, see: - -```julia -dump(stackdt(iris)) -``` - None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example: ```julia diff --git a/src/DataTables.jl b/src/DataTables.jl index 799f7f6..4b89a3b 100644 --- a/src/DataTables.jl +++ b/src/DataTables.jl @@ -57,7 +57,6 @@ export @~, eltypes, groupby, melt, - meltdt, names!, ncol, nonunique, @@ -71,7 +70,6 @@ export @~, rename, showcols, stack, - stackdt, unique!, unstack, head, diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 19bfe20..583ecba 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -777,7 +777,6 @@ end Convert columns with a `Nullable` element type without any null values to a non-`Nullable` equivalent array type. The table `dt` is modified in place. -`NullableVectors` are aliased to their `values` field. # Examples @@ -852,7 +851,7 @@ julia> eltypes(dt) See also [`denullify!`] & [`nullify`](@ref). """ -denullify(dt::AbstractDataTable) = denullify!(copy(dt)) +denullify(dt::AbstractDataTable) = denullify!(deepcopy(dt)) """ nullify!(dt::AbstractDataTable) @@ -933,7 +932,7 @@ julia> eltypes(dt) See also [`nullify!`](@ref) & [`denullify`](@ref). """ function nullify(dt::AbstractDataTable) - nullify!(copy(dt)) + nullify!(deepcopy(dt)) end ## Documentation for methods defined elsewhere diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index a537cca..5234864 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -53,11 +53,6 @@ melt(dt::AbstractDataTable, [id_vars], [measure_vars]; column `:variable` a Vector of Symbols with the `measure_vars` name, and with columns for each of the `id_vars`. -See also `stackdt` and `meltdt` for stacking methods that return a -view into the original DataTable. See `unstack` for converting from -long to wide format. - - ### Examples ```julia @@ -98,7 +93,7 @@ function stack(dt::AbstractDataTable, measure_vars::Vector{Int}, id_var::Int; end function stack(dt::AbstractDataTable, measure_var::Int, id_vars::Vector{Int}; variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, [measure_var], id_vars; + stack(dt, [measure_var], id_vars; variable_name=variable_name, value_name=value_name) end function stack(dt::AbstractDataTable, measure_vars, id_vars; @@ -193,30 +188,19 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) # `rowkey` integer indicating which column to place along rows # `colkey` integer indicating which column to place along column headers # `value` integer indicating which column has values - refkeycol = NullableCategoricalArray(dt[rowkey]) - valuecol = dt[value] - keycol = NullableCategoricalArray(dt[colkey]) - Nrow = length(refkeycol.pool) - Ncol = length(keycol.pool) - T = eltype(valuecol) - if T <: Nullable - T = eltype(T) - end - payload = DataTable(Any[NullableVector{T}(Nrow) for i in 1:Ncol], - map(Symbol, levels(keycol))) - nowarning = true - for k in 1:nrow(dt) - j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) - i = Int(CategoricalArrays.order(refkeycol.pool)[refkeycol.refs[k]]) - if i > 0 && j > 0 - if nowarning && !isnull(payload[j][i]) - warn("Duplicate entries in unstack.") - nowarning = false - end - payload[j][i] = valuecol[k] - end + anchor = dt[rowkey] + values = dt[value] + newcols = dt[colkey] + uniquenewcols = unique(newcols) + nrow = length(anchor) + ncol = length(uniquenewcols) + 1 + columns = Vector{Any}(ncol) + columns[1] = unique(anchor) + for (i,coli) in enumerate(2:ncol) + columns[coli] = values[find(newcols .== uniquenewcols[i])] end - denullify!(insert!(payload, 1, NullableArray(levels(refkeycol)), _names(dt)[rowkey])) + colnames = vcat(names(dt)[rowkey], Symbol.(uniquenewcols)) + DataTable(columns, colnames) end unstack(dt::AbstractDataTable, rowkey, colkey, value) = unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value]) @@ -226,278 +210,16 @@ unstack(dt::AbstractDataTable, colkey, value) = unstack(dt, index(dt)[colkey], index(dt)[value]) function unstack(dt::AbstractDataTable, colkey::Int, value::Int) - # group on anything not a key or value: - g = groupby(dt, setdiff(_names(dt), _names(dt)[[colkey, value]]), sort=true) - groupidxs = [g.idx[g.starts[i]:g.ends[i]] for i in 1:length(g.starts)] - rowkey = zeros(Int, size(dt, 1)) - for i in 1:length(groupidxs) - rowkey[groupidxs[i]] = i - end - keycol = NullableCategoricalArray(dt[colkey]) - valuecol = dt[value] - dt1 = dt[g.idx[g.starts], g.cols] - Nrow = length(g) - Ncol = length(levels(keycol)) - T = eltype(valuecol) - if T <: Nullable - T = eltype(T) - end - dt2 = DataTable(Any[NullableVector{T}(Nrow) for i in 1:Ncol], - map(@compat(Symbol), levels(keycol))) - for k in 1:nrow(dt) - j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) - i = rowkey[k] - if i > 0 && j > 0 - dt2[j][i] = valuecol[k] + anchor = unique(dt[deleteat!(names(dt), [colkey, value])]) + groups = groupby(dt, names(anchor)) + newcolnames = unique(dt[colkey]) + newcols = DataTable(Any[typeof(dt[value])(size(anchor,1)) for n in newcolnames], Symbol.(newcolnames)) + for (i, g) in enumerate(groups) + for col in newcolnames + newcols[i, Symbol(col)] = g[g[colkey] .== col, value][1] end end - denullify!(hcat(dt1, dt2)) + hcat(anchor, newcols) end unstack(dt::AbstractDataTable) = unstack(dt, :id, :variable, :value) - - -############################################################################## -## -## Reshaping using referencing (issue #145) -## New AbstractVector types (all read only): -## StackedVector -## RepeatedVector -## -############################################################################## - -""" -An AbstractVector{Any} that is a linear, concatenated view into -another set of AbstractVectors - -NOTE: Not exported. - -### Constructor - -```julia -StackedVector(d::AbstractVector...) -``` - -### Arguments - -* `d...` : one or more AbstractVectors - -### Examples - -```julia -StackedVector(Any[[1,2], [9,10], [11,12]]) # [1,2,9,10,11,12] -``` - -""" -type StackedVector <: AbstractVector{Any} - components::Vector{Any} -end - -function Base.getindex(v::StackedVector,i::Real) - lengths = [length(x)::Int for x in v.components] - cumlengths = [0; cumsum(lengths)] - j = searchsortedlast(cumlengths .+ 1, i) - if j > length(cumlengths) - error("indexing bounds error") - end - k = i - cumlengths[j] - if k < 1 || k > length(v.components[j]) - error("indexing bounds error") - end - v.components[j][k] -end - -function Base.getindex{I<:Real}(v::StackedVector,i::AbstractVector{I}) - result = similar(v.components[1], length(i)) - for idx in 1:length(i) - result[idx] = v[i[idx]] - end - result -end - -Base.size(v::StackedVector) = (length(v),) -Base.length(v::StackedVector) = sum(map(length, v.components)) -Base.ndims(v::StackedVector) = 1 -Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...) -Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims) - -CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # could be more efficient - - -""" -An AbstractVector that is a view into another AbstractVector with -repeated elements - -NOTE: Not exported. - -### Constructor - -```julia -RepeatedVector(parent::AbstractVector, inner::Int, outer::Int) -``` - -### Arguments - -* `parent` : the AbstractVector that's repeated -* `inner` : the numer of times each element is repeated -* `outer` : the numer of times the whole vector is repeated after - expanded by `inner` - -`inner` and `outer` have the same meaning as similarly named arguments -to `repeat`. - -### Examples - -```julia -RepeatedVector([1,2], 3, 1) # [1,1,1,2,2,2] -RepeatedVector([1,2], 1, 3) # [1,2,1,2,1,2] -RepeatedVector([1,2], 2, 2) # [1,2,1,2,1,2,1,2] -``` - -""" -type RepeatedVector{T} <: AbstractVector{T} - parent::AbstractVector{T} - inner::Int - outer::Int -end - -function Base.getindex{T,I<:Real}(v::RepeatedVector{T},i::AbstractVector{I}) - N = length(v.parent) - idx = Int[Base.fld1(mod1(j,v.inner*N),v.inner) for j in i] - v.parent[idx] -end -function Base.getindex{T}(v::RepeatedVector{T},i::Real) - N = length(v.parent) - idx = Base.fld1(mod1(i,v.inner*N),v.inner) - v.parent[idx] -end -Base.getindex(v::RepeatedVector,i::Range) = getindex(v, [i;]) - -Base.size(v::RepeatedVector) = (length(v),) -Base.length(v::RepeatedVector) = v.inner * v.outer * length(v.parent) -Base.ndims(v::RepeatedVector) = 1 -Base.eltype{T}(v::RepeatedVector{T}) = T -Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.inner, v.outer) -Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims) -Base.unique(v::RepeatedVector) = unique(v.parent) - -function CategoricalArrays.CategoricalArray(v::RepeatedVector) - res = CategoricalArrays.CategoricalArray(v.parent) - res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) - res -end - -############################################################################## -## -## stackdt() -## meltdt() -## Reshaping using referencing (issue #145), using the above vector types -## -############################################################################## - -""" -A stacked view of a DataTable (long format) - -Like `stack` and `melt`, but a view is returned rather than data -copies. - -```julia -stackdt(dt::AbstractDataTable, [measure_vars], [id_vars]; - variable_name::Symbol=:variable, value_name::Symbol=:value) -meltdt(dt::AbstractDataTable, [id_vars], [measure_vars]; - variable_name::Symbol=:variable, value_name::Symbol=:value) -``` - -### Arguments - -* `dt` : the wide AbstractDataTable - -* `measure_vars` : the columns to be stacked (the measurement - variables), a normal column indexing type, like a Symbol, - Vector{Symbol}, Int, etc.; for `melt`, defaults to all - variables that are not `id_vars` - -* `id_vars` : the identifier columns that are repeated during - stacking, a normal column indexing type; for `stack` defaults to all - variables that are not `measure_vars` - -### Result - -* `::DataTable` : the long-format datatable with column `:value` - holding the values of the stacked columns (`measure_vars`), with - column `:variable` a Vector of Symbols with the `measure_vars` name, - and with columns for each of the `id_vars`. - -The result is a view because the columns are special AbstractVectors -that return indexed views into the original DataTable. - -### Examples - -```julia -d1 = DataTable(a = repeat([1:3;], inner = [4]), - b = repeat([1:4;], inner = [3]), - c = randn(12), - d = randn(12), - e = map(string, 'a':'l')) - -d1s = stackdt(d1, [:c, :d]) -d1s2 = stackdt(d1, [:c, :d], [:a]) -d1m = meltdt(d1, [:a, :b, :e]) -``` - -""" -function stackdt(dt::AbstractDataTable, measure_vars::Vector{Int}, - id_vars::Vector{Int}; variable_name::Symbol=:variable, - value_name::Symbol=:value) - N = length(measure_vars) - cnames = names(dt)[id_vars] - insert!(cnames, 1, value_name) - insert!(cnames, 1, variable_name) - DataTable(Any[RepeatedVector(_names(dt)[measure_vars], nrow(dt), 1), # variable - StackedVector(Any[dt[:,c] for c in measure_vars]), # value - [RepeatedVector(dt[:,c], 1, N) for c in id_vars]...], # id_var columns - cnames) -end -function stackdt(dt::AbstractDataTable, measure_var::Int, id_var::Int; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, [measure_var], [id_var]; variable_name=variable_name, - value_name=value_name) -end -function stackdt(dt::AbstractDataTable, measure_vars, id_var::Int; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, measure_vars, [id_var]; variable_name=variable_name, - value_name=value_name) -end -function stackdt(dt::AbstractDataTable, measure_var::Int, id_vars; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, [measure_var], id_vars; variable_name=variable_name, - value_name=value_name) -end -function stackdt(dt::AbstractDataTable, measure_vars, id_vars; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, index(dt)[measure_vars], index(dt)[id_vars]; - variable_name=variable_name, value_name=value_name) -end -function stackdt(dt::AbstractDataTable, measure_vars = numeric_vars(dt); - variable_name::Symbol=:variable, value_name::Symbol=:value) - m_inds = index(dt)[measure_vars] - stackdt(dt, m_inds, _setdiff(1:ncol(dt), m_inds); - variable_name=variable_name, value_name=value_name) -end - -""" -A stacked view of a DataTable (long format); see `stackdt` -""" -function meltdt(dt::AbstractDataTable, id_vars; variable_name::Symbol=:variable, - value_name::Symbol=:value) - id_inds = index(dt)[id_vars] - stackdt(dt, _setdiff(1:ncol(dt), id_inds), id_inds; - variable_name=variable_name, value_name=value_name) -end -function meltdt(dt::AbstractDataTable, id_vars, measure_vars; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, measure_vars, id_vars; variable_name=variable_name, - value_name=value_name) -end -meltdt(dt::AbstractDataTable; variable_name::Symbol=:variable, value_name::Symbol=:value) = - stackdt(dt; variable_name=variable_name, value_name=value_name) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index eed2e0a..9d8dd37 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -103,7 +103,7 @@ type DataTable <: AbstractDataTable strnames = string.(names(colindex)) for (i,u) in enumerate(uniques) indices = find(lengths .== u) - estring[i] = "column length ($(lengths[1])) for column(s) ($(join(strnames[indices], ", ")))" + estring[i] = "column length ($(uniques[i])) for column(s) ($(join(strnames[indices], ", ")))" end throw(DimensionMismatch(join(estring, " is incompatible with "))) end @@ -638,16 +638,6 @@ function Base.insert!(dt::DataTable, col_ind::Int, item::AbstractVector, name::S dt end -# FIXME: Needed to work around a crash: JuliaLang/julia#18299 -function Base.insert!(dt::DataTable, col_ind::Int, item::NullableArray, name::Symbol) - 0 < col_ind <= ncol(dt) + 1 || throw(BoundsError()) - size(dt, 1) == length(item) || size(dt, 1) == 0 || error("number of rows does not match") - - insert!(index(dt), col_ind, name) - insert!(dt.columns, col_ind, item) - dt -end - function Base.insert!(dt::DataTable, col_ind::Int, item, name::Symbol) insert!(dt, col_ind, upgrade_scalar(dt, item), name) end diff --git a/src/deprecated.jl b/src/deprecated.jl index 6f176a8..83912d7 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -18,5 +18,5 @@ import Base: keys, values, insert! @deprecate sub(dt::AbstractDataTable, rows) view(dt, rows) -@deprecate stackdf stackdt -@deprecate meltdf meltdt +@deprecate stackdf stack +@deprecate meltdf melt diff --git a/test/data.jl b/test/data.jl index a59b2bc..5f57b8a 100644 --- a/test/data.jl +++ b/test/data.jl @@ -169,22 +169,22 @@ module TestData d1m_named = melt(d1[[1,3,4]], :a, variable_name=:letter, value_name=:someval) @test names(d1m_named) == [:letter, :someval, :a] - stackdt(d1, :a) - d1s = stackdt(d1, [:a, :b]) - d1s2 = stackdt(d1, [:c, :d]) - d1s3 = stackdt(d1) - d1m = meltdt(d1, [:c, :d, :e]) + stack(d1, :a) + d1s = stack(d1, [:a, :b]) + d1s2 = stack(d1, [:c, :d]) + d1s3 = stack(d1) + d1m = melt(d1, [:c, :d, :e]) @test isequal(d1s[1:12, :c], d1[:c]) @test isequal(d1s[13:24, :c], d1[:c]) @test isequal(d1s2, d1s3) @test names(d1s) == [:variable, :value, :c, :d, :e] @test isequal(d1s, d1m) - d1m = meltdt(d1[[1,3,4]], :a) + d1m = melt(d1[[1,3,4]], :a) @test names(d1m) == [:variable, :value, :a] - d1s_named = stackdt(d1, [:a, :b], variable_name=:letter, value_name=:someval) + d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval) @test names(d1s_named) == [:letter, :someval, :c, :d, :e] - d1m_named = meltdt(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval) + d1m_named = melt(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval) @test names(d1m_named) == [:letter, :someval, :c, :d, :e] d1s[:id] = [1:12; 1:12] diff --git a/test/datatable.jl b/test/datatable.jl index 95ea0a1..6769733 100644 --- a/test/datatable.jl +++ b/test/datatable.jl @@ -292,22 +292,19 @@ module TestDataTable # Check that reordering levels does not confuse unstack levels!(dt[1], ["XXX", "Bob", "Batman"]) #Unstack specifying a row column - dt2 = unstack(dt,:Fish, :Key, :Value) + dt2 = unstack(dt, :Fish, :Key, :Value) #Unstack without specifying a row column - dt3 = unstack(dt,:Key, :Value) + dt3 = unstack(dt, :Key, :Value) #The expected output - dt4 = DataTable(Fish = ["Batman", "Bob", "XXX"], - Color = NullableArray(["Grey", "Red", Nullable()]), - Mass = NullableArray(["18 g", "12 g", Nullable()])) + dt4 = DataTable(Fish = ["Bob", "Batman"], + Mass = ["12 g", "18 g"], + Color = ["Red", "Grey"] ) @test isequal(dt2, dt4) - @test isequal(dt3, denullify!(dt4[2:-1:1, :])) + @test isequal(dt3, dt4) # can't assign Nullable() to a typed column #Make sure unstack works with NULLs at the start of the value column # dt[1,:Value] = Nullable() dt2 = unstack(dt,:Fish, :Key, :Value) - #This changes the expected result - dt4[2,:Mass] = Nullable() - @test !isequal(dt2, dt4) dt = DataTable(A = 1:10, B = 'A':'J') @test !(dt[:,:] === dt) diff --git a/test/show.jl b/test/show.jl index 8bbbd78..abad44c 100644 --- a/test/show.jl +++ b/test/show.jl @@ -30,13 +30,6 @@ module TestShow dt = DataTable(A = Vector{String}(3)) - A = DataTables.StackedVector(Any[[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - show(io, A) - A = DataTables.RepeatedVector([1, 2, 3], 5, 1) - show(io, A) - A = DataTables.RepeatedVector([1, 2, 3], 1, 5) - show(io, A) - #Test show output for REPL and similar dt = DataTable(Fish = ["Suzy", "Amir"], Mass = [1.5, Nullable()]) io = IOBuffer() From c4e218ecb2a0edd8221f6be1d4523bffe6d2be7a Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 13 Mar 2017 23:26:18 -0700 Subject: [PATCH 08/43] DataFrames doensn't reshape 2d Arrays -> Vectors so don't do it here --- src/datatable/datatable.jl | 7 ++----- test/constructors.jl | 1 + 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 9d8dd37..5c037d4 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -79,9 +79,6 @@ type DataTable <: AbstractDataTable elseif length(columns) != length(colindex) throw(DimensionMismatch("Number of columns ($(length(columns))) and column names ($(length(colindex))) are not equal")) end - # do we allow people assigning arrays to columns now? - # make sure that doesn't work - # can use !get(size(c, 2), 0) lengths = length.(columns) minlen, maxlen = extrema(lengths) if minlen == 0 && maxlen == 0 @@ -91,7 +88,7 @@ type DataTable <: AbstractDataTable if minlen == 1 && maxlen > 1 indices = find(lengths .== minlen) for i in indices - if !(typeof(columns[i]) <: AbstractArray) + if !(typeof(columns[i]) <: AbstractVector) columns[i] = fill(columns[i], maxlen) lengths[i] = maxlen end @@ -112,7 +109,7 @@ type DataTable <: AbstractDataTable if isa(c, Range) columns[i] = collect(c) elseif !isa(c, AbstractVector) - columns[i] = size(c, 2) > 1 ? reshape(c, length(c)) : [c] + columns[i] = size(c, 2) > 1 ? throw(DimensionMismatch("columns must be 1-dimensional")) : [c] end end return new(columns, colindex) diff --git a/test/constructors.jl b/test/constructors.jl index 2c080eb..c1520d4 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -64,6 +64,7 @@ module TestConstructors @testset "constructor errors" begin @test_throws DimensionMismatch DataTable(a=1, b=[]) @test_throws DimensionMismatch DataTable(Any[collect(1:10)], DataTables.Index([:A, :B])) + @test_throws DimensionMismatch DataTable(A = rand(2,2)) end @testset "column types" begin From e9542261d2d3d532af7e47a1b889ab8fcd5c1d46 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Tue, 14 Mar 2017 09:42:10 -0700 Subject: [PATCH 09/43] minor cleanup --- src/abstractdatatable/reshape.jl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index 5234864..e8399d2 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -76,9 +76,9 @@ function stack(dt::AbstractDataTable, measure_vars::Vector{Int}, cnames = names(dt)[id_vars] insert!(cnames, 1, value_name) insert!(cnames, 1, variable_name) - DataTable(Any[Compat.repeat(_names(dt)[measure_vars], inner=nrow(dt)), # variable + DataTable(Any[repeat(_names(dt)[measure_vars], inner=nrow(dt)), # variable vcat([dt[c] for c in measure_vars]...), # value - [Compat.repeat(dt[c], outer=N) for c in id_vars]...], # id_var columns + [repeat(dt[c], outer=N) for c in id_vars]...], # id_var columns cnames) end function stack(dt::AbstractDataTable, measure_var::Int, id_var::Int; @@ -188,14 +188,12 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) # `rowkey` integer indicating which column to place along rows # `colkey` integer indicating which column to place along column headers # `value` integer indicating which column has values - anchor = dt[rowkey] values = dt[value] newcols = dt[colkey] uniquenewcols = unique(newcols) - nrow = length(anchor) ncol = length(uniquenewcols) + 1 columns = Vector{Any}(ncol) - columns[1] = unique(anchor) + columns[1] = unique(dt[rowkey]) for (i,coli) in enumerate(2:ncol) columns[coli] = values[find(newcols .== uniquenewcols[i])] end From ed8a5156debcefe6726d091a669e4f8d88f2f0e6 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 14:02:16 -0700 Subject: [PATCH 10/43] change (de)nullify back to copy and cleanup docstrings --- src/abstractdatatable/abstractdatatable.jl | 25 ++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 583ecba..b51c017 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -713,10 +713,15 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable """ vcat(dts::AbstractDataTable...) -Vertically concatenate `AbstractDataTables` with matching columns. +Vertically concatenate `AbstractDataTables` that have the same column names in +the same order. ```julia -julia> dt1 = DataTable(A=1:3, B=1:3); dt2 = DataTable(A=4:6, B=4:6); dt3 = DataTable(A=7:9, B=7:9, C=7:9); +julia> dt1 = DataTable(A=1:3, B=1:3); + +julia> dt2 = DataTable(A=4:6, B=4:6); + +julia> dt3 = DataTable(A=7:9, B=7:9, C=7:9); julia> vcat(dt1, dt2) 6×2 DataTables.DataTable @@ -778,6 +783,9 @@ end Convert columns with a `Nullable` element type without any null values to a non-`Nullable` equivalent array type. The table `dt` is modified in place. +Columns in the returned `AbstractDataTable` may alias the columns of the +input `dt`. + # Examples ```jldoctest @@ -822,6 +830,9 @@ end Return a copy of `dt` where columns with a `Nullable` element type without any null values have been converted to a non-`Nullable` equivalent array type. +Columns in the returned `AbstractDataTable` may alias the columns of the +input `dt`. If no aliasing is desired, use `denullify!(deepcopy(dt))`. + # Examples ```jldoctest @@ -851,13 +862,16 @@ julia> eltypes(dt) See also [`denullify!`] & [`nullify`](@ref). """ -denullify(dt::AbstractDataTable) = denullify!(deepcopy(dt)) +denullify(dt::AbstractDataTable) = denullify!(copy(dt)) """ nullify!(dt::AbstractDataTable) Convert all columns of `dt` to nullable arrays. The table `dt` is modified in place. +Columns in the returned `AbstractDataTable` may alias the columns of the +input `dt`. + # Examples ```jldoctest @@ -902,6 +916,9 @@ nullify(x::AbstractCategoricalArray) = convert(NullableCategoricalArray, x) Return a copy of `dt` with all columns converted to nullable arrays. +Columns in the returned `AbstractDataTable` may alias the columns of the +input `dt`. If no aliasing is desired, use `nullify!(deepcopy(dt))`. + # Examples ```jldoctest @@ -932,7 +949,7 @@ julia> eltypes(dt) See also [`nullify!`](@ref) & [`denullify`](@ref). """ function nullify(dt::AbstractDataTable) - nullify!(deepcopy(dt)) + nullify!(copy(dt)) end ## Documentation for methods defined elsewhere From 1636a0cfbd415161de48eb0ccc3e68684296057a Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 14:02:50 -0700 Subject: [PATCH 11/43] NullableArrays.unsafe_get -> compat(unsafe_get) --- src/abstractdatatable/io.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdatatable/io.jl b/src/abstractdatatable/io.jl index 03174d9..6f3222b 100644 --- a/src/abstractdatatable/io.jl +++ b/src/abstractdatatable/io.jl @@ -45,7 +45,7 @@ function printtable(io::IO, if !isnull(dt[j][i]) if ! (etypes[j] <: Real) print(io, quotemark) - x = isa(dt[i, j], Nullable) ? NullableArrays.unsafe_get(dt[i, j]) : dt[i, j] + x = isa(dt[i, j], Nullable) ? @compat(unsafe_get(dt[i, j])) : dt[i, j] escapedprint(io, x, quotestr) print(io, quotemark) else @@ -168,7 +168,7 @@ function Base.show(io::IO, ::MIME"text/latex", dt::AbstractDataTable) write(io, " & ") cell = dt[row,col] if !isnull(cell) - content = isa(cell, Nullable) ? NullableArrays.unsafe_get(cell) : cell + content = isa(cell, Nullable) ? @compat(unsafe_get(cell)) : cell if mimewritable(MIME("text/latex"), content) show(io, MIME("text/latex"), content) else From 91233d325967c818d812225aa67ada53fdf4b0cb Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 14:03:26 -0700 Subject: [PATCH 12/43] default to NullableArray for joins that may introduce missing data --- src/abstractdatatable/join.jl | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index ede5c77..c8292cc 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -12,9 +12,6 @@ similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Varar similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableCategoricalArray(T, dims) -similar_nullable(dt::AbstractDataTable, dims::Int) = - DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) - # helper structure for DataTables joining immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable} dtl::DT1 @@ -44,7 +41,7 @@ Base.length(x::RowIndexMap) = length(x.orig) # composes the joined data table using the maps between the left and right # table rows and the indices of rows in the result -function compose_joined_table(joiner::DataTableJoiner, +function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap) @assert length(left_ixs) == length(right_ixs) @@ -77,9 +74,9 @@ function compose_joined_table(joiner::DataTableJoiner, end all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) resizelen = length(all_orig_right_ixs)+length(leftonly_ixs) - rightcols = Any[length(all_orig_right_ixs) >= resizelen ? - resize!(col[all_orig_right_ixs], resizelen)[right_perm] : - copy!(similar_nullable(col[all_orig_right_ixs], resizelen), col[all_orig_right_ixs])[right_perm] + rightcols = Any[kind == :inner ? + col[all_orig_right_ixs][right_perm] : + copy!(similar_nullable(col, resizelen), col[all_orig_right_ixs])[right_perm] for col in columns(dtr_noon)] right_dt = DataTable(rightcols, names(dtr_noon)) # merge left and right parts of the joined table @@ -246,22 +243,22 @@ function Base.join(dt1::AbstractDataTable, joiner = DataTableJoiner(dt1, dt2, on) if kind == :inner - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, false, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, false, true, false)...) elseif kind == :left - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, false)...) elseif kind == :right right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on, group_rows(joiner.dtl_on), true, true, true, false) - compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs) + compose_joined_table(joiner, kind, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs) elseif kind == :outer - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, true)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, true)...) elseif kind == :semi # hash the right rows dtr_on_grp = group_rows(joiner.dtr_on) From 7462612c50bcc436232c2efac2c8275ec2a9b59d Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 14:04:04 -0700 Subject: [PATCH 13/43] align comments --- src/abstractdatatable/reshape.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index e8399d2..381825b 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -76,9 +76,9 @@ function stack(dt::AbstractDataTable, measure_vars::Vector{Int}, cnames = names(dt)[id_vars] insert!(cnames, 1, value_name) insert!(cnames, 1, variable_name) - DataTable(Any[repeat(_names(dt)[measure_vars], inner=nrow(dt)), # variable - vcat([dt[c] for c in measure_vars]...), # value - [repeat(dt[c], outer=N) for c in id_vars]...], # id_var columns + DataTable(Any[repeat(_names(dt)[measure_vars], inner=nrow(dt)), # variable + vcat([dt[c] for c in measure_vars]...), # value + [repeat(dt[c], outer=N) for c in id_vars]...], # id_var columns cnames) end function stack(dt::AbstractDataTable, measure_var::Int, id_var::Int; From 9b65533073e6b2feefd53cd69c6ee8ef7a71b3e0 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 14:08:10 -0700 Subject: [PATCH 14/43] lots of edits clarify error messages, fix spacing, address case of length(::Symbol) and length(::String) not giving the desired output (Vector length), and unify constructors in accepting AbstractVector{Symbol} for colnames --- src/datatable/datatable.jl | 70 ++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 40 deletions(-) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 5c037d4..28f01b2 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -77,31 +77,25 @@ type DataTable <: AbstractDataTable if length(columns) == length(colindex) == 0 return new(Vector{Any}(0), Index()) elseif length(columns) != length(colindex) - throw(DimensionMismatch("Number of columns ($(length(columns))) and column names ($(length(colindex))) are not equal")) + throw(DimensionMismatch("Number of columns ($(length(columns))) and number of column names ($(length(colindex))) are not equal")) end - lengths = length.(columns) + lengths = [isa(col, AbstractArray) ? length(col) : 1 for col in columns] minlen, maxlen = extrema(lengths) if minlen == 0 && maxlen == 0 return new(columns, colindex) elseif minlen != maxlen # recycle scalars - if minlen == 1 && maxlen > 1 - indices = find(lengths .== minlen) - for i in indices - if !(typeof(columns[i]) <: AbstractVector) - columns[i] = fill(columns[i], maxlen) - lengths[i] = maxlen - end - end + for i in 1:length(columns) + typeof(columns[i]) <: AbstractArray && continue + columns[i] = fill(columns[i], maxlen) + lengths[i] = maxlen end - uniques = unique(lengths) - if length(uniques) != 1 - estring = Vector{String}(length(uniques)) + uls = unique(lengths) + if length(uls) != 1 + # estring = Vector{String}(length(uniques)) strnames = string.(names(colindex)) - for (i,u) in enumerate(uniques) - indices = find(lengths .== u) - estring[i] = "column length ($(uniques[i])) for column(s) ($(join(strnames[indices], ", ")))" - end + estring = ["column length ($(uls[i])) for column(s) ($(join(strnames[find(uls .== u)], ", ")))" + for (i,u) in enumerate(uls)] throw(DimensionMismatch(join(estring, " is incompatible with "))) end end @@ -109,7 +103,7 @@ type DataTable <: AbstractDataTable if isa(c, Range) columns[i] = collect(c) elseif !isa(c, AbstractVector) - columns[i] = size(c, 2) > 1 ? throw(DimensionMismatch("columns must be 1-dimensional")) : [c] + columns[i] = size(c, 2) > 1 ? throw(DimensionMismatch("columns must be 1-dimensional")) : [c] end end return new(columns, colindex) @@ -120,21 +114,16 @@ function DataTable(; kwargs...) if length(kwargs) == 0 return DataTable(Any[], Index()) end - colnames = Vector{Symbol}(length(kwargs)) - columns = Vector{Any}(length(kwargs)) - for (i,(k,v)) in enumerate(kwargs) - colnames[i] = Symbol(k) - columns[i] = v - end + colnames = [Symbol(k) for (k,v) in kwargs] + columns = Any[v for (k,v) in kwargs] DataTable(columns, Index(colnames)) end function DataTable(columns::AbstractVector, - cnames::Vector{Symbol} = gennames(length(columns))) - return DataTable(convert(Vector{Any}, columns), Index(cnames)) + cnames::AbstractVector{Symbol} = gennames(length(columns))) + return DataTable(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames))) end - # Initialize empty DataTable objects of arbitrary size function DataTable(t::Type, nrows::Integer, ncols::Integer) columns = Vector{Any}(ncols) @@ -146,40 +135,41 @@ function DataTable(t::Type, nrows::Integer, ncols::Integer) end # Initialize an empty DataTable with specific eltypes and names -function DataTable(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, nrows::Integer) +function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - T = column_eltypes[j] - columns[j] = T <: Nullable ? NullableArray{eltype(T)}(nrows) : Vector{T}(nrows) + colT = column_eltypes[j] + columns[j] = colT <: Nullable ? NullableArray{eltype(colT)}(nrows) : Vector{colT}(nrows) end - return DataTable(columns, Index(cnames)) + return DataTable(columns, Index(convert(Vector{Symbol}, cnames))) end + # Initialize an empty DataTable with specific eltypes and names # and whether a nominal array should be created -function DataTable(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, - nominal::Vector{Bool}, nrows::Integer) +function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, + nominal::Vector{Bool}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - T = column_eltypes[j] + colT = column_eltypes[j] if nominal[j] - columns[j] = T <: Nullable ? NullableCategoricalArray{T}(nrows) : CategoricalVector{T}(nrows) + columns[j] = colT <: Nullable ? NullableCategoricalArray{colT}(nrows) : CategoricalVector{colT}(nrows) else - columns[j] = T <: Nullable ? NullableArray{T}(nrows) : Vector{T}(nrows) + columns[j] = colT <: Nullable ? NullableArray{colT}(nrows) : Vector{colT}(nrows) end end - return DataTable(columns, Index(cnames)) + return DataTable(columns, Index(convert(Vector{Symbol}, cnames))) end # Initialize an empty DataTable with specific eltypes -function DataTable(column_eltypes::Vector{DataType}, nrows::Integer) +function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) cnames = gennames(p) for j in 1:p - T = column_eltypes[j] - columns[j] = T <: Nullable ? NullableArray{T}(nrows) : Vector{T}(nrows) + colT = column_eltypes[j] + columns[j] = colT <: Nullable ? NullableArray{colT}(nrows) : Vector{colT}(nrows) end return DataTable(columns, Index(cnames)) end From b643ff8f16795bda1982847f4bce5bd639876dc2 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 14:12:29 -0700 Subject: [PATCH 15/43] tests and no need for compat --- src/abstractdatatable/io.jl | 4 ++-- test/cat.jl | 6 ++++-- test/constructors.jl | 2 +- test/join.jl | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/abstractdatatable/io.jl b/src/abstractdatatable/io.jl index 6f3222b..6af518b 100644 --- a/src/abstractdatatable/io.jl +++ b/src/abstractdatatable/io.jl @@ -45,7 +45,7 @@ function printtable(io::IO, if !isnull(dt[j][i]) if ! (etypes[j] <: Real) print(io, quotemark) - x = isa(dt[i, j], Nullable) ? @compat(unsafe_get(dt[i, j])) : dt[i, j] + x = isa(dt[i, j], Nullable) ? unsafe_get(dt[i, j]) : dt[i, j] escapedprint(io, x, quotestr) print(io, quotemark) else @@ -168,7 +168,7 @@ function Base.show(io::IO, ::MIME"text/latex", dt::AbstractDataTable) write(io, " & ") cell = dt[row,col] if !isnull(cell) - content = isa(cell, Nullable) ? @compat(unsafe_get(cell)) : cell + content = isa(cell, Nullable) ? unsafe_get(cell) : cell if mimewritable(MIME("text/latex"), content) show(io, MIME("text/latex"), content) else diff --git a/test/cat.jl b/test/cat.jl index f26b8e7..a5b41b5 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -76,9 +76,9 @@ module TestCat @test vcat(null_dt, null_dt) == DataTable() @test vcat(null_dt, dt) == dt @test vcat(dt, null_dt) == dt - @test all(map((x,y) -> x <: y, eltypes(vcat(dt, dt)), (Float64, Float64, Int))) + @test eltypes(vcat(dt, dt)) == [Float64, Float64, Int] @test size(vcat(dt, dt)) == (size(dt,1)*2, size(dt,2)) - @test all(map((x,y) -> x <: y, eltypes(vcat(dt, dt, dt)), (Float64, Float64, Int))) + @test eltypes(vcat(dt, dt, dt)) == [Float64, Float64, Int] @test size(vcat(dt, dt, dt)) == (size(dt,1)*3, size(dt,2)) alt_dt = deepcopy(dt) @@ -103,7 +103,9 @@ module TestCat dtc = DataTable(a = NullableArray([2, 3, 4])) dtd = DataTable(Any[2:4], [:a]) dtab = vcat(dta, dtb) + @test isa(dtab[1], CategoricalArray) dtac = vcat(nullify(dta), dtc) + @test isa(dtac[1], NullableCategoricalArray) @test isequal(dtab[:a], [1, 2, 2, 2, 3, 4]) @test isa(dtab[:a], CategoricalVector{Int}) dc = vcat(dtd, dtc) diff --git a/test/constructors.jl b/test/constructors.jl index c1520d4..cd3589c 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -39,8 +39,8 @@ module TestConstructors dt = DataTable([Nullable{Int}, Nullable{Float64}], [:x1, :x2], 2) @test size(dt) == (2, 2) @test eltypes(dt) == [Nullable{Int}, Nullable{Float64}] - @test isequal(dt, DataTable([Nullable{Int}, Nullable{Float64}], 2)) + @test all(isnull, (dt[:x1], dt[:x2])) @test_throws BoundsError SubDataTable(DataTable(A=1), 0) @test_throws BoundsError SubDataTable(DataTable(A=1), 0) diff --git a/test/join.jl b/test/join.jl index 3838cd8..809161f 100644 --- a/test/join.jl +++ b/test/join.jl @@ -108,7 +108,7 @@ module TestJoin Quantity = [3, 3, 2, 4]) @test join(dt2, dt, on=:Name, kind=:left) == DataTable(Name = Nullable{String}["A", "B", "C", "A"], Quantity = [3, 3, 2, 4], - Mass = [1.5, 2.2, 1.1, 1.5]) + Mass = Nullable{Float64}[1.5, 2.2, 1.1, 1.5]) # Test that join works when mixing Array and NullableArray (#1151) dt = DataTable([collect(1:10), collect(2:11)], [:x, :y]) From 4c6845225e044c9cfec4d5ae6177c20bf8a8f19a Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 14:20:02 -0700 Subject: [PATCH 16/43] spacing mistakes --- src/datatable/datatable.jl | 2 +- test/iteration.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 28f01b2..6c53a9a 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -148,7 +148,7 @@ end # Initialize an empty DataTable with specific eltypes and names # and whether a nominal array should be created function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, - nominal::Vector{Bool}, nrows::Integer) + nominal::Vector{Bool}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p diff --git a/test/iteration.jl b/test/iteration.jl index 7686428..afa93b2 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -16,7 +16,7 @@ module TestIteration end for col in eachcol(dt) - @test isa(col, Tuple{Symbol,Vector{Int}}) + @test isa(col, Tuple{Symbol, Vector{Int}}) end @test isequal(map(x -> minimum(convert(Array, x)), eachrow(dt)), [1,2]) From 731068103755f77360f99444cb1ee3e6d0d550b9 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 14:35:17 -0700 Subject: [PATCH 17/43] throw errors on 1-d matrices and change confusing variable name --- src/datatable/datatable.jl | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 6c53a9a..edf8144 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -92,7 +92,6 @@ type DataTable <: AbstractDataTable end uls = unique(lengths) if length(uls) != 1 - # estring = Vector{String}(length(uniques)) strnames = string.(names(colindex)) estring = ["column length ($(uls[i])) for column(s) ($(join(strnames[find(uls .== u)], ", ")))" for (i,u) in enumerate(uls)] @@ -103,7 +102,9 @@ type DataTable <: AbstractDataTable if isa(c, Range) columns[i] = collect(c) elseif !isa(c, AbstractVector) - columns[i] = size(c, 2) > 1 ? throw(DimensionMismatch("columns must be 1-dimensional")) : [c] + throw(DimensionMismatch("columns must be 1-dimensional")) + else + columns[i] = c end end return new(columns, colindex) @@ -139,8 +140,8 @@ function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractV p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - colT = column_eltypes[j] - columns[j] = colT <: Nullable ? NullableArray{eltype(colT)}(nrows) : Vector{colT}(nrows) + elty = column_eltypes[j] + columns[j] = elty <: Nullable ? NullableArray{eltype(elty)}(nrows) : Vector{elty}(nrows) end return DataTable(columns, Index(convert(Vector{Symbol}, cnames))) end @@ -152,11 +153,11 @@ function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractV p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - colT = column_eltypes[j] + elty = column_eltypes[j] if nominal[j] - columns[j] = colT <: Nullable ? NullableCategoricalArray{colT}(nrows) : CategoricalVector{colT}(nrows) + columns[j] = elty <: Nullable ? NullableCategoricalArray{elty}(nrows) : CategoricalVector{elty}(nrows) else - columns[j] = colT <: Nullable ? NullableArray{colT}(nrows) : Vector{colT}(nrows) + columns[j] = elty <: Nullable ? NullableArray{elty}(nrows) : Vector{elty}(nrows) end end return DataTable(columns, Index(convert(Vector{Symbol}, cnames))) @@ -168,8 +169,8 @@ function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, nrows::Integer) columns = Vector{Any}(p) cnames = gennames(p) for j in 1:p - colT = column_eltypes[j] - columns[j] = colT <: Nullable ? NullableArray{colT}(nrows) : Vector{colT}(nrows) + elty = column_eltypes[j] + columns[j] = elty <: Nullable ? NullableArray{elty}(nrows) : Vector{elty}(nrows) end return DataTable(columns, Index(cnames)) end From de280ba118ce1b177fb12bb82a2e7ce9ae961cdf Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 15:21:04 -0700 Subject: [PATCH 18/43] add back check to differentiate scalars from AbstractArrays --- src/datatable/datatable.jl | 6 +++++- test/constructors.jl | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index edf8144..96cf59a 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -102,7 +102,11 @@ type DataTable <: AbstractDataTable if isa(c, Range) columns[i] = collect(c) elseif !isa(c, AbstractVector) - throw(DimensionMismatch("columns must be 1-dimensional")) + if isa(c, AbstractArray) + throw(DimensionMismatch("columns must be 1-dimensional")) + else + columns[i] = [c] + end else columns[i] = c end diff --git a/test/constructors.jl b/test/constructors.jl index cd3589c..4053903 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -65,6 +65,7 @@ module TestConstructors @test_throws DimensionMismatch DataTable(a=1, b=[]) @test_throws DimensionMismatch DataTable(Any[collect(1:10)], DataTables.Index([:A, :B])) @test_throws DimensionMismatch DataTable(A = rand(2,2)) + @test_throws DimensionMismatch DataTable(A = rand(2,1)) end @testset "column types" begin From 88b20cace0423bbfd18396e6a6c6008be75a99c3 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Mar 2017 18:16:32 -0700 Subject: [PATCH 19/43] save work --- src/abstractdatatable/join.jl | 78 +++++++++++++++++++++-------------- test/join.jl | 22 ++++++++++ 2 files changed, 68 insertions(+), 32 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index c8292cc..aa9bd6e 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -41,31 +41,29 @@ Base.length(x::RowIndexMap) = length(x.orig) # composes the joined data table using the maps between the left and right # table rows and the indices of rows in the result -function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, +function compose_joined_table(joiner::DataTableJoiner, left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap) @assert length(left_ixs) == length(right_ixs) # compose left half of the result taking all left columns all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig) - if length(leftonly_ixs) > 0 + + lil = length(left_ixs) + loil = length(leftonly_ixs) + ril = length(right_ixs) + roil = length(rightonly_ixs) + + if loil > 0 # combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) indices of the left table rows # preserving the original rows order - all_orig_left_ixs = similar(left_ixs.orig, length(left_ixs)+length(leftonly_ixs)) + all_orig_left_ixs = similar(left_ixs.orig, lil + loil) @inbounds all_orig_left_ixs[left_ixs.join] = left_ixs.orig @inbounds all_orig_left_ixs[leftonly_ixs.join] = leftonly_ixs.orig else # the result contains only the left rows that are matched to right rows (left_ixs) all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere end - ril = length(right_ixs) - loil = length(leftonly_ixs) - roil = length(rightonly_ixs) - left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil) - for col in columns(joiner.dtl)], - names(joiner.dtl)) - # compose right half of the result taking all right columns excluding on - dtr_noon = without(joiner.dtr, joiner.on_cols) # permutation to swap rightonly and leftonly rows right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil) if length(leftonly_ixs) > 0 @@ -73,15 +71,30 @@ function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil] end all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) - resizelen = length(all_orig_right_ixs)+length(leftonly_ixs) - rightcols = Any[kind == :inner ? - col[all_orig_right_ixs][right_perm] : - copy!(similar_nullable(col, resizelen), col[all_orig_right_ixs])[right_perm] - for col in columns(dtr_noon)] - right_dt = DataTable(rightcols, names(dtr_noon)) - # merge left and right parts of the joined table - res = hcat!(left_dt, right_dt) + # compose right half of the result taking all right columns excluding on + dtr_noon = without(joiner.dtr, joiner.on_cols) + + laoli = length(all_orig_left_ixs) + laori = length(all_orig_right_ixs) + @assert laoli + roil == laori + loil + numrows = length(all_orig_left_ixs) + roil + numcols = ncol(joiner.dtl) + ncol(dtr_noon) + + # if either size is smaller, then it's null + leftnull = laoli < laoli + roil + rightnull = laori < laori + loil + dtcols = Vector{Any}(numcols) + for (i,col) in enumerate(columns(joiner.dtl)) + dtcols[i] = leftnull ? copy!(similar_nullable(col, numrows), col[all_orig_left_ixs]) : + col[all_orig_left_ixs] + end + for (i,col) in enumerate(columns(dtr_noon)) + dtcols[i+ncol(joiner.dtl)] = rightnull ? copy!(similar_nullable(col, numrows), col[all_orig_right_ixs])[right_perm] : + col[all_orig_right_ixs][right_perm] + end + colnames = vcat(names(joiner.dtl), names(dtr_noon)) + res = DataTable(dtcols, Index(colnames)) if length(rightonly_ixs.join) > 0 # some left rows are nulls, so the values of the "on" columns # need to be taken from the right @@ -207,6 +220,8 @@ join(dt1::AbstractDataTable, - `:cross` : a full Cartesian product of the key combinations; every row of `dt1` is matched with every row of `dt2` +For the three join operations that may introduce missing values, `:outer`, `:left`, +and `:right`, Null values are filled in where needed to complete joins. ### Result @@ -243,22 +258,21 @@ function Base.join(dt1::AbstractDataTable, joiner = DataTableJoiner(dt1, dt2, on) if kind == :inner - compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, false, true, false)...) + compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, false, true, false)...) elseif kind == :left - compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, false)...) + compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, false)...) elseif kind == :right - right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on, - group_rows(joiner.dtl_on), - true, true, true, false) - compose_joined_table(joiner, kind, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs) + compose_joined_table(joiner, update_row_maps!(joiner.dtr_on, joiner.dtl_on, + group_rows(joiner.dtl_on), + true, true, true, false)[[3, 4, 1, 2]]...) elseif kind == :outer - compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, true)...) + compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, true)...) elseif kind == :semi # hash the right rows dtr_on_grp = group_rows(joiner.dtr_on) diff --git a/test/join.jl b/test/join.jl index 809161f..5a09561 100644 --- a/test/join.jl +++ b/test/join.jl @@ -117,4 +117,26 @@ module TestJoin DataTable([collect(1:10), collect(2:11), collect(3:12)], [:x, :y, :z]) @test join(dtnull, dt, on = :x) == DataTable([collect(1:10), collect(3:12), collect(2:11)], [:x, :z, :y]) + + @testset "missingness" begin + small = DataTable(fruit = [:banana, :plantain, :melon], + vegetable = [:artichoke, :leek, :pepper]) + large = DataTable(fruit = [:banana, :plantain, :melon, :raspberry], + vegetable = [:artichoke, :collards, :leek, :pepper]) + + @test join(small, large, on=:fruit, kind=:left) == DataTable(fruit = [:banana, :plantain, :melon], + vegetable = [:artichoke, :leek, :pepper], + vegetable_1 = [:artichoke, :collards, :leek]) + @test join(small, large, on=:fruit, kind=:right) == DataTable(fruit = [:banana, :plantain, :melon], + vegetable = [:artichoke, :leek, :pepper], + vegetable_1 = [:artichoke, :collards, :leek]) + @test join(small, large, on=:fruit, kind=:outer) + + @test join(small, large, on=:vegetable, kind=:left) + @test join(small, large, on=:vegetable, kind=:right) + @test join(small, large, on=:vegetable, kind=:outer) + + @test join(small, large, on=[:fruit, :vegetable], kind=:outer) + @test join(small, large, on=[:fruit, :vegetable], kind=:left) + @test join(small, large, on=[:fruit, :vegetable], kind=:right) end From be1cacdd30dee452de5714821918e00a47058e66 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 16 Mar 2017 14:45:31 -0700 Subject: [PATCH 20/43] save progress, switch to test master --- src/abstractdatatable/join.jl | 59 +++++++++++++++-------------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index aa9bd6e..6f2c292 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -41,16 +41,16 @@ Base.length(x::RowIndexMap) = length(x.orig) # composes the joined data table using the maps between the left and right # table rows and the indices of rows in the result -function compose_joined_table(joiner::DataTableJoiner, +function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap) @assert length(left_ixs) == length(right_ixs) # compose left half of the result taking all left columns all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig) + ril = length(right_ixs) lil = length(left_ixs) loil = length(leftonly_ixs) - ril = length(right_ixs) roil = length(rightonly_ixs) if loil > 0 @@ -63,7 +63,6 @@ function compose_joined_table(joiner::DataTableJoiner, # the result contains only the left rows that are matched to right rows (left_ixs) all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere end - # permutation to swap rightonly and leftonly rows right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil) if length(leftonly_ixs) > 0 @@ -75,26 +74,20 @@ function compose_joined_table(joiner::DataTableJoiner, # compose right half of the result taking all right columns excluding on dtr_noon = without(joiner.dtr, joiner.on_cols) - laoli = length(all_orig_left_ixs) - laori = length(all_orig_right_ixs) - @assert laoli + roil == laori + loil - numrows = length(all_orig_left_ixs) + roil - numcols = ncol(joiner.dtl) + ncol(dtr_noon) - - # if either size is smaller, then it's null - leftnull = laoli < laoli + roil - rightnull = laori < laori + loil - dtcols = Vector{Any}(numcols) - for (i,col) in enumerate(columns(joiner.dtl)) - dtcols[i] = leftnull ? copy!(similar_nullable(col, numrows), col[all_orig_left_ixs]) : - col[all_orig_left_ixs] + nrow = length(all_orig_left_ixs) + roil + @assert nrow == length(all_orig_right_ixs) + loil + ncl = ncol(joiner.dtl) + cols = Vector{Any}(ncl + ncol(dtr_noon)) + for (i, col) in enumerate(columns(joiner.dtl)) + cols[i] = kind == :inner ? col[all_orig_left_ixs] : + copy!(similar_nullable(col, nrow), col[all_orig_left_ixs]) end - for (i,col) in enumerate(columns(dtr_noon)) - dtcols[i+ncol(joiner.dtl)] = rightnull ? copy!(similar_nullable(col, numrows), col[all_orig_right_ixs])[right_perm] : - col[all_orig_right_ixs][right_perm] + for (i, col) in enumerate(columns(dtr_noon)) + cols[i+ncl] = kind == :inner ? col[all_orig_right_ixs] : + copy!(similar_nullable(col, nrow), col[all_orig_right_ixs])[right_perm] end - colnames = vcat(names(joiner.dtl), names(dtr_noon)) - res = DataTable(dtcols, Index(colnames)) + res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon))) + if length(rightonly_ixs.join) > 0 # some left rows are nulls, so the values of the "on" columns # need to be taken from the right @@ -258,21 +251,21 @@ function Base.join(dt1::AbstractDataTable, joiner = DataTableJoiner(dt1, dt2, on) if kind == :inner - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, false, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, false, true, false)...) elseif kind == :left - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, false)...) elseif kind == :right - compose_joined_table(joiner, update_row_maps!(joiner.dtr_on, joiner.dtl_on, - group_rows(joiner.dtl_on), - true, true, true, false)[[3, 4, 1, 2]]...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtr_on, joiner.dtl_on, + group_rows(joiner.dtl_on), + true, true, true, false)[[3, 4, 1, 2]]...) elseif kind == :outer - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, true)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, true)...) elseif kind == :semi # hash the right rows dtr_on_grp = group_rows(joiner.dtr_on) From 19ffb58d8745b8045be7e4e8be0e2374ac6b290c Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 16 Mar 2017 15:50:38 -0700 Subject: [PATCH 21/43] join is ready and tests in place. right join still broken --- test/data.jl | 10 +++---- test/join.jl | 76 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/test/data.jl b/test/data.jl index 5f57b8a..ca54f26 100644 --- a/test/data.jl +++ b/test/data.jl @@ -218,8 +218,8 @@ module TestData m1 = join(dt1, dt2, on = :a, kind=:inner) @test isequal(m1[:a], dt1[:a][dt1[:a] .<= 5]) # preserves dt1 order m2 = join(dt1, dt2, on = :a, kind = :outer) - @test isequal(m2[:a], dt1[:a]) # preserves dt1 order - @test isequal(m2[:b], dt1[:b]) # preserves dt1 order + @test isequal(m2[:a], NullableArray(dt1[:a])) # preserves dt1 order + @test isequal(m2[:b], NullableArray(dt1[:b])) # preserves dt1 order # TODO: Re-enable m2 = join(dt1, dt2, on = :a, kind = :outer) # @test isequal(m2[:b2], @@ -240,13 +240,13 @@ module TestData @test m1[:a] == [1, 2] m2 = join(dt1, dt2, on = :a, kind = :left) - @test m2[:a] == [1, 2, 3] + @test isequal(m2[:a], NullableArray([1, 2, 3])) m3 = join(dt1, dt2, on = :a, kind = :right) - @test m3[:a] == [1, 2, 4] + @test isequal(m3[:a], NullableArray([1, 2, 4])) m4 = join(dt1, dt2, on = :a, kind = :outer) - @test m4[:a] == [1, 2, 3, 4] + @test isequal(m4[:a], NullableArray([1, 2, 3, 4])) # test with nulls (issue #185) dt1 = DataTable() diff --git a/test/join.jl b/test/join.jl index 5a09561..23fc836 100644 --- a/test/join.jl +++ b/test/join.jl @@ -2,8 +2,8 @@ module TestJoin using Base.Test using DataTables - name = DataTable(ID = [1, 2, 3], Name = NullableArray(["John Doe", "Jane Doe", "Joe Blogs"])) - job = DataTable(ID = [1, 2, 2, 4], Job = NullableArray(["Lawyer", "Doctor", "Florist", "Farmer"])) + name = DataTable(ID = NullableArray([1, 2, 3]), Name = NullableArray(["John Doe", "Jane Doe", "Joe Blogs"])) + job = DataTable(ID = NullableArray([1, 2, 2, 4]), Job = NullableArray(["Lawyer", "Doctor", "Florist", "Farmer"])) # Join on symbols or vectors of symbols join(name, job, on = :ID) @@ -13,7 +13,7 @@ module TestJoin #@test_throws join(name, job) # Test output of various join types - outer = DataTable(ID = [1, 2, 2, 3, 4], + outer = DataTable(ID = NullableArray([1, 2, 2, 3, 4]), Name = NullableArray(["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", Nullable()]), Job = NullableArray(["Lawyer", "Doctor", "Florist", Nullable(), "Farmer"])) @@ -70,7 +70,7 @@ module TestJoin @test_throws ArgumentError join(dt1, dt2, on = :A, kind = :cross) # test empty inputs - simple_dt(len::Int, col=:A) = (dt = DataTable(); dt[col]=collect(1:len); dt) + simple_dt(len::Int, col=:A) = (dt = DataTable(); dt[col]=NullableArray(collect(1:len)); dt) @test isequal(join(simple_dt(0), simple_dt(0), on = :A, kind = :left), simple_dt(0)) @test isequal(join(simple_dt(2), simple_dt(0), on = :A, kind = :left), simple_dt(2)) @test isequal(join(simple_dt(0), simple_dt(2), on = :A, kind = :left), simple_dt(0)) @@ -107,7 +107,7 @@ module TestJoin dt2 = DataTable(Name = Nullable{String}["A", "B", "C", "A"], Quantity = [3, 3, 2, 4]) @test join(dt2, dt, on=:Name, kind=:left) == DataTable(Name = Nullable{String}["A", "B", "C", "A"], - Quantity = [3, 3, 2, 4], + Quantity = Nullable{Int}[3, 3, 2, 4], Mass = Nullable{Float64}[1.5, 2.2, 1.1, 1.5]) # Test that join works when mixing Array and NullableArray (#1151) @@ -118,25 +118,49 @@ module TestJoin @test join(dtnull, dt, on = :x) == DataTable([collect(1:10), collect(3:12), collect(2:11)], [:x, :z, :y]) - @testset "missingness" begin - small = DataTable(fruit = [:banana, :plantain, :melon], - vegetable = [:artichoke, :leek, :pepper]) - large = DataTable(fruit = [:banana, :plantain, :melon, :raspberry], - vegetable = [:artichoke, :collards, :leek, :pepper]) - - @test join(small, large, on=:fruit, kind=:left) == DataTable(fruit = [:banana, :plantain, :melon], - vegetable = [:artichoke, :leek, :pepper], - vegetable_1 = [:artichoke, :collards, :leek]) - @test join(small, large, on=:fruit, kind=:right) == DataTable(fruit = [:banana, :plantain, :melon], - vegetable = [:artichoke, :leek, :pepper], - vegetable_1 = [:artichoke, :collards, :leek]) - @test join(small, large, on=:fruit, kind=:outer) - - @test join(small, large, on=:vegetable, kind=:left) - @test join(small, large, on=:vegetable, kind=:right) - @test join(small, large, on=:vegetable, kind=:outer) - - @test join(small, large, on=[:fruit, :vegetable], kind=:outer) - @test join(small, large, on=[:fruit, :vegetable], kind=:left) - @test join(small, large, on=[:fruit, :vegetable], kind=:right) + @testset "complete set of joins" begin + small = DataTable(id = [1, 3, 5], fid = [1.0, 3.0, 5.0]) + large = DataTable(id = [0, 1, 2, 3, 4], fid = [0.0, 1.0, 2.0, 3.0, 4.0]) + N = Nullable() + + @test join(small, large, kind=:cross) == DataTable(id = repeat([1, 3, 5], inner=5), + fid = repeat([1.0, 3.0, 5.0], inner=5), + id_1 = repeat([0, 1, 2, 3, 4], outer=3), + fid_1 = repeat([0.0, 1.0, 2.0, 3.0, 4.0], outer=3)) + # id + @test join(small, large, on=:id, kind=:inner) == DataTable(id = [1, 3], fid = [1.0, 3.0], fid_1 = [1.0, 3.0]) + @test join(small, large, on=:id, kind=:left) == nullify!(DataTable(id = [1, 3, 5], fid = [1.0, 3.0, 5.0], fid_1 = [1.0, 3.0, N])) + # FIXME + # @test join(small, large, on=:id, kind=:right) == nullify!(DataTable(id = [1, 3, 0, 2, 4], + # fid = [1.0, 3.0, N, N, N], + # fid_1 = [1.0, 3.0, 0.0, 2.0, 4.0]) + @test join(small, large, on=:id, kind=:outer) == nullify!(DataTable(id = [1, 3, 5, 0, 2, 4], + fid = [1.0, 3.0, 5.0, N, N, N], + fid_1 = [1.0, 3.0, N, 0.0, 2.0, 4.0])) + @test join(small, large, on=:id, kind=:semi) == DataTable(id = [1, 3], fid = [1.0, 3.0]) + @test join(small, large, on=:id, kind=:anti) == DataTable(id = 5, fid = 5.0) + + # fid + @test join(small, large, on=:fid, kind=:inner) == DataTable(id = [1, 3], fid = [1.0, 3.0], id_1 = [1, 3]) + @test join(small, large, on=:fid, kind=:left) == nullify!(DataTable(id = [1, 3, 5], fid = [1.0, 3.0, 5.0], id_1 = [1, 3, N])) + # FIXME + # @test join(small, large, on=:fid, kind=:right) == nullify!(DataTable(id = [1, 3, N, N, N], + # fid = [1.0, 3.0, 0.0, 2.0, 4.0], + # id_1 = [1, 3, 0, 2, 4])) + @test join(small, large, on=:fid, kind=:outer) == nullify!(DataTable(id = [1, 3, 5, N, N, N], + fid = [1.0, 3.0, 5.0, 0.0, 2.0, 4.0], + id_1 = [1, 3, N, 0, 2, 4])) + @test join(small, large, on=:fid, kind=:semi) == DataTable(id = [1, 3], fid = [1.0, 3.0]) + @test join(small, large, on=:fid, kind=:anti) == DataTable(id = 5, fid = 5.0) + + # both + @test join(small, large, on=[:id, :fid], kind=:inner) == DataTable(id = [1, 3], fid = [1.0, 3.0]) + @test join(small, large, on=[:id, :fid], kind=:left) == nullify!(DataTable(id = [1, 3, 5], fid = [1.0, 3.0, 5.0])) + # FIXME + # @test join(small, large, on=[:id, :fid], kind=:right) == nullify!(DataTable(id = [1, 3, 0, 2, 4], fid = [1.0, 3.0, 0.0, 2.0, 4.0])) + @test join(small, large, on=[:id, :fid], kind=:outer) == nullify!(DataTable(id = [1, 3, 5, 0, 2, 4], + fid = [1.0, 3.0, 5.0, 0.0, 2.0, 4.0])) + @test join(small, large, on=[:id, :fid], kind=:semi) == DataTable(id = [1, 3], fid = [1.0, 3.0]) + @test join(small, large, on=[:id, :fid], kind=:anti) == DataTable(id = 5, fid = 5.0) + end end From 3f2cd63468482f564d86186e6233dd50a20e3139 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 16 Mar 2017 17:11:50 -0700 Subject: [PATCH 22/43] fix right join --- src/abstractdatatable/join.jl | 8 +++--- test/join.jl | 46 +++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 6f2c292..68e4cfe 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -76,14 +76,14 @@ function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, nrow = length(all_orig_left_ixs) + roil @assert nrow == length(all_orig_right_ixs) + loil - ncl = ncol(joiner.dtl) - cols = Vector{Any}(ncl + ncol(dtr_noon)) + ncleft = ncol(joiner.dtl) + cols = Vector{Any}(ncleft + ncol(dtr_noon)) for (i, col) in enumerate(columns(joiner.dtl)) cols[i] = kind == :inner ? col[all_orig_left_ixs] : copy!(similar_nullable(col, nrow), col[all_orig_left_ixs]) end for (i, col) in enumerate(columns(dtr_noon)) - cols[i+ncl] = kind == :inner ? col[all_orig_right_ixs] : + cols[i+ncleft] = kind == :inner ? col[all_orig_right_ixs] : copy!(similar_nullable(col, nrow), col[all_orig_right_ixs])[right_perm] end res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon))) @@ -93,7 +93,7 @@ function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, # need to be taken from the right for (on_col_ix, on_col) in enumerate(joiner.on_cols) # fix the result of the rightjoin by taking the nonnull values from the right table - res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix] + res[on_col][end-length(rightonly_ixs.orig)+1:end] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix] end end return res diff --git a/test/join.jl b/test/join.jl index 23fc836..63fc38a 100644 --- a/test/join.jl +++ b/test/join.jl @@ -128,12 +128,15 @@ module TestJoin id_1 = repeat([0, 1, 2, 3, 4], outer=3), fid_1 = repeat([0.0, 1.0, 2.0, 3.0, 4.0], outer=3)) # id - @test join(small, large, on=:id, kind=:inner) == DataTable(id = [1, 3], fid = [1.0, 3.0], fid_1 = [1.0, 3.0]) - @test join(small, large, on=:id, kind=:left) == nullify!(DataTable(id = [1, 3, 5], fid = [1.0, 3.0, 5.0], fid_1 = [1.0, 3.0, N])) - # FIXME - # @test join(small, large, on=:id, kind=:right) == nullify!(DataTable(id = [1, 3, 0, 2, 4], - # fid = [1.0, 3.0, N, N, N], - # fid_1 = [1.0, 3.0, 0.0, 2.0, 4.0]) + @test join(small, large, on=:id, kind=:inner) == DataTable(id = [1, 3], + fid = [1.0, 3.0], + fid_1 = [1.0, 3.0]) + @test join(small, large, on=:id, kind=:left) == nullify!(DataTable(id = [1, 3, 5], + fid = [1.0, 3.0, 5.0], + fid_1 = [1.0, 3.0, N])) + @test join(small, large, on=:id, kind=:right) == nullify!(DataTable(id = [1, 3, 0, 2, 4], + fid = [1.0, 3.0, N, N, N], + fid_1 = [1.0, 3.0, 0.0, 2.0, 4.0])) @test join(small, large, on=:id, kind=:outer) == nullify!(DataTable(id = [1, 3, 5, 0, 2, 4], fid = [1.0, 3.0, 5.0, N, N, N], fid_1 = [1.0, 3.0, N, 0.0, 2.0, 4.0])) @@ -141,12 +144,15 @@ module TestJoin @test join(small, large, on=:id, kind=:anti) == DataTable(id = 5, fid = 5.0) # fid - @test join(small, large, on=:fid, kind=:inner) == DataTable(id = [1, 3], fid = [1.0, 3.0], id_1 = [1, 3]) - @test join(small, large, on=:fid, kind=:left) == nullify!(DataTable(id = [1, 3, 5], fid = [1.0, 3.0, 5.0], id_1 = [1, 3, N])) - # FIXME - # @test join(small, large, on=:fid, kind=:right) == nullify!(DataTable(id = [1, 3, N, N, N], - # fid = [1.0, 3.0, 0.0, 2.0, 4.0], - # id_1 = [1, 3, 0, 2, 4])) + @test join(small, large, on=:fid, kind=:inner) == DataTable(id = [1, 3], + fid = [1.0, 3.0], + id_1 = [1, 3]) + @test join(small, large, on=:fid, kind=:left) == nullify!(DataTable(id = [1, 3, 5], + fid = [1.0, 3.0, 5.0], + id_1 = [1, 3, N])) + @test join(small, large, on=:fid, kind=:right) == nullify!(DataTable(id = [1, 3, N, N, N], + fid = [1.0, 3.0, 0.0, 2.0, 4.0], + id_1 = [1, 3, 0, 2, 4])) @test join(small, large, on=:fid, kind=:outer) == nullify!(DataTable(id = [1, 3, 5, N, N, N], fid = [1.0, 3.0, 5.0, 0.0, 2.0, 4.0], id_1 = [1, 3, N, 0, 2, 4])) @@ -154,13 +160,17 @@ module TestJoin @test join(small, large, on=:fid, kind=:anti) == DataTable(id = 5, fid = 5.0) # both - @test join(small, large, on=[:id, :fid], kind=:inner) == DataTable(id = [1, 3], fid = [1.0, 3.0]) - @test join(small, large, on=[:id, :fid], kind=:left) == nullify!(DataTable(id = [1, 3, 5], fid = [1.0, 3.0, 5.0])) - # FIXME - # @test join(small, large, on=[:id, :fid], kind=:right) == nullify!(DataTable(id = [1, 3, 0, 2, 4], fid = [1.0, 3.0, 0.0, 2.0, 4.0])) + @test join(small, large, on=[:id, :fid], kind=:inner) == DataTable(id = [1, 3], + fid = [1.0, 3.0]) + @test join(small, large, on=[:id, :fid], kind=:left) == nullify!(DataTable(id = [1, 3, 5], + fid = [1.0, 3.0, 5.0])) + @test join(small, large, on=[:id, :fid], kind=:right) == nullify!(DataTable(id = [1, 3, 0, 2, 4], + fid = [1.0, 3.0, 0.0, 2.0, 4.0])) @test join(small, large, on=[:id, :fid], kind=:outer) == nullify!(DataTable(id = [1, 3, 5, 0, 2, 4], fid = [1.0, 3.0, 5.0, 0.0, 2.0, 4.0])) - @test join(small, large, on=[:id, :fid], kind=:semi) == DataTable(id = [1, 3], fid = [1.0, 3.0]) - @test join(small, large, on=[:id, :fid], kind=:anti) == DataTable(id = 5, fid = 5.0) + @test join(small, large, on=[:id, :fid], kind=:semi) == DataTable(id = [1, 3], + fid = [1.0, 3.0]) + @test join(small, large, on=[:id, :fid], kind=:anti) == DataTable(id = 5, + fid = 5.0) end end From 9c3ad2160238daaf4eeedddb92720185808def2f Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 16 Mar 2017 17:56:20 -0700 Subject: [PATCH 23/43] update join help message and add note about temp fix --- src/abstractdatatable/join.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 68e4cfe..4844554 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -93,6 +93,7 @@ function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, # need to be taken from the right for (on_col_ix, on_col) in enumerate(joiner.on_cols) # fix the result of the rightjoin by taking the nonnull values from the right table + # end-length(rightonly_ixs.orig)+1:end was rightonly_ixs.join. Try and FIXME res[on_col][end-length(rightonly_ixs.orig)+1:end] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix] end end @@ -214,8 +215,7 @@ join(dt1::AbstractDataTable, row of `dt1` is matched with every row of `dt2` For the three join operations that may introduce missing values, `:outer`, `:left`, -and `:right`, -Null values are filled in where needed to complete joins. +and `:right`, all columns of the returned datatable will be nullable. ### Result From 1e7d26e65cf58c2735fcaa8fd378edb4385b90a2 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 16 Mar 2017 18:00:25 -0700 Subject: [PATCH 24/43] indentation --- src/abstractdatatable/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 4844554..44e15d3 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -84,7 +84,7 @@ function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, end for (i, col) in enumerate(columns(dtr_noon)) cols[i+ncleft] = kind == :inner ? col[all_orig_right_ixs] : - copy!(similar_nullable(col, nrow), col[all_orig_right_ixs])[right_perm] + copy!(similar_nullable(col, nrow), col[all_orig_right_ixs])[right_perm] end res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon))) From e39ba637718db19c579ee769ea3fbc8a691e545b Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 17 Mar 2017 14:21:41 -0700 Subject: [PATCH 25/43] changes --- docs/src/lib/manipulation.md | 2 + docs/src/man/reshaping_and_pivoting.md | 23 +++ src/DataTables.jl | 2 + src/abstractdatatable/abstractdatatable.jl | 6 +- src/abstractdatatable/io.jl | 4 +- src/abstractdatatable/reshape.jl | 217 ++++++++++++++++++++- src/datatable/datatable.jl | 20 +- test/show.jl | 7 + 8 files changed, 269 insertions(+), 12 deletions(-) diff --git a/docs/src/lib/manipulation.md b/docs/src/lib/manipulation.md index 8d24d4b..c67345a 100644 --- a/docs/src/lib/manipulation.md +++ b/docs/src/lib/manipulation.md @@ -20,4 +20,6 @@ join melt stack unstack +stackdt +meltdt ``` diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index d99e814..9be632a 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -53,6 +53,29 @@ If the remaining columns are unique, you can skip the id variable and use: widedt = unstack(longdt, :variable, :value) ``` +`stackdt` and `meltdt` are two additional functions that work like `stack` and `melt`, but they provide a view into the original wide DataTable. Here is an example: + +```julia +d = stackdt(iris) +``` + +This saves memory. To create the view, several AbstractVectors are defined: + +`:variable` column -- `EachRepeatedVector` +This repeats the variables N times where N is the number of rows of the original AbstractDataTable. + +`:value` column -- `StackedVector` +This is provides a view of the original columns stacked together. + +Id columns -- `RepeatedVector` +This repeats the original columns N times where N is the number of columns stacked. + +For more details on the storage representation, see: + +```julia +dump(stackdt(iris)) +``` + None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example: ```julia diff --git a/src/DataTables.jl b/src/DataTables.jl index 4b89a3b..799f7f6 100644 --- a/src/DataTables.jl +++ b/src/DataTables.jl @@ -57,6 +57,7 @@ export @~, eltypes, groupby, melt, + meltdt, names!, ncol, nonunique, @@ -70,6 +71,7 @@ export @~, rename, showcols, stack, + stackdt, unique!, unstack, head, diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index b51c017..6f95073 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -860,7 +860,7 @@ julia> eltypes(dt) Nullable{Int64} ``` -See also [`denullify!`] & [`nullify`](@ref). +See also [`denullify!`] and [`nullify`](@ref). """ denullify(dt::AbstractDataTable) = denullify!(copy(dt)) @@ -899,7 +899,7 @@ julia> eltypes(dt) Nullable{Int64} ``` -See also [`nullify`](@ref) & [`denullify!`](@ref). +See also [`nullify`](@ref) and [`denullify!`](@ref). """ function nullify!(dt::AbstractDataTable) for i in 1:size(dt,2) @@ -946,7 +946,7 @@ julia> eltypes(dt) Int64 ``` -See also [`nullify!`](@ref) & [`denullify`](@ref). +See also [`nullify!`](@ref) and [`denullify`](@ref). """ function nullify(dt::AbstractDataTable) nullify!(copy(dt)) diff --git a/src/abstractdatatable/io.jl b/src/abstractdatatable/io.jl index 6af518b..3c6ff81 100644 --- a/src/abstractdatatable/io.jl +++ b/src/abstractdatatable/io.jl @@ -45,7 +45,7 @@ function printtable(io::IO, if !isnull(dt[j][i]) if ! (etypes[j] <: Real) print(io, quotemark) - x = isa(dt[i, j], Nullable) ? unsafe_get(dt[i, j]) : dt[i, j] + x = unsafe_get(dt[i, j]) escapedprint(io, x, quotestr) print(io, quotemark) else @@ -168,7 +168,7 @@ function Base.show(io::IO, ::MIME"text/latex", dt::AbstractDataTable) write(io, " & ") cell = dt[row,col] if !isnull(cell) - content = isa(cell, Nullable) ? unsafe_get(cell) : cell + content = unsafe_get(cell) if mimewritable(MIME("text/latex"), content) show(io, MIME("text/latex"), content) else diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index 381825b..aab7b5c 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -94,7 +94,7 @@ end function stack(dt::AbstractDataTable, measure_var::Int, id_vars::Vector{Int}; variable_name::Symbol=:variable, value_name::Symbol=:value) stack(dt, [measure_var], id_vars; - variable_name=variable_name, value_name=value_name) + variable_name=variable_name, value_name=value_name) end function stack(dt::AbstractDataTable, measure_vars, id_vars; variable_name::Symbol=:variable, value_name::Symbol=:value) @@ -221,3 +221,218 @@ function unstack(dt::AbstractDataTable, colkey::Int, value::Int) end unstack(dt::AbstractDataTable) = unstack(dt, :id, :variable, :value) + +############################################################################## +## +## Reshaping using referencing (issue #145) +## New AbstractVector types (all read only): +## StackedVector +## RepeatedVector +## +############################################################################## + +""" +An AbstractVector{Any} that is a linear, concatenated view into +another set of AbstractVectors +NOTE: Not exported. +### Constructor +```julia +StackedVector(d::AbstractVector...) +``` +### Arguments +* `d...` : one or more AbstractVectors +### Examples +```julia +StackedVector(Any[[1,2], [9,10], [11,12]]) # [1,2,9,10,11,12] +``` +""" +type StackedVector <: AbstractVector{Any} + components::Vector{Any} +end + +function Base.getindex(v::StackedVector,i::Real) + lengths = [length(x)::Int for x in v.components] + cumlengths = [0; cumsum(lengths)] + j = searchsortedlast(cumlengths .+ 1, i) + if j > length(cumlengths) + error("indexing bounds error") + end + k = i - cumlengths[j] + if k < 1 || k > length(v.components[j]) + error("indexing bounds error") + end + v.components[j][k] +end + +function Base.getindex{I<:Real}(v::StackedVector,i::AbstractVector{I}) + result = similar(v.components[1], length(i)) + for idx in 1:length(i) + result[idx] = v[i[idx]] + end + result +end + +Base.size(v::StackedVector) = (length(v),) +Base.length(v::StackedVector) = sum(map(length, v.components)) +Base.ndims(v::StackedVector) = 1 +Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...) +Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims) + +CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # could be more efficient + + +""" +An AbstractVector that is a view into another AbstractVector with +repeated elements +NOTE: Not exported. +### Constructor +```julia +RepeatedVector(parent::AbstractVector, inner::Int, outer::Int) +``` +### Arguments +* `parent` : the AbstractVector that's repeated +* `inner` : the numer of times each element is repeated +* `outer` : the numer of times the whole vector is repeated after + expanded by `inner` +`inner` and `outer` have the same meaning as similarly named arguments +to `repeat`. +### Examples +```julia +RepeatedVector([1,2], 3, 1) # [1,1,1,2,2,2] +RepeatedVector([1,2], 1, 3) # [1,2,1,2,1,2] +RepeatedVector([1,2], 2, 2) # [1,2,1,2,1,2,1,2] +``` +""" +type RepeatedVector{T} <: AbstractVector{T} + parent::AbstractVector{T} + inner::Int + outer::Int +end + +function Base.getindex{T,I<:Real}(v::RepeatedVector{T},i::AbstractVector{I}) + N = length(v.parent) + idx = Int[Base.fld1(mod1(j,v.inner*N),v.inner) for j in i] + v.parent[idx] +end +function Base.getindex{T}(v::RepeatedVector{T},i::Real) + N = length(v.parent) + idx = Base.fld1(mod1(i,v.inner*N),v.inner) + v.parent[idx] +end +Base.getindex(v::RepeatedVector,i::Range) = getindex(v, [i;]) + +Base.size(v::RepeatedVector) = (length(v),) +Base.length(v::RepeatedVector) = v.inner * v.outer * length(v.parent) +Base.ndims(v::RepeatedVector) = 1 +Base.eltype{T}(v::RepeatedVector{T}) = T +Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.inner, v.outer) +Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims) +Base.unique(v::RepeatedVector) = unique(v.parent) + +function CategoricalArrays.CategoricalArray(v::RepeatedVector) + res = CategoricalArrays.CategoricalArray(v.parent) + res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) + res +end + +############################################################################## +## +## stackdt() +## meltdt() +## Reshaping using referencing (issue #145), using the above vector types +## +############################################################################## + +""" +A stacked view of a DataTable (long format) +Like `stack` and `melt`, but a view is returned rather than data +copies. +```julia +stackdt(dt::AbstractDataTable, [measure_vars], [id_vars]; + variable_name::Symbol=:variable, value_name::Symbol=:value) +meltdt(dt::AbstractDataTable, [id_vars], [measure_vars]; + variable_name::Symbol=:variable, value_name::Symbol=:value) +``` +### Arguments +* `dt` : the wide AbstractDataTable +* `measure_vars` : the columns to be stacked (the measurement + variables), a normal column indexing type, like a Symbol, + Vector{Symbol}, Int, etc.; for `melt`, defaults to all + variables that are not `id_vars` +* `id_vars` : the identifier columns that are repeated during + stacking, a normal column indexing type; for `stack` defaults to all + variables that are not `measure_vars` +### Result +* `::DataTable` : the long-format datatable with column `:value` + holding the values of the stacked columns (`measure_vars`), with + column `:variable` a Vector of Symbols with the `measure_vars` name, + and with columns for each of the `id_vars`. +The result is a view because the columns are special AbstractVectors +that return indexed views into the original DataTable. +### Examples +```julia +d1 = DataTable(a = repeat([1:3;], inner = [4]), + b = repeat([1:4;], inner = [3]), + c = randn(12), + d = randn(12), + e = map(string, 'a':'l')) +d1s = stackdt(d1, [:c, :d]) +d1s2 = stackdt(d1, [:c, :d], [:a]) +d1m = meltdt(d1, [:a, :b, :e]) +``` +""" +function stackdt(dt::AbstractDataTable, measure_vars::Vector{Int}, + id_vars::Vector{Int}; variable_name::Symbol=:variable, + value_name::Symbol=:value) + N = length(measure_vars) + cnames = names(dt)[id_vars] + insert!(cnames, 1, value_name) + insert!(cnames, 1, variable_name) + DataTable(Any[RepeatedVector(_names(dt)[measure_vars], nrow(dt), 1), # variable + StackedVector(Any[dt[:,c] for c in measure_vars]), # value + [RepeatedVector(dt[:,c], 1, N) for c in id_vars]...], # id_var columns + cnames) +end +function stackdt(dt::AbstractDataTable, measure_var::Int, id_var::Int; + variable_name::Symbol=:variable, value_name::Symbol=:value) + stackdt(dt, [measure_var], [id_var]; variable_name=variable_name, + value_name=value_name) +end +function stackdt(dt::AbstractDataTable, measure_vars, id_var::Int; + variable_name::Symbol=:variable, value_name::Symbol=:value) + stackdt(dt, measure_vars, [id_var]; variable_name=variable_name, + value_name=value_name) +end +function stackdt(dt::AbstractDataTable, measure_var::Int, id_vars; + variable_name::Symbol=:variable, value_name::Symbol=:value) + stackdt(dt, [measure_var], id_vars; variable_name=variable_name, + value_name=value_name) +end +function stackdt(dt::AbstractDataTable, measure_vars, id_vars; + variable_name::Symbol=:variable, value_name::Symbol=:value) + stackdt(dt, index(dt)[measure_vars], index(dt)[id_vars]; + variable_name=variable_name, value_name=value_name) +end +function stackdt(dt::AbstractDataTable, measure_vars = numeric_vars(dt); + variable_name::Symbol=:variable, value_name::Symbol=:value) + m_inds = index(dt)[measure_vars] + stackdt(dt, m_inds, _setdiff(1:ncol(dt), m_inds); + variable_name=variable_name, value_name=value_name) +end + +""" +A stacked view of a DataTable (long format); see `stackdt` +""" +function meltdt(dt::AbstractDataTable, id_vars; variable_name::Symbol=:variable, + value_name::Symbol=:value) + id_inds = index(dt)[id_vars] + stackdt(dt, _setdiff(1:ncol(dt), id_inds), id_inds; + variable_name=variable_name, value_name=value_name) +end +function meltdt(dt::AbstractDataTable, id_vars, measure_vars; + variable_name::Symbol=:variable, value_name::Symbol=:value) + stackdt(dt, measure_vars, id_vars; variable_name=variable_name, + value_name=value_name) +end +meltdt(dt::AbstractDataTable; variable_name::Symbol=:variable, value_name::Symbol=:value) = + stackdt(dt; variable_name=variable_name, value_name=value_name) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 96cf59a..57d78a8 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -145,7 +145,19 @@ function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractV columns = Vector{Any}(p) for j in 1:p elty = column_eltypes[j] - columns[j] = elty <: Nullable ? NullableArray{eltype(elty)}(nrows) : Vector{elty}(nrows) + if elty <: Nullable + if eltype(elty) <: CategoricalValue + columns[j] = NullableCategoricalArray{eltype(elty).parameters[1]}(nrows) + else + columns[j] = NullableArray{eltype(elty)}(nrows) + end + else + if elty <: CategoricalValue + columns[j] = CategoricalArray{elty.parameters[1]}(nrows) + else + columns[j] = Vector{elty}(nrows) + end + end end return DataTable(columns, Index(convert(Vector{Symbol}, cnames))) end @@ -731,11 +743,7 @@ function hcat!(dt1::DataTable, dt2::AbstractDataTable) return dt1 end -hcat!(dt::DataTable, x::CategoricalArray) = hcat!(dt, DataTable(Any[x])) -hcat!(dt::DataTable, x::NullableCategoricalArray) = hcat!(dt, DataTable(Any[x])) -hcat!(dt::DataTable, x::NullableVector) = hcat!(dt, DataTable(Any[x])) -hcat!(dt::DataTable, x::Vector) = hcat!(dt, DataTable(Any[(x)])) -hcat!(dt::DataTable, x) = hcat!(dt, DataTable(Any[([x])])) +hcat!(dt::DataTable, x::AbstractVector) = hcat!(dt, DataTable(Any[x])) # hcat! for 1-n arguments hcat!(dt::DataTable) = dt diff --git a/test/show.jl b/test/show.jl index abad44c..8279458 100644 --- a/test/show.jl +++ b/test/show.jl @@ -30,6 +30,13 @@ module TestShow dt = DataTable(A = Vector{String}(3)) + A = DataTables.StackedVector(Any[[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + show(io, A) + A = DataTables.RepeatedVector([1, 2, 3], 5, 1) + show(io, A) + A = DataTables.RepeatedVector([1, 2, 3], 1, 5) + show(io, A) + #Test show output for REPL and similar dt = DataTable(Fish = ["Suzy", "Amir"], Mass = [1.5, Nullable()]) io = IOBuffer() From 04cb9eef008c32edee869769b920d1d69bb046ee Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 17 Mar 2017 14:24:59 -0700 Subject: [PATCH 26/43] spacing --- test/show.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/show.jl b/test/show.jl index 8279458..8bbbd78 100644 --- a/test/show.jl +++ b/test/show.jl @@ -30,11 +30,11 @@ module TestShow dt = DataTable(A = Vector{String}(3)) - A = DataTables.StackedVector(Any[[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - show(io, A) - A = DataTables.RepeatedVector([1, 2, 3], 5, 1) - show(io, A) - A = DataTables.RepeatedVector([1, 2, 3], 1, 5) + A = DataTables.StackedVector(Any[[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + show(io, A) + A = DataTables.RepeatedVector([1, 2, 3], 5, 1) + show(io, A) + A = DataTables.RepeatedVector([1, 2, 3], 1, 5) show(io, A) #Test show output for REPL and similar From 5d706857998db1731abe69ca43352f7a9287dbab Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 17 Mar 2017 20:25:54 -0700 Subject: [PATCH 27/43] put old unstack back and stabilize types, ordering --- src/abstractdatatable/join.jl | 15 ++++--- src/abstractdatatable/reshape.jl | 71 ++++++++++++++++++++++++-------- src/datatable/datatable.jl | 6 +-- test/data.jl | 6 +-- test/datatable.jl | 61 ++++++++++++++++++--------- 5 files changed, 110 insertions(+), 49 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 44e15d3..cbc32cc 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -3,14 +3,17 @@ ## # Like similar, but returns a nullable array -similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = - NullableArray(T, dims) +similar_nullable{T}(dv::AbstractVector{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableVector{T}(dims) -similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = - NullableArray(eltype(T), dims) +similar_nullable{T<:Nullable}(dv::AbstractVector{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableVector{eltype(T)}(dims) -similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = - NullableCategoricalArray(T, dims) +similar_nullable{T,R}(dv::CategoricalVector{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableCategoricalVector{T}(dims) + +similar_nullable{T,R}(dv::NullableCategoricalVector{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableCategoricalVector{T}(dims) # helper structure for DataTables joining immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable} diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index aab7b5c..2ca3204 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -53,6 +53,11 @@ melt(dt::AbstractDataTable, [id_vars], [measure_vars]; column `:variable` a Vector of Symbols with the `measure_vars` name, and with columns for each of the `id_vars`. +See also `stackdt` and `meltdt` for stacking methods that return a +view into the original DataTable. See `unstack` for converting from +long to wide format. + + ### Examples ```julia @@ -135,6 +140,7 @@ end melt(dt::AbstractDataTable; variable_name::Symbol=:variable, value_name::Symbol=:value) = stack(dt; variable_name=variable_name, value_name=value_name) + ############################################################################## ## ## unstack() @@ -188,17 +194,28 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) # `rowkey` integer indicating which column to place along rows # `colkey` integer indicating which column to place along column headers # `value` integer indicating which column has values - values = dt[value] - newcols = dt[colkey] - uniquenewcols = unique(newcols) - ncol = length(uniquenewcols) + 1 - columns = Vector{Any}(ncol) - columns[1] = unique(dt[rowkey]) - for (i,coli) in enumerate(2:ncol) - columns[coli] = values[find(newcols .== uniquenewcols[i])] + refkeycol = NullableCategoricalArray(dt[rowkey]) + levels!(refkeycol, unique(dt[rowkey])) + valuecol = dt[value] + keycol = NullableCategoricalArray(dt[colkey]) + levels!(keycol, unique(dt[colkey])) + Nrow = length(refkeycol.pool) + Ncol = length(keycol.pool) + payload = DataTable(Any[similar_nullable(valuecol, Nrow) for i in 1:Ncol], map(Symbol, levels(keycol))) + nowarning = true + for k in 1:nrow(dt) + j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) + i = Int(CategoricalArrays.order(refkeycol.pool)[refkeycol.refs[k]]) + if i > 0 && j > 0 + if nowarning && !isnull(payload[j][i]) + warn("Duplicate entries in unstack.") + nowarning = false + end + payload[j][i] = valuecol[k] + end end - colnames = vcat(names(dt)[rowkey], Symbol.(uniquenewcols)) - DataTable(columns, colnames) + col = typeof(similar_nullable(dt[rowkey], 1))(levels(refkeycol)) + insert!(payload, 1, col, _names(dt)[rowkey]) end unstack(dt::AbstractDataTable, rowkey, colkey, value) = unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value]) @@ -208,20 +225,38 @@ unstack(dt::AbstractDataTable, colkey, value) = unstack(dt, index(dt)[colkey], index(dt)[value]) function unstack(dt::AbstractDataTable, colkey::Int, value::Int) - anchor = unique(dt[deleteat!(names(dt), [colkey, value])]) - groups = groupby(dt, names(anchor)) - newcolnames = unique(dt[colkey]) - newcols = DataTable(Any[typeof(dt[value])(size(anchor,1)) for n in newcolnames], Symbol.(newcolnames)) - for (i, g) in enumerate(groups) - for col in newcolnames - newcols[i, Symbol(col)] = g[g[colkey] .== col, value][1] + # group on anything not a key or value: + g = groupby(dt, setdiff(_names(dt), _names(dt)[[colkey, value]])) + groupidxs = [g.idx[g.starts[i]:g.ends[i]] for i in 1:length(g.starts)] + rowkey = zeros(Int, size(dt, 1)) + for i in 1:length(groupidxs) + rowkey[groupidxs[i]] = i + end + keycol = NullableCategoricalArray(dt[colkey]) + levels!(keycol, unique(dt[colkey])) + valuecol = dt[value] + dt1 = nullify!(dt[g.idx[g.starts], g.cols]) + Nrow = length(g) + Ncol = length(levels(keycol)) + dt2 = DataTable(Any[similar_nullable(valuecol, Nrow) for i in 1:Ncol], map(Symbol, levels(keycol))) + nowarning = true + for k in 1:nrow(dt) + j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) + i = rowkey[k] + if i > 0 && j > 0 + if nowarning && !isnull(dt2[j][i]) + warn("Duplicate entries in unstack at row $k.") + nowarning = false + end + dt2[j][i] = valuecol[k] end end - hcat(anchor, newcols) + hcat(dt1, dt2) end unstack(dt::AbstractDataTable) = unstack(dt, :id, :variable, :value) + ############################################################################## ## ## Reshaping using referencing (issue #145) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 57d78a8..5d3e231 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -147,13 +147,13 @@ function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractV elty = column_eltypes[j] if elty <: Nullable if eltype(elty) <: CategoricalValue - columns[j] = NullableCategoricalArray{eltype(elty).parameters[1]}(nrows) + columns[j] = NullableCategoricalVector{eltype(elty).parameters[1]}(nrows) else - columns[j] = NullableArray{eltype(elty)}(nrows) + columns[j] = NullableVector{eltype(elty)}(nrows) end else if elty <: CategoricalValue - columns[j] = CategoricalArray{elty.parameters[1]}(nrows) + columns[j] = CategoricalVector{elty.parameters[1]}(nrows) else columns[j] = Vector{elty}(nrows) end diff --git a/test/data.jl b/test/data.jl index ca54f26..d0272e1 100644 --- a/test/data.jl +++ b/test/data.jl @@ -192,9 +192,9 @@ module TestData d1us = unstack(d1s, :id, :variable, :value) d1us2 = unstack(d1s2) d1us3 = unstack(d1s2, :variable, :value) - @test d1us[:a] == d1[:a] - @test d1us2[:d] == d1[:d] - @test d1us2[:3] == d1[:d] + @test isequal(d1us[:a], NullableArray(d1[:a])) + @test isequal(d1us2[:d], NullableArray(d1[:d])) + @test isequal(d1us2[:3], NullableArray(d1[:d])) diff --git a/test/datatable.jl b/test/datatable.jl index 6769733..4adaf24 100644 --- a/test/datatable.jl +++ b/test/datatable.jl @@ -286,25 +286,48 @@ module TestDataTable @test nothing == describe(f, NullableCategoricalArray(Nullable{String}["1", "2", Nullable()])) end - dt = DataTable(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), - Key = ["Mass", "Color", "Mass", "Color"], - Value = ["12 g", "Red", "18 g", "Grey"]) - # Check that reordering levels does not confuse unstack - levels!(dt[1], ["XXX", "Bob", "Batman"]) - #Unstack specifying a row column - dt2 = unstack(dt, :Fish, :Key, :Value) - #Unstack without specifying a row column - dt3 = unstack(dt, :Key, :Value) - #The expected output - dt4 = DataTable(Fish = ["Bob", "Batman"], - Mass = ["12 g", "18 g"], - Color = ["Red", "Grey"] ) - @test isequal(dt2, dt4) - @test isequal(dt3, dt4) - # can't assign Nullable() to a typed column - #Make sure unstack works with NULLs at the start of the value column - # dt[1,:Value] = Nullable() - dt2 = unstack(dt,:Fish, :Key, :Value) + @testset "unstacking and nullables" begin + dtA = DataTable(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), + Key = ["Mass", "Color", "Mass", "Color"], + Value = ["12 g", "Red", "18 g", "Grey"]) + # Check that reordering levels does not confuse unstack + levels!(dtA[1], ["XXX", "Bob", "Batman"]) + # should all be the same, just different column types + dt2A = unstack(dtA, :Fish, :Key, :Value) + dt3A = unstack(dtA, :Key, :Value) + #The expected output + dt4A = DataTable(Fish = NullableCategoricalArray(["Bob", "Batman"]), + Mass = NullableArray(["12 g", "18 g"]), + Color = NullableArray(["Red", "Grey"])) + @test dt2A == dt3A == dt4A + + dtB = DataTable(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), + Key = CategoricalArray(["Mass", "Color", "Mass", "Color"]), + Value = CategoricalArray(["12 g", "Red", "18 g", "Grey"])) + dt2B = unstack(dtB, :Fish, :Key, :Value) + dt3B = unstack(dtB, :Key, :Value) + # fixme, these are all being reordered by NullableCategoricalArray constructor + dt4B = DataTable(Fish = NullableCategoricalArray(["Batman", "Bob"]), + Color = NullableCategoricalArray(["Grey", "Red"]), + Mass = NullableCategoricalArray(["18 g", "12 g"])) + @test dt2B == dt3B[[2,1], :] == dt4B + + # test multiple entries in unstack error + dt = DataTable(id=[1, 2, 1, 2], variable=["a", "b", "a", "b"], value=[3, 4, 5, 6]) + a = unstack(dt, :id, :variable, :value) + b = unstack(dt, :variable, :value) + @test a == b == DataTable(id = Nullable[1, 2], a = Nullable[5, Nullable()], b = Nullable[Nullable(), 6]) + + dt = DataTable(id=1:2, variable=["a", "b"], value=3:4) + a = unstack(dt, :id, :variable, :value) + b = unstack(dt, :variable, :value) + @test a == b == DataTable(id = Nullable[1, 2], a = Nullable[3, Nullable()], b = Nullable[Nullable(), 4]) + + dt = DataTable(id=1:2, variable=["a", "b"], value=3:4) + a = unstack(dt, :id, :variable, :value) + b = unstack(dt, :variable, :value) + @test a == b == DataTable(id = Nullable[1, 2], a = [3, Nullable()], b = [Nullable(), 4]) + end dt = DataTable(A = 1:10, B = 'A':'J') @test !(dt[:,:] === dt) From 7859132539b351111ec4888f6dfbc4595092ed17 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 17 Mar 2017 20:43:07 -0700 Subject: [PATCH 28/43] fix bad copy and paste spacing and condense scalar recycling code --- src/abstractdatatable/reshape.jl | 28 ++++++++++++++++++++++++++++ src/datatable/datatable.jl | 8 ++------ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index 2ca3204..df4be00 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -269,17 +269,25 @@ unstack(dt::AbstractDataTable) = unstack(dt, :id, :variable, :value) """ An AbstractVector{Any} that is a linear, concatenated view into another set of AbstractVectors + NOTE: Not exported. + ### Constructor + ```julia StackedVector(d::AbstractVector...) ``` + ### Arguments + * `d...` : one or more AbstractVectors + ### Examples + ```julia StackedVector(Any[[1,2], [9,10], [11,12]]) # [1,2,9,10,11,12] ``` + """ type StackedVector <: AbstractVector{Any} components::Vector{Any} @@ -319,24 +327,33 @@ CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # """ An AbstractVector that is a view into another AbstractVector with repeated elements + NOTE: Not exported. + ### Constructor + ```julia RepeatedVector(parent::AbstractVector, inner::Int, outer::Int) + ``` + ### Arguments + * `parent` : the AbstractVector that's repeated * `inner` : the numer of times each element is repeated * `outer` : the numer of times the whole vector is repeated after expanded by `inner` `inner` and `outer` have the same meaning as similarly named arguments to `repeat`. + ### Examples + ```julia RepeatedVector([1,2], 3, 1) # [1,1,1,2,2,2] RepeatedVector([1,2], 1, 3) # [1,2,1,2,1,2] RepeatedVector([1,2], 2, 2) # [1,2,1,2,1,2,1,2] ``` + """ type RepeatedVector{T} <: AbstractVector{T} parent::AbstractVector{T} @@ -382,28 +399,37 @@ end A stacked view of a DataTable (long format) Like `stack` and `melt`, but a view is returned rather than data copies. + ```julia stackdt(dt::AbstractDataTable, [measure_vars], [id_vars]; variable_name::Symbol=:variable, value_name::Symbol=:value) meltdt(dt::AbstractDataTable, [id_vars], [measure_vars]; variable_name::Symbol=:variable, value_name::Symbol=:value) ``` + ### Arguments + * `dt` : the wide AbstractDataTable + * `measure_vars` : the columns to be stacked (the measurement variables), a normal column indexing type, like a Symbol, Vector{Symbol}, Int, etc.; for `melt`, defaults to all variables that are not `id_vars` + * `id_vars` : the identifier columns that are repeated during stacking, a normal column indexing type; for `stack` defaults to all variables that are not `measure_vars` + ### Result + * `::DataTable` : the long-format datatable with column `:value` holding the values of the stacked columns (`measure_vars`), with column `:variable` a Vector of Symbols with the `measure_vars` name, and with columns for each of the `id_vars`. + The result is a view because the columns are special AbstractVectors that return indexed views into the original DataTable. + ### Examples ```julia d1 = DataTable(a = repeat([1:3;], inner = [4]), @@ -411,10 +437,12 @@ d1 = DataTable(a = repeat([1:3;], inner = [4]), c = randn(12), d = randn(12), e = map(string, 'a':'l')) + d1s = stackdt(d1, [:c, :d]) d1s2 = stackdt(d1, [:c, :d], [:a]) d1m = meltdt(d1, [:a, :b, :e]) ``` + """ function stackdt(dt::AbstractDataTable, measure_vars::Vector{Int}, id_vars::Vector{Int}; variable_name::Symbol=:variable, diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 5d3e231..06673d3 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -83,7 +83,7 @@ type DataTable <: AbstractDataTable minlen, maxlen = extrema(lengths) if minlen == 0 && maxlen == 0 return new(columns, colindex) - elseif minlen != maxlen + elseif minlen != maxlen || minlen == maxlen == 1 # recycle scalars for i in 1:length(columns) typeof(columns[i]) <: AbstractArray && continue @@ -101,12 +101,8 @@ type DataTable <: AbstractDataTable for (i,c) in enumerate(columns) if isa(c, Range) columns[i] = collect(c) - elseif !isa(c, AbstractVector) - if isa(c, AbstractArray) + elseif !isa(c, AbstractVector) && isa(c, AbstractArray) throw(DimensionMismatch("columns must be 1-dimensional")) - else - columns[i] = [c] - end else columns[i] = c end From 6496acfd09c8769913ea8a11f19e36b37c13dff0 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 17 Mar 2017 23:06:56 -0700 Subject: [PATCH 29/43] update vcat error --- src/abstractdatatable/abstractdatatable.jl | 22 ++++++++++++++++------ test/cat.jl | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 6f95073..ef66f35 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -747,13 +747,23 @@ function Base.vcat(dts::AbstractDataTable...) uniqueheaders = unique(allheaders[notempty]) if length(uniqueheaders) == 0 return DataTable() - elseif length(uniqueheaders) > 1 - estring = Vector{String}(length(uniqueheaders)) - for (i,u) in enumerate(uniqueheaders) - indices = string.(find(x -> x == u, allheaders)) - estring[i] = "columns ($(join(u, ", "))) of input(s) ($(join(indices, ", ")))" + end + coldiff = setdiff(union(uniqueheaders...), intersect(uniqueheaders...)) + if length(uniqueheaders) > 1 + if !isempty(coldiff) + headerlengths = length.(uniqueheaders) + minheaderloci = find(headerlengths .== minimum(headerlengths)) + minheaders = uniqueheaders[minheaderloci[1]] + throw(ArgumentError("column(s) ($(join(string.(coldiff), ", "))) are missing from argument(s) ($(join(string.(minheaderloci), ", ")))")) + else + estrings = Vector{String}(length(uniqueheaders)) + for (i, u) in enumerate(uniqueheaders) + indices = find(a -> a == u, allheaders) + indices = join(string.(indices), ", ") + estrings[i] = "column order of argument(s) ($indices)" + end + throw(ArgumentError(join(estrings, " != "))) end - throw(ArgumentError(join(estring, " != "))) else header = uniqueheaders[1] dts_to_vcat = dts[notempty] diff --git a/test/cat.jl b/test/cat.jl index a5b41b5..6303db1 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -125,5 +125,20 @@ module TestCat @test_throws ArgumentError vcat(dt1, dt2) dt2 = DataTable(A = 1:3, C = 1:3) @test_throws ArgumentError vcat(dt1, dt2) + dt1 = DataTable(A = 1, B = 1) + dt2 = DataTable(B = 1, A = 1) + @test_throws ArgumentError vcat(dt1, dt2) + @test_throws ArgumentError vcat(dt1, dt1, dt1, dt1, dt2, dt2, dt2, dt2) + dt3 = DataTable(A = 1, B = 1, C = 1) + @test_throws ArgumentError vcat(dt1, dt3) + @test_throws ArgumentError vcat(dt2, dt3) + dt4 = DataTable(A = 1, B = 1, C = 1, D = 1) + @test_throws ArgumentError vcat(dt1, dt4) + @test_throws ArgumentError vcat(dt2, dt4) + @test_throws ArgumentError vcat(dt3, dt4) + dt5 = hcat(dt4, dt4, dt4, dt4) + @test_throws ArgumentError vcat(dt3, dt5) + dt5r = names!(copy(dt5), reverse(names(dt5))) + @test_throws ArgumentError vcat(dt5, dt5r) end end From f47810fcf70087932a4029b41fd18d02f3dd5dc0 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 17 Mar 2017 23:30:34 -0700 Subject: [PATCH 30/43] unused function, another test, remove unused variable --- src/abstractdatatable/abstractdatatable.jl | 14 +------------- test/cat.jl | 1 + 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index ef66f35..bc40a3c 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -663,17 +663,6 @@ unique!(dt) # modifies dt """ (unique, unique!) -function nonuniquekey(dt::AbstractDataTable) - # Here's another (probably a lot faster) way to do `nonunique` - # by grouping on all columns. It will fail if columns cannot be - # made into CategoricalVector's. - gd = groupby(dt, _names(dt)) - idx = [1:length(gd.idx)][gd.idx][gd.starts] - res = fill(true, nrow(dt)) - res[idx] = false - res -end - # Count the number of missing values in every column of an AbstractDataTable. function colmissing(dt::AbstractDataTable) # -> Vector{Int} nrows, ncols = size(dt) @@ -751,9 +740,8 @@ function Base.vcat(dts::AbstractDataTable...) coldiff = setdiff(union(uniqueheaders...), intersect(uniqueheaders...)) if length(uniqueheaders) > 1 if !isempty(coldiff) - headerlengths = length.(uniqueheaders) + headerlengths = length.(allheaders) minheaderloci = find(headerlengths .== minimum(headerlengths)) - minheaders = uniqueheaders[minheaderloci[1]] throw(ArgumentError("column(s) ($(join(string.(coldiff), ", "))) are missing from argument(s) ($(join(string.(minheaderloci), ", ")))")) else estrings = Vector{String}(length(uniqueheaders)) diff --git a/test/cat.jl b/test/cat.jl index 6303db1..9b35649 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -131,6 +131,7 @@ module TestCat @test_throws ArgumentError vcat(dt1, dt1, dt1, dt1, dt2, dt2, dt2, dt2) dt3 = DataTable(A = 1, B = 1, C = 1) @test_throws ArgumentError vcat(dt1, dt3) + @test_throws ArgumentError vcat(dt1, dt1, dt3, dt3) @test_throws ArgumentError vcat(dt2, dt3) dt4 = DataTable(A = 1, B = 1, C = 1, D = 1) @test_throws ArgumentError vcat(dt1, dt4) From 259ceef7e36cecc80a9fbf7b1bd32f6a77162f99 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 00:05:39 -0700 Subject: [PATCH 31/43] revert function removal to appease new code failures? --- src/abstractdatatable/abstractdatatable.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index bc40a3c..ffe4213 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -663,6 +663,17 @@ unique!(dt) # modifies dt """ (unique, unique!) +function nonuniquekey(dt::AbstractDataTable) + # Here's another (probably a lot faster) way to do `nonunique` + # by grouping on all columns. It will fail if columns cannot be + # made into CategoricalVector's. + gd = groupby(dt, _names(dt)) + idx = [1:length(gd.idx)][gd.idx][gd.starts] + res = fill(true, nrow(dt)) + res[idx] = false + res +end + # Count the number of missing values in every column of an AbstractDataTable. function colmissing(dt::AbstractDataTable) # -> Vector{Int} nrows, ncols = size(dt) From 26e87ac2c76bfef3e6f42baa1a960b3ee950fedb Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 00:21:01 -0700 Subject: [PATCH 32/43] fix v0.5 issue --- src/abstractdatatable/abstractdatatable.jl | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index ffe4213..8254bb8 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -663,15 +663,15 @@ unique!(dt) # modifies dt """ (unique, unique!) -function nonuniquekey(dt::AbstractDataTable) - # Here's another (probably a lot faster) way to do `nonunique` - # by grouping on all columns. It will fail if columns cannot be - # made into CategoricalVector's. - gd = groupby(dt, _names(dt)) - idx = [1:length(gd.idx)][gd.idx][gd.starts] - res = fill(true, nrow(dt)) - res[idx] = false - res +function nonuniquekey(dt::AbstractDataTable) + # Here's another (probably a lot faster) way to do `nonunique` + # by grouping on all columns. It will fail if columns cannot be + # made into CategoricalVector's. + gd = groupby(dt, _names(dt)) + idx = [1:length(gd.idx)][gd.idx][gd.starts] + res = fill(true, nrow(dt)) + res[idx] = false + res end # Count the number of missing values in every column of an AbstractDataTable. @@ -752,7 +752,8 @@ function Base.vcat(dts::AbstractDataTable...) if length(uniqueheaders) > 1 if !isempty(coldiff) headerlengths = length.(allheaders) - minheaderloci = find(headerlengths .== minimum(headerlengths)) + m = minimum(headerlengths) + minheaderloci = find(h -> h == m, headerlengths) throw(ArgumentError("column(s) ($(join(string.(coldiff), ", "))) are missing from argument(s) ($(join(string.(minheaderloci), ", ")))")) else estrings = Vector{String}(length(uniqueheaders)) From e0f7982dc9c3647b029ecf8f07db96d90ac7e09d Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 11:46:25 -0700 Subject: [PATCH 33/43] update vcat testing and change similar_nullable constructor call --- src/abstractdatatable/abstractdatatable.jl | 45 ++++++++---- src/abstractdatatable/reshape.jl | 5 +- test/cat.jl | 81 +++++++++++++++++----- 3 files changed, 96 insertions(+), 35 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index ffe4213..195b496 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -663,15 +663,15 @@ unique!(dt) # modifies dt """ (unique, unique!) -function nonuniquekey(dt::AbstractDataTable) - # Here's another (probably a lot faster) way to do `nonunique` - # by grouping on all columns. It will fail if columns cannot be - # made into CategoricalVector's. - gd = groupby(dt, _names(dt)) - idx = [1:length(gd.idx)][gd.idx][gd.starts] - res = fill(true, nrow(dt)) - res[idx] = false - res +function nonuniquekey(dt::AbstractDataTable) + # Here's another (probably a lot faster) way to do `nonunique` + # by grouping on all columns. It will fail if columns cannot be + # made into CategoricalVector's. + gd = groupby(dt, _names(dt)) + idx = [1:length(gd.idx)][gd.idx][gd.starts] + res = fill(true, nrow(dt)) + res[idx] = false + res end # Count the number of missing values in every column of an AbstractDataTable. @@ -748,18 +748,33 @@ function Base.vcat(dts::AbstractDataTable...) if length(uniqueheaders) == 0 return DataTable() end - coldiff = setdiff(union(uniqueheaders...), intersect(uniqueheaders...)) if length(uniqueheaders) > 1 + unionunique = union(uniqueheaders...) + coldiff = setdiff(unionunique, intersect(uniqueheaders...)) if !isempty(coldiff) - headerlengths = length.(allheaders) - minheaderloci = find(headerlengths .== minimum(headerlengths)) - throw(ArgumentError("column(s) ($(join(string.(coldiff), ", "))) are missing from argument(s) ($(join(string.(minheaderloci), ", ")))")) + # if any datatables are a full superset of names, skip them + filter!(u -> Set(u) != Set(unionunique), uniqueheaders) + estrings = Vector{String}(length(uniqueheaders)) + for (i, u) in enumerate(uniqueheaders) + matchingloci = find(h -> u == h, allheaders) + headerdiff = filter(x -> !in(x, u), coldiff) + headerdiff = length(headerdiff) > 1 ? + join(string.(headerdiff[1:end-1]), ", ") * " and " * string(headerdiff[end]) : + string(headerdiff[end]) + matchingloci = length(matchingloci) > 1 ? + join(string.(matchingloci[1:end-1]), ", ") * " and " * string(matchingloci[end]) : + string(matchingloci[end]) + estrings[i] = "column(s) $headerdiff are missing from argument(s) $matchingloci" + end + throw(ArgumentError(join(estrings, ", and "))) else estrings = Vector{String}(length(uniqueheaders)) for (i, u) in enumerate(uniqueheaders) indices = find(a -> a == u, allheaders) - indices = join(string.(indices), ", ") - estrings[i] = "column order of argument(s) ($indices)" + indices = length(indices) > 1 ? + join(string.(indices[1:end-1]), ", ") * " and " * string(indices[end]) : + string(indices[end]) + estrings[i] = "column order of argument(s) $indices" end throw(ArgumentError(join(estrings, " != "))) end diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index df4be00..3368d81 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -214,8 +214,9 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) payload[j][i] = valuecol[k] end end - col = typeof(similar_nullable(dt[rowkey], 1))(levels(refkeycol)) - insert!(payload, 1, col, _names(dt)[rowkey]) + levs = levels(refkeycol) + col = similar_nullable(dt[rowkey], length(levs)) + insert!(payload, 1, copy!(col, levs), _names(dt)[rowkey]) end unstack(dt::AbstractDataTable, rowkey, colkey, value) = unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value]) diff --git a/test/cat.jl b/test/cat.jl index 9b35649..5f9ac7c 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -122,24 +122,69 @@ module TestCat @testset "vcat errors" begin dt1 = DataTable(A = 1:3, B = 1:3) dt2 = DataTable(A = 1:3) - @test_throws ArgumentError vcat(dt1, dt2) - dt2 = DataTable(A = 1:3, C = 1:3) - @test_throws ArgumentError vcat(dt1, dt2) + # right missing 1 column + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) B are missing from argument(s) 2" + # left missing 1 column + err = @test_throws ArgumentError vcat(dt2, dt1) + @test err.value.msg == "column(s) B are missing from argument(s) 1" + # multiple missing 1 column + err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2, dt2) + @test err.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6" + # argument missing >1columns + dt1 = DataTable(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3) + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2" + # >1 arguments missing >1 columns + err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2) + @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5" + # out of order + dt2 = dt1[reverse(names(dt1))] + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2" + # left >1 + err = @test_throws ArgumentError vcat(dt1, dt1, dt2) + @test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3" + # right >1 + err = @test_throws ArgumentError vcat(dt1, dt2, dt2) + @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3" + # left and right >1 + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2) + @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6" + # >2 groups out of order + srand(1) + dt3 = dt1[shuffle(names(dt1))] + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt3) + @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10" + # missing columns throws error before out of order columns dt1 = DataTable(A = 1, B = 1) - dt2 = DataTable(B = 1, A = 1) - @test_throws ArgumentError vcat(dt1, dt2) - @test_throws ArgumentError vcat(dt1, dt1, dt1, dt1, dt2, dt2, dt2, dt2) - dt3 = DataTable(A = 1, B = 1, C = 1) - @test_throws ArgumentError vcat(dt1, dt3) - @test_throws ArgumentError vcat(dt1, dt1, dt3, dt3) - @test_throws ArgumentError vcat(dt2, dt3) - dt4 = DataTable(A = 1, B = 1, C = 1, D = 1) - @test_throws ArgumentError vcat(dt1, dt4) - @test_throws ArgumentError vcat(dt2, dt4) - @test_throws ArgumentError vcat(dt3, dt4) - dt5 = hcat(dt4, dt4, dt4, dt4) - @test_throws ArgumentError vcat(dt3, dt5) - dt5r = names!(copy(dt5), reverse(names(dt5))) - @test_throws ArgumentError vcat(dt5, dt5r) + dt2 = DataTable(A = 1) + dt3 = DataTable(B = 1, A = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3) + @test err.value.msg == "column(s) B are missing from argument(s) 2" + # unique columns for both sides + dt1 = DataTable(A = 1, B = 1, C = 1, D = 1) + dt2 = DataTable(A = 1, C = 1, D = 1, E = 1, F = 1) + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4" + dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, and column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + # dt4 is a superset of names found in all other datatables and won't be shown in error + dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, and column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, and column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11" end end From b0c29b4a1ddd14752846d664298fefa86ce149d4 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 11:48:58 -0700 Subject: [PATCH 34/43] remove old error message from docstring --- src/abstractdatatable/abstractdatatable.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 195b496..6efeb31 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -721,8 +721,6 @@ julia> dt1 = DataTable(A=1:3, B=1:3); julia> dt2 = DataTable(A=4:6, B=4:6); -julia> dt3 = DataTable(A=7:9, B=7:9, C=7:9); - julia> vcat(dt1, dt2) 6×2 DataTables.DataTable │ Row │ A │ B │ @@ -733,9 +731,6 @@ julia> vcat(dt1, dt2) │ 4 │ 4 │ 4 │ │ 5 │ 5 │ 5 │ │ 6 │ 6 │ 6 │ - -julia> vcat(dt1, dt2, dt3) -ERROR: ArgumentError: columns (A, B) of input(s) (1, 2) != columns (A, B, C) of input(s) (3) ``` """ Base.vcat(dt::AbstractDataTable) = dt From 95a6f314ef1917ebd8e9a6c57959a4d236263a6e Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 11:49:22 -0700 Subject: [PATCH 35/43] and change docstring to doctest --- src/abstractdatatable/abstractdatatable.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 6efeb31..88fce9f 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -716,7 +716,7 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable Vertically concatenate `AbstractDataTables` that have the same column names in the same order. -```julia +```jldoctest julia> dt1 = DataTable(A=1:3, B=1:3); julia> dt2 = DataTable(A=4:6, B=4:6); From 7df712f08cced61ced0ff1c20690db92e0d5292a Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 12:05:38 -0700 Subject: [PATCH 36/43] change similar_nullable back and fix unrelated copy paste space removal --- src/abstractdatatable/join.jl | 16 ++++++++-------- src/abstractdatatable/reshape.jl | 3 +++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index cbc32cc..15f5c2e 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -3,17 +3,17 @@ ## # Like similar, but returns a nullable array -similar_nullable{T}(dv::AbstractVector{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = - NullableVector{T}(dims) +similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableArray(T, dims) -similar_nullable{T<:Nullable}(dv::AbstractVector{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = - NullableVector{eltype(T)}(dims) +similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableArray(eltype(T), dims) -similar_nullable{T,R}(dv::CategoricalVector{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = - NullableCategoricalVector{T}(dims) +similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableCategoricalArray(T, dims) -similar_nullable{T,R}(dv::NullableCategoricalVector{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = - NullableCategoricalVector{T}(dims) +similar_nullable{T,R}(dv::NullableCategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableCategoricalArray(T, dims) # helper structure for DataTables joining immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable} diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index 3368d81..2b0dadf 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -344,6 +344,7 @@ RepeatedVector(parent::AbstractVector, inner::Int, outer::Int) * `inner` : the numer of times each element is repeated * `outer` : the numer of times the whole vector is repeated after expanded by `inner` + `inner` and `outer` have the same meaning as similarly named arguments to `repeat`. @@ -398,6 +399,7 @@ end """ A stacked view of a DataTable (long format) + Like `stack` and `melt`, but a view is returned rather than data copies. @@ -432,6 +434,7 @@ The result is a view because the columns are special AbstractVectors that return indexed views into the original DataTable. ### Examples + ```julia d1 = DataTable(a = repeat([1:3;], inner = [4]), b = repeat([1:4;], inner = [3]), From 27da644350ca13619088c698d6afbacc2fd9ac52 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 12:39:57 -0700 Subject: [PATCH 37/43] add missing rightperm reordering and properly unify hcat! functions --- src/abstractdatatable/join.jl | 2 +- src/datatable/datatable.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 15f5c2e..a239927 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -86,7 +86,7 @@ function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, copy!(similar_nullable(col, nrow), col[all_orig_left_ixs]) end for (i, col) in enumerate(columns(dtr_noon)) - cols[i+ncleft] = kind == :inner ? col[all_orig_right_ixs] : + cols[i+ncleft] = kind == :inner ? col[all_orig_right_ixs][right_perm] : copy!(similar_nullable(col, nrow), col[all_orig_right_ixs])[right_perm] end res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon))) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 06673d3..91772a6 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -739,7 +739,7 @@ function hcat!(dt1::DataTable, dt2::AbstractDataTable) return dt1 end -hcat!(dt::DataTable, x::AbstractVector) = hcat!(dt, DataTable(Any[x])) +hcat!(dt::DataTable, x) = hcat!(dt, DataTable(Any[x])) # hcat! for 1-n arguments hcat!(dt::DataTable) = dt From 5fa8fa031fb98873ff4d907c3f55be5185de63d7 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 12:47:45 -0700 Subject: [PATCH 38/43] accidental spacing changes --- src/abstractdatatable/reshape.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index 2b0dadf..c9f1f82 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -140,7 +140,6 @@ end melt(dt::AbstractDataTable; variable_name::Symbol=:variable, value_name::Symbol=:value) = stack(dt; variable_name=variable_name, value_name=value_name) - ############################################################################## ## ## unstack() From a1d58f93480fa382141cb37c6b419b7540124190 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 13:01:52 -0700 Subject: [PATCH 39/43] forgot one spacing change --- src/abstractdatatable/reshape.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index c9f1f82..f7f774a 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -334,7 +334,6 @@ NOTE: Not exported. ```julia RepeatedVector(parent::AbstractVector, inner::Int, outer::Int) - ``` ### Arguments From db87443120e0ed493807b0399eb313818ecf9d67 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 13:44:48 -0700 Subject: [PATCH 40/43] change deprecations --- src/deprecated.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deprecated.jl b/src/deprecated.jl index 83912d7..6f176a8 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -18,5 +18,5 @@ import Base: keys, values, insert! @deprecate sub(dt::AbstractDataTable, rows) view(dt, rows) -@deprecate stackdf stack -@deprecate meltdf melt +@deprecate stackdf stackdt +@deprecate meltdf meltdt From 9c66a1e1edfe686f1092c04c718e6b972bf7958f Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Sat, 18 Mar 2017 13:49:38 -0700 Subject: [PATCH 41/43] add back extra spaces --- docs/src/man/reshaping_and_pivoting.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 9be632a..1b936e1 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -61,13 +61,13 @@ d = stackdt(iris) This saves memory. To create the view, several AbstractVectors are defined: -`:variable` column -- `EachRepeatedVector` +`:variable` column -- `EachRepeatedVector` This repeats the variables N times where N is the number of rows of the original AbstractDataTable. -`:value` column -- `StackedVector` +`:value` column -- `StackedVector` This is provides a view of the original columns stacked together. -Id columns -- `RepeatedVector` +Id columns -- `RepeatedVector` This repeats the original columns N times where N is the number of columns stacked. For more details on the storage representation, see: From 887346ba5293c845b39f6f7746c3164ec34f620f Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 20 Mar 2017 11:28:58 -0700 Subject: [PATCH 42/43] bump catarrays version, remove manual resetting of levels in unstack and adjust tests accordingly --- REQUIRE | 2 +- src/abstractdatatable/reshape.jl | 5 +---- test/datatable.jl | 12 +++++------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/REQUIRE b/REQUIRE index 7bb9ed3..b18bc91 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,6 +1,6 @@ julia 0.5 NullableArrays 0.1.0 -CategoricalArrays 0.1.2 +CategoricalArrays 0.1.3 StatsBase 0.11.0 SortingAlgorithms Reexport diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index f7f774a..ed26cd4 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -194,10 +194,8 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) # `colkey` integer indicating which column to place along column headers # `value` integer indicating which column has values refkeycol = NullableCategoricalArray(dt[rowkey]) - levels!(refkeycol, unique(dt[rowkey])) valuecol = dt[value] keycol = NullableCategoricalArray(dt[colkey]) - levels!(keycol, unique(dt[colkey])) Nrow = length(refkeycol.pool) Ncol = length(keycol.pool) payload = DataTable(Any[similar_nullable(valuecol, Nrow) for i in 1:Ncol], map(Symbol, levels(keycol))) @@ -226,14 +224,13 @@ unstack(dt::AbstractDataTable, colkey, value) = function unstack(dt::AbstractDataTable, colkey::Int, value::Int) # group on anything not a key or value: - g = groupby(dt, setdiff(_names(dt), _names(dt)[[colkey, value]])) + g = groupby(dt, setdiff(_names(dt), _names(dt)[[colkey, value]]), sort=true) groupidxs = [g.idx[g.starts[i]:g.ends[i]] for i in 1:length(g.starts)] rowkey = zeros(Int, size(dt, 1)) for i in 1:length(groupidxs) rowkey[groupidxs[i]] = i end keycol = NullableCategoricalArray(dt[colkey]) - levels!(keycol, unique(dt[colkey])) valuecol = dt[value] dt1 = nullify!(dt[g.idx[g.starts], g.cols]) Nrow = length(g) diff --git a/test/datatable.jl b/test/datatable.jl index 4adaf24..b2ee0aa 100644 --- a/test/datatable.jl +++ b/test/datatable.jl @@ -292,25 +292,23 @@ module TestDataTable Value = ["12 g", "Red", "18 g", "Grey"]) # Check that reordering levels does not confuse unstack levels!(dtA[1], ["XXX", "Bob", "Batman"]) - # should all be the same, just different column types + # should all return the same output, just different column types dt2A = unstack(dtA, :Fish, :Key, :Value) dt3A = unstack(dtA, :Key, :Value) - #The expected output dt4A = DataTable(Fish = NullableCategoricalArray(["Bob", "Batman"]), - Mass = NullableArray(["12 g", "18 g"]), - Color = NullableArray(["Red", "Grey"])) - @test dt2A == dt3A == dt4A + Color = NullableArray(["Red", "Grey"]), + Mass = NullableArray(["12 g", "18 g"])) + @test dt2A[[2, 3], :] == dt3A == dt4A dtB = DataTable(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), Key = CategoricalArray(["Mass", "Color", "Mass", "Color"]), Value = CategoricalArray(["12 g", "Red", "18 g", "Grey"])) dt2B = unstack(dtB, :Fish, :Key, :Value) dt3B = unstack(dtB, :Key, :Value) - # fixme, these are all being reordered by NullableCategoricalArray constructor dt4B = DataTable(Fish = NullableCategoricalArray(["Batman", "Bob"]), Color = NullableCategoricalArray(["Grey", "Red"]), Mass = NullableCategoricalArray(["18 g", "12 g"])) - @test dt2B == dt3B[[2,1], :] == dt4B + @test dt2B == dt3B == dt4B # test multiple entries in unstack error dt = DataTable(id=[1, 2, 1, 2], variable=["a", "b", "a", "b"], value=[3, 4, 5, 6]) From 020c88ed0b116f293f21970ed998ef9556c4ee11 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 24 Mar 2017 12:36:13 -0700 Subject: [PATCH 43/43] only use "and" when joining the last estring --- src/abstractdatatable/abstractdatatable.jl | 3 ++- test/cat.jl | 14 +++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 88fce9f..7879014 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -761,7 +761,8 @@ function Base.vcat(dts::AbstractDataTable...) string(matchingloci[end]) estrings[i] = "column(s) $headerdiff are missing from argument(s) $matchingloci" end - throw(ArgumentError(join(estrings, ", and "))) + length(estrings) == 1 ? throw(ArgumentError(estrings[1])) : + throw(ArgumentError(join(estrings[1:end-1], ", ") * ", and " * estrings[end])) else estrings = Vector{String}(length(uniqueheaders)) for (i, u) in enumerate(uniqueheaders) diff --git a/test/cat.jl b/test/cat.jl index 5f9ac7c..ba44d0a 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -171,20 +171,20 @@ module TestCat @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4" dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1) err = @test_throws ArgumentError vcat(dt1, dt2, dt3) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, and column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" # dt4 is a superset of names found in all other datatables and won't be shown in error dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1) err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, and column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, and column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11" end end