Skip to content

Commit

Permalink
Consolidate DataTable constructors and remove autopromotion (#64)
Browse files Browse the repository at this point in the history
Consolidating the constructors minimized the number of places where
auto promotion could take place. The new constructor recycles scalars
such that if DataTable is created with a mix of scalars and vectors
the scalars will be recycled to the same length as the vectors. Fixes an
outstanding bug where scalar recycling only worked if the scalar
assignments came after the vector assignments of the desired length, see
#882. Tests that
used to assume NullableArray promotion now explicitly use NullableArrays
and new constructor tests have been added to test changes.
  • Loading branch information
cjprybol authored and nalimilan committed Apr 13, 2017
1 parent 32a3a25 commit 3da735b
Show file tree
Hide file tree
Showing 13 changed files with 231 additions and 243 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
language: julia
julia:
- 0.5
- nightly
- 0.6
os:
- linux
- osx
Expand Down
4 changes: 2 additions & 2 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ environment:
matrix:
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"

branches:
only:
Expand Down
203 changes: 81 additions & 122 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,117 +74,100 @@ type DataFrame <: AbstractDataFrame
colindex::Index

function DataFrame(columns::Vector{Any}, colindex::Index)
ncols = length(columns)
if ncols > 1
nrows = length(columns[1])
equallengths = true
for i in 2:ncols
equallengths &= length(columns[i]) == nrows
if length(columns) == length(colindex) == 0
return new(Vector{Any}(0), Index())
elseif length(columns) != length(colindex)
throw(DimensionMismatch("Number of columns ($(length(columns))) and number of column names ($(length(colindex))) are not equal"))
end
lengths = [isa(col, AbstractArray) ? length(col) : 1 for col in columns]
minlen, maxlen = extrema(lengths)
if minlen == 0 && maxlen == 0
return new(columns, colindex)
elseif minlen != maxlen || minlen == maxlen == 1
# recycle scalars
for i in 1:length(columns)
isa(columns[i], AbstractArray) && continue
columns[i] = fill(columns[i], maxlen)
lengths[i] = maxlen
end
if !equallengths
msg = "All columns in a DataFrame must be the same length"
throw(ArgumentError(msg))
uls = unique(lengths)
if length(uls) != 1
strnames = string.(names(colindex))
estrings = ["column length $u for column(s) " *
join(strnames[lengths .== u], ", ", " and ") for (i, u) in enumerate(uls)]
throw(DimensionMismatch(join(estrings, " is incompatible with ", ", and is incompatible with ")))
end
end
if length(colindex) != ncols
msg = "Columns and column index must be the same length"
throw(ArgumentError(msg))
for (i, c) in enumerate(columns)
if isa(c, Range)
columns[i] = collect(c)
elseif !isa(c, AbstractVector)
throw(DimensionMismatch("columns must be 1-dimensional"))
end
end
new(columns, colindex)
end
end

function DataFrame(; kwargs...)
result = DataFrame(Any[], Index())
for (k, v) in kwargs
result[k] = v
end
return result
colnames = Symbol[k for (k,v) in kwargs]
columns = Any[v for (k,v) in kwargs]
DataFrame(columns, Index(colnames))
end

function DataFrame(columns::AbstractVector,
cnames::AbstractVector{Symbol} = gennames(length(columns)))
return DataFrame(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames)))
end


# Initialize empty DataFrame objects of arbitrary size
function DataFrame(t::Type, nrows::Integer, ncols::Integer)
columns = Vector{Any}(ncols)
for i in 1:ncols
columns[i] = NullableArray(t, nrows)
end
cnames = gennames(ncols)
return DataFrame(columns, Index(cnames))
end

# Initialize an empty DataFrame with specific eltypes and names
function DataFrame(column_eltypes::Vector, cnames::Vector, nrows::Integer)
p = length(column_eltypes)
columns = Vector{Any}(p)
for j in 1:p
columns[j] = NullableArray(column_eltypes[j], nrows)
function DataFrame{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, nrows::Integer)
numcols = length(column_eltypes)
columns = Vector{Any}(numcols)
for j in 1:numcols
elty = column_eltypes[j]
if elty <: Nullable
if eltype(elty) <: CategoricalValue
columns[j] = NullableCategoricalArray{eltype(elty)}(nrows)
else
columns[j] = NullableVector{eltype(elty)}(nrows)
end
else
if elty <: CategoricalValue
columns[j] = CategoricalVector{elty}(nrows)
else
columns[j] = Vector{elty}(nrows)
end
end
end
return DataFrame(columns, Index(cnames))
return DataFrame(columns, Index(convert(Vector{Symbol}, cnames)))
end

# Initialize an empty DataFrame with specific eltypes and names
# and whether a nominal array should be created
function DataFrame(column_eltypes::Vector{DataType}, cnames::Vector{Symbol},
nominal::Vector{Bool}, nrows::Integer)
p = length(column_eltypes)
columns = Vector{Any}(p)
for j in 1:p
if nominal[j]
columns[j] = NullableCategoricalArray{column_eltypes[j]}(nrows)
else
columns[j] = NullableArray{column_eltypes[j]}(nrows)
end
end
return DataFrame(columns, Index(cnames))
end

# Initialize an empty DataFrame with specific eltypes
function DataFrame(column_eltypes::Vector, nrows::Integer)
p = length(column_eltypes)
columns = Vector{Any}(p)
cnames = gennames(p)
for j in 1:p
columns[j] = NullableArray{column_eltypes[j]}(nrows)
function DataFrame{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol},
nominal::Vector{Bool}, nrows::Integer)
# upcast Vector{DataType} -> Vector{Type} which can hold CategoricalValues
updated_types = convert(Vector{Type}, column_eltypes)
for i in eachindex(nominal)
nominal[i] || continue
if updated_types[i] <: Nullable
updated_types[i] = Nullable{CategoricalValue{eltype(updated_types[i])}}
else
updated_types[i] = CategoricalValue{updated_types[i]}
end
end
return DataFrame(columns, Index(cnames))
return DataFrame(updated_types, cnames, nrows)
end

# Initialize from a Vector of Associatives (aka list of dicts)
function DataFrame{D <: Associative}(ds::Vector{D})
ks = Set()
for d in ds
union!(ks, keys(d))
end
DataFrame(ds, [ks...])
# Initialize empty DataFrame objects of arbitrary size
function DataFrame(t::Type, nrows::Integer, ncols::Integer)
return DataFrame(fill(t, ncols), nrows)
end

# Initialize from a Vector of Associatives (aka list of dicts)
function DataFrame{D <: Associative}(ds::Vector{D}, ks::Vector)
#get column eltypes
col_eltypes = Type[@compat(Union{}) for _ = 1:length(ks)]
for d in ds
for (i,k) in enumerate(ks)
if haskey(d, k) && !_isnull(d[k])
col_eltypes[i] = promote_type(col_eltypes[i], typeof(d[k]))
end
end
end
col_eltypes[col_eltypes .== @compat(Union{})] = Any

# create empty DataFrame, and fill
df = DataFrame(col_eltypes, ks, length(ds))
for (i,d) in enumerate(ds)
for (j,k) in enumerate(ks)
df[i,j] = get(d, k, Nullable())
end
end

df
# Initialize an empty DataFrame with specific eltypes
function DataFrame{T<:Type}(column_eltypes::AbstractVector{T}, nrows::Integer)
return DataFrame(column_eltypes, gennames(length(column_eltypes)), nrows)
end

##############################################################################
Expand Down Expand Up @@ -363,24 +346,20 @@ function insert_multiple_entries!{T <: Real}(df::DataFrame,
end
end

upgrade_vector{T<:Nullable}(v::AbstractArray{T}) = v
upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v)
upgrade_vector(v::AbstractArray) = NullableArray(v)

function upgrade_scalar(df::DataFrame, v::AbstractArray)
msg = "setindex!(::DataFrame, ...) only broadcasts scalars, not arrays"
throw(ArgumentError(msg))
end
function upgrade_scalar(df::DataFrame, v::Any)
n = (ncol(df) == 0) ? 1 : nrow(df)
NullableArray(fill(v, n))
fill(v, n)
end

# df[SingleColumnIndex] = AbstractVector
function Base.setindex!(df::DataFrame,
v::AbstractVector,
col_ind::ColumnIndex)
insert_single_column!(df, upgrade_vector(v), col_ind)
insert_single_column!(df, v, col_ind)
end

# df[SingleColumnIndex] = Single Item (EXPANDS TO NROW(DT) if NCOL(DT) > 0)
Expand Down Expand Up @@ -417,9 +396,8 @@ end
function Base.setindex!{T <: ColumnIndex}(df::DataFrame,
v::AbstractVector,
col_inds::AbstractVector{T})
dv = upgrade_vector(v)
for col_ind in col_inds
df[col_ind] = dv
df[col_ind] = v
end
return df
end
Expand Down Expand Up @@ -820,7 +798,7 @@ function Base.append!(df1::DataFrame, df2::AbstractDataFrame)
return df1
end

function Base.convert(::Type{DataFrame}, A::Matrix)
function Base.convert(::Type{DataFrame}, A::AbstractMatrix)
n = size(A, 2)
cols = Vector{Any}(n)
for i in 1:n
Expand All @@ -829,35 +807,16 @@ function Base.convert(::Type{DataFrame}, A::Matrix)
return DataFrame(cols, Index(gennames(n)))
end

function _dataframe_from_associative(dnames, d::Associative)
p = length(dnames)
p == 0 && return DataFrame()
columns = Vector{Any}(p)
colnames = Vector{Symbol}(p)
n = length(d[dnames[1]])
for j in 1:p
name = dnames[j]
col = d[name]
if length(col) != n
throw(ArgumentError("All columns in Dict must have the same length"))
end
columns[j] = NullableArray(col)
colnames[j] = Symbol(name)
end
return DataFrame(columns, Index(colnames))
end

function Base.convert(::Type{DataFrame}, d::Associative)
dnames = collect(keys(d))
return _dataframe_from_associative(dnames, d)
end

# A Dict is not sorted or otherwise ordered, and it's nicer to return a
# DataFrame which is ordered in some way
function Base.convert(::Type{DataFrame}, d::Dict)
dnames = collect(keys(d))
sort!(dnames)
return _dataframe_from_associative(dnames, d)
colnames = keys(d)
if isa(d, Dict)
colnames = sort!(collect(keys(d)))
else
colnames = keys(d)
end
colindex = Index(Symbol[k for k in colnames])
columns = Any[d[c] for c in colnames]
DataFrame(columns, colindex)
end


Expand Down
15 changes: 9 additions & 6 deletions test/cat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ module TestCat
@test vcat(null_df, null_df) == DataFrame()
@test_throws ArgumentError vcat(null_df, df)
@test_throws ArgumentError vcat(df, null_df)
@test eltypes(vcat(df, df)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}]
@test eltypes(vcat(df, df)) == Type[Float64, Float64, Int]
@test size(vcat(df, df)) == (size(df,1)*2, size(df,2))
@test eltypes(vcat(df, df, df)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}]
@test eltypes(vcat(df, df, df)) == Type[Float64,Float64,Int]
@test size(vcat(df, df, df)) == (size(df,1)*3, size(df,2))

alt_df = deepcopy(df)
Expand All @@ -110,12 +110,12 @@ module TestCat
@test names(df4) == names(dfr)
@test isequal(dfr, [df4; df4])

@test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == [Nullable{Float64}]
@test eltypes(vcat(DataFrame(a = NullableArray(Int, 1)), DataFrame(a = [2.1]))) == [Nullable{Float64}]
@test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == Type[Float64]
@test eltypes(vcat(DataFrame(a = NullableArray(Int, 1)), DataFrame(a = [2.1]))) == Type[Nullable{Float64}]

# Minimal container type promotion
dfa = DataFrame(a = CategoricalArray([1, 2, 2]))
dfb = DataFrame(a = CategoricalArray([2, 3, 4]))
dfa = DataFrame(a = NullableCategoricalArray([1, 2, 2]))
dfb = DataFrame(a = NullableCategoricalArray([2, 3, 4]))
dfc = DataFrame(a = NullableArray([2, 3, 4]))
dfd = DataFrame(Any[2:4], [:a])
dfab = vcat(dfa, dfb)
Expand Down Expand Up @@ -249,4 +249,7 @@ module TestCat
err = @test_throws ArgumentError vcat(df1, df2, df3, df4, df1, df2, df3, df4, df1, df2, df3, df4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11"
end
x = view(DataFrame(A = NullableArray(1:3)), 2)
y = DataFrame(A = NullableArray(4:5))
@test isequal(vcat(x, y), DataFrame(A = NullableArray([2, 4, 5])))
end
Loading

0 comments on commit 3da735b

Please sign in to comment.