From dff0bc64403d3e8cd343bcb11f593a05e2aec62e Mon Sep 17 00:00:00 2001 From: tshort Date: Sat, 11 Jan 2014 10:37:49 -0500 Subject: [PATCH] Add an AbstractDataFrame that is a composite type with columns as type members. --- src/DataFrames.jl | 5 ++- src/cdataframe.jl | 97 ++++++++++++++++++++++++++++++++++++++++++++++ test/cdataframe.jl | 40 +++++++++++++++++++ 3 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 src/cdataframe.jl create mode 100644 test/cdataframe.jl diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 81893e4c2b..01c129471f 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -179,7 +179,9 @@ export # reconcile_groups, class, # in the S3 sense of "class" inherits, read_rda, - vecbind + vecbind, + cdataframe, + CDataFrame ############################################################################## ## @@ -191,6 +193,7 @@ include("utils.jl") include("index.jl") include("namedarray.jl") include("dataframe.jl") +include("cdataframe.jl") include("show.jl") include("merge.jl") include("grouping.jl") diff --git a/src/cdataframe.jl b/src/cdataframe.jl new file mode 100644 index 0000000000..9e8e245373 --- /dev/null +++ b/src/cdataframe.jl @@ -0,0 +1,97 @@ +abstract CDataFrame <: AbstractDataFrame + +expr_typeof(d::DataType) = :($(d.name)) + +symb(x::Type) = x.name.name +symb(x::Int) = x +symb(x) = symbol(x) + +function expr_typeof(d) + # return an expression representing the type of d + # This is used to build up the member list in the CDataFrame type + t = typeof(d) + return :($(t.name.name){$(map(symb, t.parameters)...)}) +end + + +function cdataframe(df::AbstractDataFrame) + # t = symbol("CDataFrame" * string(gensym())) + t = symbol("CDataFrame" * string(rand(Uint16))) + typedef = quote + type $(t) <: CDataFrame + end + end + type_exprs = Any[] + for i in 1:ncol(df) + push!(type_exprs, :($(symbol(colnames(df)[i]))::$(expr_typeof(df[:,i])))) + end + typedef.args[2].args[3].args = type_exprs + # typedef.args[2].args[3].args = Any[symbol(x) for x in colnames(df)] + eval(typedef) + T = eval(:($t)) + a = Any[] + for colname in colnames(df) + push!(a, df[colname]) + end + T(a...) +end + +cdataframe(df::CDataFrame) = df + +cdataframe(df::CDataFrame; kwargs...) = cdataframe(cbind(DataFrame(df), DataFrame(; kwargs...))) + +DataFrame(df::CDataFrame) = DataFrame(Any[df[i] for i in 1:ncol(df)], colnames(df)) + +colnames(df::CDataFrame) = [string(x)::ByteString for x in names(typeof(df))] +colsymbols(df::CDataFrame) = [x::Symbol for x in names(typeof(df))] + +nrow(df::CDataFrame) = ncol(df) > 0 ? length(getfield(df, names(typeof(df))[1])) : 0 +ncol(df::CDataFrame) = length(typeof(df).types) + +index(df::CDataFrame) = Index(colnames(df)) + +function Base.getindex(df::CDataFrame, col_ind::Real) + getfield(df, colsymbols(df)[col_ind]) +end + +function Base.getindex(df::CDataFrame, col_ind::String) + getfield(df, symbol(col_ind)) +end + +function Base.getindex(df::CDataFrame, col_ind::Symbol) + getfield(df, col_ind) +end + +function Base.getindex{T <: ColumnIndex}(df::CDataFrame, col_inds::AbstractVector{T}) + CDataFrame(DataFrame(df)[col_inds]) +end + +function Base.getindex(df::CDataFrame, row_ind::Real, col_ind::ColumnIndex) + df[col_ind][row_ind] +end + +# df[SingleRowIndex, MultiColumnIndex] => (Sub)?DataFrame +function Base.getindex{T <: ColumnIndex}(df::CDataFrame, row_ind::Real, col_inds::AbstractVector{T}) + cdataframe(DataFrame(df)[row_ind, col_inds]) +end + +# df[MultiRowIndex, SingleColumnIndex] => (Sub)?AbstractDataVector +function Base.getindex{T <: Real}(df::CDataFrame, row_inds::AbstractVector{T}, col_ind::ColumnIndex) + df[col_ind][row_inds] +end + +# df[MultiRowIndex, MultiColumnIndex] => (Sub)?DataFrame +function Base.getindex{R <: Real, T <: ColumnIndex}(df::CDataFrame, row_inds::AbstractVector{R}, col_inds::AbstractVector{T}) + cdataframe(DataFrame(df)[row_inds, col_inds]) +end + +# two-argument form, two dfs, references only +function Base.hcat(df1::CDataFrame, df2::CDataFrame) + cdataframe(hcat(DataFrame(df1), DataFrame(df2))) +end + +function Base.hcat(df::CDataFrame, x) + cdataframe(hcat(DataFrame(df), DataFrame(x))) +end + +Base.similar(df::CDataFrame, dims) = cdataframe(similar(DataFrame(df), dims)) diff --git a/test/cdataframe.jl b/test/cdataframe.jl new file mode 100644 index 0000000000..4ec38cab99 --- /dev/null +++ b/test/cdataframe.jl @@ -0,0 +1,40 @@ +using Base.Test +using DataFrames + +let + N = 5000000 + x1 = rand(N) + x2 = rand(N) + df = DataFrame({x1, x2}) + cdf = cdataframe(df) + + function test_sum_1(d) + res = 0.0 + for i = 1:nrow(d) + res += d[i,"x1"] * d[i,"x2"] + end + res + end + + function test_sum_2(d) + res = 0.0 + for i = 1:nrow(d) + res += d.x1[i] * d.x2[i] + end + res + end + + function test_sum_3(x1,x2) + res = 0.0 + for i = 1:length(x1) + res += x1[i] * x2[i] + end + res + end + + @time test_sum_1(df) + @time test_sum_1(cdf) + @time test_sum_2(cdf) + @time test_sum_3(x1, x2) + +end