From dff0bc64403d3e8cd343bcb11f593a05e2aec62e Mon Sep 17 00:00:00 2001
From: tshort <tshort.rlists@gmail.com>
Date: Sat, 11 Jan 2014 10:37:49 -0500
Subject: [PATCH] Add an AbstractDataFrame that is a composite type with
 columns as type members.

---
 src/DataFrames.jl  |  5 ++-
 src/cdataframe.jl  | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 test/cdataframe.jl | 40 +++++++++++++++++++
 3 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 src/cdataframe.jl
 create mode 100644 test/cdataframe.jl

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
index 81893e4c2b..01c129471f 100644
--- a/src/DataFrames.jl
+++ b/src/DataFrames.jl
@@ -179,7 +179,9 @@ export # reconcile_groups,
        class,                              # in the S3 sense of "class"
        inherits,
        read_rda,
-       vecbind
+       vecbind,
+       cdataframe,
+       CDataFrame
 
 ##############################################################################
 ##
@@ -191,6 +193,7 @@ include("utils.jl")
 include("index.jl")
 include("namedarray.jl")
 include("dataframe.jl")
+include("cdataframe.jl")
 include("show.jl")
 include("merge.jl")
 include("grouping.jl")
diff --git a/src/cdataframe.jl b/src/cdataframe.jl
new file mode 100644
index 0000000000..9e8e245373
--- /dev/null
+++ b/src/cdataframe.jl
@@ -0,0 +1,97 @@
+abstract CDataFrame <: AbstractDataFrame
+
+expr_typeof(d::DataType) = :($(d.name))
+
+symb(x::Type) = x.name.name
+symb(x::Int) = x
+symb(x) = symbol(x)
+
+function expr_typeof(d)
+    # return an expression representing the type of d
+    # This is used to build up the member list in the CDataFrame type
+    t = typeof(d)
+    return :($(t.name.name){$(map(symb, t.parameters)...)})
+end
+
+
+function cdataframe(df::AbstractDataFrame)
+    # t = symbol("CDataFrame" * string(gensym()))
+    t = symbol("CDataFrame" * string(rand(Uint16)))
+    typedef = quote
+        type $(t) <: CDataFrame
+        end
+    end
+    type_exprs = Any[]
+    for i in 1:ncol(df)
+        push!(type_exprs, :($(symbol(colnames(df)[i]))::$(expr_typeof(df[:,i]))))
+    end
+    typedef.args[2].args[3].args = type_exprs
+    # typedef.args[2].args[3].args = Any[symbol(x) for x in colnames(df)]
+    eval(typedef)
+    T = eval(:($t))
+    a = Any[]
+    for colname in colnames(df)
+        push!(a, df[colname])
+    end
+    T(a...)
+end
+
+cdataframe(df::CDataFrame) = df
+
+cdataframe(df::CDataFrame; kwargs...) = cdataframe(cbind(DataFrame(df), DataFrame(; kwargs...)))
+
+DataFrame(df::CDataFrame) = DataFrame(Any[df[i] for i in 1:ncol(df)], colnames(df))
+
+colnames(df::CDataFrame) = [string(x)::ByteString for x in names(typeof(df))]
+colsymbols(df::CDataFrame) = [x::Symbol for x in names(typeof(df))]
+
+nrow(df::CDataFrame) = ncol(df) > 0 ? length(getfield(df, names(typeof(df))[1])) : 0
+ncol(df::CDataFrame) = length(typeof(df).types)
+
+index(df::CDataFrame) = Index(colnames(df))
+
+function Base.getindex(df::CDataFrame, col_ind::Real)
+    getfield(df, colsymbols(df)[col_ind])
+end
+
+function Base.getindex(df::CDataFrame, col_ind::String)
+    getfield(df, symbol(col_ind))
+end
+
+function Base.getindex(df::CDataFrame, col_ind::Symbol)
+    getfield(df, col_ind)
+end
+
+function Base.getindex{T <: ColumnIndex}(df::CDataFrame, col_inds::AbstractVector{T})
+    CDataFrame(DataFrame(df)[col_inds])
+end
+
+function Base.getindex(df::CDataFrame, row_ind::Real, col_ind::ColumnIndex)
+    df[col_ind][row_ind]
+end
+
+# df[SingleRowIndex, MultiColumnIndex] => (Sub)?DataFrame
+function Base.getindex{T <: ColumnIndex}(df::CDataFrame, row_ind::Real, col_inds::AbstractVector{T})
+    cdataframe(DataFrame(df)[row_ind, col_inds])
+end
+
+# df[MultiRowIndex, SingleColumnIndex] => (Sub)?AbstractDataVector
+function Base.getindex{T <: Real}(df::CDataFrame, row_inds::AbstractVector{T}, col_ind::ColumnIndex)
+    df[col_ind][row_inds]
+end
+
+# df[MultiRowIndex, MultiColumnIndex] => (Sub)?DataFrame
+function Base.getindex{R <: Real, T <: ColumnIndex}(df::CDataFrame, row_inds::AbstractVector{R}, col_inds::AbstractVector{T})
+    cdataframe(DataFrame(df)[row_inds, col_inds])
+end
+
+# two-argument form, two dfs, references only
+function Base.hcat(df1::CDataFrame, df2::CDataFrame)
+    cdataframe(hcat(DataFrame(df1), DataFrame(df2)))
+end
+
+function Base.hcat(df::CDataFrame, x)
+    cdataframe(hcat(DataFrame(df), DataFrame(x)))
+end
+
+Base.similar(df::CDataFrame, dims) = cdataframe(similar(DataFrame(df), dims))
diff --git a/test/cdataframe.jl b/test/cdataframe.jl
new file mode 100644
index 0000000000..4ec38cab99
--- /dev/null
+++ b/test/cdataframe.jl
@@ -0,0 +1,40 @@
+using Base.Test
+using DataFrames
+
+let
+    N = 5000000
+    x1 = rand(N)
+    x2 = rand(N)
+    df = DataFrame({x1, x2})
+    cdf = cdataframe(df)
+
+    function test_sum_1(d)
+        res = 0.0
+        for i = 1:nrow(d)
+            res += d[i,"x1"] * d[i,"x2"]
+        end
+        res
+    end
+
+    function test_sum_2(d)
+        res = 0.0
+        for i = 1:nrow(d)
+            res += d.x1[i] * d.x2[i]
+        end
+        res
+    end
+
+    function test_sum_3(x1,x2)
+        res = 0.0
+        for i = 1:length(x1)
+            res += x1[i] * x2[i]
+        end
+        res
+    end
+
+    @time test_sum_1(df)
+    @time test_sum_1(cdf)
+    @time test_sum_2(cdf)
+    @time test_sum_3(x1, x2)
+
+end