Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Operations for DataArray that skip NA values #354

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,13 @@ export # reconcile_groups,
class, # in the S3 sense of "class"
inherits,
read_rda,
vecbind
vecbind,

# DataArray operations
sum, prod, max, min, mean, cumsum, cumprod,

# Functors that handle NAs
NA2Zero, NA2One, NA2Min, NA2Max, NotNA

##############################################################################
##
Expand Down Expand Up @@ -252,7 +258,8 @@ include("predicates.jl")
include("indexing.jl")
include("extras.jl")
include("RDA.jl")
include("dataframe_blocks.jl")
#include("dataframe_blocks.jl")
include("dataarray_ops.jl")

# TODO: Remove these definitions
nafilter(x...) = error("Function removed. Please use removeNA")
Expand Down
134 changes: 134 additions & 0 deletions src/dataarray_ops.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@

# NA operations for DataArray using NumericExtensions

using NumericExtensions
import NumericExtensions.evaluate, NumericExtensions.result_type
import NumericExtensions.reduced_size

import Base.sum, Base.prod, Base.max, Base.min, Base.cumsum, Base.cumprod

typealias DimSpec Union(Int, (Int, Int))

# Functors to return additive and multiplicative identity for NA
type NA2Zero <: BinaryFunctor; end
evaluate(::NA2Zero, x, y::Bool) = y ? zero(typeof(x)) : x
result_type{T<:Number}(::NA2Zero, ::Type{T}, ::Type{Bool}) = T

type NA2One <: BinaryFunctor; end
evaluate(::NA2One, x, y::Bool) = y ? one(typeof(x)) : x
result_type{T<:Number}(::NA2One, ::Type{T}, ::Type{Bool}) = T

type NA2Min <: BinaryFunctor; end
evaluate(::NA2Min, x, y::Bool) = y ? typemin(typeof(x)) : x
result_type{T<:Number}(::NA2Min, ::Type{T}, ::Type{Bool}) = T

type NA2Max <: BinaryFunctor; end
evaluate(::NA2Max, x, y::Bool) = y ? typemax(typeof(x)) : x
result_type{T<:Number}(::NA2Max, ::Type{T}, ::Type{Bool}) = T


# Functor to return true if data is NOT NA (useful to get the number of non-NA)
# entries.
type NotNA <: UnaryFunctor; end
evaluate(::NotNA, y::Bool) = !y
result_type(::NotNA, ::Type{Bool}) = Bool

# Can make this a noop when NumericExtensions handles BitArrays.
macro BA(x)
:(convert(Array{Bool,ndims($x)}, $x))
end

# TODO: Make handling NA depend on a flag
# TODO: Implement median, std, var, mad, norm, skewness, kurtosis
# TODO: Implement inplace versions of functions that don't need to allocate
# space for result.
# Think about median as it's a bit tricky b/c have to actually get rid of the
# NAs which I'm not sure is possible with mapreduce.
#

# These are placeholders for now b/c NumericExtensions doesn't work with
# BitArrays.

# Entire Array
sum{T<:Number}(da::DataArray{T}) = isempty(da) ? zero(T) :
mapreduce(NA2Zero(), Add(), da.data, @BA(da.na))
prod{T<:Number}(da::DataArray{T}) = isempty(da) ? one(T) :
mapreduce(NA2One(), Multiply(), da.data, @BA(da.na))
max{T<:Number}(da::DataArray{T}) = isempty(da) ? throw(ArgumentError("Empty error not allowed")) :
mapreduce(NA2Min(), Max(), da.data, @BA(da.na))
min{T<:Number}(da::DataArray{T}) = isempty(da) ? throw(ArgumentError("Empty error not allowed")) :
mapreduce(NA2Max(), Min(), da.data, @BA(da.na))

# Reduce along dimensions
sum{T<:Number}(da::DataArray{T}, dims::DimSpec) = isempty(da) ? zeros(T, reduced_size(size(da), dims)) :
mapreduce(NA2Zero(), Add(), da.data, @BA(da.na), dims)
prod{T<:Number}(da::DataArray{T}, dims::DimSpec) = isempty(da) ? ones(T, reduced_size(size(da), dims)) :
mapreduce(NA2One(), Multiply(), da.data, @BA(da.na), dims)
max{T<:Number}(da::DataArray{T}, dims::DimSpec) = isempty(da) ? throw(ArguemntError("Empty array not allowed")) :
mapreduce(NA2Min(), Max(), da.data, @BA(da.na), dims)
min{T<:Number}(da::DataArray{T}, dims::DimSpec) = isempty(da) ? throw(ArguementError("Empty array not allowed")) :
mapreduce(NA2Max(), Min(), da.data, @BA(da.na), dims)

function mean{T<:Number}(da::DataArray{T})
if isempty(da)
return zero(T)
end
na = @BA(da.na)
s = mapreduce(NA2Zero(), Add(), da.data, na)
nn = mapreduce(NotNA(), Add(), na)
s ./ nn
end
function mean{T<:Number}(da::DataArray{T}, dims::DimSpec)
if isempty(da)
return zeros(T, reduced_size(size(da), dims))
end
na = @BA(da.na)
s = mapreduce(NA2Zero(), Add(), da.data, na, dims)
nn = mapreduce(NotNA(), Add(), na, dims)
map(Divide(), s, nn)
end


# Dimensionless version only defined for vectors
function cumsum{T<:Number}(da::DataArray{T,1})
if isempty(da)
return zero(T)
end
c = DataArray(Array(T,size(da.data)), da.na)
mapscan!(c.data, NA2Zero(), Add(), da.data, @BA(da.na))
c
end
function cumsum{T<:Number}(da::DataArray{T}, dims::DimSpec)
if isempty(da)
return zeros(T, reduced_size(size(da), dims))
end
c = DataArray(Array(T,size(da.data)), da.na)
mapscan!(c.data, NA2Zero(), Add(), da.data, @BA(da.na), dims)
c
end

# Dimensionless version only defined for vectors
function cumprod{T<:Number}(da::DataArray{T,1})
if isempty(da)
return one(T)
end
c = DataArray(Array(T,size(da.data)), da.na)
mapscan!(c.data, NA2One(), Multiply(), da.data, @BA(da.na))
c
end
function cumprod{T<:Number}(da::DataArray{T}, dims::DimSpec)
if isempty(da)
return ones(T, reduced_size(size(da), dims))
end
c = DataArray(Array(T,size(da.data)), da.na)
mapscan!(c.data, NA2One(), Multiply(), da.data, @BA(da.na), dims)
c
end


# Basically just copy the var, std, etc. functions from Numeric Extensions as
# they're hand-coded there.



# Inplace versions
6 changes: 4 additions & 2 deletions src/operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,16 @@ array_arithmetic_operators = [:(+), :(.+), :(-), :(.-), :(.*), :(.^)]

bit_operators = [:(&), :(|), :($)]

unary_vector_operators = [:min, :max, :prod, :sum, :mean, :median, :std,
unary_vector_operators = [#:min, :max, :prod, :sum, :mean,
:median, :std,
:var, :mad, :norm, :skewness, :kurtosis]

# TODO: dist, iqr, rle, inverse_rle

pairwise_vector_operators = [:diff, :reldiff, :percent_change]

cumulative_vector_operators = [:cumprod, :cumsum, :cumsum_kbn, :cummin, :cummax]
cumulative_vector_operators = [#:cumprod, :cumsum,
:cumsum_kbn, :cummin, :cummax]

ffts = [:fft]

Expand Down