Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: Added generalized sorting routines to DataFrames. #177

Merged
merged 1 commit into from
Feb 8, 2013
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 89 additions & 4 deletions src/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1730,10 +1730,6 @@ function isfinite(df::DataFrame)
return DataFrame(res_columns, colnames(df))
end

function sortby(df::DataFrame, colname::String)
return df[order(df[colname]), :]
end

# TODO: Use cor_pearson and cov_pearson for DataMatrix to do this
function cor_pearson(df::DataFrame)
numeric_cols = find(map(t -> t <: Number, coltypes(df)))
Expand All @@ -1759,6 +1755,95 @@ function flipud!(df::DataFrame)
return
end


##############################################################################
## Sorting
##############################################################################

import Sort.sort, Sort.sortby, Sort.By,
Sort.sort!, Sort.sortby!,
Sort.Algorithm, Sort.Ordering,
Sort.lt, Sort.Perm, Sort.Forward

typealias ColIndexVec Union(AbstractVector{Integer}, AbstractVector{ASCIIString}, AbstractVector{UTF8String}, AbstractVector{Symbol})

const DF_STABLE_SORT = Sort.TimSort()

# Permute indices according to the ordering of the given dataframe columns
type DFPerm{O<:Ordering,DF<:AbstractDataFrame} <: Ordering
ords::AbstractVector{O}
df::DF
end

function DFPerm{O<:Ordering,DF<:AbstractDataFrame}(o::AbstractVector{Ordering}, df::DF)
o_cols = length(o)
df_cols = ncols(df)
if o_cols > df_cols
error("DFPerm: number of column orderings is greater than the number of columns")
end
if o_cols < df_cols
o = cat(1, o, fill(Sort.Forward(), df_cols-o_cols))
end
DFPerm{O,DF}(o, df[cols])
end

DFPerm{O<:Ordering,DF<:AbstractDataFrame}(o::O, df::DF) = DFPerm{O,DF}(fill(o,ncol(df)), df)
DFPerm{ DF<:AbstractDataFrame}( df::DF) = DFPerm(Sort.Forward(), df)

function lt(o::DFPerm, a, b)
for i = 1:ncol(o.df)
if lt(o.ords[i], o.df[a,i], o.df[b,i])
return true
end
if lt(o.ords[i], o.df[b,i], o.df[a,i])
return false
end
end
false
end

# TODO: move [1:nrow(df)] first if/when julia pull #2179 is applied
sortperm(df::AbstractDataFrame, a::Algorithm, o::Union(Perm,DFPerm)) = sort!(a, o, [1:nrow(df)])
sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) = sortperm(df, a, DFPerm(o,df))
sort (df::AbstractDataFrame, a::Algorithm, o::Ordering) = df[sortperm(df, a, o),:]

function sort!(df::AbstractDataFrame, a::Algorithm, o::Ordering)
p = sortperm(df, a, o)
pp = similar(p)
for col in df.columns
copy!(pp,p)
permute!!(col, pp)
end
df
end

for s in {:sort!, :sort, :sortperm}
@eval begin
$s{O<:Ordering}(df::AbstractDataFrame, ::Type{O}) = $s(df, DF_STABLE_SORT, O())
$s (df::AbstractDataFrame, o::Ordering) = $s(df, DF_STABLE_SORT, o)
$s (df::AbstractDataFrame ) = $s(df, Sort.Forward())
end
end

for (sb,s) in {(:sortby!, :sort!), (:sortby, :sort)}
@eval begin
$sb(df::AbstractDataFrame, by::Function) = $s(df,By(by))

$sb{O<:Ordering}(df::AbstractDataFrame, col::ColumnIndex, ::Type{O}) = $s(df,Perm(O(),df[col]))
$sb (df::AbstractDataFrame, col::ColumnIndex, o::Ordering) = $s(df,Perm(o,df[col]))
$sb (df::AbstractDataFrame, col::ColumnIndex) = $sb(df,col,Sort.Forward())

$sb{O<:Ordering}(df::AbstractDataFrame, cols::ColIndexVec, ::Type{O}) = $s(df,DFPerm(O(),df[cols]))
$sb (df::AbstractDataFrame, cols::ColIndexVec, o::Ordering) = $s(df,DFPerm(o, df[cols]))
$sb (df::AbstractDataFrame, cols::ColIndexVec) = $sb(df,cols,Sort.Forward())

$sb{O<:Ordering}(df::AbstractDataFrame, cols::ColIndexVec, o::AbstractArray{O}) = $s(df,DFPerm(o, df[cols]))
$sb (df::AbstractDataFrame, cols::ColIndexVec, o::AbstractArray{CompositeKind}) = $s(df,DFPerm(Ordering[O() for O in o], df[cols]))
$sb (df::AbstractDataFrame, cols::ColIndexVec, o::AbstractArray) = $sb(df,cols,CompositeKind[ot for ot in o])
$sb (df::AbstractDataFrame, col_ord::AbstractArray{Tuple}) = ((cols,o) = zip(col_ord...); $sb(df, [cols...], [o...]))
end
end

##############################################################################
##
## Iteration: EachRow, EachCol
Expand Down