Skip to content

Commit

Permalink
closes #760 and #771. 'frank' is implemented.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Jan 7, 2015
1 parent 4fc3edb commit 368b356
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 23 deletions.
5 changes: 3 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ export(chmatch, "%chin%", chorder, chgroup)
export(rbindlist)
export(fread)
export(foverlaps)
export(shift)
export(frank)
export(frankv)
export(address)
export(.SD,.N,.I,.GRP,.BY,.EACHI)

Expand Down Expand Up @@ -74,8 +77,6 @@ S3method(head, data.table)
import(stats)
S3method(na.omit, data.table)

export(shift)

# IDateTime support:
export(as.IDate,as.ITime,IDateTime)
export(hour,yday,wday,mday,week,month,quarter,year)
Expand Down
61 changes: 49 additions & 12 deletions R/setkey.R
Original file line number Diff line number Diff line change
Expand Up @@ -330,53 +330,90 @@ CJ <- function(..., sorted = TRUE)
l
}

frankv <- function(x, by=seq_along(x), ties.method=c("average", "first", "random", "max", "min", "dense"), na.last=TRUE) {
frankv <- function(x, cols=seq_along(x), na.last=TRUE, ties.method=c("average", "first", "random", "max", "min", "dense")) {
ties.method = match.arg(ties.method)
na.last = as.logical(na.last)
if (!length(na.last)) stop('length(na.last) = 0')
if (length(na.last) != 1L) {
warning("length(na.last) > 1, only the first element will be used")
na.last = na.last[1L]
}
keep = (na.last == "keep")
na.last = as.logical(na.last)
as_list <- function(x) {
xx = vector("list", 1L)
.Call(Csetlistelt, xx, 1L, x)
xx
}
if (is.atomic(x)) {
if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense")
by = 1L
if (!missing(cols) && !is.null(cols))
stop("x is a single vector, non-NULL 'cols' doesn't make sense")
cols = 1L
x = as_list(x)
} else {
if (is.character(by)) by = chmatch(by, names(x))
by = as.integer(by)
if (!length(cols))
stop("x is a list, 'cols' can not be 0-length")
if (is.character(cols))
cols = chmatch(cols, names(x))
cols = as.integer(cols)
}
x = .Call(Cshallowwrapper, x, seq_along(x)) # shallow copy even if list..
setDT(x)
if (is.na(na.last))
x = na.omit(x, by) # TODO: take care of na.last internally without having to subset data.table
if (is.na(na.last)) {
set(x, j = "..na_prefix..", value = is_na(x, cols))
cols = c(ncol(x), cols)
nas = x[[ncol(x)]]
}
if (ties.method == "random") {
set(x, j="..stats_runif..", value = stats::runif(nrow(x)))
by = c(by, ncol(x))
set(x, i = if (is.na(na.last)) which_(nas, FALSE) else NULL,
j = "..stats_runif..",
value = stats::runif(nrow(x)))
cols = c(cols, ncol(x))
}
xorder = forderv(x, by=by, sort=TRUE, retGrp=TRUE, na.last=na.last)
xorder = forderv(x, by=cols, sort=TRUE, retGrp=TRUE,
na.last=if (identical(na.last, FALSE)) na.last else TRUE)
xstart = attr(xorder, 'starts')
xsorted = FALSE
if (!length(xorder)) {
xsorted = TRUE
xorder = seq_along(x[[1L]])
}
ans = switch(ties.method,
average = , min = , max =, dense =, runlength = {
average = , min = , max =, dense = {
rank = .Call(Cfrank, xorder, xstart, uniqlengths(xstart, length(xorder)), ties.method)
},
first = , random = {
if (xsorted) xorder else forderv(xorder)
}
)
# take care of na.last="keep"
V1 = NULL # for R CMD CHECK warning
if (isTRUE(keep)) {
ans = (setDT(as_list(ans))[which_(nas, TRUE), V1 := NA])[[1L]]
} else if (is.na(na.last)) {
ans = ans[which_(nas, FALSE)]
}
ans
}

frank <- function(x, ..., na.last=TRUE, ties.method=c("average", "first", "random", "max", "min", "dense")) {
cols = substitute(list(...))[-1]
if (length(cols)) {
cols=as.list(cols)
for (i in seq_along(cols)) {
v=as.list(cols[[i]])
if (length(v) > 1 && v[[1L]] == "+") v=v[[-1L]]
else if (length(v) > 1 && v[[1L]] == "-") {
v=v[[-1L]]
}
cols[[i]]=as.character(v)
}
cols=unlist(cols, use.names=FALSE)
} else {
cols=colnames(x)
}
frankv(x, cols=cols, na.last=na.last, ties.method=ties.method)
}

#########################################################################################
# Deprecated ...
#########################################################################################
Expand Down
2 changes: 1 addition & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -5004,7 +5004,7 @@ if ("package:bit64" %in% search()) dt[, DD := as.integer64(DD)]
test_no = 1369.0
for (i in seq_along(dt)) {
col = dt[[i]]
for (j in c(TRUE, FALSE)) {
for (j in list(TRUE, FALSE, "keep")) {
for (k in c("average", "min", "max", "first")) {
if (k == "random") set.seed(45L)
if (class(col) == "integer64") r1 = rank(as.integer(col), ties.method=k, na.last=j)
Expand Down
65 changes: 65 additions & 0 deletions man/frank.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
\name{frank}
\alias{frank}
\alias{frankv}
\alias{rank}
\title{Fast rank}
\description{
Similar to \code{base::rank} but \emph{much faster}. It accepts vectors, lists, data.frames or data.tables as input. In addition to the \code{ties.method} possibilities provided by \code{base::rank}, it also provides \code{ties.method="dense"}.

\code{bit64::integer64} type is also supported.
}

\usage{
frank(x, ..., na.last=TRUE, ties.method=c("average",
"first", "random", "max", "min", "dense"))

frankv(x, cols=seq_along(x), na.last=TRUE,
ties.method=c("average", "first", "random",
"max", "min", "dense"))

}
\arguments{
\item{x}{ A vector, or list with all it's elements identical in length or data.frame or data.table. }
\item{...}{ Only for lists, data.frames and data.tables. The columns to calculate ranks based on. Do not quote column names. If ... is missing, all columns are considered by default. }
\item{cols}{ A character vector of column names (or numbers) of x, to which obtain ranks for. }
\item{na.last}{ Control treatment of \code{NA}s. If \code{TRUE}, missing values in the data are put last; if \code{FALSE}, they are put first; if \code{NA}, they are removed; if \code{"keep"} they are kept with rank \code{NA}. }
\item{ties.method}{ A character string specifying how ties are treated, see \code{Details}. }
}
\details{
To be consistent with other \code{data.table} operations, \code{NA}s are considered identical to other \code{NA}s (and \code{NaN}s to other \code{NaN}s), unlike \code{base::rank}. Therefore, for \code{na.last=TRUE} and \code{na.last=FALSE}, \code{NA}s (and \code{NaN}s) are given identical ranks, unlike \code{\link[base]{rank}}.
\code{frank} is not limited to vectors. It accepts data.tables (and lists and data.frames) as well.
In addition to the \code{ties.method} values possible using base's \code{\link[base]{rank}}, it also provides another additional argument \emph{"dense"} which returns the ranks without any gaps in the ranking. See examples.
}
\value{
A numeric vector of length equal to \code{NROW(x)} (unless \code{na.last = NA}, when missing values are removed). The vector is of integer type unless \code{ties.method = "average"} when it is of double type (irrespective of ties).
}

\examples{
# on vectors
x = c(4, 1, 4, NA, 1, NA, 4)
# NAs are considered identical (unlike base R)
# default is average
frankv(x) # na.last=TRUE
frankv(x, na.last=FALSE)

# ties.method = min
frankv(x, ties.method="min")
# ties.method = dense
frankv(x, ties.method="dense")

# on data.table
DT = data.table(x, y=c(1, 1, 1, 0, NA, 0, 2))
frankv(DT, cols="x") # same as frankv(x) from before
frankv(DT, cols="x", na.last="keep")
frankv(DT, cols="x", ties.method="dense", na.last=NA)
frank(DT, x, ties.method="dense", na.last=NA) # equivalent of above using frank
# on both columns
frankv(DT, ties.method="first", na.last="keep")
frank(DT, ties.method="first", na.last="keep") # equivalent of above using frank
}
\seealso{
\code{\link{data.table}}, \code{\link{setkey}}, \code{\link{setorder}}
}
\keyword{ data }
16 changes: 8 additions & 8 deletions src/frank.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,14 @@ SEXP dt_na(SEXP x, SEXP cols) {
SEXP frank(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP ties_method) {
int i=0, j=0, k=0, n;
int *xstart = INTEGER(xstartArg), *xlen = INTEGER(xlenArg), *xorder = INTEGER(xorderArg);
enum {MEAN, MAX, MIN, DENSE, RUNLENGTH} ties = MEAN;
enum {MEAN, MAX, MIN, DENSE} ties = MEAN; // RUNLENGTH
SEXP ans;

if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "average")) ties = MEAN;
else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "max")) ties = MAX;
else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "min")) ties = MIN;
else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "dense")) ties = DENSE;
else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "runlength")) ties = RUNLENGTH;
// else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "runlength")) ties = RUNLENGTH;
else error("Internal error: invalid ties.method for frankv(), should have been caught before. Please report to datatable-help");
n = length(xorderArg);
ans = (ties == MEAN) ? PROTECT(allocVector(REALSXP, n)) : PROTECT(allocVector(INTSXP, n));
Expand Down Expand Up @@ -110,12 +110,12 @@ SEXP frank(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP ties_method) {
k++;
}
break;
case RUNLENGTH :
for (i = 0; i < length(xstartArg); i++) {
k=1;
for (j = xstart[i]-1; j < xstart[i]+xlen[i]-1; j++)
INTEGER(ans)[xorder[j]-1] = k++;
}
// case RUNLENGTH :
// for (i = 0; i < length(xstartArg); i++) {
// k=1;
// for (j = xstart[i]-1; j < xstart[i]+xlen[i]-1; j++)
// INTEGER(ans)[xorder[j]-1] = k++;
// }
break;
}
}
Expand Down

0 comments on commit 368b356

Please sign in to comment.