Skip to content

Commit

Permalink
Implements #760. Fast rank for vectors & lists.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Aug 18, 2014
1 parent 6f0dc75 commit 1ff5e1e
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 1 deletion.
73 changes: 72 additions & 1 deletion R/setkey.R
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,78 @@ CJ = function(..., sorted = TRUE)
l
}


frankv = function(x, na.last=TRUE, order=1L, ties.method=c("average", "first", "random", "max", "min")) {
ties.method = match.arg(ties.method)
na.last = as.logical(na.last)
if (!length(na.last)) stop('length(na.last) = 0')
if (length(na.last) != 1L) {
warning("length(na.last) > 1, only the first element will be used")
na.last = na.last[1L]
}
as_list <- function(x) {
xx = vector("list", 1L)
.Call("Csetlistelt", xx, 1L, x)
xx
}
if (is.atomic(x)) x = as_list(x)
else {
n = vapply(x, length, 0L)
if (any(n<max(n))) stop("All elements in input list x must be of same length")
}
is_na <- function(x) {
xseq = seq_along(x[[1L]])
.Call("Cdt_na", x, xseq, xseq)
}
shallow_list <- function(x) {
lx = length(x); sx = seq_len(lx)
xx = vector("list", lx)
point(xx, sx, x, sx)
}
remove_na <- function(x) {
na = is_na(x)
xx = shallow_list(x)
setDT(xx)
.Call("CsubsetDT", xx, which(!na), seq_along(xx))
}
ties_random <- function(x) {
lx = length(x); sx = seq_len(lx)
xx = vector("list", lx+1L)
point(xx, sx, x, sx)
.Call("Csetlistelt", xx, lx+1L, stats::runif(length(x[[1L]])))
xx
}
if (ties.method == "random") {
# seems inefficient to subset, but have to do to get same results as base
# is okay because it's just for the case where na.last=NA *and* ties="random"
if (is.na(na.last)) x = remove_na(x)
x = ties_random(x)
}
xorder = forderv(x, sort=TRUE, retGrp=TRUE, order=order, na.last=na.last)
xstart = attr(xorder, 'starts')
xsorted = FALSE
if (!length(xorder)) {
xsorted = TRUE
xorder = seq_along(x[[1L]])
}
ans = switch(ties.method,
average = , min = , max = {
rank = .Call("Cfrank", xorder, xstart, uniqlengths(xstart, length(xorder)), ties.method)
if (is.na(na.last) && xorder[1L] == 0L) {
idx = which(rank != 0L)
rank = .Call("CsubsetVector", rank, idx) # xorder[idx], but faster
}
rank
},
first = , random = {
if (is.na(na.last) && xorder[1L] == 0L) {
idx = which(xorder != 0L)
xorder = .Call("CsubsetVector", xorder, idx)
}
if (xsorted) xorder else forderv(xorder)
}
)
ans
}

#########################################################################################
# Deprecated ...
Expand Down
44 changes: 44 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ if (!.devtesting) {
.R.subassignCopiesOthers = data.table:::.R.subassignCopiesOthers
.R.subassignCopiesVecsxp = data.table:::.R.subassignCopiesVecsxp
setdiff_ = data.table:::setdiff_
frankv = data.table:::frankv
}


Expand Down Expand Up @@ -5046,6 +5047,49 @@ test(1368.1, forderv(x), c(2L, 1L, 3L))
x = c(2147483647L, NA_integer_, -2L)
test(1368.2, forderv(x), c(2L, 3L, 1L))

# tests for frankv. testing on vectors alone so that we can compare with base::rank
# One difference is that NAs belong to the same group, unlike base::rank. So are NaNs.
# So, they can't be compared to base::rank, won't be identical except for ties="first", and (ties="random", na.last=NA) - should document this.

# no seed set on purpose
dt = data.table(AA=sample(c(-2:2), 50, TRUE),
BB=sample(c(-2,-1,0,1,2,Inf,-Inf), 50, TRUE),
CC=sample(c(letters[1:5]), 50, TRUE),
DD=sample(c(-2:2), 50, TRUE),
EE=sample(as.logical(c(-2:2)), 50, TRUE))
if ("package:bit64" %in% search()) dt[, DD := as.integer64(DD)]

test_no = 1369.0
for (i in seq_along(dt)) {
col = dt[[i]]
for (j in c(TRUE, FALSE, NA)) {
for (k in c("average", "min", "max", "first")) {
if (k == "random") set.seed(45L)
if (class(col) == "integer64") r1 = rank(as.integer(col), ties.method=k, na.last=j)
else r1 = rank(col, ties.method=k, na.last=j)
if (k == "random") set.seed(45L)
r2 = frankv(col, ties.method=k, na.last=j)

test_no = signif(test_no+.01, 7)
test(test_no, r1, r2)

# decreasing order
if (k == "random") set.seed(45L)
if (class(col) == "integer64") r1 = rank(-as.integer(col), ties.method=k, na.last=j)
else if (class(col) == "character") r1 = rank(-xtfrm(col), ties.method=k, na.last=j)
else r1 = rank(-col, ties.method=k, na.last=j)

if (k == "random") set.seed(45L)
r2 = frankv(col, order=-1L, ties.method=k, na.last=j)

test_no = signif(test_no+.01, 7)
test(test_no, r1, r2)

}
}
}
# maybe add tests on smaller examples with NAs, even though can't compare to base::rank.

##########################


Expand Down
102 changes: 102 additions & 0 deletions src/frank.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#include "data.table.h"
#include <Rdefines.h>
// #include <signal.h> // the debugging machinery + breakpoint aidee
// raise(SIGINT);

extern SEXP char_integer64;

static union {
double d;
unsigned long long ull;
} u;

SEXP dt_na(SEXP x, SEXP xorderArg, SEXP xstartArg) {
int i, j, k, n = length(xstartArg), *xstart = INTEGER(xstartArg), *xorder = INTEGER(xorderArg);
SEXP v, ans, class;
double *dv;

if (!isNewList(x)) error("Internal error: 'x' should be a list. Please report to datatable-help");
ans = PROTECT(allocVector(LGLSXP, n));
for (i=0; i<n; i++) LOGICAL(ans)[i]=0;
for (i=0; i<length(x); i++) {
v = VECTOR_ELT(x, i);
switch (TYPEOF(v)) {
case LGLSXP:
for (j=0; j<n; j++) {
k = xorder[xstart[j]-1]-1;
LOGICAL(ans)[j] |= (LOGICAL(v)[k] == NA_LOGICAL);
}
break;
case INTSXP:
for (j=0; j<n; j++) {
k = xorder[xstart[j]-1]-1;
LOGICAL(ans)[j] |= (INTEGER(v)[k] == NA_INTEGER);
}
break;
case STRSXP:
for (j=0; j<n; j++) {
k = xorder[xstart[j]-1]-1;
LOGICAL(ans)[j] |= (STRING_ELT(v, k) == NA_STRING);
}
break;
case REALSXP:
class = getAttrib(v, R_ClassSymbol);
if (isString(class) && STRING_ELT(class, 0) == char_integer64) {
dv = (double *)REAL(v);
for (j=0; j<n; j++) {
k = xorder[xstart[j]-1]-1;
u.d = dv[k];
LOGICAL(ans)[j] |= ((u.ull ^ 0x8000000000000000) == 0);
}
} else {
for (j=0; j<n; j++) {
k = xorder[xstart[j]-1]-1;
LOGICAL(ans)[j] |= ISNAN(REAL(v)[k]);
}
}
break;
default:
error("Unknown column type '%s'", type2char(TYPEOF(v)));
}
}
UNPROTECT(1);
return(ans);
}

SEXP frank(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP ties_method) {
int i=0, j=0, n;
int *xstart = INTEGER(xstartArg), *xlen = INTEGER(xlenArg), *xorder = INTEGER(xorderArg);
enum {MEAN, MAX, MIN} ties = MEAN;
SEXP ans;

if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "average")) ties = MEAN;
else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "max")) ties = MAX;
else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "min")) ties = MIN;
else error("Internal error: invalid ties.method for frankv(), should have been caught before. Please report to datatable-help");
n = length(xorderArg);
ans = (ties == MEAN) ? PROTECT(allocVector(REALSXP, n)) : PROTECT(allocVector(INTSXP, n));
if (n > 0) {
switch (ties) {
case MEAN :
for (i = 0; i < length(xstartArg); i++) {
for (j = xstart[i]-1; j < xstart[i]+xlen[i]-1; j++)
REAL(ans)[xorder[j]-1] = (2*xstart[i]+xlen[i]-1)/2.0;
}
break;
case MAX :
for (i = 0; i < length(xstartArg); i++) {
for (j = xstart[i]-1; j < xstart[i]+xlen[i]-1; j++)
INTEGER(ans)[xorder[j]-1] = xstart[i]+xlen[i]-1;
}
break;
case MIN :
for (i = 0; i < length(xstartArg); i++) {
for (j = xstart[i]-1; j < xstart[i]+xlen[i]-1; j++)
INTEGER(ans)[xorder[j]-1] = xstart[i];
}
break;
}
}
UNPROTECT(1);
return(ans);
}
4 changes: 4 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ SEXP chmatch2();
SEXP subsetDT();
SEXP subsetVector();
SEXP convertNegativeIdx();
SEXP frank();
SEXP dt_na();

// .Externals
SEXP fastmean();
Expand Down Expand Up @@ -100,6 +102,8 @@ R_CallMethodDef callMethods[] = {
{"CsubsetDT", (DL_FUNC) &subsetDT, -1},
{"CsubsetVector", (DL_FUNC) &subsetVector, -1},
{"CconvertNegativeIdx", (DL_FUNC) &convertNegativeIdx, -1},
{"Cfrank", (DL_FUNC) &frank, -1},
{"Cdt_na", (DL_FUNC) &dt_na, -1},
{NULL, NULL, 0}
};

Expand Down

0 comments on commit 1ff5e1e

Please sign in to comment.