Skip to content

Commit

Permalink
Handle duplicate index values in C for window_idx
Browse files Browse the repository at this point in the history
This basically moves the R code into C, with minor changes. The C code
pre-(and over-)allocates the result and loops over the locations in
reverse order. Corwin suggested better long-term solutions (e.g. fix
merge.xts() to handle duplicate index values correctly), but this
this commit addresses the immediate performance regression.

Also add more benchmarks.

See #251.
  • Loading branch information
joshuaulrich committed Jul 30, 2018
1 parent 1d707c5 commit a017f41
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 18 deletions.
14 changes: 1 addition & 13 deletions R/xts.methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -283,19 +283,7 @@ window_idx <- function(x, index. = NULL, start = NULL, end = NULL)
# We get back upper bound of index as per findInterval
tmp <- base_idx[firstlast]

# Iterate in reverse to grab all matches
# We have to do this to handle duplicate dates in the xts index.
tmp <- rev(tmp)
res <- NULL
for(i in tmp) {
dt <- idx[i]
j <- i
repeat {
res <- c(res, j)
j <- j -1
if(j < 1 || idx[j] != dt) break
}
}
res <- .Call("fill_window_dups_rev", tmp, .index(x), PACKAGE = "xts")
firstlast <- rev(res)
}

Expand Down
37 changes: 32 additions & 5 deletions inst/benchmarks/benchmark.subset.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
stopifnot(require("xts"))
stopifnot(require("microbenchmark"))
# Benchmark [.xts using ISO8601 range on large objects
N <- 2e7
secPerYr <- 86400*365
x <- xts::.xts(1:N, 1.0*seq(secPerYr*20, secPerYr*40, length.out = N))
rng <- "1990/2000"
s <- 86400*365.25
x <- .xts(1:N, 1.0*seq(s*20, s*40, length.out = N), tzone = "UTC")

# warmup, in case there's any JIT
for (i in 1:2) {
x[rng,]
x["1999/2001",]
}

profile <- FALSE
Expand All @@ -20,5 +20,32 @@ if (profile) {
Rprof(NULL)
print(srp <- summaryRprof())
} else {
microbenchmark(x[rng,], times = 10)
cat("Subset using ISO-8601 range\n")
microbenchmark(x["1990",], x["1990/",], x["/2009",],
x["1990/1994",], x["1990/1999",], x["1990/2009",], times = 5)
}

cat("Subset using integer vector\n")
i001 <- seq(1, N, 1)
i005 <- seq(1, N, 5)
i010 <- seq(1, N, 10)
i050 <- seq(1, N, 50)
i100 <- seq(1, N, 100)
microbenchmark(x[i001,], x[i005,], x[i010,], x[i050,], x[i100,], times = 5)

cat("Subset using logical vector\n")
l001 <- l005 <- l010 <- l050 <- l100 <- logical(N)
l001[i001] <- TRUE
l005[i005] <- TRUE
l010[i010] <- TRUE
l050[i050] <- TRUE
l100[i100] <- TRUE
microbenchmark(x[l001,], x[l005,], x[l010,], x[l050,], x[l100,], times = 5)

cat("Subset using date-time vector\n")
t001 <- index(x)[i001]
t005 <- index(x)[i005]
t010 <- index(x)[i010]
t050 <- index(x)[i050]
t100 <- index(x)[i100]
microbenchmark(x[t001,], x[t005,], x[t010,], x[t050,], x[t100,], times = 5)
30 changes: 30 additions & 0 deletions src/binsearch.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,33 @@ SEXP binsearch(SEXP key, SEXP vec, SEXP start)

return ScalarInteger(lo);
}

SEXP fill_window_dups_rev(SEXP _x, SEXP _index)
{
/* Translate user index (_x) to xts index (_index). '_x' contains the
* upper bound of the location of the user index in the xts index.
* This is necessary to handle duplicate dates in the xts index.
*/
int n_x = length(_x);
double *x = REAL(_x);
double *index = REAL(_index);

SEXP _out = PROTECT(allocVector(INTSXP, length(_index)));
int *out = INTEGER(_out);

/* Loop over locations in _x in reverse order */
int i, xi, j, k = 0;
for (i = n_x; i > 0; i--) {
xi = x[i-1];
j = xi;
do {
out[k++] = j--;
} while (j > 0 && index[xi-1] == index[j-1]);
}

/* truncate so length(_out) = k
* NB: output is in reverse order!
*/
UNPROTECT(1);
return lengthgets(_out, k);
}

0 comments on commit a017f41

Please sign in to comment.