Handle duplicate index values in C for window_idx

This basically moves the R code into C, with minor changes. The C code pre-(and over-)allocates the result and loops over the locations in reverse order. Corwin suggested better long-term solutions (e.g. fix merge.xts() to handle duplicate index values correctly), but this this commit addresses the immediate performance regression. Also add more benchmarks. See #251.
joshuaulrich · Jul 30, 2018 · a017f41 · a017f41
1 parent 1d707c5
commit a017f41
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 18 deletions.
diff --git a/R/xts.methods.R b/R/xts.methods.R
@@ -283,19 +283,7 @@ window_idx <- function(x, index. = NULL, start = NULL, end = NULL)
  # We get back upper bound of index as per findInterval
  tmp <- base_idx[firstlast]
 
- # Iterate in reverse to grab all matches
- # We have to do this to handle duplicate dates in the xts index.
- tmp <- rev(tmp)
- res <- NULL
- for(i in tmp) {
- dt <- idx[i]
- j <- i
- repeat {
- res <- c(res, j)
- j <- j -1
- if(j < 1 || idx[j] != dt) break
- }
- }
+ res <- .Call("fill_window_dups_rev", tmp, .index(x), PACKAGE = "xts")
  firstlast <- rev(res)
  }
 

diff --git a/inst/benchmarks/benchmark.subset.R b/inst/benchmarks/benchmark.subset.R
@@ -1,13 +1,13 @@
+stopifnot(require("xts"))
 stopifnot(require("microbenchmark"))
 # Benchmark [.xts using ISO8601 range on large objects
 N <- 2e7
-secPerYr <- 86400*365
-x <- xts::.xts(1:N, 1.0*seq(secPerYr*20, secPerYr*40, length.out = N))
-rng <- "1990/2000"
+s <- 86400*365.25
+x <- .xts(1:N, 1.0*seq(s*20, s*40, length.out = N), tzone = "UTC")
 
 # warmup, in case there's any JIT
 for (i in 1:2) {
- x[rng,]
+ x["1999/2001",]
 }
 
 profile <- FALSE
@@ -20,5 +20,32 @@ if (profile) {
  Rprof(NULL)
  print(srp <- summaryRprof())
 } else {
- microbenchmark(x[rng,], times = 10)
+ cat("Subset using ISO-8601 range\n")
+ microbenchmark(x["1990",], x["1990/",], x["/2009",],
+ x["1990/1994",], x["1990/1999",], x["1990/2009",], times = 5)
 }
+
+cat("Subset using integer vector\n")
+i001 <- seq(1, N, 1)
+i005 <- seq(1, N, 5)
+i010 <- seq(1, N, 10)
+i050 <- seq(1, N, 50)
+i100 <- seq(1, N, 100)
+microbenchmark(x[i001,], x[i005,], x[i010,], x[i050,], x[i100,], times = 5)
+
+cat("Subset using logical vector\n")
+l001 <- l005 <- l010 <- l050 <- l100 <- logical(N)
+l001[i001] <- TRUE
+l005[i005] <- TRUE
+l010[i010] <- TRUE
+l050[i050] <- TRUE
+l100[i100] <- TRUE
+microbenchmark(x[l001,], x[l005,], x[l010,], x[l050,], x[l100,], times = 5)
+
+cat("Subset using date-time vector\n")
+t001 <- index(x)[i001]
+t005 <- index(x)[i005]
+t010 <- index(x)[i010]
+t050 <- index(x)[i050]
+t100 <- index(x)[i100]
+microbenchmark(x[t001,], x[t005,], x[t010,], x[t050,], x[t100,], times = 5)
diff --git a/src/binsearch.c b/src/binsearch.c
@@ -151,3 +151,33 @@ SEXP binsearch(SEXP key, SEXP vec, SEXP start)
 
  return ScalarInteger(lo);
 }
+
+SEXP fill_window_dups_rev(SEXP _x, SEXP _index)
+{
+ /* Translate user index (_x) to xts index (_index). '_x' contains the
+ * upper bound of the location of the user index in the xts index.
+ * This is necessary to handle duplicate dates in the xts index.
+ */
+ int n_x = length(_x);
+ double *x = REAL(_x);
+ double *index = REAL(_index);
+
+ SEXP _out = PROTECT(allocVector(INTSXP, length(_index)));
+ int *out = INTEGER(_out);
+
+ /* Loop over locations in _x in reverse order */
+ int i, xi, j, k = 0;
+ for (i = n_x; i > 0; i--) {
+ xi = x[i-1];
+ j = xi;
+ do {
+ out[k++] = j--;
+ } while (j > 0 && index[xi-1] == index[j-1]);
+ }
+
+ /* truncate so length(_out) = k
+ * NB: output is in reverse order!
+ */
+ UNPROTECT(1);
+ return lengthgets(_out, k);
+}