Reworked between.c (#3177)

Rdatatable · Dec 1, 2018 · e6913a4 · jangorecki · Apr 22, 2019 · e6913a4
1 parent a65dfd4
commit e6913a4
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 116 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,8 @@
 
 3. `setnames()` gains `skip_absent` to skip names in `old` that aren't present, [#3030](https://github.com/Rdatatable/data.table/issues/3030). By default `FALSE` so that it is still an error, as before, to attempt to change a column name that is not present. Thanks to @MusTheDataGuy for the suggestion and the PR.
 
+4. `NA` in `between`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than than `NA`. This is now documented.
+
 #### BUG FIXES
 
 1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting.

diff --git a/R/between.R b/R/between.R
@@ -1,11 +1,16 @@
 # is x[i] in between lower[i] and upper[i] ?
 between <- function(x,lower,upper,incbounds=TRUE) {
-  is_strictly_numeric <- function(x) is.numeric(x) && !"integer64" %chin% class(x)
-  if (is_strictly_numeric(x) && is_strictly_numeric(lower) &&
-    is_strictly_numeric(upper) && length(lower) == 1L && length(upper) == 1L) {
-    # faster parallelised version for int/double for most common scenario
+  if (is.logical(x)) stop("between has been x of type logical")
+  if (is.logical(lower)) lower = as.integer(lower)   # typically NA (which is logical type)
+  if (is.logical(upper)) upper = as.integer(upper)   # typically NA (which is logical type)
+  is_strictly_numeric <- function(x) is.numeric(x) && !inherits(x, "integer64")
+  if (is_strictly_numeric(x) && is_strictly_numeric(lower) && is_strictly_numeric(upper)) {
+    # faster parallelised version for int/double.
+    # Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x).
+    # length(upper) can be 1 or length(x) independently of lower
     .Call(Cbetween, x, lower, upper, incbounds)
   } else {
+    # now just for character input. TODO: support character between in Cbetween and remove this branch
     if(incbounds) x>=lower & x<=upper
     else x>lower & x<upper
   }

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -9542,7 +9542,6 @@ test(1693.7, month(t), 8L)
 test(1693.8, quarter(t), 3L)
 test(1693.9, year(t), 2016L)
 
-
 # fix for #1740 - sub-assigning NAs for factors
 dt = data.table(x = 1:5, y = factor(c("","a","b","a", "")), z = 5:9)
 ans = data.table(x = 1:5, y = factor(c(NA,"a","b","a", NA)), z = 5:9)
@@ -9551,24 +9550,34 @@ test(1694.0, dt[y=="", y := NA], ans)
 # more tests for between()
 x = c(NaN, NA, 1, 5, -Inf, Inf)
 test(1695.1, x %between% c(3, 7), c(NA, NA, FALSE, TRUE, FALSE, FALSE))
-test(1695.2, x %between% c(NA, 7), c(NA, NA, NA, NA, NA, FALSE))
-test(1695.3, x %between% c(3, NA), c(NA, NA, FALSE, NA, FALSE, NA))
-test(1695.4, x %between% c(NA, NA), rep(NA, 6L))
-
+test(1695.2, x %between% c(NA, 7), c(NA, NA, TRUE, TRUE, TRUE, FALSE))
+test(1695.3, x %between% c(3, NA), c(NA, NA, FALSE, TRUE, FALSE, TRUE))
+test(1695.4, x %between% c(NA, NA), c(NA, NA, TRUE, TRUE, TRUE,TRUE))
+test(1695.5, x %between% c(NA_real_, NA_real_), c(NA, NA, TRUE, TRUE, TRUE,TRUE))
+test(1695.6, x %between% list(c(1,2,3,4,5,6), 10),  c(NA, NA, FALSE, TRUE, FALSE, FALSE))
 x = c(NA, 1L, 5L)
-test(1695.5, x %between% c(3, 7), c(NA, FALSE, TRUE))
-test(1695.6, x %between% c(NA, 7), c(NA, NA, NA))
-test(1695.7, x %between% c(3, NA), c(NA, FALSE, NA))
-test(1695.8, x %between% c(NA, NA), rep(NA, 3L))
-
+test(1695.7, x %between% c(3, 7), c(NA, FALSE, TRUE))
+test(1695.8, x %between% c(NA, 7), c(NA, TRUE, TRUE))
+test(1695.9, x %between% c(3, NA), c(NA, FALSE, TRUE))
+test(1695.11, x %between% c(NA, NA), c(NA, TRUE, TRUE))
 x = rep(NA_integer_, 3)
-test(1695.9, x %between% c(3, 7), rep(NA, 3L))
-test(1695.10, x %between% c(NA, 7), rep(NA, 3L))
-test(1695.11, x %between% c(3, NA), rep(NA, 3L))
-test(1695.12, x %between% c(NA, NA), rep(NA, 3L))
-
+test(1695.12, x %between% c(3, 7), rep(NA, 3L))
+test(1695.13, x %between% c(NA, 7), rep(NA, 3L))
+test(1695.14, x %between% c(3, NA), rep(NA, 3L))
+test(1695.15, x %between% c(NA, NA), rep(NA, 3L))
 x = integer(0)
-test(1695.13, x %between% c(3, 7), logical(0))
+test(1695.16, x %between% c(3, 7), logical(0))
+test(1695.17, TRUE %between% c(3, 7), error="between has been x of type logical")
+x = c("foo","bar","paz")
+test(1695.18, between(x, "bag", "fog"), c(FALSE, TRUE, FALSE))
+test(1695.19, between(x, c("b","f","a"), "q"), c(TRUE, FALSE, TRUE))
+test(1695.20, between(x, c("foo","baq","bar"), "paz", incbounds=TRUE), c(TRUE, TRUE, TRUE))
+test(1695.21, between(x, c("foo","baq","bar"), "paz", incbounds=FALSE), c(FALSE, TRUE, FALSE))
+x = c(3.14, 3.20, -42, Inf)
+test(1695.22, between(x, c(3,4,-60,5), c(3.14,10,-30,Inf)), c(TRUE,FALSE,TRUE,TRUE))
+test(1695.23, between(x, c(3,4,-60,5), c(3.14,10,-30,Inf), incbounds=FALSE), c(FALSE,FALSE,TRUE,FALSE))
+test(1695.24, between(x, c(3,4,-60), 99), error="length(lower) (3) must be either 1 or length(x) (4)")
+test(1695.25, between(x, c(3,4,-60,5), c(99,98)), error="length(upper) (2) must be either 1 or length(x) (4)")
 
 # test for #1819, verbose message for bmerge
 old_opt = getOption("datatable.verbose")

diff --git a/man/between.Rd b/man/between.Rd
@@ -8,7 +8,8 @@
 Intended for use in \code{i} in \code{[.data.table}.
 
 \code{between} is equivalent to \code{x >= lower & x <= upper} when
-\code{incbounds=TRUE}, or \code{x > lower & y < upper} when \code{FALSE}.
+\code{incbounds=TRUE}, or \code{x > lower & y < upper} when \code{FALSE}. With a caveat that
+\code{NA} in \code{lower} or \code{upper} are taken as a missing bound and return \code{TRUE} not \code{NA}.
 
 \code{inrange} checks whether each value in \code{x} is in between any of
 the intervals provided in \code{lower,upper}.
@@ -24,28 +25,24 @@ x \%inrange\% y
 \item{x}{ Any orderable vector, i.e., those with relevant methods for
 \code{`<=`}, such as \code{numeric}, \code{character}, \code{Date}, etc. in
 case of \code{between} and a numeric vector in case of \code{inrange}.}
-\item{lower}{ Lower range bound.}
-\item{upper}{ Upper range bound.}
+\item{lower}{ Lower range bound. Either length 1 or same length as \code{x}.}
+\item{upper}{ Upper range bound. Either length 1 or same length as \code{x}.}
 \item{y}{ A length-2 \code{vector} or \code{list}, with \code{y[[1]]}
 interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.}
 \item{incbounds}{ \code{TRUE} means inclusive bounds, i.e., [lower,upper].
 \code{FALSE} means exclusive bounds, i.e., (lower,upper).
-
 It is set to \code{TRUE} by default for infix notations.}
 }
 \details{
 
-From \code{v1.9.8+}, \code{between} is vectorised. \code{lower} and
-\code{upper} are recycled to \code{length(x)} if necessary.
-
-\emph{non-equi} joins were recently implemented in \code{v1.9.8}. It extends
+\emph{non-equi} joins were implemented in \code{v1.9.8}. They extend
 binary search based joins in \code{data.table} to other binary operators
 including \code{>=, <=, >, <}. \code{inrange} makes use of this new
 functionality and performs a range join.
 
 }
 \value{
-Logical vector as the same length as \code{x} with value \code{TRUE} for those
+Logical vector the same length as \code{x} with value \code{TRUE} for those
 that lie within the specified range.
 }
 \note{ Current implementation does not make use of ordered keys for

diff --git a/src/between.c b/src/between.c
@@ -1,111 +1,98 @@
 #include "data.table.h"
-#include <Rdefines.h>
-
-static double l=0.0, u=0.0;
-
-static Rboolean int_upper_closed(SEXP x, R_len_t i) {
-  return (INTEGER(x)[i] == NA_INTEGER || (double)INTEGER(x)[i] <= u ? NA_LOGICAL : FALSE);
-}
-
-static Rboolean int_upper_open(SEXP x, R_len_t i) {
-  return (INTEGER(x)[i] == NA_INTEGER || (double)INTEGER(x)[i] < u ? NA_LOGICAL : FALSE);
-}
-
-static Rboolean int_lower_closed(SEXP x, R_len_t i) {
-  return (INTEGER(x)[i] == NA_INTEGER || (double)INTEGER(x)[i] >= l ? NA_LOGICAL : FALSE);
-}
-
-static Rboolean int_lower_open(SEXP x, R_len_t i) {
-  return (INTEGER(x)[i] == NA_INTEGER || (double)INTEGER(x)[i] > l ? NA_LOGICAL : FALSE);
-}
-
-static Rboolean int_both_closed(SEXP x, R_len_t i) {
-  return (INTEGER(x)[i] == NA_INTEGER ? NA_LOGICAL : ((double)INTEGER(x)[i] >= l && (double)INTEGER(x)[i] <= u));
-}
-
-static Rboolean int_both_open(SEXP x, R_len_t i) {
-  return (INTEGER(x)[i] == NA_INTEGER ? NA_LOGICAL : ((double)INTEGER(x)[i] > l && (double)INTEGER(x)[i] < u));
-}
-
-static Rboolean double_upper_closed(SEXP x, R_len_t i) {
-  return (ISNAN(REAL(x)[i]) || REAL(x)[i] <= u ? NA_LOGICAL : FALSE);
-}
-
-static Rboolean double_upper_open(SEXP x, R_len_t i) {
-  return (ISNAN(REAL(x)[i]) || REAL(x)[i] < u ? NA_LOGICAL : FALSE);
-}
-
-static Rboolean double_lower_closed(SEXP x, R_len_t i) {
-  return (ISNAN(REAL(x)[i]) || REAL(x)[i] >= l ? NA_LOGICAL : FALSE);
-}
-
-static Rboolean double_lower_open(SEXP x, R_len_t i) {
-  return (ISNAN(REAL(x)[i]) || REAL(x)[i] > l ? NA_LOGICAL : FALSE);
-}
-
-static Rboolean double_both_closed(SEXP x, R_len_t i) {
-  return (ISNAN(REAL(x)[i]) ? NA_LOGICAL : (REAL(x)[i] >= l && REAL(x)[i] <= u));
-}
-
-static Rboolean double_both_open(SEXP x, R_len_t i) {
-  return (ISNAN(REAL(x)[i]) ? NA_LOGICAL : (REAL(x)[i] > l && REAL(x)[i] < u));
-}
 
 SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) {
 
-  R_len_t i, nx = length(x), nl = length(lower), nu = length(upper);
-  l = 0.0; u = 0.0;
-  SEXP ans;
-  Rboolean (*flower)(), (*fupper)(), (*fboth)();
+  R_len_t nx = length(x), nl = length(lower), nu = length(upper);
   if (!nx || !nl || !nu)
     return (allocVector(LGLSXP, 0));
   if (nl != 1 && nl != nx)
     error("length(lower) (%d) must be either 1 or length(x) (%d)", nl, nx);
   if (nu != 1 && nu != nx)
     error("length(upper) (%d) must be either 1 or length(x) (%d)", nu, nx);
   if (!isLogical(bounds) || LOGICAL(bounds)[0] == NA_LOGICAL)
-    error("incbounds must be logical TRUE/FALSE.");
+    error("incbounds must be logical TRUE/FALSE.");  // # nocov
 
   int nprotect = 0;
-  if (ALTREP(x))      { x     =  PROTECT(duplicate(x));      nprotect++; }
-  if (ALTREP(lower))  { lower =  PROTECT(duplicate(lower));  nprotect++; }
-  if (ALTREP(upper))  { upper =  PROTECT(duplicate(upper));  nprotect++; }
-  if (ALTREP(bounds)) { bounds = PROTECT(duplicate(bounds)); nprotect++; }
-
-  // no support for int64 yet (only handling most common cases)
-  // coerce to also get NA values properly
-  lower = PROTECT(coerceVector(lower, REALSXP)); l = REAL(lower)[0];
-  upper = PROTECT(coerceVector(upper, REALSXP)); u = REAL(upper)[0];
-  ans   = PROTECT(allocVector(LGLSXP, nx));
-  nprotect += 3;
-
-  if (LOGICAL(bounds)[0]) {
-    fupper = isInteger(x) ? &int_upper_closed : &double_upper_closed;
-    flower = isInteger(x) ? &int_lower_closed : &double_lower_closed;
-    fboth  = isInteger(x) ? &int_both_closed  : &double_both_closed;
-  } else {
-    fupper = isInteger(x) ? &int_upper_open : &double_upper_open;
-    flower = isInteger(x) ? &int_lower_open : &double_lower_open;
-    fboth  = isInteger(x) ? &int_both_open  : &double_both_open;
+  bool integer=true;
+  if (isReal(x) || isReal(lower) || isReal(upper)) {
+    if (inherits(x,"integer64") || inherits(lower,"integer64") || inherits(upper,"integer64")) {
+      error("Internal error: one or more of x, lower and upper is type integer64 but this should have been caught by between() at R level.");  // # nocov
+    }
+    integer=false;
+    lower = PROTECT(coerceVector(lower, REALSXP));  // these coerces will convert NA appropriately
+    upper = PROTECT(coerceVector(upper, REALSXP));
+    x     = PROTECT(coerceVector(x, REALSXP));
+    nprotect += 3;
   }
-
-  if ( ISNAN(REAL(lower)[0]) ) {
-    if ( ISNAN(REAL(upper)[0]) ) {
+  // TODO: sweep through lower and upper ensuring lower<=upper (inc bounds) and no lower>upper or lower==INT_MAX
+
+  const bool recycleLow = LENGTH(lower)==1;
+  const bool recycleUpp = LENGTH(upper)==1;
+  const bool open = !LOGICAL(bounds)[0];
+  SEXP ans = PROTECT(allocVector(LGLSXP, nx)); nprotect++;
+  int *restrict ansp = LOGICAL(ans);
+  if (integer) {
+    const int *lp = INTEGER(lower);
+    const int *up = INTEGER(upper);
+    const int *xp = INTEGER(x);
+    if (recycleLow && recycleUpp) {
+      const int l = lp[0] + open;  // +open so we can always use >= and <=.  NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too)
+      const int u = up[0]==NA_INTEGER ? INT_MAX : up[0] - open;
       #pragma omp parallel for num_threads(getDTthreads())
-      for (i=0; i<nx; i++) LOGICAL(ans)[i] = NA_LOGICAL;
-    } else {
+      for (int i=0; i<nx; i++) {
+        int elem = xp[i];
+        ansp[i] = elem==NA_INTEGER ? NA_LOGICAL : (l<=elem && elem<=u);
+      }
+    }
+    else {
+      const int lowMask = recycleLow ? 0 : INT_MAX;
+      const int uppMask = recycleUpp ? 0 : INT_MAX;
       #pragma omp parallel for num_threads(getDTthreads())
-      for (i=0; i<nx; i++) LOGICAL(ans)[i] = fupper(x, i);
+      for (int i=0; i<nx; i++) {
+        int elem = xp[i];
+        int l = lp[i&lowMask] +open;
+        int u = up[i&uppMask];
+        u = (u==NA_INTEGER) ? INT_MAX : u-open;
+        ansp[i] = elem==NA_INTEGER ? NA_LOGICAL : (l<=elem && elem<=u);
+      }
     }
   } else {
-    if ( ISNAN(REAL(upper)[0]) ) {
-      #pragma omp parallel for num_threads(getDTthreads())
-      for (i=0; i<nx; i++) LOGICAL(ans)[i] = flower(x, i);
-    } else {
+    // type real
+    const double *lp = REAL(lower);
+    const double *up = REAL(upper);
+    const double *xp = REAL(x);
+    if (recycleLow && recycleUpp) {
+      const double l = isnan(lp[0]) ? -INFINITY : lp[0];
+      const double u = isnan(up[0]) ?  INFINITY : up[0];
+      if (open) {
+        #pragma omp parallel for num_threads(getDTthreads())
+        for (int i=0; i<nx; i++) {
+          double elem = xp[i];
+          ansp[i] = isnan(elem) ? NA_LOGICAL : (l<elem && elem<u);
+        }
+      } else {
+        #pragma omp parallel for num_threads(getDTthreads())
+        for (int i=0; i<nx; i++) {
+          double elem = xp[i];
+          ansp[i] = isnan(elem) ? NA_LOGICAL : (l<=elem && elem<=u);
+        }
+      }
+    }
+    else {
+      const int lowMask = recycleLow ? 0 : INT_MAX;
+      const int uppMask = recycleUpp ? 0 : INT_MAX;
       #pragma omp parallel for num_threads(getDTthreads())
-      for (i=0; i<nx; i++) LOGICAL(ans)[i] = fboth(x, i);
+      for (int i=0; i<nx; i++) {
+        double elem = xp[i];
+        double l = lp[i&lowMask];
+        double u = up[i&uppMask];
+        if (isnan(l)) l=-INFINITY;
+        if (isnan(u)) u= INFINITY;
+        ansp[i] = isnan(elem) ? NA_LOGICAL : (open ? l<elem && elem<u : l<=elem && elem<=u);
+      }
     }
   }
   UNPROTECT(nprotect);
   return(ans);
 }
+