Skip to content

Commit

Permalink
Reworked between.c (#3177)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle authored Dec 1, 2018
1 parent a65dfd4 commit e6913a4
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 116 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

3. `setnames()` gains `skip_absent` to skip names in `old` that aren't present, [#3030](https://github.com/Rdatatable/data.table/issues/3030). By default `FALSE` so that it is still an error, as before, to attempt to change a column name that is not present. Thanks to @MusTheDataGuy for the suggestion and the PR.

4. `NA` in `between`'s `lower` and `upper` are now taken as missing bounds and return `TRUE` rather than than `NA`. This is now documented.

#### BUG FIXES

1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting.
Expand Down
13 changes: 9 additions & 4 deletions R/between.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
# is x[i] in between lower[i] and upper[i] ?
between <- function(x,lower,upper,incbounds=TRUE) {
is_strictly_numeric <- function(x) is.numeric(x) && !"integer64" %chin% class(x)
if (is_strictly_numeric(x) && is_strictly_numeric(lower) &&
is_strictly_numeric(upper) && length(lower) == 1L && length(upper) == 1L) {
# faster parallelised version for int/double for most common scenario
if (is.logical(x)) stop("between has been x of type logical")
if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type)
if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type)
is_strictly_numeric <- function(x) is.numeric(x) && !inherits(x, "integer64")
if (is_strictly_numeric(x) && is_strictly_numeric(lower) && is_strictly_numeric(upper)) {
# faster parallelised version for int/double.
# Cbetween supports length(lower)==1 (recycled) and (from v1.12.0) length(lower)==length(x).
# length(upper) can be 1 or length(x) independently of lower
.Call(Cbetween, x, lower, upper, incbounds)
} else {
# now just for character input. TODO: support character between in Cbetween and remove this branch
if(incbounds) x>=lower & x<=upper
else x>lower & x<upper
}
Expand Down
41 changes: 25 additions & 16 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -9542,7 +9542,6 @@ test(1693.7, month(t), 8L)
test(1693.8, quarter(t), 3L)
test(1693.9, year(t), 2016L)


# fix for #1740 - sub-assigning NAs for factors
dt = data.table(x = 1:5, y = factor(c("","a","b","a", "")), z = 5:9)
ans = data.table(x = 1:5, y = factor(c(NA,"a","b","a", NA)), z = 5:9)
Expand All @@ -9551,24 +9550,34 @@ test(1694.0, dt[y=="", y := NA], ans)
# more tests for between()
x = c(NaN, NA, 1, 5, -Inf, Inf)
test(1695.1, x %between% c(3, 7), c(NA, NA, FALSE, TRUE, FALSE, FALSE))
test(1695.2, x %between% c(NA, 7), c(NA, NA, NA, NA, NA, FALSE))
test(1695.3, x %between% c(3, NA), c(NA, NA, FALSE, NA, FALSE, NA))
test(1695.4, x %between% c(NA, NA), rep(NA, 6L))

test(1695.2, x %between% c(NA, 7), c(NA, NA, TRUE, TRUE, TRUE, FALSE))
test(1695.3, x %between% c(3, NA), c(NA, NA, FALSE, TRUE, FALSE, TRUE))
test(1695.4, x %between% c(NA, NA), c(NA, NA, TRUE, TRUE, TRUE,TRUE))
test(1695.5, x %between% c(NA_real_, NA_real_), c(NA, NA, TRUE, TRUE, TRUE,TRUE))
test(1695.6, x %between% list(c(1,2,3,4,5,6), 10), c(NA, NA, FALSE, TRUE, FALSE, FALSE))
x = c(NA, 1L, 5L)
test(1695.5, x %between% c(3, 7), c(NA, FALSE, TRUE))
test(1695.6, x %between% c(NA, 7), c(NA, NA, NA))
test(1695.7, x %between% c(3, NA), c(NA, FALSE, NA))
test(1695.8, x %between% c(NA, NA), rep(NA, 3L))

test(1695.7, x %between% c(3, 7), c(NA, FALSE, TRUE))
test(1695.8, x %between% c(NA, 7), c(NA, TRUE, TRUE))
test(1695.9, x %between% c(3, NA), c(NA, FALSE, TRUE))
test(1695.11, x %between% c(NA, NA), c(NA, TRUE, TRUE))
x = rep(NA_integer_, 3)
test(1695.9, x %between% c(3, 7), rep(NA, 3L))
test(1695.10, x %between% c(NA, 7), rep(NA, 3L))
test(1695.11, x %between% c(3, NA), rep(NA, 3L))
test(1695.12, x %between% c(NA, NA), rep(NA, 3L))

test(1695.12, x %between% c(3, 7), rep(NA, 3L))
test(1695.13, x %between% c(NA, 7), rep(NA, 3L))
test(1695.14, x %between% c(3, NA), rep(NA, 3L))
test(1695.15, x %between% c(NA, NA), rep(NA, 3L))
x = integer(0)
test(1695.13, x %between% c(3, 7), logical(0))
test(1695.16, x %between% c(3, 7), logical(0))
test(1695.17, TRUE %between% c(3, 7), error="between has been x of type logical")
x = c("foo","bar","paz")
test(1695.18, between(x, "bag", "fog"), c(FALSE, TRUE, FALSE))
test(1695.19, between(x, c("b","f","a"), "q"), c(TRUE, FALSE, TRUE))
test(1695.20, between(x, c("foo","baq","bar"), "paz", incbounds=TRUE), c(TRUE, TRUE, TRUE))
test(1695.21, between(x, c("foo","baq","bar"), "paz", incbounds=FALSE), c(FALSE, TRUE, FALSE))
x = c(3.14, 3.20, -42, Inf)
test(1695.22, between(x, c(3,4,-60,5), c(3.14,10,-30,Inf)), c(TRUE,FALSE,TRUE,TRUE))
test(1695.23, between(x, c(3,4,-60,5), c(3.14,10,-30,Inf), incbounds=FALSE), c(FALSE,FALSE,TRUE,FALSE))
test(1695.24, between(x, c(3,4,-60), 99), error="length(lower) (3) must be either 1 or length(x) (4)")
test(1695.25, between(x, c(3,4,-60,5), c(99,98)), error="length(upper) (2) must be either 1 or length(x) (4)")

# test for #1819, verbose message for bmerge
old_opt = getOption("datatable.verbose")
Expand Down
15 changes: 6 additions & 9 deletions man/between.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
Intended for use in \code{i} in \code{[.data.table}.

\code{between} is equivalent to \code{x >= lower & x <= upper} when
\code{incbounds=TRUE}, or \code{x > lower & y < upper} when \code{FALSE}.
\code{incbounds=TRUE}, or \code{x > lower & y < upper} when \code{FALSE}. With a caveat that
\code{NA} in \code{lower} or \code{upper} are taken as a missing bound and return \code{TRUE} not \code{NA}.

\code{inrange} checks whether each value in \code{x} is in between any of
the intervals provided in \code{lower,upper}.
Expand All @@ -24,28 +25,24 @@ x \%inrange\% y
\item{x}{ Any orderable vector, i.e., those with relevant methods for
\code{`<=`}, such as \code{numeric}, \code{character}, \code{Date}, etc. in
case of \code{between} and a numeric vector in case of \code{inrange}.}
\item{lower}{ Lower range bound.}
\item{upper}{ Upper range bound.}
\item{lower}{ Lower range bound. Either length 1 or same length as \code{x}.}
\item{upper}{ Upper range bound. Either length 1 or same length as \code{x}.}
\item{y}{ A length-2 \code{vector} or \code{list}, with \code{y[[1]]}
interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.}
\item{incbounds}{ \code{TRUE} means inclusive bounds, i.e., [lower,upper].
\code{FALSE} means exclusive bounds, i.e., (lower,upper).

It is set to \code{TRUE} by default for infix notations.}
}
\details{

From \code{v1.9.8+}, \code{between} is vectorised. \code{lower} and
\code{upper} are recycled to \code{length(x)} if necessary.

\emph{non-equi} joins were recently implemented in \code{v1.9.8}. It extends
\emph{non-equi} joins were implemented in \code{v1.9.8}. They extend
binary search based joins in \code{data.table} to other binary operators
including \code{>=, <=, >, <}. \code{inrange} makes use of this new
functionality and performs a range join.

}
\value{
Logical vector as the same length as \code{x} with value \code{TRUE} for those
Logical vector the same length as \code{x} with value \code{TRUE} for those
that lie within the specified range.
}
\note{ Current implementation does not make use of ordered keys for
Expand Down
161 changes: 74 additions & 87 deletions src/between.c
Original file line number Diff line number Diff line change
@@ -1,111 +1,98 @@
#include "data.table.h"
#include <Rdefines.h>

static double l=0.0, u=0.0;

static Rboolean int_upper_closed(SEXP x, R_len_t i) {
return (INTEGER(x)[i] == NA_INTEGER || (double)INTEGER(x)[i] <= u ? NA_LOGICAL : FALSE);
}

static Rboolean int_upper_open(SEXP x, R_len_t i) {
return (INTEGER(x)[i] == NA_INTEGER || (double)INTEGER(x)[i] < u ? NA_LOGICAL : FALSE);
}

static Rboolean int_lower_closed(SEXP x, R_len_t i) {
return (INTEGER(x)[i] == NA_INTEGER || (double)INTEGER(x)[i] >= l ? NA_LOGICAL : FALSE);
}

static Rboolean int_lower_open(SEXP x, R_len_t i) {
return (INTEGER(x)[i] == NA_INTEGER || (double)INTEGER(x)[i] > l ? NA_LOGICAL : FALSE);
}

static Rboolean int_both_closed(SEXP x, R_len_t i) {
return (INTEGER(x)[i] == NA_INTEGER ? NA_LOGICAL : ((double)INTEGER(x)[i] >= l && (double)INTEGER(x)[i] <= u));
}

static Rboolean int_both_open(SEXP x, R_len_t i) {
return (INTEGER(x)[i] == NA_INTEGER ? NA_LOGICAL : ((double)INTEGER(x)[i] > l && (double)INTEGER(x)[i] < u));
}

static Rboolean double_upper_closed(SEXP x, R_len_t i) {
return (ISNAN(REAL(x)[i]) || REAL(x)[i] <= u ? NA_LOGICAL : FALSE);
}

static Rboolean double_upper_open(SEXP x, R_len_t i) {
return (ISNAN(REAL(x)[i]) || REAL(x)[i] < u ? NA_LOGICAL : FALSE);
}

static Rboolean double_lower_closed(SEXP x, R_len_t i) {
return (ISNAN(REAL(x)[i]) || REAL(x)[i] >= l ? NA_LOGICAL : FALSE);
}

static Rboolean double_lower_open(SEXP x, R_len_t i) {
return (ISNAN(REAL(x)[i]) || REAL(x)[i] > l ? NA_LOGICAL : FALSE);
}

static Rboolean double_both_closed(SEXP x, R_len_t i) {
return (ISNAN(REAL(x)[i]) ? NA_LOGICAL : (REAL(x)[i] >= l && REAL(x)[i] <= u));
}

static Rboolean double_both_open(SEXP x, R_len_t i) {
return (ISNAN(REAL(x)[i]) ? NA_LOGICAL : (REAL(x)[i] > l && REAL(x)[i] < u));
}

SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP bounds) {

R_len_t i, nx = length(x), nl = length(lower), nu = length(upper);
l = 0.0; u = 0.0;
SEXP ans;
Rboolean (*flower)(), (*fupper)(), (*fboth)();
R_len_t nx = length(x), nl = length(lower), nu = length(upper);
if (!nx || !nl || !nu)
return (allocVector(LGLSXP, 0));
if (nl != 1 && nl != nx)
error("length(lower) (%d) must be either 1 or length(x) (%d)", nl, nx);
if (nu != 1 && nu != nx)
error("length(upper) (%d) must be either 1 or length(x) (%d)", nu, nx);
if (!isLogical(bounds) || LOGICAL(bounds)[0] == NA_LOGICAL)
error("incbounds must be logical TRUE/FALSE.");
error("incbounds must be logical TRUE/FALSE."); // # nocov

This comment has been minimized.

Copy link
@jangorecki

jangorecki Apr 22, 2019

Member

this is not an internal error, so should be tested instead of nocov, will amend when resolving #3516


int nprotect = 0;
if (ALTREP(x)) { x = PROTECT(duplicate(x)); nprotect++; }
if (ALTREP(lower)) { lower = PROTECT(duplicate(lower)); nprotect++; }
if (ALTREP(upper)) { upper = PROTECT(duplicate(upper)); nprotect++; }
if (ALTREP(bounds)) { bounds = PROTECT(duplicate(bounds)); nprotect++; }

// no support for int64 yet (only handling most common cases)
// coerce to also get NA values properly
lower = PROTECT(coerceVector(lower, REALSXP)); l = REAL(lower)[0];
upper = PROTECT(coerceVector(upper, REALSXP)); u = REAL(upper)[0];
ans = PROTECT(allocVector(LGLSXP, nx));
nprotect += 3;

if (LOGICAL(bounds)[0]) {
fupper = isInteger(x) ? &int_upper_closed : &double_upper_closed;
flower = isInteger(x) ? &int_lower_closed : &double_lower_closed;
fboth = isInteger(x) ? &int_both_closed : &double_both_closed;
} else {
fupper = isInteger(x) ? &int_upper_open : &double_upper_open;
flower = isInteger(x) ? &int_lower_open : &double_lower_open;
fboth = isInteger(x) ? &int_both_open : &double_both_open;
bool integer=true;
if (isReal(x) || isReal(lower) || isReal(upper)) {
if (inherits(x,"integer64") || inherits(lower,"integer64") || inherits(upper,"integer64")) {
error("Internal error: one or more of x, lower and upper is type integer64 but this should have been caught by between() at R level."); // # nocov
}
integer=false;
lower = PROTECT(coerceVector(lower, REALSXP)); // these coerces will convert NA appropriately
upper = PROTECT(coerceVector(upper, REALSXP));
x = PROTECT(coerceVector(x, REALSXP));
nprotect += 3;
}

if ( ISNAN(REAL(lower)[0]) ) {
if ( ISNAN(REAL(upper)[0]) ) {
// TODO: sweep through lower and upper ensuring lower<=upper (inc bounds) and no lower>upper or lower==INT_MAX

const bool recycleLow = LENGTH(lower)==1;
const bool recycleUpp = LENGTH(upper)==1;
const bool open = !LOGICAL(bounds)[0];
SEXP ans = PROTECT(allocVector(LGLSXP, nx)); nprotect++;
int *restrict ansp = LOGICAL(ans);
if (integer) {
const int *lp = INTEGER(lower);
const int *up = INTEGER(upper);
const int *xp = INTEGER(x);
if (recycleLow && recycleUpp) {
const int l = lp[0] + open; // +open so we can always use >= and <=. NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too)
const int u = up[0]==NA_INTEGER ? INT_MAX : up[0] - open;
#pragma omp parallel for num_threads(getDTthreads())
for (i=0; i<nx; i++) LOGICAL(ans)[i] = NA_LOGICAL;
} else {
for (int i=0; i<nx; i++) {
int elem = xp[i];
ansp[i] = elem==NA_INTEGER ? NA_LOGICAL : (l<=elem && elem<=u);
}
}
else {
const int lowMask = recycleLow ? 0 : INT_MAX;
const int uppMask = recycleUpp ? 0 : INT_MAX;
#pragma omp parallel for num_threads(getDTthreads())
for (i=0; i<nx; i++) LOGICAL(ans)[i] = fupper(x, i);
for (int i=0; i<nx; i++) {
int elem = xp[i];
int l = lp[i&lowMask] +open;
int u = up[i&uppMask];
u = (u==NA_INTEGER) ? INT_MAX : u-open;
ansp[i] = elem==NA_INTEGER ? NA_LOGICAL : (l<=elem && elem<=u);
}
}
} else {
if ( ISNAN(REAL(upper)[0]) ) {
#pragma omp parallel for num_threads(getDTthreads())
for (i=0; i<nx; i++) LOGICAL(ans)[i] = flower(x, i);
} else {
// type real
const double *lp = REAL(lower);
const double *up = REAL(upper);
const double *xp = REAL(x);
if (recycleLow && recycleUpp) {
const double l = isnan(lp[0]) ? -INFINITY : lp[0];
const double u = isnan(up[0]) ? INFINITY : up[0];
if (open) {
#pragma omp parallel for num_threads(getDTthreads())
for (int i=0; i<nx; i++) {
double elem = xp[i];
ansp[i] = isnan(elem) ? NA_LOGICAL : (l<elem && elem<u);
}
} else {
#pragma omp parallel for num_threads(getDTthreads())
for (int i=0; i<nx; i++) {
double elem = xp[i];
ansp[i] = isnan(elem) ? NA_LOGICAL : (l<=elem && elem<=u);
}
}
}
else {
const int lowMask = recycleLow ? 0 : INT_MAX;
const int uppMask = recycleUpp ? 0 : INT_MAX;
#pragma omp parallel for num_threads(getDTthreads())
for (i=0; i<nx; i++) LOGICAL(ans)[i] = fboth(x, i);
for (int i=0; i<nx; i++) {
double elem = xp[i];
double l = lp[i&lowMask];
double u = up[i&uppMask];
if (isnan(l)) l=-INFINITY;
if (isnan(u)) u= INFINITY;
ansp[i] = isnan(elem) ? NA_LOGICAL : (open ? l<elem && elem<u : l<=elem && elem<=u);
}
}
}
UNPROTECT(nprotect);
return(ans);
}

0 comments on commit e6913a4

Please sign in to comment.