From 612691ac1e3b808bc6c5a9945f69db44f374ef13 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 5 Mar 2019 16:44:13 +0530 Subject: [PATCH 1/3] join retain key, roll branch only when non missing roll arg, closes #3441 --- NEWS.md | 2 ++ R/data.table.R | 8 +++++++- inst/tests/tests.Rraw | 8 ++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 24ecb1a09..04d4c468f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,8 @@ 10. Quoted expression having `:=` and dot alias in RHS now works as expected. Thanks to @franknarf1 for raising up issue on [StackOverflow](https://stackoverflow.com/questions/41228076/using-data-tables-shortcut-in-quoted-expressions) and @jangorecki for PR. +11. Key is properly removed in an edge case not handled before, when joining a *keyed* table to a 2+ length vector wrapped in list when only single row match. Thanks to @symbalex for reporting and @franknarf1 for tiny reproducible example. Closes [#3441](https://github.com/Rdatatable/data.table/issues/3441). + #### NOTES 1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background. diff --git a/R/data.table.R b/R/data.table.R index 2052d1c92..40aa970ad 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1333,7 +1333,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { ## check key on i as well! ichk = is.data.table(i) && haskey(i) && identical(head(key(i), length(leftcols)), names(i)[leftcols]) # i has the correct key, #3061 - if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE + if (keylen && ( + ichk || is.logical(i) || ( + .Call(CisOrderedSubset, irows, nrow(x)) && !missingroll && ( # !missingroll fix for #3441 + (roll == FALSE) || length(irows) == 1L + ) + ) + )) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE setattr(ans,"sorted",head(key(x),keylen)) } setattr(ans, "class", class(x)) # fix for #5296 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 52701f0ed..922eff2d6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13618,6 +13618,14 @@ x = 5L test(1998.1, as.IDate(x), output = '1970-01-06') test(1998.2, class(x), 'integer') +# Joining a keyed table on a non-keyed table is not working sometimes #3441 +dx = data.table(id = "A", key = "id") +di = list(c("D", "A")) +test(1999.1, key(dx[di]), NULL) +dx = data.table(id = 1L, key = "id") +di = list(z=c(2L, 1L)) +test(1999.2, key(dx[di]), NULL) + ################################### # Add new tests above this line # From b6cb0f5e68c8af0a7432ad3626279c5695dd8c55 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 5 Mar 2019 17:23:48 +0530 Subject: [PATCH 2/3] detect NAs in isOrderedSubset, closes #3441 --- R/data.table.R | 8 +------- src/forder.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 40aa970ad..2052d1c92 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1333,13 +1333,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { ## check key on i as well! ichk = is.data.table(i) && haskey(i) && identical(head(key(i), length(leftcols)), names(i)[leftcols]) # i has the correct key, #3061 - if (keylen && ( - ichk || is.logical(i) || ( - .Call(CisOrderedSubset, irows, nrow(x)) && !missingroll && ( # !missingroll fix for #3441 - (roll == FALSE) || length(irows) == 1L - ) - ) - )) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE + if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE setattr(ans,"sorted",head(key(x),keylen)) } setattr(ans, "class", class(x)) # fix for #5296 diff --git a/src/forder.c b/src/forder.c index 8a53e82c2..3782ce398 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1248,17 +1248,21 @@ SEXP isOrderedSubset(SEXP x, SEXP nrow) // specialized for use in [.data.table only // Ignores 0s but heeds NAs and any out-of-range (which result in NA) { - int i=0, last, elem; if (!length(x)) return(ScalarLogical(TRUE)); if (!isInteger(x)) error("x has non-0 length but isn't an integer vector"); if (!isInteger(nrow) || LENGTH(nrow)!=1 || INTEGER(nrow)[0]<0) error("nrow must be integer vector length 1 and >=0"); if (LENGTH(x)<=1) return(ScalarLogical(TRUE)); - while (i INTEGER(nrow)[0]) return(ScalarLogical(FALSE)); From 87b9233165f61a646a9cfe12ea18695fe80a90e8 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 5 Mar 2019 15:52:57 -0800 Subject: [PATCH 3/3] tidy --- NEWS.md | 2 +- inst/tests/tests.Rraw | 12 ++++++------ src/forder.c | 29 +++++++++++------------------ 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/NEWS.md b/NEWS.md index 04d4c468f..610f678e3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,7 +32,7 @@ 10. Quoted expression having `:=` and dot alias in RHS now works as expected. Thanks to @franknarf1 for raising up issue on [StackOverflow](https://stackoverflow.com/questions/41228076/using-data-tables-shortcut-in-quoted-expressions) and @jangorecki for PR. -11. Key is properly removed in an edge case not handled before, when joining a *keyed* table to a 2+ length vector wrapped in list when only single row match. Thanks to @symbalex for reporting and @franknarf1 for tiny reproducible example. Closes [#3441](https://github.com/Rdatatable/data.table/issues/3441). +11. A join's result could be incorrectly keyed when a single nomatch occurred at the very beginning while all other values matched, [#3441](https://github.com/Rdatatable/data.table/issues/3441). The incorrect key would cause incorrect results in subsequent queries. Thanks to @symbalex for reporting and @franknarf1 for pinpointing the root cause. #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 922eff2d6..67f5d22a4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13454,7 +13454,7 @@ test(1986.4, DT[, as.list(colMeans(.SD)), by=gear], cbind(DT[,"gear"],DT[,-"gear # tests for #2949, #1974 and #1369 - dcast not able to handle functions referred to by a variable dt = data.table( x=sample(5,20,TRUE), - y=sample(2,20,TRUE), + y=sample(2,20,TRUE), z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L @@ -13466,7 +13466,7 @@ myFun1 <- function(data, vars) { myFun2 <- function(data, vars) { myFuns <- list(f1=sum, first=function(x) x[1L]) dcast.data.table(data, "x + y ~ z", value.var=vars, fun.aggregate=myFuns) -} +} funs = list(sum, mean) vars = list("d1", "d2") @@ -13508,7 +13508,7 @@ DT2 <- data.table( flight=sample(codes[1200:2000], 1e4, TRUE) ) DT2[, c("start", "end") := .(pmin(start, end), pmax(start, end))] -# just testing if the code runs without segfault .. +# just testing if the code runs without segfault .. test(1989.1, nrow(DT1[DT2, on=.(date <= end, date >= start, flight==flight)]) > 0L, TRUE) # fix for #2202, dcast needs to rank NA with na.last=FALSE in frankv within dcast @@ -13537,9 +13537,9 @@ DT_y <- data.table(y1 = c(1,10), y2 = c(9,50), yval = c("y_a","y_b"), key = c("y test(1992.1, foverlaps(DT_x, DT_y), error="All rows with NA values") # foverlaps POSIXct checks #1143 + another check... -xp <- data.frame(year = c(2006L, 2006L, 2006L), day = c(361L, 361L, 360L), +xp <- data.frame(year = c(2006L, 2006L, 2006L), day = c(361L, 361L, 360L), hour = c(14L, 8L, 8L), min = c(30L, 0L, 30L), val = c(0.5, 0.3, 0.4), - Date = structure(c(1167229800, 1167206400, 1167121800), + Date = structure(c(1167229800, 1167206400, 1167121800), class = c("POSIXct", "POSIXt"), tzone = "UTC")) ## "UTC" time zone setDT(xp)[, `:=`(start = Date - 1800L, end = Date + 1800L)] tt <- as.POSIXct(c("2006/12/27 14:23:59", "2006/12/27 16:47:59", "2006/12/27 19:12:00"), format = "%Y/%m/%d %T", tz = "Asia/Jerusalem") ## different time zone @@ -13618,7 +13618,7 @@ x = 5L test(1998.1, as.IDate(x), output = '1970-01-06') test(1998.2, class(x), 'integer') -# Joining a keyed table on a non-keyed table is not working sometimes #3441 +# a single NA at the beginning with no other nomatch would cause incorrect key, #3441 dx = data.table(id = "A", key = "id") di = list(c("D", "A")) test(1999.1, key(dx[di]), NULL) diff --git a/src/forder.c b/src/forder.c index 3782ce398..18dc5a4c1 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1244,27 +1244,20 @@ SEXP fsorted(SEXP x) return ScalarLogical(i==n); } -SEXP isOrderedSubset(SEXP x, SEXP nrow) +SEXP isOrderedSubset(SEXP x, SEXP nrowArg) // specialized for use in [.data.table only // Ignores 0s but heeds NAs and any out-of-range (which result in NA) { - if (!length(x)) return(ScalarLogical(TRUE)); - if (!isInteger(x)) error("x has non-0 length but isn't an integer vector"); - if (!isInteger(nrow) || LENGTH(nrow)!=1 || INTEGER(nrow)[0]<0) error("nrow must be integer vector length 1 and >=0"); - if (LENGTH(x)<=1) return(ScalarLogical(TRUE)); - int* ix = INTEGER(x); - int i; - for (i=0; i INTEGER(nrow)[0]) + if (!isNull(x) && !isInteger(x)) error("x must be either NULL or an integer vector"); + if (length(x)<=1) return(ScalarLogical(TRUE)); // a single NA when length(x)==1 is ordered (e.g. tests 128 & 130) otherwise anyNA => FALSE + if (!isInteger(nrowArg) || LENGTH(nrowArg)!=1) error("nrow must be integer vector length 1"); + const int nrow = INTEGER(nrowArg)[0]; + if (nrow<0) error("nrow==%d but must be >=0", nrow); + const int *xd = INTEGER(x), xlen=LENGTH(x); + for (int i=0, last=INT_MIN; inrow) return(ScalarLogical(FALSE)); last = elem; }