From 612691ac1e3b808bc6c5a9945f69db44f374ef13 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Tue, 5 Mar 2019 16:44:13 +0530
Subject: [PATCH 1/3] join retain key, roll branch only when non missing roll
 arg, closes #3441

---
 NEWS.md               | 2 ++
 R/data.table.R        | 8 +++++++-
 inst/tests/tests.Rraw | 8 ++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 24ecb1a09..04d4c468f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -32,6 +32,8 @@
 
 10. Quoted expression having `:=` and dot alias in RHS now works as expected. Thanks to @franknarf1 for raising up issue on [StackOverflow](https://stackoverflow.com/questions/41228076/using-data-tables-shortcut-in-quoted-expressions) and @jangorecki for PR.
 
+11. Key is properly removed in an edge case not handled before, when joining a *keyed* table to a 2+ length vector wrapped in list when only single row match. Thanks to @symbalex for reporting and @franknarf1 for tiny reproducible example. Closes [#3441](https://github.com/Rdatatable/data.table/issues/3441).
+
 #### NOTES
 
 1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background.
diff --git a/R/data.table.R b/R/data.table.R
index 2052d1c92..40aa970ad 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1333,7 +1333,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
           ## check key on i as well!
           ichk = is.data.table(i) && haskey(i) &&
                  identical(head(key(i), length(leftcols)), names(i)[leftcols]) # i has the correct key, #3061
-          if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
+          if (keylen && (
+            ichk || is.logical(i) || (
+              .Call(CisOrderedSubset, irows, nrow(x)) && !missingroll && ( # !missingroll fix for #3441
+                (roll == FALSE) || length(irows) == 1L
+                )
+              )
+            )) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
             setattr(ans,"sorted",head(key(x),keylen))
         }
         setattr(ans, "class", class(x)) # fix for #5296
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 52701f0ed..922eff2d6 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -13618,6 +13618,14 @@ x = 5L
 test(1998.1, as.IDate(x), output = '1970-01-06')
 test(1998.2, class(x), 'integer')
 
+# Joining a keyed table on a non-keyed table is not working sometimes #3441
+dx = data.table(id = "A", key = "id")
+di = list(c("D", "A"))
+test(1999.1, key(dx[di]), NULL)
+dx = data.table(id = 1L, key = "id")
+di = list(z=c(2L, 1L))
+test(1999.2, key(dx[di]), NULL)
+
 
 ###################################
 #  Add new tests above this line  #

From b6cb0f5e68c8af0a7432ad3626279c5695dd8c55 Mon Sep 17 00:00:00 2001
From: jangorecki <j.gorecki@wit.edu.pl>
Date: Tue, 5 Mar 2019 17:23:48 +0530
Subject: [PATCH 2/3] detect NAs in isOrderedSubset, closes #3441

---
 R/data.table.R |  8 +-------
 src/forder.c   | 12 ++++++++----
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index 40aa970ad..2052d1c92 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1333,13 +1333,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
           ## check key on i as well!
           ichk = is.data.table(i) && haskey(i) &&
                  identical(head(key(i), length(leftcols)), names(i)[leftcols]) # i has the correct key, #3061
-          if (keylen && (
-            ichk || is.logical(i) || (
-              .Call(CisOrderedSubset, irows, nrow(x)) && !missingroll && ( # !missingroll fix for #3441
-                (roll == FALSE) || length(irows) == 1L
-                )
-              )
-            )) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
+          if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
             setattr(ans,"sorted",head(key(x),keylen))
         }
         setattr(ans, "class", class(x)) # fix for #5296
diff --git a/src/forder.c b/src/forder.c
index 8a53e82c2..3782ce398 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -1248,17 +1248,21 @@ SEXP isOrderedSubset(SEXP x, SEXP nrow)
 // specialized for use in [.data.table only
 // Ignores 0s but heeds NAs and any out-of-range (which result in NA)
 {
-  int i=0, last, elem;
   if (!length(x)) return(ScalarLogical(TRUE));
   if (!isInteger(x)) error("x has non-0 length but isn't an integer vector");
   if (!isInteger(nrow) || LENGTH(nrow)!=1 || INTEGER(nrow)[0]<0) error("nrow must be integer vector length 1 and >=0");
   if (LENGTH(x)<=1) return(ScalarLogical(TRUE));
-  while (i<LENGTH(x) && INTEGER(x)[i]==0) i++;
+  int* ix = INTEGER(x);
+  int i;
+  for (i=0; i<LENGTH(x); i++) {
+    if (ix[i]==NA_INTEGER) return(ScalarLogical(FALSE)); // fix for #3441
+    if (ix[i]!=0) break;
+  }
   if (i==LENGTH(x)) return(ScalarLogical(TRUE));
-  last = INTEGER(x)[i];  // the first non-0
+  int last = ix[i];  // the first non-0
   i++;
   for (; i<LENGTH(x); i++) {
-    elem = INTEGER(x)[i];
+    int elem = ix[i];
     if (elem == 0) continue;
     if (elem < last || elem < 0 || elem > INTEGER(nrow)[0])
       return(ScalarLogical(FALSE));

From 87b9233165f61a646a9cfe12ea18695fe80a90e8 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Tue, 5 Mar 2019 15:52:57 -0800
Subject: [PATCH 3/3] tidy

---
 NEWS.md               |  2 +-
 inst/tests/tests.Rraw | 12 ++++++------
 src/forder.c          | 29 +++++++++++------------------
 3 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 04d4c468f..610f678e3 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -32,7 +32,7 @@
 
 10. Quoted expression having `:=` and dot alias in RHS now works as expected. Thanks to @franknarf1 for raising up issue on [StackOverflow](https://stackoverflow.com/questions/41228076/using-data-tables-shortcut-in-quoted-expressions) and @jangorecki for PR.
 
-11. Key is properly removed in an edge case not handled before, when joining a *keyed* table to a 2+ length vector wrapped in list when only single row match. Thanks to @symbalex for reporting and @franknarf1 for tiny reproducible example. Closes [#3441](https://github.com/Rdatatable/data.table/issues/3441).
+11. A join's result could be incorrectly keyed when a single nomatch occurred at the very beginning while all other values matched, [#3441](https://github.com/Rdatatable/data.table/issues/3441). The incorrect key would cause incorrect results in subsequent queries. Thanks to @symbalex for reporting and @franknarf1 for pinpointing the root cause.
 
 #### NOTES
 
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 922eff2d6..67f5d22a4 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -13454,7 +13454,7 @@ test(1986.4, DT[, as.list(colMeans(.SD)), by=gear], cbind(DT[,"gear"],DT[,-"gear
 # tests for #2949, #1974 and #1369 - dcast not able to handle functions referred to by a variable
 dt = data.table(
   x=sample(5,20,TRUE),
-  y=sample(2,20,TRUE), 
+  y=sample(2,20,TRUE),
   z=sample(letters[1:2], 20,TRUE),
   d1 = runif(20),
   d2=1L
@@ -13466,7 +13466,7 @@ myFun1 <- function(data, vars) {
 myFun2 <- function(data, vars) {
   myFuns <- list(f1=sum, first=function(x) x[1L])
   dcast.data.table(data, "x + y ~ z", value.var=vars, fun.aggregate=myFuns)
-} 
+}
 funs = list(sum, mean)
 vars = list("d1", "d2")
 
@@ -13508,7 +13508,7 @@ DT2 <- data.table(
   flight=sample(codes[1200:2000], 1e4, TRUE)
 )
 DT2[, c("start", "end") := .(pmin(start, end), pmax(start, end))]
-# just testing if the code runs without segfault .. 
+# just testing if the code runs without segfault ..
 test(1989.1, nrow(DT1[DT2, on=.(date <= end, date >= start, flight==flight)]) > 0L, TRUE)
 
 # fix for #2202, dcast needs to rank NA with na.last=FALSE in frankv within dcast
@@ -13537,9 +13537,9 @@ DT_y <- data.table(y1 = c(1,10), y2 = c(9,50), yval = c("y_a","y_b"), key = c("y
 test(1992.1, foverlaps(DT_x, DT_y), error="All rows with NA values")
 
 # foverlaps POSIXct checks #1143 + another check...
-xp <- data.frame(year = c(2006L, 2006L, 2006L), day = c(361L, 361L, 360L), 
+xp <- data.frame(year = c(2006L, 2006L, 2006L), day = c(361L, 361L, 360L),
                  hour = c(14L, 8L, 8L), min = c(30L, 0L, 30L), val = c(0.5, 0.3, 0.4),
-                 Date = structure(c(1167229800, 1167206400, 1167121800), 
+                 Date = structure(c(1167229800, 1167206400, 1167121800),
                           class = c("POSIXct", "POSIXt"), tzone = "UTC")) ## "UTC" time zone
 setDT(xp)[, `:=`(start = Date - 1800L, end = Date + 1800L)]
 tt <- as.POSIXct(c("2006/12/27 14:23:59", "2006/12/27 16:47:59", "2006/12/27 19:12:00"), format = "%Y/%m/%d %T", tz = "Asia/Jerusalem") ## different time zone
@@ -13618,7 +13618,7 @@ x = 5L
 test(1998.1, as.IDate(x), output = '1970-01-06')
 test(1998.2, class(x), 'integer')
 
-# Joining a keyed table on a non-keyed table is not working sometimes #3441
+# a single NA at the beginning with no other nomatch would cause incorrect key, #3441
 dx = data.table(id = "A", key = "id")
 di = list(c("D", "A"))
 test(1999.1, key(dx[di]), NULL)
diff --git a/src/forder.c b/src/forder.c
index 3782ce398..18dc5a4c1 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -1244,27 +1244,20 @@ SEXP fsorted(SEXP x)
   return ScalarLogical(i==n);
 }
 
-SEXP isOrderedSubset(SEXP x, SEXP nrow)
+SEXP isOrderedSubset(SEXP x, SEXP nrowArg)
 // specialized for use in [.data.table only
 // Ignores 0s but heeds NAs and any out-of-range (which result in NA)
 {
-  if (!length(x)) return(ScalarLogical(TRUE));
-  if (!isInteger(x)) error("x has non-0 length but isn't an integer vector");
-  if (!isInteger(nrow) || LENGTH(nrow)!=1 || INTEGER(nrow)[0]<0) error("nrow must be integer vector length 1 and >=0");
-  if (LENGTH(x)<=1) return(ScalarLogical(TRUE));
-  int* ix = INTEGER(x);
-  int i;
-  for (i=0; i<LENGTH(x); i++) {
-    if (ix[i]==NA_INTEGER) return(ScalarLogical(FALSE)); // fix for #3441
-    if (ix[i]!=0) break;
-  }
-  if (i==LENGTH(x)) return(ScalarLogical(TRUE));
-  int last = ix[i];  // the first non-0
-  i++;
-  for (; i<LENGTH(x); i++) {
-    int elem = ix[i];
-    if (elem == 0) continue;
-    if (elem < last || elem < 0 || elem > INTEGER(nrow)[0])
+  if (!isNull(x) && !isInteger(x)) error("x must be either NULL or an integer vector");
+  if (length(x)<=1) return(ScalarLogical(TRUE));  // a single NA when length(x)==1 is ordered (e.g. tests 128 & 130) otherwise anyNA => FALSE
+  if (!isInteger(nrowArg) || LENGTH(nrowArg)!=1) error("nrow must be integer vector length 1");
+  const int nrow = INTEGER(nrowArg)[0];
+  if (nrow<0) error("nrow==%d but must be >=0", nrow);
+  const int *xd = INTEGER(x), xlen=LENGTH(x);
+  for (int i=0, last=INT_MIN; i<xlen; ++i) {
+    int elem = xd[i];
+    if (elem==0) continue;
+    if (elem<last || elem<0/*includes NA==INT_MIN*/ || elem>nrow)
       return(ScalarLogical(FALSE));
     last = elem;
   }