From 3c487ed60e21aeaad49877f533ba78245bd23149 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 4 Apr 2020 01:43:53 +0100 Subject: [PATCH 1/8] int i and nomatch arg, closes #3109 --- NEWS.md | 11 +++++ R/data.table.R | 6 ++- inst/tests/tests.Rraw | 15 +++++++ src/subset.c | 97 +++++++++++++++++++++++++++++++------------ 4 files changed, 101 insertions(+), 28 deletions(-) diff --git a/NEWS.md b/NEWS.md index 71fd76aa6..9662351a7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -81,6 +81,17 @@ unit = "s") 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR. +15. It is now possible to use `nomatch=NULL` argument and integer `i` to specify rows to subset but return only those rows which exists in data.table, closes [#3109](https://github.com/Rdatatable/data.table/issues/3109), [#3666](https://github.com/Rdatatable/data.table/issues/3666). Thanks @mllg and @hadley for feature request. + +```r +DT = data.table(x = 1:4) +DT[c(1L, NA_integer_, 3L, 5L), nomatch = 0L] +# x +# +#1: 1 +#2: 3 +``` + ## BUG FIXES 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085). diff --git a/R/data.table.R b/R/data.table.R index 9292ee940..933f30a52 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -567,8 +567,10 @@ replace_dot_alias = function(e) { else stop("i evaluates to a logical vector length ", length(i), " but there are ", nrow(x), " rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.") } else { irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error) - irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=") # last argument is allowOverMax (NA when selecting, error when assigning) - # simplifies logic from here on: can assume positive subscripts (no zeros) + irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), + is.null(jsub) || root!=":=", # allowOverMax (NA when selecting, error when assigning) + identical(nomatch,0L)) # nomatchArg, keeps only shuffle #3109 + # simplifies logic from here on: can assume positive subscripts (no zeros), for a nomatch=0L also no NAs # maintains Arun's fix for #2697 (test 1042) # efficient in C with more detailed helpful messages when user mixes positives and negatives # falls through quickly (no R level allocs) if all items are within range [1,max] with no zeros or negatives diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8..a26e26c36 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16846,3 +16846,18 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN)) test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) + +# Respect `nomatch = 0L` for integer i #3109, also #3666 +dt = data.table(x = 1:4) +test(2139.01, dt[c(1L, 5L, NA_integer_)], data.table(x=c(1L,NA_integer_,NA_integer_))) # test the default nomatch +test(2139.02, dt[c(1L, 5L, NA_integer_), nomatch=NULL], data.table(x = 1L)) +test(2139.03, dt[c(1L, 5L, NA_integer_), x, nomatch=NULL], 1L) +test(2139.04, dt[c(1:4,1:4), nomatch=NULL], data.table(x=c(1:4,1:4))) # early stopping convertNegAndZeroIdx +test(2139.05, dt[c(1:4,-1L), nomatch=NULL], error="i argument must not be negative") +test(2139.06, dt[c(1:4,NA_integer_,-1L), nomatch=NULL], error="i argument must not be negative") +test(2139.07, dt[NA, nomatch=NULL], data.table(x=integer())) +test(2139.08, dt[0L, nomatch=NULL], data.table(x=integer())) +test(2139.09, dt[0:1, nomatch=NULL], data.table(x=1L)) +test(2139.10, dt[-1L, nomatch=NULL], error="i argument must not be negative") +test(2139.11, data.table()[1L, nomatch=NULL], data.table()) +test(2139.12, data.table()[-1L, nomatch=NULL], error="i argument must not be negative") diff --git a/src/subset.c b/src/subset.c index d9fea2800..bde0033db 100644 --- a/src/subset.c +++ b/src/subset.c @@ -106,51 +106,79 @@ static const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSu return NULL; } -SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax) +SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatchArg) { // called from [.data.table to massage user input, creating a new strictly positive idx if there are any negatives or zeros // + more precise and helpful error messages telling user exactly where the problem is (saving user debugging time) // + a little more efficient than negativeSubscript in src/main/subscript.c (it's private to R so we can't call it anyway) // allowOverMaxArg is false when := (test 1024), otherwise true for selecting + // nomatchArg is true (nomatch==0L) when we want to skip nomatch'ing rows, it also excludes NAs #3109 if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov if (!isInteger(maxArg) || length(maxArg)!=1) error(_("Internal error. 'maxArg' is type '%s' and length %d, should be an integer singleton"), type2char(TYPEOF(maxArg)), length(maxArg)); // # nocov if (!isLogical(allowOverMax) || LENGTH(allowOverMax)!=1 || LOGICAL(allowOverMax)[0]==NA_LOGICAL) error(_("Internal error: allowOverMax must be TRUE/FALSE")); // # nocov - int max = INTEGER(maxArg)[0], n=LENGTH(idx); + if (!isLogical(nomatchArg) || LENGTH(nomatchArg)!=1 || LOGICAL(nomatchArg)[0]==NA_LOGICAL) error(_("Internal error: nomatchArg must be TRUE/FALSE")); // # nocov + int max = INTEGER(maxArg)[0], n=LENGTH(idx), nomatch = LOGICAL(nomatchArg)[0]; if (max<0) error(_("Internal error. max is %d, must be >= 0."), max); // # nocov includes NA which will print as INT_MIN int *idxp = INTEGER(idx); bool stop = false; - #pragma omp parallel for num_threads(getDTthreads()) - for (int i=0; imax) stop=true; + if (!nomatch) { + #pragma omp parallel for num_threads(getDTthreads()) + for (int i=0; imax) + stop = true; + } + } else { // nomatch + #pragma omp parallel for num_threads(getDTthreads()) + for (int i=0; imax) + stop = true; + } } - if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA or in range [1-max] + if (!stop) + return(idx); // most common case to return early: no 0, no negative; all idx either NA (for !nomatch) or in range [1-max] // --------- - // else massage the input to a standard idx where all items are either NA or in range [1,max] ... + // else massage the input to a standard idx where all items are either NA (for !nomatch) or in range [1,max] ... - int countNeg=0, countZero=0, countNA=0, firstOverMax=0; + int countNeg=0, countZero=0, countNA=0, countOverMax=0, firstOverMax=0; for (int i=0; imax && firstOverMax==0) firstOverMax=i+1; + if (elem==NA_INTEGER) + countNA++; + else if (elem<0) + countNeg++; + else if (elem==0) + countZero++; + else if (elem>max) { + countOverMax++; + if (firstOverMax==0) + firstOverMax=i+1; + } } if (firstOverMax && LOGICAL(allowOverMax)[0]==FALSE) { error(_("i[%d] is %d which is out of range [1,nrow=%d]"), firstOverMax, idxp[firstOverMax-1], max); } + if (countNeg && nomatch) { + error(_("i argument must not be negative when used together with nomatch=NULL")); + } int countPos = n-countNeg-countZero-countNA; if (countPos && countNeg) { int i=0, firstNeg=0, firstPos=0; while (i0) firstPos=i+1; - if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) firstNeg=i+1; + if (firstPos==0 && elem>0) + firstPos=i+1; + if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) + firstNeg=i+1; i++; } error(_("Item %d of i is %d and item %d is %d. Cannot mix positives and negatives."), firstNeg, idxp[firstNeg-1], firstPos, idxp[firstPos-1]); @@ -159,40 +187,56 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax) int i=0, firstNeg=0, firstNA=0; while (imax) + continue; + ansp[ansi++] = elem; + } + } else if (countNeg==0) { // just zeros to remove, or >max to convert to NA ans = PROTECT(allocVector(INTSXP, n - countZero)); int *ansp = INTEGER(ans); for (int i=0, ansi=0; imax ? NA_INTEGER : elem; } } else { // idx is all negative without any NA but perhaps some zeros bool *keep = (bool *)R_alloc(max, sizeof(bool)); // 4 times less memory that INTSXP in src/main/subscript.c - for (int i=0; imax) { countBeyond++; - if (firstBeyond==0) firstBeyond=i+1; + if (firstBeyond==0) + firstBeyond=i+1; continue; } if (!keep[elem-1]) { countDup++; - if (firstDup==0) firstDup=i+1; + if (firstDup==0) + firstDup=i+1; } else { keep[elem-1] = false; countRemoved++; @@ -206,7 +250,8 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax) ans = PROTECT(allocVector(INTSXP, ansn)); int *ansp = INTEGER(ans); for (int i=0, ansi=0; i Date: Sat, 4 Apr 2020 02:09:11 +0100 Subject: [PATCH 2/8] afix test 798, notjoin and negative subset --- inst/tests/tests.Rraw | 19 ++++++++++--------- src/subset.c | 41 ++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a26e26c36..b0f1abb1f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2217,7 +2217,7 @@ test(794, DT[!"b"], DT[c("a","c")]) test(795, DT[!0], DT) test(796, DT[!NULL], DT[NULL]) test(797, DT[!integer()], DT) -test(798, DT[!-1], DT[1]) +test(798, DT[!-1], DT[1]) # this passed nomatch==0 to convertNegAndZeroIdx #3109 test(799, DT[--1], DT[1]) myi = c("a","c") test(800, DT[!myi], DT["b"]) @@ -16853,11 +16853,12 @@ test(2139.01, dt[c(1L, 5L, NA_integer_)], data.table(x=c(1L,NA_integer_,NA_integ test(2139.02, dt[c(1L, 5L, NA_integer_), nomatch=NULL], data.table(x = 1L)) test(2139.03, dt[c(1L, 5L, NA_integer_), x, nomatch=NULL], 1L) test(2139.04, dt[c(1:4,1:4), nomatch=NULL], data.table(x=c(1:4,1:4))) # early stopping convertNegAndZeroIdx -test(2139.05, dt[c(1:4,-1L), nomatch=NULL], error="i argument must not be negative") -test(2139.06, dt[c(1:4,NA_integer_,-1L), nomatch=NULL], error="i argument must not be negative") -test(2139.07, dt[NA, nomatch=NULL], data.table(x=integer())) -test(2139.08, dt[0L, nomatch=NULL], data.table(x=integer())) -test(2139.09, dt[0:1, nomatch=NULL], data.table(x=1L)) -test(2139.10, dt[-1L, nomatch=NULL], error="i argument must not be negative") -test(2139.11, data.table()[1L, nomatch=NULL], data.table()) -test(2139.12, data.table()[-1L, nomatch=NULL], error="i argument must not be negative") +test(2139.05, dt[c(1:4,-1L), nomatch=NULL], error="Cannot mix positives and negatives") +test(2139.06, dt[c(1:4,NA_integer_,-1L), nomatch=NULL], error="Cannot mix positives and negatives") +test(2139.07, dt[c(-1L,NA_integer_), nomatch=NULL], error="Cannot mix negatives and NA") +test(2139.08, dt[NA, nomatch=NULL], data.table(x=integer())) +test(2139.09, dt[0L, nomatch=NULL], data.table(x=integer())) +test(2139.10, dt[0:1, nomatch=NULL], data.table(x=1L)) +test(2139.11, dt[-1L, nomatch=NULL], data.table(x=2:4)) +test(2139.12, data.table()[1L, nomatch=NULL], data.table()) +test(2139.13, data.table()[-1L, nomatch=NULL], data.table(), warning="there are only 0 rows") diff --git a/src/subset.c b/src/subset.c index bde0033db..f9366b47b 100644 --- a/src/subset.c +++ b/src/subset.c @@ -166,9 +166,6 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatch if (firstOverMax && LOGICAL(allowOverMax)[0]==FALSE) { error(_("i[%d] is %d which is out of range [1,nrow=%d]"), firstOverMax, idxp[firstOverMax-1], max); } - if (countNeg && nomatch) { - error(_("i argument must not be negative when used together with nomatch=NULL")); - } int countPos = n-countNeg-countZero-countNA; if (countPos && countNeg) { @@ -197,24 +194,26 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatch } SEXP ans; - if (nomatch) { // also countNeg==0 checked before - ans = PROTECT(allocVector(INTSXP, n-countZero-countNA-countOverMax)); - int *ansp = INTEGER(ans); - for (int i=0, ansi=0; imax) - continue; - ansp[ansi++] = elem; - } - } else if (countNeg==0) { - // just zeros to remove, or >max to convert to NA - ans = PROTECT(allocVector(INTSXP, n - countZero)); - int *ansp = INTEGER(ans); - for (int i=0, ansi=0; imax ? NA_INTEGER : elem; + if (countNeg==0) { + if (nomatch) { + ans = PROTECT(allocVector(INTSXP, n-countZero-countNA-countOverMax)); + int *ansp = INTEGER(ans); + for (int i=0, ansi=0; imax) + continue; + ansp[ansi++] = elem; + } + } else { + // just zeros to remove, or >max to convert to NA + ans = PROTECT(allocVector(INTSXP, n - countZero)); + int *ansp = INTEGER(ans); + for (int i=0, ansi=0; imax ? NA_INTEGER : elem; + } } } else { // idx is all negative without any NA but perhaps some zeros From 8879054889b1194028b487ad605298a84d59d216 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 4 Apr 2020 02:12:56 +0100 Subject: [PATCH 3/8] use proper nomatch value --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 9662351a7..4bf31abe8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -85,7 +85,7 @@ unit = "s") ```r DT = data.table(x = 1:4) -DT[c(1L, NA_integer_, 3L, 5L), nomatch = 0L] +DT[c(1L, NA_integer_, 3L, 5L), nomatch = NULL] # x # #1: 1 From 8c7ff0a27fc71ec513ca8730732f62316e3dba63 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 4 Apr 2020 11:16:07 +0100 Subject: [PATCH 4/8] fix missing codecov --- inst/tests/tests.Rraw | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b0f1abb1f..a605dc83e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16862,3 +16862,6 @@ test(2139.10, dt[0:1, nomatch=NULL], data.table(x=1L)) test(2139.11, dt[-1L, nomatch=NULL], data.table(x=2:4)) test(2139.12, data.table()[1L, nomatch=NULL], data.table()) test(2139.13, data.table()[-1L, nomatch=NULL], data.table(), warning="there are only 0 rows") +test(2139.14, dt[-c(1L,0L)], data.table(x=2:4)) # codecov gap, not related to nomatch +test(2139.15, dt[-c(1L,0L), nomatch=NULL], data.table(x=2:4)) + From 7434ed6a1ede1c08be543cfe354c6be9559ec912 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sat, 21 Aug 2021 13:47:45 -0600 Subject: [PATCH 5/8] undo line splitting so I can see what logic changes are; pending C same-line-branch codecov https://github.com/Rdatatable/data.table/pull/4301#issuecomment-903161605 --- src/subset.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/src/subset.c b/src/subset.c index 576428a3c..b601eb5e7 100644 --- a/src/subset.c +++ b/src/subset.c @@ -160,8 +160,7 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatch stop = true; } } - if (!stop) - return(idx); // most common case to return early: no 0, no negative; all idx either NA (for !nomatch) or in range [1-max] + if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA (for !nomatch) or in range [1-max] // --------- // else massage the input to a standard idx where all items are either NA (for !nomatch) or in range [1,max] ... @@ -169,12 +168,9 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatch int countNeg=0, countZero=0, countNA=0, countOverMax=0, firstOverMax=0; for (int i=0; imax) { countOverMax++; if (firstOverMax==0) @@ -190,10 +186,8 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatch int i=0, firstNeg=0, firstPos=0; while (i0) - firstPos=i+1; - if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) - firstNeg=i+1; + if (firstPos==0 && elem>0) firstPos=i+1; + if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) firstNeg=i+1; i++; } error(_("Item %d of i is %d and item %d is %d. Cannot mix positives and negatives."), firstNeg, idxp[firstNeg-1], firstPos, idxp[firstPos-1]); @@ -202,10 +196,8 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatch int i=0, firstNeg=0, firstNA=0; while (imax) { countBeyond++; - if (firstBeyond==0) - firstBeyond=i+1; + if (firstBeyond==0) firstBeyond=i+1; continue; } if (!keep[elem-1]) { countDup++; - if (firstDup==0) - firstDup=i+1; + if (firstDup==0) firstDup=i+1; } else { keep[elem-1] = false; countRemoved++; @@ -267,8 +255,7 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatch ans = PROTECT(allocVector(INTSXP, ansn)); int *ansp = INTEGER(ans); for (int i=0, ansi=0; i Date: Sat, 21 Aug 2021 23:12:01 -0600 Subject: [PATCH 6/8] reduce new code, rename nomatch to allowNA --- R/data.table.R | 2 +- src/subset.c | 73 +++++++++++++++++++------------------------------- 2 files changed, 29 insertions(+), 46 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 3f90abbcb..adc7198f9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -621,7 +621,7 @@ replace_dot_alias = function(e) { irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error) irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=", # allowOverMax (NA when selecting, error when assigning) - identical(nomatch,0L)) # nomatchArg, keeps only shuffle #3109 + is.na(nomatch)) # allowNA=false when nomatch=NULL (converted to 0L at the top currently) #3109 #3666 # simplifies logic from here on: can assume positive subscripts (no zeros), for a nomatch=0L also no NAs # maintains Arun's fix for #2697 (test 1042) # efficient in C with more detailed helpful messages when user mixes positives and negatives diff --git a/src/subset.c b/src/subset.c index b601eb5e7..a04fcc9ec 100644 --- a/src/subset.c +++ b/src/subset.c @@ -124,58 +124,42 @@ const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_ou return NULL; } -SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatchArg) +SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP allowNAArg) { // called from [.data.table to massage user input, creating a new strictly positive idx if there are any negatives or zeros // + more precise and helpful error messages telling user exactly where the problem is (saving user debugging time) // + a little more efficient than negativeSubscript in src/main/subscript.c (it's private to R so we can't call it anyway) // allowOverMaxArg is false when := (test 1024), otherwise true for selecting - // nomatchArg is true (nomatch==0L) when we want to skip nomatch'ing rows, it also excludes NAs #3109 + // allowNAArg is false when nomatch=NULL #3109 #3666 if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov if (!isInteger(maxArg) || length(maxArg)!=1) error(_("Internal error. 'maxArg' is type '%s' and length %d, should be an integer singleton"), type2char(TYPEOF(maxArg)), length(maxArg)); // # nocov if (!isLogical(allowOverMax) || LENGTH(allowOverMax)!=1 || LOGICAL(allowOverMax)[0]==NA_LOGICAL) error(_("Internal error: allowOverMax must be TRUE/FALSE")); // # nocov - if (!isLogical(nomatchArg) || LENGTH(nomatchArg)!=1 || LOGICAL(nomatchArg)[0]==NA_LOGICAL) error(_("Internal error: nomatchArg must be TRUE/FALSE")); // # nocov - int max = INTEGER(maxArg)[0], n=LENGTH(idx), nomatch = LOGICAL(nomatchArg)[0]; + const int max = INTEGER(maxArg)[0], n=LENGTH(idx); if (max<0) error(_("Internal error. max is %d, must be >= 0."), max); // # nocov includes NA which will print as INT_MIN - int *idxp = INTEGER(idx); + if (!isLogical(allowNAArg) || LENGTH(allowNAArg)!=1 || LOGICAL(allowNAArg)[0]==NA_LOGICAL) error(_("Internal error: allowNAArg must be TRUE/FALSE")); // # nocov + const bool allowNA = LOGICAL(allowNAArg)[0]; + const int *idxp = INTEGER(idx); bool stop = false; - if (!nomatch) { - #pragma omp parallel for num_threads(getDTthreads(n, true)) - for (int i=0; imax) - stop = true; - } - } else { - #pragma omp parallel for num_threads(getDTthreads(n, true)) - for (int i=0; imax) - stop = true; - } + #pragma omp parallel for num_threads(getDTthreads(n, true)) + for (int i=0; imax) stop=true; } - if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA (for !nomatch) or in range [1-max] + if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA (if allowNA) or in range [1-max] // --------- - // else massage the input to a standard idx where all items are either NA (for !nomatch) or in range [1,max] ... + // else massage the input to a standard idx where all items are either in range [1,max], or NA (if allowNA) - int countNeg=0, countZero=0, countNA=0, countOverMax=0, firstOverMax=0; - for (int i=0; imax) { - countOverMax++; - if (firstOverMax==0) - firstOverMax=i+1; - } + else if (elem>max && ++countOverMax && firstOverMax==0) firstOverMax=i+1; } if (firstOverMax && LOGICAL(allowOverMax)[0]==FALSE) { error(_("i[%d] is %d which is out of range [1,nrow=%d]"), firstOverMax, idxp[firstOverMax-1], max); @@ -205,24 +189,23 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP nomatch SEXP ans; if (countNeg==0) { - if (nomatch) { - ans = PROTECT(allocVector(INTSXP, n-countZero-countNA-countOverMax)); + if (allowNA) { + // remove zeros, convert >max to NA + ans = PROTECT(allocVector(INTSXP, n-countZero)); int *ansp = INTEGER(ans); - for (int i=0, ansi=0; imax) - continue; - ansp[ansi++] = elem; + if (elem==0) continue; + ansp[ansi++] = elem>max ? NA_INTEGER : elem; } } else { - // just zeros to remove, or >max to convert to NA - ans = PROTECT(allocVector(INTSXP, n - countZero)); + // remove zeros, NA and >max + ans = PROTECT(allocVector(INTSXP, n-countZero-countNA-countOverMax)); int *ansp = INTEGER(ans); - for (int i=0, ansi=0; imax ? NA_INTEGER : elem; + if (elem<1 || elem>max) continue; + ansp[ansi++] = elem; } } } else { @@ -296,7 +279,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md bool anyNA=false, orderedSubset=true; // true for when rows==null (meaning all rows) if (!isNull(rows) && check_idx(rows, nrow, &anyNA, &orderedSubset)!=NULL) { SEXP max = PROTECT(ScalarInteger(nrow)); nprotect++; - rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE), ScalarLogical(FALSE))); nprotect++; + rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE), ScalarLogical(TRUE))); nprotect++; const char *err = check_idx(rows, nrow, &anyNA, &orderedSubset); if (err!=NULL) error(err); } From 4ffe3d0a96dd7af80229ff74446275e683580800 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 22 Aug 2021 13:06:49 -0600 Subject: [PATCH 7/8] reworked news item, warning for nomatch=0 just in this case which was ignored before, convert nomatch=0 to NULL instead of NULL to 0 in internals --- NEWS.md | 21 ++++++++++++------ R/data.table.R | 37 +++++++++++++++---------------- inst/tests/tests.Rraw | 48 +++++++++++++++++++++++++---------------- src/bmerge.c | 3 +-- src/nqrecreateindices.c | 6 +++--- 5 files changed, 67 insertions(+), 48 deletions(-) diff --git a/NEWS.md b/NEWS.md index e82ba16ca..f0581ccb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -115,15 +115,22 @@ mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) ``` -23. It is now possible to use `nomatch=NULL` argument and integer `i` to specify rows to subset but return only those rows which exists in data.table, closes [#3109](https://github.com/Rdatatable/data.table/issues/3109), [#3666](https://github.com/Rdatatable/data.table/issues/3666). Thanks @mllg and @hadley for feature request. +23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. ```R - DT = data.table(x = 1:4) - DT[c(1L, NA_integer_, 3L, 5L), nomatch = NULL] - # x - # - #1: 1 - #2: 3 + DT = data.table(A=1:3) + DT[c(1L, NA, 3L, 5L)] # default nomatch=NA + # A + # + # 1: 1 + # 2: NA + # 3: 3 + # 4: NA + DT[c(1L, NA, 3L, 5L), nomatch=NULL] + # A + # + # 1: 1 + # 2: 3 ``` ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index e8bf7ca69..10a138129 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -221,12 +221,12 @@ replace_dot_alias = function(e) { # TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one. .unsafe.opt() #3585 missingnomatch = missing(nomatch) - if (is.null(nomatch)) nomatch = 0L # allow nomatch=NULL API already now, part of: https://github.com/Rdatatable/data.table/issues/857 - if (!is.na(nomatch) && nomatch!=0L) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)") - nomatch = as.integer(nomatch) + nomatch0 = identical(nomatch,0) || identical(nomatch,0L) # for warning with row-numbers in i; #4353 + if (nomatch0) nomatch=NULL # retain nomatch=0 backwards compatibility; #857 + if (!is.na(nomatch) && !is.null(nomatch)) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) - if (!is.na(nomatch) && is.na(which)) stopf("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.") + if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE @@ -313,7 +313,7 @@ replace_dot_alias = function(e) { stopf(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i") if (!missingnomatch) { warningf("nomatch isn't relevant together with :=, ignoring nomatch") - nomatch=0L + nomatch=NULL } } } @@ -369,7 +369,7 @@ replace_dot_alias = function(e) { if (isub %iscall% "!") { notjoin = TRUE if (!missingnomatch) stopf("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch."); - nomatch = 0L + nomatch = NULL isub = isub[[2L]] # #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!" if (isub %iscall% "(" && !is.name(isub[[2L]])) @@ -389,7 +389,7 @@ replace_dot_alias = function(e) { on = o$on ## the following two are ignored if i is not a data.table. ## Since we are converting i to data.table, it is important to set them properly. - nomatch = 0L + nomatch = NULL mult = "all" } else if (!is.name(isub)) { @@ -509,8 +509,7 @@ replace_dot_alias = function(e) { len__ = ans$lens allGrp1 = all(ops==1L) # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time. indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix - # length of input nomatch (single 0 or NA) is 1 in both cases. - # When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N + # When no match, len__ is 0 for nomatch=NULL and 1 for nomatch=NA, so len__ isn't .N # If using secondary key of x, f__ will refer to xo if (is.na(which)) { w = if (notjoin) f__!=0L else is.na(f__) @@ -533,7 +532,7 @@ replace_dot_alias = function(e) { # Fix for #1092 and #1074 # TODO: implement better version of "any"/"all"/"which" to avoid # unnecessary construction of logical vectors - if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L] + if (is.null(nomatch) && allLen1) irows = irows[irows != 0L] } else { if (length(xo) && missing(on)) stopf("Internal error. Cannot by=.EACHI when joining to an index, yet") # nocov @@ -553,10 +552,10 @@ replace_dot_alias = function(e) { } else { if (!byjoin) { #1287 and #1271 irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic - if (identical(nomatch,0L)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows + if (is.null(nomatch)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows } # TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"? - # TODO: how about when nomatch=0L, can we avoid allocating then as well? + # TODO: how about when nomatch=NULL, can we avoid allocating then as well? } if (length(xo) && length(irows)) { irows = xo[irows] # TO DO: fsort here? @@ -619,10 +618,12 @@ replace_dot_alias = function(e) { else stopf("i evaluates to a logical vector length %d but there are %d rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.", length(i), nrow(x)) } else { irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error) + if (nomatch0) warning("Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)") + # warning only for this case where nomatch was ignored before v1.14.2; #3109 irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=", # allowOverMax (NA when selecting, error when assigning) - is.na(nomatch)) # allowNA=false when nomatch=NULL (converted to 0L at the top currently) #3109 #3666 - # simplifies logic from here on: can assume positive subscripts (no zeros), for a nomatch=0L also no NAs + !is.null(nomatch)) # allowNA=false when nomatch=NULL #3109, true when nomatch=NA #3666 + # simplifies logic from here on: can assume positive subscripts (no zeros), for nomatch=NULL also no NAs # maintains Arun's fix for #2697 (test 1042) # efficient in C with more detailed helpful messages when user mixes positives and negatives # falls through quickly (no R level allocs) if all items are within range [1,max] with no zeros or negatives @@ -630,7 +631,7 @@ replace_dot_alias = function(e) { } } if (notjoin) { - if (byjoin || !is.integer(irows) || is.na(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov + if (byjoin || !is.integer(irows) || !is.null(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov irows = irows[irows!=0L] if (verbose) {last.started.at=proc.time();catf("Inverting irows for notjoin done in ... ");flush.console()} i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x)) @@ -845,7 +846,7 @@ replace_dot_alias = function(e) { # in 1.8.3, but this failed test 876. # TO DO: Add a test like X[i,sum(v),by=i.x2], or where by includes a join column (both where some i don't match). # TO DO: Make xss directly, rather than recursive call. - if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset + if (is.null(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset if (length(allbyvars)) { ############### TO DO TO DO TO DO ############### if (verbose) catf("i clause present and columns used in by detected, only these subset: %s\n", brackify(allbyvars)) xss = `[.data.table`(x,irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends) @@ -1292,7 +1293,7 @@ replace_dot_alias = function(e) { } ans = vector("list", length(ansvars)) ii = rep.int(indices__, len__) # following #1991 fix - # TODO: if (allLen1 && allGrp1 && (is.na(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=0 should drop rows in i that have no match] + # TODO: if (allLen1 && allGrp1 && (!is.null(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=NULL should drop rows in i that have no match] # But rather than that complex logic here at R level to catch that and do a shallow copy for efficiency, just do the check inside CsubsetDT # to see if it passed 1:nrow(x) and then CsubsetDT should do the shallow copy safely and centrally. # That R level branch was taken out in PR #3213 @@ -1723,7 +1724,7 @@ replace_dot_alias = function(e) { } dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with - # nomatch=0L even now.. but not switching it on yet, will deal it separately. + # nomatch=NULL even now.. but not switching it on yet, will deal it separately. if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { if (!length(ansvars) && !use.I) { GForce = FALSE diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7f3bbb3dc..4522fc693 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2293,7 +2293,8 @@ test(807, DT[!c("b","foo","c"),nomatch=0], error="not-join.*prefix is present on test(808, DT[c("b","foo","c"),which=TRUE,nomatch=NA], INT(3:4,NA,5:6)) test(809, DT[c("b","foo","c"),which=TRUE,nomatch=0], INT(3:4,5:6)) test(810, DT[c("b","foo","c"),which=NA,nomatch=NA], 2L) -test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0 would always return an empty vector[.] Please change or remove either which or nomatch") +test(811.1, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0|NULL would always return an empty vector[.] Please change or remove either which or nomatch") +test(811.2, DT[c("b","foo","c"),which=NA,nomatch=NULL], error="which=NA with nomatch=0|NULL would always return an empty vector[.] Please change or remove either which or nomatch") # New notj for column names and positions when with=FALSE, #1384 DT = data.table(a=1:3,b=4:6,c=7:9) @@ -17979,21 +17980,32 @@ test(2209.3, fread('A|B,C\n1|+,4\n2|-,5\n3|-,6\n', dec=','), data.table(A=1:3, ' test(2209.4, fread('A|B|C\n1|0,4|a\n2|0,5|b\n', dec=','), data.table(A=1:2, B=c(0.4,0.5), C=c("a","b"))) # ok before test(2209.5, fread('A|B,C\n1|0,4\n2|0,5\n', dec=','), data.table(A=1:2, "B,C"=c(0.4,0.5))) -# respect `nomatch = 0L` for integer i, #3109 #3666 -dt = data.table(x = 1:4) -test(2210.01, dt[c(1L, 5L, NA_integer_)], data.table(x=c(1L,NA_integer_,NA_integer_))) # test the default nomatch -test(2210.02, dt[c(1L, 5L, NA_integer_), nomatch=NULL], data.table(x = 1L)) -test(2210.03, dt[c(1L, 5L, NA_integer_), x, nomatch=NULL], 1L) -test(2210.04, dt[c(1:4,1:4), nomatch=NULL], data.table(x=c(1:4,1:4))) # early stopping convertNegAndZeroIdx -test(2210.05, dt[c(1:4,-1L), nomatch=NULL], error="Cannot mix positives and negatives") -test(2210.06, dt[c(1:4,NA_integer_,-1L), nomatch=NULL], error="Cannot mix positives and negatives") -test(2210.07, dt[c(-1L,NA_integer_), nomatch=NULL], error="Cannot mix negatives and NA") -test(2210.08, dt[NA, nomatch=NULL], data.table(x=integer())) -test(2210.09, dt[0L, nomatch=NULL], data.table(x=integer())) -test(2210.10, dt[0:1, nomatch=NULL], data.table(x=1L)) -test(2210.11, dt[-1L, nomatch=NULL], data.table(x=2:4)) -test(2210.12, data.table()[1L, nomatch=NULL], data.table()) -test(2210.13, data.table()[-1L, nomatch=NULL], data.table(), warning="there are only 0 rows") -test(2210.14, dt[-c(1L,0L)], data.table(x=2:4)) # codecov gap, not related to nomatch -test(2210.15, dt[-c(1L,0L), nomatch=NULL], data.table(x=2:4)) +# respect `nomatch=NULL` for integer i, #3109 #3666 +DT = data.table(x = 1:4) +test(2210.01, DT[c(1L, 5L, NA_integer_)], data.table(x=c(1L,NA_integer_,NA_integer_))) # default nomatch=NA +test(2210.02, DT[c(1L, 5L, NA_integer_), nomatch=NULL], data.table(x = 1L)) +test(2210.03, DT[c(1L, 5L, NA_integer_), nomatch=0], data.table(x = 1L), warning="Please use nomatch=NULL") +test(2210.04, DT[c(1L, 5L, NA_integer_), x, nomatch=NULL], 1L) +test(2210.05, DT[c(1L, 5L, NA_integer_), x, nomatch=0], 1L, warning="Please use nomatch=NULL") +test(2210.06, DT[c(1:4,1:4), nomatch=NULL], data.table(x=c(1:4,1:4))) # early stopping convertNegAndZeroIdx +test(2210.07, DT[c(1:4,-1L), nomatch=NULL], error="Cannot mix positives and negatives") +test(2210.08, DT[c(1:4,NA_integer_,-1L), nomatch=NULL], error="Cannot mix positives and negatives") +test(2210.09, DT[c(1:4,NA_integer_,-1L), nomatch=0], error="Cannot mix positives and negatives", warning="Please use nomatch=NULL") +test(2210.10, DT[c(-1L,NA_integer_), nomatch=NULL], error="Cannot mix negatives and NA") +test(2210.11, DT[c(-1L,NA_integer_), nomatch=0], error="Cannot mix negatives and NA", warning="Please use nomatch=NULL") +test(2210.12, DT[NA, nomatch=NULL], data.table(x=integer())) +test(2210.13, DT[NA, nomatch=0], data.table(x=integer()), warning="Please use nomatch=NULL") +test(2210.14, DT[0L, nomatch=NULL], data.table(x=integer())) +test(2210.15, DT[0L, nomatch=0], data.table(x=integer()), warning="Please use nomatch=NULL") +test(2210.16, DT[0:1, nomatch=NULL], data.table(x=1L)) +test(2210.17, DT[0:1, nomatch=0], data.table(x=1L), warning="Please use nomatch=NULL") +test(2210.18, DT[-1L, nomatch=NULL], data.table(x=2:4)) +test(2210.19, DT[-1L, nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL") +test(2210.20, data.table()[1L, nomatch=NULL], data.table()) +test(2210.21, data.table()[1L, nomatch=0], data.table(), warning="Please use nomatch=NULL") +test(2210.22, data.table()[-1L, nomatch=NULL], data.table(), warning="there are only 0 rows") +test(2210.23, data.table()[-1L, nomatch=0], data.table(), warning=c("Please use nomatch=NULL","there are only 0 rows")) +test(2210.24, DT[-c(1L,0L)], data.table(x=2:4)) # codecov gap, not related to nomatch +test(2210.25, DT[-c(1L,0L), nomatch=NULL], data.table(x=2:4)) +test(2210.26, DT[-c(1L,0L), nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL") diff --git a/src/bmerge.c b/src/bmerge.c index 83c5b167d..fac7ee281 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -85,8 +85,7 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP error(_("rollends must be a length 2 logical vector")); rollends = LOGICAL(rollendsArg); - // nomatch arg - nomatch = INTEGER(nomatchArg)[0]; + nomatch = isNull(nomatchArg) ? 0 : INTEGER(nomatchArg)[0]; // mult arg if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL; diff --git a/src/nqrecreateindices.c b/src/nqrecreateindices.c index a07ce38c7..a750c093e 100644 --- a/src/nqrecreateindices.c +++ b/src/nqrecreateindices.c @@ -14,7 +14,7 @@ SEXP nqRecreateIndices(SEXP xo, SEXP len, SEXP indices, SEXP nArg, SEXP nomatch) const int *iindices = INTEGER(indices); const int *ilen = INTEGER(len); const int *ixo = INTEGER(xo); - const int *inomatch = INTEGER(nomatch); + const int inomatch = isNull(nomatch) ? 0 : INTEGER(nomatch)[0]; int *inewstarts = INTEGER(newstarts); for (int i=0; i= xn) { // NA_integer_ = INT_MIN is checked in init.c - // j >= xn needed for special nomatch=0L case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) - inewstarts[i] = inomatch[0]; + // j >= xn needed for special nomatch=NULL case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) + inewstarts[i] = inomatch; j++; // newlen will be 1 for xo=NA and 0 for xo=0 .. but we need to increment by 1 for both } else { inewstarts[i] = tmp+1; From 442e0d35a6ee0b486da0aa3702a55e9952e02fc4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 22 Aug 2021 13:19:27 -0600 Subject: [PATCH 8/8] remove new comment on unchanged test that I don't understand (nomatch wasn't passed to convertNegAndZeroIdx at all) --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4522fc693..d477b13e7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2277,7 +2277,7 @@ test(794, DT[!"b"], DT[c("a","c")]) test(795, DT[!0], DT) test(796, DT[!NULL], DT[NULL]) test(797, DT[!integer()], DT) -test(798, DT[!-1], DT[1]) # this passed nomatch==0 to convertNegAndZeroIdx #3109 +test(798, DT[!-1], DT[1]) test(799, DT[--1], DT[1]) myi = c("a","c") test(800, DT[!myi], DT["b"])