diff --git a/NEWS.md b/NEWS.md index 451532fca..f0581ccb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -114,6 +114,24 @@ ```R mtcars |> DT(mpg>20, .(mean_hp=mean(hp)), by=cyl) ``` + +23. `DT[i, nomatch=NULL]` where `i` contains row numbers now excludes `NA` and any outside the range [1,nrow], [#3109](https://github.com/Rdatatable/data.table/issues/3109) [#3666](https://github.com/Rdatatable/data.table/issues/3666). Before, `NA` rows were returned always for such values; i.e. `nomatch=0|NULL` was ignored. Thanks Michel Lang and Hadley Wickham for the requests, and Jan Gorecki for the PR. Using `nomatch=0` in this case when `i` is row numbers generates the warning `Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)`. + + ```R + DT = data.table(A=1:3) + DT[c(1L, NA, 3L, 5L)] # default nomatch=NA + # A + # + # 1: 1 + # 2: NA + # 3: 3 + # 4: NA + DT[c(1L, NA, 3L, 5L), nomatch=NULL] + # A + # + # 1: 1 + # 2: 3 + ``` ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index f13a1c7ea..10a138129 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -221,12 +221,12 @@ replace_dot_alias = function(e) { # TO DO (document/faq/example). Removed for now ... if ((roll || rolltolast) && missing(mult)) mult="last" # for when there is exact match to mult. This does not control cases where the roll is mult, that is always the last one. .unsafe.opt() #3585 missingnomatch = missing(nomatch) - if (is.null(nomatch)) nomatch = 0L # allow nomatch=NULL API already now, part of: https://github.com/Rdatatable/data.table/issues/857 - if (!is.na(nomatch) && nomatch!=0L) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL)") - nomatch = as.integer(nomatch) + nomatch0 = identical(nomatch,0) || identical(nomatch,0L) # for warning with row-numbers in i; #4353 + if (nomatch0) nomatch=NULL # retain nomatch=0 backwards compatibility; #857 + if (!is.na(nomatch) && !is.null(nomatch)) stopf("nomatch= must be either NA or NULL (or 0 for backwards compatibility which is the same as NULL but please use NULL)") if (!is.logical(which) || length(which)>1L) stopf("which= must be a logical vector length 1. Either FALSE, TRUE or NA.") if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) - if (!is.na(nomatch) && is.na(which)) stopf("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.") + if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE @@ -313,7 +313,7 @@ replace_dot_alias = function(e) { stopf(":= with keyby is only possible when i is not supplied since you can't setkey on a subset of rows. Either change keyby to by or remove i") if (!missingnomatch) { warningf("nomatch isn't relevant together with :=, ignoring nomatch") - nomatch=0L + nomatch=NULL } } } @@ -369,7 +369,7 @@ replace_dot_alias = function(e) { if (isub %iscall% "!") { notjoin = TRUE if (!missingnomatch) stopf("not-join '!' prefix is present on i but nomatch is provided. Please remove nomatch."); - nomatch = 0L + nomatch = NULL isub = isub[[2L]] # #932 related so that !(v1 == 1) becomes v1 == 1 instead of (v1 == 1) after removing "!" if (isub %iscall% "(" && !is.name(isub[[2L]])) @@ -389,7 +389,7 @@ replace_dot_alias = function(e) { on = o$on ## the following two are ignored if i is not a data.table. ## Since we are converting i to data.table, it is important to set them properly. - nomatch = 0L + nomatch = NULL mult = "all" } else if (!is.name(isub)) { @@ -509,8 +509,7 @@ replace_dot_alias = function(e) { len__ = ans$lens allGrp1 = all(ops==1L) # was previously 'ans$allGrp1'. Fixing #1991. TODO: Revisit about allGrp1 possibility for speedups in certain cases when I find some time. indices__ = if (length(ans$indices)) ans$indices else seq_along(f__) # also for #1991 fix - # length of input nomatch (single 0 or NA) is 1 in both cases. - # When no match, len__ is 0 for nomatch=0 and 1 for nomatch=NA, so len__ isn't .N + # When no match, len__ is 0 for nomatch=NULL and 1 for nomatch=NA, so len__ isn't .N # If using secondary key of x, f__ will refer to xo if (is.na(which)) { w = if (notjoin) f__!=0L else is.na(f__) @@ -533,7 +532,7 @@ replace_dot_alias = function(e) { # Fix for #1092 and #1074 # TODO: implement better version of "any"/"all"/"which" to avoid # unnecessary construction of logical vectors - if (identical(nomatch, 0L) && allLen1) irows = irows[irows != 0L] + if (is.null(nomatch) && allLen1) irows = irows[irows != 0L] } else { if (length(xo) && missing(on)) stopf("Internal error. Cannot by=.EACHI when joining to an index, yet") # nocov @@ -553,10 +552,10 @@ replace_dot_alias = function(e) { } else { if (!byjoin) { #1287 and #1271 irows = f__ # len__ is set to 1 as well, no need for 'pmin' logic - if (identical(nomatch,0L)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows + if (is.null(nomatch)) irows = irows[len__>0L] # 0s are len 0, so this removes -1 irows } # TODO: when nomatch=NA, len__ need not be allocated / set at all for mult="first"/"last"? - # TODO: how about when nomatch=0L, can we avoid allocating then as well? + # TODO: how about when nomatch=NULL, can we avoid allocating then as well? } if (length(xo) && length(irows)) { irows = xo[irows] # TO DO: fsort here? @@ -619,8 +618,12 @@ replace_dot_alias = function(e) { else stopf("i evaluates to a logical vector length %d but there are %d rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.", length(i), nrow(x)) } else { irows = as.integer(i) # e.g. DT[c(1,3)] and DT[c(-1,-3)] ok but not DT[c(1,-3)] (caught as error) - irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), is.null(jsub) || root!=":=") # last argument is allowOverMax (NA when selecting, error when assigning) - # simplifies logic from here on: can assume positive subscripts (no zeros) + if (nomatch0) warning("Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 (Jan 2019)") + # warning only for this case where nomatch was ignored before v1.14.2; #3109 + irows = .Call(CconvertNegAndZeroIdx, irows, nrow(x), + is.null(jsub) || root!=":=", # allowOverMax (NA when selecting, error when assigning) + !is.null(nomatch)) # allowNA=false when nomatch=NULL #3109, true when nomatch=NA #3666 + # simplifies logic from here on: can assume positive subscripts (no zeros), for nomatch=NULL also no NAs # maintains Arun's fix for #2697 (test 1042) # efficient in C with more detailed helpful messages when user mixes positives and negatives # falls through quickly (no R level allocs) if all items are within range [1,max] with no zeros or negatives @@ -628,7 +631,7 @@ replace_dot_alias = function(e) { } } if (notjoin) { - if (byjoin || !is.integer(irows) || is.na(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov + if (byjoin || !is.integer(irows) || !is.null(nomatch)) stopf("Internal error: notjoin but byjoin or !integer or nomatch==NA") # nocov irows = irows[irows!=0L] if (verbose) {last.started.at=proc.time();catf("Inverting irows for notjoin done in ... ");flush.console()} i = irows = if (length(irows)) seq_len(nrow(x))[-irows] else NULL # NULL meaning all rows i.e. seq_len(nrow(x)) @@ -843,7 +846,7 @@ replace_dot_alias = function(e) { # in 1.8.3, but this failed test 876. # TO DO: Add a test like X[i,sum(v),by=i.x2], or where by includes a join column (both where some i don't match). # TO DO: Make xss directly, rather than recursive call. - if (!is.na(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset + if (is.null(nomatch)) irows = irows[irows!=0L] # TO DO: can be removed now we have CisSortedSubset if (length(allbyvars)) { ############### TO DO TO DO TO DO ############### if (verbose) catf("i clause present and columns used in by detected, only these subset: %s\n", brackify(allbyvars)) xss = `[.data.table`(x,irows,allbyvars,with=FALSE,nomatch=nomatch,mult=mult,roll=roll,rollends=rollends) @@ -1290,7 +1293,7 @@ replace_dot_alias = function(e) { } ans = vector("list", length(ansvars)) ii = rep.int(indices__, len__) # following #1991 fix - # TODO: if (allLen1 && allGrp1 && (is.na(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=0 should drop rows in i that have no match] + # TODO: if (allLen1 && allGrp1 && (!is.null(nomatch) || !any(f__==0L))) then ii will be 1:nrow(i) [nomatch=NULL should drop rows in i that have no match] # But rather than that complex logic here at R level to catch that and do a shallow copy for efficiency, just do the check inside CsubsetDT # to see if it passed 1:nrow(x) and then CsubsetDT should do the shallow copy safely and centrally. # That R level branch was taken out in PR #3213 @@ -1721,7 +1724,7 @@ replace_dot_alias = function(e) { } dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with - # nomatch=0L even now.. but not switching it on yet, will deal it separately. + # nomatch=NULL even now.. but not switching it on yet, will deal it separately. if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { if (!length(ansvars) && !use.I) { GForce = FALSE diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6626e4f0a..d477b13e7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2293,7 +2293,8 @@ test(807, DT[!c("b","foo","c"),nomatch=0], error="not-join.*prefix is present on test(808, DT[c("b","foo","c"),which=TRUE,nomatch=NA], INT(3:4,NA,5:6)) test(809, DT[c("b","foo","c"),which=TRUE,nomatch=0], INT(3:4,5:6)) test(810, DT[c("b","foo","c"),which=NA,nomatch=NA], 2L) -test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0 would always return an empty vector[.] Please change or remove either which or nomatch") +test(811.1, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0|NULL would always return an empty vector[.] Please change or remove either which or nomatch") +test(811.2, DT[c("b","foo","c"),which=NA,nomatch=NULL], error="which=NA with nomatch=0|NULL would always return an empty vector[.] Please change or remove either which or nomatch") # New notj for column names and positions when with=FALSE, #1384 DT = data.table(a=1:3,b=4:6,c=7:9) @@ -17979,3 +17980,32 @@ test(2209.3, fread('A|B,C\n1|+,4\n2|-,5\n3|-,6\n', dec=','), data.table(A=1:3, ' test(2209.4, fread('A|B|C\n1|0,4|a\n2|0,5|b\n', dec=','), data.table(A=1:2, B=c(0.4,0.5), C=c("a","b"))) # ok before test(2209.5, fread('A|B,C\n1|0,4\n2|0,5\n', dec=','), data.table(A=1:2, "B,C"=c(0.4,0.5))) +# respect `nomatch=NULL` for integer i, #3109 #3666 +DT = data.table(x = 1:4) +test(2210.01, DT[c(1L, 5L, NA_integer_)], data.table(x=c(1L,NA_integer_,NA_integer_))) # default nomatch=NA +test(2210.02, DT[c(1L, 5L, NA_integer_), nomatch=NULL], data.table(x = 1L)) +test(2210.03, DT[c(1L, 5L, NA_integer_), nomatch=0], data.table(x = 1L), warning="Please use nomatch=NULL") +test(2210.04, DT[c(1L, 5L, NA_integer_), x, nomatch=NULL], 1L) +test(2210.05, DT[c(1L, 5L, NA_integer_), x, nomatch=0], 1L, warning="Please use nomatch=NULL") +test(2210.06, DT[c(1:4,1:4), nomatch=NULL], data.table(x=c(1:4,1:4))) # early stopping convertNegAndZeroIdx +test(2210.07, DT[c(1:4,-1L), nomatch=NULL], error="Cannot mix positives and negatives") +test(2210.08, DT[c(1:4,NA_integer_,-1L), nomatch=NULL], error="Cannot mix positives and negatives") +test(2210.09, DT[c(1:4,NA_integer_,-1L), nomatch=0], error="Cannot mix positives and negatives", warning="Please use nomatch=NULL") +test(2210.10, DT[c(-1L,NA_integer_), nomatch=NULL], error="Cannot mix negatives and NA") +test(2210.11, DT[c(-1L,NA_integer_), nomatch=0], error="Cannot mix negatives and NA", warning="Please use nomatch=NULL") +test(2210.12, DT[NA, nomatch=NULL], data.table(x=integer())) +test(2210.13, DT[NA, nomatch=0], data.table(x=integer()), warning="Please use nomatch=NULL") +test(2210.14, DT[0L, nomatch=NULL], data.table(x=integer())) +test(2210.15, DT[0L, nomatch=0], data.table(x=integer()), warning="Please use nomatch=NULL") +test(2210.16, DT[0:1, nomatch=NULL], data.table(x=1L)) +test(2210.17, DT[0:1, nomatch=0], data.table(x=1L), warning="Please use nomatch=NULL") +test(2210.18, DT[-1L, nomatch=NULL], data.table(x=2:4)) +test(2210.19, DT[-1L, nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL") +test(2210.20, data.table()[1L, nomatch=NULL], data.table()) +test(2210.21, data.table()[1L, nomatch=0], data.table(), warning="Please use nomatch=NULL") +test(2210.22, data.table()[-1L, nomatch=NULL], data.table(), warning="there are only 0 rows") +test(2210.23, data.table()[-1L, nomatch=0], data.table(), warning=c("Please use nomatch=NULL","there are only 0 rows")) +test(2210.24, DT[-c(1L,0L)], data.table(x=2:4)) # codecov gap, not related to nomatch +test(2210.25, DT[-c(1L,0L), nomatch=NULL], data.table(x=2:4)) +test(2210.26, DT[-c(1L,0L), nomatch=0], data.table(x=2:4), warning="Please use nomatch=NULL") + diff --git a/src/bmerge.c b/src/bmerge.c index 83c5b167d..fac7ee281 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -85,8 +85,7 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP error(_("rollends must be a length 2 logical vector")); rollends = LOGICAL(rollendsArg); - // nomatch arg - nomatch = INTEGER(nomatchArg)[0]; + nomatch = isNull(nomatchArg) ? 0 : INTEGER(nomatchArg)[0]; // mult arg if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL; diff --git a/src/nqrecreateindices.c b/src/nqrecreateindices.c index a07ce38c7..a750c093e 100644 --- a/src/nqrecreateindices.c +++ b/src/nqrecreateindices.c @@ -14,7 +14,7 @@ SEXP nqRecreateIndices(SEXP xo, SEXP len, SEXP indices, SEXP nArg, SEXP nomatch) const int *iindices = INTEGER(indices); const int *ilen = INTEGER(len); const int *ixo = INTEGER(xo); - const int *inomatch = INTEGER(nomatch); + const int inomatch = isNull(nomatch) ? 0 : INTEGER(nomatch)[0]; int *inewstarts = INTEGER(newstarts); for (int i=0; i= xn) { // NA_integer_ = INT_MIN is checked in init.c - // j >= xn needed for special nomatch=0L case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) - inewstarts[i] = inomatch[0]; + // j >= xn needed for special nomatch=NULL case, see issue#4388 (due to xo[irows] from R removing '0' value in xo) + inewstarts[i] = inomatch; j++; // newlen will be 1 for xo=NA and 0 for xo=0 .. but we need to increment by 1 for both } else { inewstarts[i] = tmp+1; diff --git a/src/subset.c b/src/subset.c index 2e62b6ec9..a04fcc9ec 100644 --- a/src/subset.c +++ b/src/subset.c @@ -124,39 +124,42 @@ const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_ou return NULL; } -SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax) +SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax, SEXP allowNAArg) { // called from [.data.table to massage user input, creating a new strictly positive idx if there are any negatives or zeros // + more precise and helpful error messages telling user exactly where the problem is (saving user debugging time) // + a little more efficient than negativeSubscript in src/main/subscript.c (it's private to R so we can't call it anyway) // allowOverMaxArg is false when := (test 1024), otherwise true for selecting + // allowNAArg is false when nomatch=NULL #3109 #3666 if (!isInteger(idx)) error(_("Internal error. 'idx' is type '%s' not 'integer'"), type2char(TYPEOF(idx))); // # nocov if (!isInteger(maxArg) || length(maxArg)!=1) error(_("Internal error. 'maxArg' is type '%s' and length %d, should be an integer singleton"), type2char(TYPEOF(maxArg)), length(maxArg)); // # nocov if (!isLogical(allowOverMax) || LENGTH(allowOverMax)!=1 || LOGICAL(allowOverMax)[0]==NA_LOGICAL) error(_("Internal error: allowOverMax must be TRUE/FALSE")); // # nocov - int max = INTEGER(maxArg)[0], n=LENGTH(idx); + const int max = INTEGER(maxArg)[0], n=LENGTH(idx); if (max<0) error(_("Internal error. max is %d, must be >= 0."), max); // # nocov includes NA which will print as INT_MIN - int *idxp = INTEGER(idx); + if (!isLogical(allowNAArg) || LENGTH(allowNAArg)!=1 || LOGICAL(allowNAArg)[0]==NA_LOGICAL) error(_("Internal error: allowNAArg must be TRUE/FALSE")); // # nocov + const bool allowNA = LOGICAL(allowNAArg)[0]; + const int *idxp = INTEGER(idx); bool stop = false; #pragma omp parallel for num_threads(getDTthreads(n, true)) - for (int i=0; imax) stop=true; + if ((elem<1 && (elem!=NA_INTEGER || !allowNA)) || elem>max) stop=true; } - if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA or in range [1-max] + if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA (if allowNA) or in range [1-max] // --------- - // else massage the input to a standard idx where all items are either NA or in range [1,max] ... + // else massage the input to a standard idx where all items are either in range [1,max], or NA (if allowNA) - int countNeg=0, countZero=0, countNA=0, firstOverMax=0; - for (int i=0; imax && firstOverMax==0) firstOverMax=i+1; + else if (elem>max && ++countOverMax && firstOverMax==0) firstOverMax=i+1; } if (firstOverMax && LOGICAL(allowOverMax)[0]==FALSE) { error(_("i[%d] is %d which is out of range [1,nrow=%d]"), firstOverMax, idxp[firstOverMax-1], max); @@ -186,13 +189,24 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax) SEXP ans; if (countNeg==0) { - // just zeros to remove, or >max to convert to NA - ans = PROTECT(allocVector(INTSXP, n - countZero)); - int *ansp = INTEGER(ans); - for (int i=0, ansi=0; imax ? NA_INTEGER : elem; + if (allowNA) { + // remove zeros, convert >max to NA + ans = PROTECT(allocVector(INTSXP, n-countZero)); + int *ansp = INTEGER(ans); + for (int i=0, ansi=0; imax ? NA_INTEGER : elem; + } + } else { + // remove zeros, NA and >max + ans = PROTECT(allocVector(INTSXP, n-countZero-countNA-countOverMax)); + int *ansp = INTEGER(ans); + for (int i=0, ansi=0; imax) continue; + ansp[ansi++] = elem; + } } } else { // idx is all negative without any NA but perhaps some zeros @@ -265,7 +279,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md bool anyNA=false, orderedSubset=true; // true for when rows==null (meaning all rows) if (!isNull(rows) && check_idx(rows, nrow, &anyNA, &orderedSubset)!=NULL) { SEXP max = PROTECT(ScalarInteger(nrow)); nprotect++; - rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE))); nprotect++; + rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE), ScalarLogical(TRUE))); nprotect++; const char *err = check_idx(rows, nrow, &anyNA, &orderedSubset); if (err!=NULL) error(err); }