From 1b96cf78ab7212856b676426060ae0a4091418de Mon Sep 17 00:00:00 2001 From: MarkusBonsch Date: Wed, 3 Oct 2018 15:24:46 +0200 Subject: [PATCH 01/10] Fixed bug in 'on' --- R/data.table.R | 90 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 2fd122518..f00bc3d85 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -498,23 +498,81 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { on = eval(onsub, parent.frame(2L), parent.frame(2L)) if (!is.character(on)) stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") - this_op = regmatches(on, gregexpr(pat, on)) - idx = (vapply(this_op, length, 0L) == 0L) - this_op[idx] = "==" - this_op = unlist(this_op, use.names=FALSE) - idx_op = match(this_op, ops, nomatch=0L) - if (any(idx_op %in% c(0L, 6L))) - stop("Invalid operators ", paste(this_op[idx_op==0L], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") - if (is.null(names(on))) { - on[idx] = if (isnull_inames) paste(on[idx], paste0("V", seq_len(sum(idx))), sep="==") else paste(on[idx], on[idx], sep="==") - } else { - on[idx] = paste(names(on)[idx], on[idx], sep="==") + ## extract the operators and potential variable names from 'on'. + ## split at backticks to take care about variable names like `col1<=`. + pieces <- strsplit(on, "(?=[`])", perl = TRUE) + xCols <- character(length(on)) + ## if 'on' is named, the names are the xCols for sure + if(!is.null(names(on))){ + xCols <- names(on) + } + iCols <- character(length(on)) + operators <- character(length(on)) + ## loop over the elements and extract operators and column names. + for(i in seq_along(pieces)){ + thisCols <- character(0) + thisOperators <- character(0) + j <- 1 + while(j <= length(pieces[[i]])){ + if(pieces[[i]][j] == "`"){ + ## start of a variable name with backtick. + thisCols <- c(thisCols, pieces[[i]][j+1]) + j <- j+3 # +1 is the column name, +2 is delimiting "`", +3 is next relevant entry.` + } else { + ## no backtick + ## search for operators + thisOperators <- c(thisOperators, + unlist(regmatches(pieces[[i]][j], gregexpr(pat, pieces[[i]][j])), + use.names = FALSE)) + ## search for column names + thisCols <- c(thisCols, trimws(strsplit(pieces[[i]][j], pat)[[1]])) + ## there can be empty string column names because of trimws, remove them + thisCols <- thisCols[thisCols != ""] + j <- j+1 + } + } + if (length(thisOperators) == 0) { + ## if no operator is given, it must be == + operators[i] <- "==" + } else if (length(thisOperators) == 1) { + operators[i] <- thisOperators + } else { + ## multiple operators found in one 'on' part. Something is wrong. + stop("Found more than one operator in one 'on' statement: ", on[i], ". Please specify a single operator.") + } + if (length(thisCols) == 2){ + ## two column names found, first is xCol, second is iCol for sure + xCols[i] <- thisCols[1] + iCols[i] <- thisCols[2] + } else if (length(thisCols) == 1){ + ## a single column name found. Can mean different things + if(xCols[i] != ""){ + ## xCol is given by names(on). thisCols must be iCol + iCols[i] <- thisCols[1] + } else if (isnull_inames){ + ## i has no names. It will be given the names V1, V2, ... automatically. + ## The single column name is the x column. It will match to the ith column in i. + xCols[i] <- thisCols[1] + iCols[i] <- paste0("V", i) + } else { + ## i has names and one single column name is given by on. + ## This means that xCol and iCol have the same name. + xCols[i] <- thisCols[1] + iCols[i] <- thisCols[1] + } + } else if (length(thisCols) == 0){ + stop("'on' contains no column name: ", on[i], ". Each 'on' clause must contain one or two column names.") + } else { + stop("'on' contains more than 2 column names: ", on[i], ". Each 'on' clause must contain one or two column names.") + } } - split = tstrsplit(on, paste0("[ ]*", pat, "[ ]*")) - on = setattr(split[[2L]], 'names', split[[1L]]) - if (length(empty_idx <- which(names(on) == ""))) - names(on)[empty_idx] = on[empty_idx] - list(on = on, ops = idx_op) + idx_op = match(operators, ops, nomatch=0L) + if (any(idx_op %in% c(0L, 6L))) + stop("Invalid operators ", paste(operators[idx_op==0L], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") + ## the final on will contain the xCol as name, the iCol as value + on <- iCols + names(on) <- xCols + return(list(on = on, ops = idx_op)) } on_ops = parse_on(substitute(on)) on = on_ops[[1L]] From b32fc34e0a13351f8a509b6ec4aaf9c5668e5e67 Mon Sep 17 00:00:00 2001 From: MarkusBonsch Date: Wed, 3 Oct 2018 15:28:07 +0200 Subject: [PATCH 02/10] Added tests. --- inst/tests/tests.Rraw | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d46967a22..fdcb0b0f1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12278,7 +12278,24 @@ DT = data.table(A=1:5) test(1947.1, DT[A<0, c('A','B'):=.(NULL, A)], error="When deleting columns, i should not be provided") test(1947.2, DT, data.table(A=1:5)) - +## tests for #2931 +DT <- data.table(id = 1:3, `counts(a>=0)` = 1:3, sameName = 1:3) +i <- data.table(idi = 1:3, ` weirdName>=` = 1:3, sameName = 1:3) +## test white spaces around operator +test(1948.1, DT[i, on = "id >= idi"], DT[i, on = "id>=idi"]) +test(1948.2, DT[i, on = "id>= idi"], DT[i, on = "id>=idi"]) +test(1948.3, DT[i, on = "id >=idi"], DT[i, on = "id>=idi"]) +## test column names containing operators +test(1948.4, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), + DT[i, on = "id>=idi"]) +test(1948.5, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), + DT[i, on = "id>=idi"]) +test(1948.6, setnames(DT[i, on = "id >= ` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), + DT[i, on = "id>=idi"]) +test(1948.7, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), + DT[i, on = "id==idi"]) +## mixed example +test(1948.8, DT[i, on = c( id = "idi", "sameName", "`counts(a>=0)`==` weirdName>=`")], DT[i, on = "id==idi", c("id", "counts(a>=0)", "sameName")]) ################################### # Add new tests above this line # From 1c9907f214cd2546069f98cc90166c086c08909d Mon Sep 17 00:00:00 2001 From: MarkusBonsch Date: Thu, 4 Oct 2018 09:32:19 +0200 Subject: [PATCH 03/10] Removed tests before merge. --- inst/tests/tests.Rraw | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fdcb0b0f1..d46967a22 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12278,24 +12278,7 @@ DT = data.table(A=1:5) test(1947.1, DT[A<0, c('A','B'):=.(NULL, A)], error="When deleting columns, i should not be provided") test(1947.2, DT, data.table(A=1:5)) -## tests for #2931 -DT <- data.table(id = 1:3, `counts(a>=0)` = 1:3, sameName = 1:3) -i <- data.table(idi = 1:3, ` weirdName>=` = 1:3, sameName = 1:3) -## test white spaces around operator -test(1948.1, DT[i, on = "id >= idi"], DT[i, on = "id>=idi"]) -test(1948.2, DT[i, on = "id>= idi"], DT[i, on = "id>=idi"]) -test(1948.3, DT[i, on = "id >=idi"], DT[i, on = "id>=idi"]) -## test column names containing operators -test(1948.4, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), - DT[i, on = "id>=idi"]) -test(1948.5, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), - DT[i, on = "id>=idi"]) -test(1948.6, setnames(DT[i, on = "id >= ` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), - DT[i, on = "id>=idi"]) -test(1948.7, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), - DT[i, on = "id==idi"]) -## mixed example -test(1948.8, DT[i, on = c( id = "idi", "sameName", "`counts(a>=0)`==` weirdName>=`")], DT[i, on = "id==idi", c("id", "counts(a>=0)", "sameName")]) + ################################### # Add new tests above this line # From 6b4a5f89963e5579680a34cb842cca54165296ae Mon Sep 17 00:00:00 2001 From: MarkusBonsch Date: Thu, 4 Oct 2018 09:33:49 +0200 Subject: [PATCH 04/10] Reintroduced tests before push. --- inst/tests/tests.Rraw | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5156af431..bdb302791 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12278,6 +12278,24 @@ DT = data.table(A=1:5) test(1947.1, DT[A<0, c('A','B'):=.(NULL, A)], error="When deleting columns, i should not be provided") test(1947.2, DT, data.table(A=1:5)) +## tests for #2931 +DT <- data.table(id = 1:3, `counts(a>=0)` = 1:3, sameName = 1:3) +i <- data.table(idi = 1:3, ` weirdName>=` = 1:3, sameName = 1:3) +## test white spaces around operator +test(1948.1, DT[i, on = "id >= idi"], DT[i, on = "id>=idi"]) +test(1948.2, DT[i, on = "id>= idi"], DT[i, on = "id>=idi"]) +test(1948.3, DT[i, on = "id >=idi"], DT[i, on = "id>=idi"]) +## test column names containing operators +test(1948.4, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), + DT[i, on = "id>=idi"]) +test(1948.5, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), + DT[i, on = "id>=idi"]) +test(1948.6, setnames(DT[i, on = "id >= ` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), + DT[i, on = "id>=idi"]) +test(1948.7, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), + DT[i, on = "id==idi"]) +## mixed example +test(1948.8, DT[i, on = c( id = "idi", "sameName", "`counts(a>=0)`==` weirdName>=`")], DT[i, on = "id==idi", c("id", "counts(a>=0)", "sameName")]) ################################### From fda588922f71338e08bb4a9e81fdf0ea87866925 Mon Sep 17 00:00:00 2001 From: MarkusBonsch Date: Thu, 4 Oct 2018 09:40:12 +0200 Subject: [PATCH 05/10] Added NEWS item. --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index c95079158..c77d4aba8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,8 @@ 1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting. +2. Variable names in backticks which included a comparison operator, e.g. `a<=colB`, caused an error when specifying this column name in the 'on' clause during a join [#3092](https://github.com/Rdatatable/data.table/issues/3092). This has been fixed. Additionally, the 'on' clause now supports white spaces around operators, e.g. on = "colA == colB". Thanks to @mt1022 for reporting and to @MarkusBonsch for fixing. + #### NOTES 1. When data.table first loads it now checks the DLL's MD5. This is to detect installation issues on Windows when you upgrade and i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. Thanks to Gabor Csardi for the suggestion to use `tools::checkMD5sums()`. From e8ac366256a001eaf17af01fa7bb046acc261120 Mon Sep 17 00:00:00 2001 From: MarkusBonsch Date: Thu, 4 Oct 2018 21:03:24 +0200 Subject: [PATCH 06/10] Separated parse_on from [ for readability and structure. --- R/data.table.R | 193 ++++++++++++++++++++++++++----------------------- 1 file changed, 101 insertions(+), 92 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index dfcdf41a3..a36ea6dd9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -484,98 +484,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { } if (!missing(on)) { # on = .() is now possible, #1257 - parse_on <- function(onsub) { - ops = c("==", "<=", "<", ">=", ">", "!=") - pat = paste0("(", ops, ")", collapse="|") - if (is.call(onsub) && onsub[[1L]] == "eval") { - onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L)) - if (is.call(onsub) && onsub[[1L]] == "eval") onsub = onsub[[2L]] - } - if (is.call(onsub) && as.character(onsub[[1L]]) %in% c("list", ".")) { - spat = paste0("[ ]+(", pat, ")[ ]+") - onsub = lapply(as.list(onsub)[-1L], function(x) gsub(spat, "\\1", deparse(x, width.cutoff=500L))) - onsub = as.call(c(quote(c), onsub)) - } - on = eval(onsub, parent.frame(2L), parent.frame(2L)) - if (!is.character(on)) - stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") - ## extract the operators and potential variable names from 'on'. - ## split at backticks to take care about variable names like `col1<=`. - pieces <- strsplit(on, "(?=[`])", perl = TRUE) - xCols <- character(length(on)) - ## if 'on' is named, the names are the xCols for sure - if(!is.null(names(on))){ - xCols <- names(on) - } - iCols <- character(length(on)) - operators <- character(length(on)) - ## loop over the elements and extract operators and column names. - for(i in seq_along(pieces)){ - thisCols <- character(0) - thisOperators <- character(0) - j <- 1 - while(j <= length(pieces[[i]])){ - if(pieces[[i]][j] == "`"){ - ## start of a variable name with backtick. - thisCols <- c(thisCols, pieces[[i]][j+1]) - j <- j+3 # +1 is the column name, +2 is delimiting "`", +3 is next relevant entry.` - } else { - ## no backtick - ## search for operators - thisOperators <- c(thisOperators, - unlist(regmatches(pieces[[i]][j], gregexpr(pat, pieces[[i]][j])), - use.names = FALSE)) - ## search for column names - thisCols <- c(thisCols, trimws(strsplit(pieces[[i]][j], pat)[[1]])) - ## there can be empty string column names because of trimws, remove them - thisCols <- thisCols[thisCols != ""] - j <- j+1 - } - } - if (length(thisOperators) == 0) { - ## if no operator is given, it must be == - operators[i] <- "==" - } else if (length(thisOperators) == 1) { - operators[i] <- thisOperators - } else { - ## multiple operators found in one 'on' part. Something is wrong. - stop("Found more than one operator in one 'on' statement: ", on[i], ". Please specify a single operator.") - } - if (length(thisCols) == 2){ - ## two column names found, first is xCol, second is iCol for sure - xCols[i] <- thisCols[1] - iCols[i] <- thisCols[2] - } else if (length(thisCols) == 1){ - ## a single column name found. Can mean different things - if(xCols[i] != ""){ - ## xCol is given by names(on). thisCols must be iCol - iCols[i] <- thisCols[1] - } else if (isnull_inames){ - ## i has no names. It will be given the names V1, V2, ... automatically. - ## The single column name is the x column. It will match to the ith column in i. - xCols[i] <- thisCols[1] - iCols[i] <- paste0("V", i) - } else { - ## i has names and one single column name is given by on. - ## This means that xCol and iCol have the same name. - xCols[i] <- thisCols[1] - iCols[i] <- thisCols[1] - } - } else if (length(thisCols) == 0){ - stop("'on' contains no column name: ", on[i], ". Each 'on' clause must contain one or two column names.") - } else { - stop("'on' contains more than 2 column names: ", on[i], ". Each 'on' clause must contain one or two column names.") - } - } - idx_op = match(operators, ops, nomatch=0L) - if (any(idx_op %in% c(0L, 6L))) - stop("Invalid operators ", paste(operators[idx_op==0L], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") - ## the final on will contain the xCol as name, the iCol as value - on <- iCols - names(on) <- xCols - return(list(on = on, ops = idx_op)) - } - on_ops = parse_on(substitute(on)) + on_ops = parse_on(substitute(on), isnull_inames) on = on_ops[[1L]] ops = on_ops[[2L]] # TODO: collect all '==' ops first to speeden up Cnestedid @@ -3110,3 +3019,103 @@ isReallyReal <- function(x) { ) ) } + + +parse_on <- function(onsub, isnull_inames) { + ## helper that takes the 'on' string(s) and extracts comparison operators and column names from it. + #' @param onsub the substituted on + #' @param isnull_inames bool; TRUE if i has no names. + #' @return List with two entries: + #' 'on' : character vector providing the column names for the join. + #' Names correspond to columns in x, entries correspond to columns in i + #' 'ops': integer vector. Gives the indices of the operators that connect the columns in x and i. + ops = c("==", "<=", "<", ">=", ">", "!=") + pat = paste0("(", ops, ")", collapse="|") + if (is.call(onsub) && onsub[[1L]] == "eval") { + onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L)) + if (is.call(onsub) && onsub[[1L]] == "eval") onsub = onsub[[2L]] + } + if (is.call(onsub) && as.character(onsub[[1L]]) %in% c("list", ".")) { + spat = paste0("[ ]+(", pat, ")[ ]+") + onsub = lapply(as.list(onsub)[-1L], function(x) gsub(spat, "\\1", deparse(x, width.cutoff=500L))) + onsub = as.call(c(quote(c), onsub)) + } + on = eval(onsub, parent.frame(2L), parent.frame(2L)) + if (!is.character(on)) + stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") + ## extract the operators and potential variable names from 'on'. + ## split at backticks to take care about variable names like `col1<=`. + pieces <- strsplit(on, "(?=[`])", perl = TRUE) + xCols <- character(length(on)) + ## if 'on' is named, the names are the xCols for sure + if(!is.null(names(on))){ + xCols <- names(on) + } + iCols <- character(length(on)) + operators <- character(length(on)) + ## loop over the elements and extract operators and column names. + for(i in seq_along(pieces)){ + thisCols <- character(0) + thisOperators <- character(0) + j <- 1 + while(j <= length(pieces[[i]])){ + if(pieces[[i]][j] == "`"){ + ## start of a variable name with backtick. + thisCols <- c(thisCols, pieces[[i]][j+1]) + j <- j+3 # +1 is the column name, +2 is delimiting "`", +3 is next relevant entry.` + } else { + ## no backtick + ## search for operators + thisOperators <- c(thisOperators, + unlist(regmatches(pieces[[i]][j], gregexpr(pat, pieces[[i]][j])), + use.names = FALSE)) + ## search for column names + thisCols <- c(thisCols, trimws(strsplit(pieces[[i]][j], pat)[[1]])) + ## there can be empty string column names because of trimws, remove them + thisCols <- thisCols[thisCols != ""] + j <- j+1 + } + } + if (length(thisOperators) == 0) { + ## if no operator is given, it must be == + operators[i] <- "==" + } else if (length(thisOperators) == 1) { + operators[i] <- thisOperators + } else { + ## multiple operators found in one 'on' part. Something is wrong. + stop("Found more than one operator in one 'on' statement: ", on[i], ". Please specify a single operator.") + } + if (length(thisCols) == 2){ + ## two column names found, first is xCol, second is iCol for sure + xCols[i] <- thisCols[1] + iCols[i] <- thisCols[2] + } else if (length(thisCols) == 1){ + ## a single column name found. Can mean different things + if(xCols[i] != ""){ + ## xCol is given by names(on). thisCols must be iCol + iCols[i] <- thisCols[1] + } else if (isnull_inames){ + ## i has no names. It will be given the names V1, V2, ... automatically. + ## The single column name is the x column. It will match to the ith column in i. + xCols[i] <- thisCols[1] + iCols[i] <- paste0("V", i) + } else { + ## i has names and one single column name is given by on. + ## This means that xCol and iCol have the same name. + xCols[i] <- thisCols[1] + iCols[i] <- thisCols[1] + } + } else if (length(thisCols) == 0){ + stop("'on' contains no column name: ", on[i], ". Each 'on' clause must contain one or two column names.") + } else { + stop("'on' contains more than 2 column names: ", on[i], ". Each 'on' clause must contain one or two column names.") + } + } + idx_op = match(operators, ops, nomatch=0L) + if (any(idx_op %in% c(0L, 6L))) + stop("Invalid operators ", paste(operators[idx_op==0L], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") + ## the final on will contain the xCol as name, the iCol as value + on <- iCols + names(on) <- xCols + return(list(on = on, ops = idx_op)) +} \ No newline at end of file From 5eb5d7706b06f7a28971659e456310a0d86853b8 Mon Sep 17 00:00:00 2001 From: MarkusBonsch Date: Thu, 4 Oct 2018 21:53:46 +0200 Subject: [PATCH 07/10] Added tests for rare errors to increase code coverage. --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bdb302791..1f9afb29e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12296,6 +12296,10 @@ test(1948.7, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","cou DT[i, on = "id==idi"]) ## mixed example test(1948.8, DT[i, on = c( id = "idi", "sameName", "`counts(a>=0)`==` weirdName>=`")], DT[i, on = "id==idi", c("id", "counts(a>=0)", "sameName")]) +## testing for errors +test(1948.9, DT[i, on = ""], error = "'on' contains no column name: . Each 'on' clause must contain one or two column names.") +test(1948.11, DT[i, on = "id>=idi>=1"], error = "Found more than one operator in one 'on' statement: id>=idi>=1. Please specify a single operator.") +test(1948.12, DT[i, on = "`id``idi`<=id"], error = "'on' contains more than 2 column names: `id``idi`<=id. Each 'on' clause must contain one or two column names.") ################################### From 9c9a9d17e13c9e94b75338c0b683077cc955b980 Mon Sep 17 00:00:00 2001 From: MarkusBonsch Date: Sat, 6 Oct 2018 23:17:41 +0200 Subject: [PATCH 08/10] Better test coverage. Renamed 'parse_on' to '.parse_on' to mark it as strictly internal. --- R/data.table.R | 6 +++--- inst/tests/tests.Rraw | 10 +++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index e219dc69b..a6718c470 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -487,7 +487,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { } if (!missing(on)) { # on = .() is now possible, #1257 - on_ops = parse_on(substitute(on), isnull_inames) + on_ops = .parse_on(substitute(on), isnull_inames) on = on_ops[[1L]] ops = on_ops[[2L]] # TODO: collect all '==' ops first to speeden up Cnestedid @@ -3024,7 +3024,7 @@ isReallyReal <- function(x) { } -parse_on <- function(onsub, isnull_inames) { +.parse_on <- function(onsub, isnull_inames) { ## helper that takes the 'on' string(s) and extracts comparison operators and column names from it. #' @param onsub the substituted on #' @param isnull_inames bool; TRUE if i has no names. @@ -3116,7 +3116,7 @@ parse_on <- function(onsub, isnull_inames) { } idx_op = match(operators, ops, nomatch=0L) if (any(idx_op %in% c(0L, 6L))) - stop("Invalid operators ", paste(operators[idx_op==0L], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") + stop("Invalid operators ", paste(operators[idx_op %in% c(0L, 6L)], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".") ## the final on will contain the xCol as name, the iCol as value on <- iCols names(on) <- xCols diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 47e5dd21a..af8393b4d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12296,10 +12296,14 @@ test(1948.7, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","cou DT[i, on = "id==idi"]) ## mixed example test(1948.8, DT[i, on = c( id = "idi", "sameName", "`counts(a>=0)`==` weirdName>=`")], DT[i, on = "id==idi", c("id", "counts(a>=0)", "sameName")]) +## testing 'eval' in on clause +test(1948.9, DT[i, on = eval(eval("id<=idi"))], DT[i, on = "id<=idi"]) ## testing for errors -test(1948.9, DT[i, on = ""], error = "'on' contains no column name: . Each 'on' clause must contain one or two column names.") -test(1948.11, DT[i, on = "id>=idi>=1"], error = "Found more than one operator in one 'on' statement: id>=idi>=1. Please specify a single operator.") -test(1948.12, DT[i, on = "`id``idi`<=id"], error = "'on' contains more than 2 column names: `id``idi`<=id. Each 'on' clause must contain one or two column names.") +test(1948.11, DT[i, on = ""], error = "'on' contains no column name: . Each 'on' clause must contain one or two column names.") +test(1948.12, DT[i, on = "id>=idi>=1"], error = "Found more than one operator in one 'on' statement: id>=idi>=1. Please specify a single operator.") +test(1948.13, DT[i, on = "`id``idi`<=id"], error = "'on' contains more than 2 column names: `id``idi`<=id. Each 'on' clause must contain one or two column names.") +test(1948.14, DT[i, on = "id != idi"], error = "Invalid operators !=. Only allowed operators are ==<=<>=>.") +test(1948.15, DT[i, on = 1L], error = "'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.") # helpful error when on= is provided but not i, rather than silently ignoring on= test(1949.1, DT[,,on=A], error="When i and j are both missing, no other argument should be used.") From 68c007cc5cb59a17428b24133c7f4cbfa9343e98 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sat, 6 Oct 2018 23:18:32 -0700 Subject: [PATCH 09/10] attempt at covr workaround --- R/data.table.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index a6718c470..1abb10c07 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3029,14 +3029,14 @@ isReallyReal <- function(x) { #' @param onsub the substituted on #' @param isnull_inames bool; TRUE if i has no names. #' @return List with two entries: - #' 'on' : character vector providing the column names for the join. + #' 'on' : character vector providing the column names for the join. #' Names correspond to columns in x, entries correspond to columns in i #' 'ops': integer vector. Gives the indices of the operators that connect the columns in x and i. ops = c("==", "<=", "<", ">=", ">", "!=") pat = paste0("(", ops, ")", collapse="|") if (is.call(onsub) && onsub[[1L]] == "eval") { onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L)) - if (is.call(onsub) && onsub[[1L]] == "eval") onsub = onsub[[2L]] + if (is.call(onsub) && onsub[[1L]] == "eval") { onsub = onsub[[2L]] } } if (is.call(onsub) && as.character(onsub[[1L]]) %in% c("list", ".")) { spat = paste0("[ ]+(", pat, ")[ ]+") @@ -3063,14 +3063,14 @@ isReallyReal <- function(x) { j <- 1 while(j <= length(pieces[[i]])){ if(pieces[[i]][j] == "`"){ - ## start of a variable name with backtick. + ## start of a variable name with backtick. thisCols <- c(thisCols, pieces[[i]][j+1]) j <- j+3 # +1 is the column name, +2 is delimiting "`", +3 is next relevant entry.` } else { ## no backtick ## search for operators - thisOperators <- c(thisOperators, - unlist(regmatches(pieces[[i]][j], gregexpr(pat, pieces[[i]][j])), + thisOperators <- c(thisOperators, + unlist(regmatches(pieces[[i]][j], gregexpr(pat, pieces[[i]][j])), use.names = FALSE)) ## search for column names thisCols <- c(thisCols, trimws(strsplit(pieces[[i]][j], pat)[[1]])) @@ -3103,11 +3103,11 @@ isReallyReal <- function(x) { xCols[i] <- thisCols[1] iCols[i] <- paste0("V", i) } else { - ## i has names and one single column name is given by on. + ## i has names and one single column name is given by on. ## This means that xCol and iCol have the same name. xCols[i] <- thisCols[1] iCols[i] <- thisCols[1] - } + } } else if (length(thisCols) == 0){ stop("'on' contains no column name: ", on[i], ". Each 'on' clause must contain one or two column names.") } else { @@ -3121,4 +3121,4 @@ isReallyReal <- function(x) { on <- iCols names(on) <- xCols return(list(on = on, ops = idx_op)) -} \ No newline at end of file +} From d4c20aa1dd9807a4f7b484325dec7771e8a6ff7d Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sat, 6 Oct 2018 23:49:35 -0700 Subject: [PATCH 10/10] news item tweak --- NEWS.md | 2 +- inst/tests/tests.Rraw | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3abd68661..b0e57ad56 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,7 +13,7 @@ 1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting. -2. Variable names in backticks which included a comparison operator, e.g. `a<=colB`, caused an error when specifying this column name in the 'on' clause during a join [#3092](https://github.com/Rdatatable/data.table/issues/3092). This has been fixed. Additionally, the 'on' clause now supports white spaces around operators, e.g. on = "colA == colB". Thanks to @mt1022 for reporting and to @MarkusBonsch for fixing. +2. Column names that look like expressions (e.g. `"a<=colB"`) caused an error when used in `on=` even when wrapped with backticks, [#3092](https://github.com/Rdatatable/data.table/issues/3092). Additionally, `on=` now supports white spaces around operators; e.g. `on = "colA == colB"`. Thanks to @mt1022 for reporting and to @MarkusBonsch for fixing. #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 40517fe28..f4fa813ba 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12286,13 +12286,13 @@ test(1948.1, DT[i, on = "id >= idi"], DT[i, on = "id>=idi"]) test(1948.2, DT[i, on = "id>= idi"], DT[i, on = "id>=idi"]) test(1948.3, DT[i, on = "id >=idi"], DT[i, on = "id>=idi"]) ## test column names containing operators -test(1948.4, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), +test(1948.4, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), DT[i, on = "id>=idi"]) -test(1948.5, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), +test(1948.5, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), DT[i, on = "id>=idi"]) test(1948.6, setnames(DT[i, on = "id >= ` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), DT[i, on = "id>=idi"]) -test(1948.7, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), +test(1948.7, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")), DT[i, on = "id==idi"]) ## mixed example test(1948.8, DT[i, on = c( id = "idi", "sameName", "`counts(a>=0)`==` weirdName>=`")], DT[i, on = "id==idi", c("id", "counts(a>=0)", "sameName")]) @@ -12319,14 +12319,14 @@ if (test_bit64) { # allow nomatch=NULL to work same as nomatch=0L, #857 d1 = data.table(a=1:3, b=2:4) d2 = data.table(a=2:4, b=3:5) -test(1950.1, d1[d2, on="a", nomatch=NULL], d1[d2, on="a", nomatch=0L]) -test(1950.2, d1[d2, on="b", nomatch=NULL], d1[d2, on="b", nomatch=0L]) -test(1950.3, d1[d2, on=c("a","b"), nomatch=NULL], d1[d2, on=c("a","b"), nomatch=0L]) -test(1950.4, d1[d2, nomatch=3], error="nomatch= must be either NA or NULL .or 0 for backwards compatibility") +test(1951.1, d1[d2, on="a", nomatch=NULL], d1[d2, on="a", nomatch=0L]) +test(1951.2, d1[d2, on="b", nomatch=NULL], d1[d2, on="b", nomatch=0L]) +test(1951.3, d1[d2, on=c("a","b"), nomatch=NULL], d1[d2, on=c("a","b"), nomatch=0L]) +test(1951.4, d1[d2, nomatch=3], error="nomatch= must be either NA or NULL .or 0 for backwards compatibility") # coverage of which= checks -test(1951.1, d1[a==2, which=3], error="which= must be a logical vector length 1. Either FALSE, TRUE or NA.") -test(1951.2, d1[a==2, 2, which=TRUE], error="which==TRUE.*but j is also supplied") +test(1952.1, d1[a==2, which=3], error="which= must be a logical vector length 1. Either FALSE, TRUE or NA.") +test(1952.2, d1[a==2, 2, which=TRUE], error="which==TRUE.*but j is also supplied") ###################################