From 0273b1288a780aff9648dd8524a06c2c354b3d23 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 1 Nov 2021 21:45:20 -0700 Subject: [PATCH 001/106] expect factor(1) when measure=list --- inst/tests/tests.Rraw | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6382a13a8..a0f5ebfe4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17735,6 +17735,8 @@ test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE), data.table(variable=factor(2), a=2, b=2)) test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("b1","b2")), b=c(1,2))) # measure.vars named list length=1, #5065 +test(2182.71, melt(DT.wide, measure.vars = list("a2")), data.table(b1=1, b2=2, variable=factor(1), value=2)) +test(2182.72, melt(DT.wide, measure.vars = c("a2")), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) ### First block testing measurev # new variable_table attribute for measure.vars, PR#4731 for multiple issues From 8fa8c8fdc91e5dfe9ae3d5a591fd4fa4d8004c38 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 1 Nov 2021 22:48:14 -0700 Subject: [PATCH 002/106] melt checks if measure.vars is list --- R/data.table.R | 9 +++++++-- R/fmelt.R | 1 + inst/tests/tests.Rraw | 8 +++++--- src/fmelt.c | 12 +++++++++--- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index e020ea3e3..8f6ffa568 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1011,8 +1011,13 @@ replace_dot_alias = function(e) { .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) } else { if (colsub %iscall% 'patterns') { - # each pattern gives a new filter condition, intersect the end result - .SDcols = Reduce(intersect, eval_with_cols(colsub, names_x)) + patterns_list_or_vector = eval_with_cols(colsub, names_x) + .SDcols = if (is.list(patterns_list_or_vector)) { + # each pattern gives a new filter condition, intersect the end result + Reduce(intersect, patterns_list_or_vector) + } else { + patterns_list_or_vector + } } else { .SDcols = eval(colsub, parent.frame(), parent.frame()) # allow filtering via function in .SDcols, #3950 diff --git a/R/fmelt.R b/R/fmelt.R index 83963bebc..02011ee05 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -30,6 +30,7 @@ patterns = function(..., cols=character(0L)) { # replace with lengths when R 3.2.0 dependency arrives if (length(idx <- which(sapply(matched, length) == 0L))) stopf('Pattern(s) not found: [%s]', brackify(p[idx])) + if(length(matched)==1)return(matched[[1]]) matched } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a0f5ebfe4..b0b16217e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17734,9 +17734,11 @@ exid = data.table(id=1, expected) test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE), data.table(variable=factor(2), a=2, b=2)) -test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("b1","b2")), b=c(1,2))) # measure.vars named list length=1, #5065 -test(2182.71, melt(DT.wide, measure.vars = list("a2")), data.table(b1=1, b2=2, variable=factor(1), value=2)) -test(2182.72, melt(DT.wide, measure.vars = c("a2")), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) +test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("1","2")), b=c(1,2))) # measure.vars named list length=1, #5065 +test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor(1), value=2)) +test(2182.72, melt(DT.wide, measure.vars=c("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) +test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="1", value=2)) +test(2182.74, melt(DT.wide, measure.vars=c("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2)) ### First block testing measurev # new variable_table attribute for measure.vars, PR#4731 for multiple issues diff --git a/src/fmelt.c b/src/fmelt.c index 9990da2fc..bca7904ca 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -293,7 +293,8 @@ struct processData { totlen, // of output/long DT result of melt operation. nrow; // of input/wide DT to be melted. SEXPTYPE *maxtype; - Rboolean narm; // remove missing values? + Rboolean measure_is_list, + narm; // remove missing values? }; static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valnames, Rboolean narm, Rboolean verbose, struct processData *data) { @@ -302,6 +303,11 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna SEXPTYPE type; data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); + if(!isNull(measure) && isNewList(measure)){ + data->measure_is_list = TRUE; + }else{ + data->measure_is_list = FALSE; + } data->idcols = VECTOR_ELT(vars, 0); data->valuecols = VECTOR_ELT(vars, 1); data->lids = length(data->idcols); @@ -594,7 +600,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str if (isNull(data->variable_table)) { if (!varfactor) { SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); - if (data->lvalues == 1) {//one value column to output. + if (!data->measure_is_list) {//one value column to output. const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; @@ -613,7 +619,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SET_VECTOR_ELT(ansvars, 0, target=allocVector(INTSXP, data->totlen)); SEXP levels; int *td = INTEGER(target); - if (data->lvalues == 1) {//one value column to output. + if (!data->measure_is_list) {//one value column to output. SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); int len = length(thisvaluecols); levels = PROTECT(allocVector(STRSXP, len)); protecti++; From 9659f7af025b123f14c071d58c2c10bdea17e01c Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 1 Nov 2021 23:21:12 -0700 Subject: [PATCH 003/106] inconsistent variable between measure.vars with list of length=1 and length>1 --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 5faf40723..95319616c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -233,7 +233,7 @@ 12. `as.data.table(table(NULL))` now returns `data.table(NULL)` rather than error `attempt to set an attribute on NULL`, [#4179](https://github.com/Rdatatable/data.table/issues/4179). The result differs slightly to `as.data.frame(table(NULL))` (0-row, 1-column) because 0-column works better with other `data.table` functions like `rbindlist()`. Thanks to Michael Chirico for the report and fix. -13. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. +13. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Another inconsistency was between length=1 list and length>1 list, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. 14. `by=...get()...` could fail with `object not found`, [#4873](https://github.com/Rdatatable/data.table/issues/4873) [#4981](https://github.com/Rdatatable/data.table/issues/4981). Thanks to @sindribaldur for reporting, and @OfekShilon for fixing. From eff26227a7b86bc1d5dc870b40d19ad29a82c035 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 1 Nov 2021 23:34:04 -0700 Subject: [PATCH 004/106] link related issue --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b0b16217e..88b8f390d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17735,6 +17735,7 @@ test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE), data.table(variable=factor(2), a=2, b=2)) test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("1","2")), b=c(1,2))) # measure.vars named list length=1, #5065 +# consistency between measure.vars=list with length=1 and length>1, #5209 test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor(1), value=2)) test(2182.72, melt(DT.wide, measure.vars=c("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="1", value=2)) From afdcab77df9dd4ae6fa318d5e86333d305a0f00f Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 15 Feb 2024 15:20:30 -0700 Subject: [PATCH 005/106] move news item up --- NEWS.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7d06a925c..609a0af00 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,10 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) +## BUG FIXES + +X. `melt` with a list for `measure.vars` was inconsistent between length=1 list and length>1 list, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE @@ -321,7 +325,7 @@ 12. `as.data.table(table(NULL))` now returns `data.table(NULL)` rather than error `attempt to set an attribute on NULL`, [#4179](https://github.com/Rdatatable/data.table/issues/4179). The result differs slightly to `as.data.frame(table(NULL))` (0-row, 1-column) because 0-column works better with other `data.table` functions like `rbindlist()`. Thanks to Michael Chirico for the report and fix. -13. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Another inconsistency was between length=1 list and length>1 list, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. +13. `melt` with a list for `measure.vars` would output `variable` inconsistently between `na.rm=TRUE` and `FALSE`, [#4455](https://github.com/Rdatatable/data.table/issues/4455). Thanks to @tdhock for reporting and fixing. 14. `by=...get()...` could fail with `object not found`, [#4873](https://github.com/Rdatatable/data.table/issues/4873) [#4981](https://github.com/Rdatatable/data.table/issues/4981). Thanks to @sindribaldur for reporting, and @OfekShilon for fixing. From 33d7b2ad6f09d4d900e16139077716353671e7af Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 15 Feb 2024 15:33:47 -0700 Subject: [PATCH 006/106] add test suggested by @mnazarov --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6b15f52a4..87c6395f8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17240,6 +17240,7 @@ test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data test(2182.72, melt(DT.wide, measure.vars=c("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="1", value=2)) test(2182.74, melt(DT.wide, measure.vars=c("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2)) +test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="1", n=10))#thanks @mnazarov ### First block testing measurev # new variable_table attribute for measure.vars, PR#4731 for multiple issues From 40afa84c37063a17ec8a36e9eefa3507745d9923 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 13 Mar 2024 05:59:43 +0100 Subject: [PATCH 007/106] Fix `onLoad` for `devtools::load_all()` (#5828) * check if builtPath exists * fix builtPath * update dev README * make file.path platform independent Co-authored-by: Michael Chirico * soften language around the use of cc() * cache r versino * clarify dev README * update comment * rename variable for consistency --------- Co-authored-by: Michael Chirico Co-authored-by: Tyson Barrett Co-authored-by: Michael Chirico --- .dev/README.md | 10 ++++++++++ R/onLoad.R | 9 +++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.dev/README.md b/.dev/README.md index c0eb9dfa3..9184793b8 100644 --- a/.dev/README.md +++ b/.dev/README.md @@ -1,5 +1,15 @@ # data.table developer +## Setup + +To use the optional helper function `cc()`, one needs to set up the project path and source `.dev/cc.R` to use `cc()` conveniently. This works through creating an additional `.Rprofile` in the `data.table` directory. + +```r +# content of .Rprofile in the package directory +Sys.setenv(PROJ_PATH="~/git/data.table") +source(".dev/cc.R") +``` + ## Utilities ### [`cc.R`](./cc.R) diff --git a/R/onLoad.R b/R/onLoad.R index 08f074c67..f8766e624 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -15,6 +15,7 @@ .Last.updated = vector("integer", 1L) # exported variable; number of rows updated by the last := or set(), #1885 .onLoad = function(libname, pkgname) { + session_r_version = base::getRversion() # Runs when loaded but not attached to search() path; e.g., when a package just Imports (not Depends on) data.table if (!exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # check when installed package is loaded but skip when developing the package with cc() @@ -25,9 +26,9 @@ # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478 stopf("The data_table.%s version (%s) does not match the package (%s). Please close all R sessions to release the old %s and reinstall data.table in a fresh R session. The root cause is that R's package installer can in some unconfirmed circumstances leave a package in a state that is apparently functional but where new R code is calling old C code silently: https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478. Once a package is in this mismatch state it may produce wrong results silently until you next upgrade the package. Please help by adding precise circumstances to 17478 to move the status to confirmed. This mismatch between R and C code can happen with any package not just data.table. It is just that data.table has added this check.", dll, dllV, RV, toupper(dll)) } - builtUsing = readRDS(system.file("Meta/package.rds",package="data.table"))$Built$R - if (!identical(base::getRversion()>="4.0.0", builtUsing>="4.0.0")) { - stopf("This is R %s but data.table has been installed using R %s. The major version must match. Please reinstall data.table.", base::getRversion(), builtUsing) + builtPath = system.file("Meta", "package.rds", package="data.table") + if (builtPath != "" && !identical(session_r_version>="4.0.0", (build_r_version <- readRDS(builtPath)$Built$R)>="4.0.0")) { + stopf("This is R %s but data.table has been installed using R %s. The major version must match. Please reinstall data.table.", session_r_version, build_r_version) # the if(R>=4.0.0) in NAMESPACE when registering S3 methods rbind.data.table and cbind.data.table happens on install; #3968 } } @@ -37,7 +38,7 @@ # be conditional too: registering the S3 methods in R before 4.0.0 causes this workaround to no longer work. However, the R # syntax available to use in NAMESPACE is very limited (can't call data.table() in it in a capability test, for example). # This version number ("4.0.0") must be precisely the same as used in NAMESPACE; see PR for #3948. - if (base::getRversion() < "4.0.0") { + if (session_r_version < "4.0.0") { # continue to support R<4.0.0 # If R 3.6.2 (not yet released) includes the c|rbind S3 dispatch fix, then this workaround still works. tt = base::cbind.data.frame From dbcb6564926395a682b9cb06da0c044fbd257bdd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 14 Mar 2024 07:24:52 -0700 Subject: [PATCH 008/106] Add options= to test() (#5996) * Add options= to test() document in Rd * Add options= to test() document in Rd * missed staged chunk --- R/test.data.table.R | 8 ++++++-- man/test.Rd | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 4d2ab9410..748b61505 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -249,7 +249,11 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL) { + if (!is.null(options)) { + old_options <- do.call('options', as.list(options)) # as.list(): allow passing named character vector for convenience + on.exit(options(old_options), add=TRUE) + } # Usage: # i) tests that x equals y when both x and y are supplied, the most common usage # ii) tests that x is TRUE when y isn't supplied @@ -280,7 +284,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no foreign = get("foreign", parent.frame()) showProgress = get("showProgress", parent.frame()) time = nTest = RSS = NULL # to avoid 'no visible binding' note - if (num>0) on.exit( { + if (num>0) on.exit( add=TRUE, { took = proc.time()[3L]-lasttime # so that prep time between tests is attributed to the following test timings[as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE] if (memtest) { diff --git a/man/test.Rd b/man/test.Rd index c9763b247..ddf1198bf 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -7,7 +7,8 @@ \usage{ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, - output = NULL, notOutput = NULL, ignore.warning = NULL) + output = NULL, notOutput = NULL, ignore.warning = NULL, + options = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -19,6 +20,7 @@ test(num, x, y = TRUE, \item{output}{ If you are testing the printing/console output behaviour; e.g. with \code{verbose=TRUE} or \code{options(datatable.verbose=TRUE)}. Again, regex-compatible and case sensitive. } \item{notOutput}{ Or if you are testing that a feature does \emph{not} print particular console output. Case insensitive (unlike output) so that the test does not incorrectly pass just because the string is not found due to case. } \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } +\item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to `test()` (usually, `x`, or maybe `y`) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From f92aee69e6535b77b9f98b0ccc02c0ebdfe84911 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 14 Mar 2024 09:11:11 -0700 Subject: [PATCH 009/106] dcast only computes default fill if necessary (#5549) * delete old commented code * new test for no warning fails * only compute default fill if missing cells present * any_NA_int helper * bugfix #5512 * Update src/fcast.c Co-authored-by: Xianying Tan * Update src/fcast.c Co-authored-by: Xianying Tan * mention warning text * const int args * add back ithiscol * get pointer before for loop * add test case from Michael * test min(dbl) and no warning when fill specified * Revert "delete old commented code" This reverts commit 2886c4f41e204020df6c8848e4fba47bc805e73e. * use suggestions from Michael * rm inline any_NA_int since that causes install to fail * clarify comment * link 5390 * mymin test fails * compute some_fill using anyNA in R then pass to C * Update R/fcast.R Co-authored-by: Michael Chirico * Update R/fcast.R Co-authored-by: Michael Chirico * dat_for_default_fill is zero-row dt * !length instead of length==0 * new dcast tests with fill=character * dat_for_default_fill is dat again, not 0-row, because that causes some test failure --------- Co-authored-by: Xianying Tan Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/fcast.R | 22 +++++++++++++--------- inst/tests/tests.Rraw | 15 +++++++++++++++ man/dcast.data.table.Rd | 2 +- src/data.table.h | 2 +- src/fcast.c | 28 +++++++++++++++++----------- 6 files changed, 49 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index acefbbf4e..dff623278 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,8 @@ 3. Optimized `shift` per group produced wrong results when simultaneously subsetting, for example, `DT[i==1L, shift(x), by=group]`, [#5962](https://github.com/Rdatatable/data.table/issues/5962). Thanks to @renkun-ken for the report and Benjamin Schwendinger for the fix. +4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to Toby Dylan Hocking for the fix. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/fcast.R b/R/fcast.R index bb59d8409..7c0766cfe 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -152,7 +152,6 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., dat = .Call(CsubsetDT, dat, idx, seq_along(dat)) } fun.call = m[["fun.aggregate"]] - fill.default = NULL if (is.null(fun.call)) { oo = forderv(dat, by=varnames, retGrp=TRUE) if (attr(oo, 'maxgrpn', exact=TRUE) > 1L) { @@ -160,15 +159,15 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., fun.call = quote(length) } } - if (!is.null(fun.call)) { + dat_for_default_fill = dat + run_agg_funs = !is.null(fun.call) + if (run_agg_funs) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) - errmsg = gettext("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") - if (is.null(fill)) { - fill.default = suppressWarnings(dat[0L][, eval(fun.call)]) - # tryCatch(fill.default <- dat[0L][, eval(fun.call)], error = function(x) stopf(errmsg)) - if (nrow(fill.default) != 1L) stopf(errmsg) + maybe_err = function(list.of.columns) { + if (any(lengths(list.of.columns) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") + list.of.columns } - dat = dat[, eval(fun.call), by=c(varnames)] + dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)] } order_ = function(x) { o = forderv(x, retGrp=TRUE, sort=TRUE) @@ -211,7 +210,12 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., } maplen = vapply_1i(mapunique, length) idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join. - ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call)) + some_fill = anyNA(idx) + fill.default = if (run_agg_funs && is.null(fill) && some_fill) dat_for_default_fill[, maybe_err(eval(fun.call))] + if (run_agg_funs && is.null(fill) && some_fill) { + fill.default = dat_for_default_fill[0L][, maybe_err(eval(fun.call))] + } + ans = .Call(Cfcast, lhs, val, maplen[[1L]], maplen[[2L]], idx, fill, fill.default, is.null(fun.call), some_fill) allcols = do.call("paste", c(rhs, sep=sep)) if (length(valnames) > 1L) allcols = do.call("paste", if (identical(".", allcols)) list(valnames, sep=sep) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1b507112c..4c06fac21 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3729,6 +3729,21 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2],20,TRUE), d1=runif(20), d2=1L) test(1102.38, names(dcast(DT, x ~ y + z, fun.aggregate=length, value.var = "d2", sep=".")), c("x", "1.a", "1.b", "2.a", "2.b")) + + # test for #5512, only compute default fill if needed. + DT = data.table(chr=c("a","b","b"), int=1:3, dbl=as.double(4:6)) + mymin <- function(x){ + if (!length(x)) stop("calling mymin on vector of length 0") + min(x) + } + test(1102.39, dcast(DT, . ~ chr, mymin, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) # fill not used in output, so default fill not computed. + ans <- data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int") + test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning=c("no non-missing arguments to min; returning Inf", "inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")) # warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA. + test(1102.41, dcast(DT, int ~ chr, mymin, value.var="int", fill=NA), ans) # because fill=NA is provided by user, no need to call mymin(integer()). + test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int"), warning="no non-missing arguments to min; returning Inf") # only one warning, because no coercion. + test(1102.43, dcast(DT, int ~ chr, min, value.var="dbl", fill="coerced to NA"), data.table(int=1:3, a=c(4,NA,NA), b=c(NA,5,6), key="int"), warning=c("Coercing 'character' RHS to 'double' to match the type of target vector.", "NAs introduced by coercion")) + test(1102.44, dcast(DT, int ~ ., value.var="dbl", fill="ignored"), data.table(int=1:3, .=c(4,5,6), key="int")) + } # test for freading commands diff --git a/man/dcast.data.table.Rd b/man/dcast.data.table.Rd index 2aa265a96..8d35c199d 100644 --- a/man/dcast.data.table.Rd +++ b/man/dcast.data.table.Rd @@ -22,7 +22,7 @@ \item{\dots}{Any other arguments that may be passed to the aggregating function.} \item{margins}{Not implemented yet. Should take variable names to compute margins on. A value of \code{TRUE} would compute all margins.} \item{subset}{Specified if casting should be done on a subset of the data. Ex: \code{subset = .(col1 <= 5)} or \code{subset = .(variable != "January")}.} - \item{fill}{Value with which to fill missing cells. If \code{fun.aggregate} is present, takes the value by applying the function on a 0-length vector.} + \item{fill}{Value with which to fill missing cells. If \code{fill=NULL} and missing cells are present, then \code{fun.aggregate} is used on a 0-length vector to obtain a fill value.} \item{drop}{\code{FALSE} will cast by including all missing combinations. \code{c(FALSE, TRUE)} will only include all missing combinations of formula \code{LHS}; \code{c(TRUE, FALSE)} will only include all missing combinations of formula RHS. See Examples.} diff --git a/src/data.table.h b/src/data.table.h index 0a6eb207a..da82af7be 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -289,7 +289,7 @@ SEXP setlistelt(SEXP, SEXP, SEXP); SEXP address(SEXP); SEXP expandAltRep(SEXP); SEXP fmelt(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP fcast(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP issorted(SEXP, SEXP); SEXP gforce(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP gsum(SEXP, SEXP); diff --git a/src/fcast.c b/src/fcast.c index 8c49c6fe2..d049711bf 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -4,7 +4,7 @@ // raise(SIGINT); // TO DO: margins -SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fill, SEXP fill_d, SEXP is_agg) { +SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fill, SEXP fill_d, SEXP is_agg, SEXP some_fillArg) { int nrows=INTEGER(nrowArg)[0], ncols=INTEGER(ncolArg)[0]; int nlhs=length(lhs), nval=length(val), *idx = INTEGER(idxArg); SEXP target; @@ -15,24 +15,28 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil SET_VECTOR_ELT(ans, i, VECTOR_ELT(lhs, i)); } // get val cols + bool some_fill = LOGICAL(some_fillArg)[0]; for (int i=0; i Date: Thu, 14 Mar 2024 11:35:06 -0700 Subject: [PATCH 010/106] Need importClassesFrom(methods, "[") after removing blanket import(methods) (#6001) * need importClassesFrom * revert methods:: change, not necessary --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index b9872ee7e..20601c9cf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ useDynLib("data_table", .registration=TRUE) ## For S4-ization importFrom(methods, "S3Part<-", slotNames) +importMethodsFrom(methods, "[") exportClasses(data.table, IDate, ITime) ## From b8919d05aa2191815623c0fa8231bf6b36228732 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sat, 16 Mar 2024 01:42:26 +0100 Subject: [PATCH 011/106] `fwrite(..., row.names=TRUE)` print row.names instead of row numbers (#5364) * fwrite(..., row.names=TRUE) print row.names instead of row numbers * add issue comment to tests + code * restore NEWS notes * spare [ * Update NEWS.md Co-authored-by: Michael Chirico * Update R/fwrite.R Co-authored-by: Michael Chirico * improve test readability\nalso switched test order to make it easier * ... not needed * data.table style * One more local var to focus on what changes between tests --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/fwrite.R | 8 +++++++- inst/tests/tests.Rraw | 8 ++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index dff623278..6dfaa1814 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,6 +30,8 @@ 4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to Toby Dylan Hocking for the fix. +5. `fwrite(x, row.names=TRUE)` with `x` a `matrix` writes `row.names` when present, not row numbers, [#5315](https://github.com/Rdatatable/data.table/issues/5315). Thanks to @Liripo for the report, and @ben-schwen for the fix. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/fwrite.R b/R/fwrite.R index 20f1c70f5..b13b0afb7 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -38,7 +38,13 @@ fwrite = function(x, file="", append=FALSE, quote="auto", # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix messagef("x being coerced from class: matrix to data.table") - x = as.data.table(x) + # keep row.names for matrix input #5315 + if (row.names && !is.null(rownames(x))) { + row.names = FALSE + x = as.data.table(x, keep.rownames="") + } else { + x = as.data.table(x) + } } stopifnot(is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4c06fac21..169a715a8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9912,6 +9912,14 @@ test(1658.37, fwrite(matrix("foo"), quote=TRUE), output='"V1"\n.*"foo"', message test(1658.38, fwrite(matrix(1:4, nrow=2, ncol=2), quote = TRUE), output = '"V1","V2"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table") test(1658.39, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\n.*1\n2\n3', message = "x being coerced from class: matrix to data.table") test(1658.40, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table") +# keep row.names for matrix input #5315 +M = matrix(1:4, nrow=2) +coercion_msg = "x being coerced from class: matrix to data.table" +test(1658.401, fwrite(M, row.names=TRUE), output='"",V1,V2\n1,1,3\n2,2,4', message=coercion_msg) +test(1658.402, fwrite(M, row.names=FALSE), output='V1,V2\n1,3\n2,4', message=coercion_msg) +rownames(M) = c("a","b") +test(1658.403, fwrite(M, row.names=TRUE), output='"",V1,V2\na,1,3\nb,2,4', message=coercion_msg) +test(1658.404, fwrite(M, row.names=FALSE), output='V1,V2\n1,3\n2,4', message=coercion_msg) # fwrite compress if (!haszlib()) { From 8de09b27c37bd3513c84a6294116f6dccef795ba Mon Sep 17 00:00:00 2001 From: Florian Kohrt Date: Sat, 16 Mar 2024 05:49:49 +0100 Subject: [PATCH 012/106] Document data.tables with no columns (#5615) * Document data.tables with no columns * Mention that rows are children of columns --------- Co-authored-by: Michael Chirico --- vignettes/datatable-faq.Rmd | 1 + 1 file changed, 1 insertion(+) diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index 16436446f..97c11aeba 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -400,6 +400,7 @@ A key advantage of column vectors in R is that they are _ordered_, unlike SQL[^2 - `check.names` is by default `TRUE` in `data.frame` but `FALSE` in data.table, for convenience. - `data.table` has always set `stringsAsFactors=FALSE` by default. In R 4.0.0 (Apr 2020), `data.frame`'s default was changed from `TRUE` to `FALSE` and there is no longer a difference in this regard; see [stringsAsFactors, Kurt Hornik, Feb 2020](https://developer.r-project.org/Blog/public/2020/02/16/stringsasfactors/). - Atomic vectors in `list` columns are collapsed when printed using `", "` in `data.frame`, but `","` in data.table with a trailing comma after the 6th item to avoid accidental printing of large embedded objects. + - Unlike data.frames a data.table cannot store rows with no columns, as rows are considered to be the children of columns: `nrow(DF[, 0])` returns the number of rows, while `nrow(DT[, 0])` always returns 0; but see issue [#2422](https://github.com/Rdatatable/data.table/issues/2422). In `[.data.frame` we very often set `drop = FALSE`. When we forget, bugs can arise in edge cases where single columns are selected and all of a sudden a vector is returned rather than a single column `data.frame`. In `[.data.table` we took the opportunity to make it consistent and dropped `drop`. From 821c8f98ea25cc6bbdf4b8e7d75f9ddcedb81a28 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sat, 16 Mar 2024 16:43:33 +0100 Subject: [PATCH 013/106] Transpose(dt) allows to return list without promoting elements to maxtype (#5805) * add feature * change fill * undup code * update arguments * add man * add tests * update usage docs * add coverage * add factors test * update tests for factors * add NEWS * update news * add example to docs * update docs * Update NEWS.md Co-authored-by: Michael Chirico * remove extra blank line * ease t/f error * rm blank line * restore logical case * reordering test case numbers * fix LGL case * use unlist as proper action * move NEWS * fix doc * rm blank line in tests --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/transpose.R | 4 ++-- inst/tests/tests.Rraw | 14 +++++++++++++- man/transpose.Rd | 11 ++++++++++- src/data.table.h | 2 +- src/transpose.c | 19 ++++++++++++++----- 6 files changed, 42 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6dfaa1814..cb020d1e1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,8 @@ 4. Namespace-qualifying `data.table::shift()`, `data.table::first()`, or `data.table::last()` will not deactivate GForce, [#5942](https://github.com/Rdatatable/data.table/issues/5942). Thanks @MichaelChirico for the proposal and fix. Namespace-qualifying other calls like `stats::sum()`, `base::prod()`, etc., continue to work as an escape valve to avoid GForce, e.g. to ensure S3 method dispatch. +5. `transpose` gains `list.cols=` argument, [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoids type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/transpose.R b/R/transpose.R index 115752c04..684b135d4 100644 --- a/R/transpose.R +++ b/R/transpose.R @@ -1,4 +1,4 @@ -transpose = function(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL) { +transpose = function(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL, list.cols=FALSE) { if (!is.null(make.names)) { stopifnot(length(make.names)==1L) if (is.character(make.names)) { @@ -14,7 +14,7 @@ transpose = function(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names colnames = as.character(l[[make.names]]) l = if (is.data.table(l)) l[,-make.names,with=FALSE] else l[-make.names] } - ans = .Call(Ctranspose, l, fill, ignore.empty, keep.names) + ans = .Call(Ctranspose, l, fill, ignore.empty, keep.names, list.cols) if (!is.null(make.names)) setattr(ans, "names", c(keep.names, colnames)) else if (is.data.frame(l)) # including data.table but not plain list setattr(ans, "names", c(keep.names, paste0("V", seq_len(length(ans)-length(keep.names))))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 169a715a8..bca2c13ab 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6921,10 +6921,22 @@ ll = sapply(ll, paste, collapse=",") test(1477.07, transpose(strsplit(ll, ",", fixed=TRUE)), tstrsplit(ll, ",", fixed=TRUE)) test(1477.08, transpose(1:5), error="l must be a list") test(1477.09, transpose(list(as.complex(c(1, 1+5i)))), error="Unsupported column type") -test(1477.10, transpose(list(list(1:5))), error="Item 1 of list input is") +test(1477.10, transpose(list(x~y)), error="Item 1 of list input is") test(1477.11, transpose(as.list(1:5), fill=1:2), error="fill must be a length 1 vector") test(1477.12, transpose(as.list(1:5), ignore.empty=NA), error="ignore.empty should be logical TRUE/FALSE") test(1477.13, transpose(list()), list()) +# return list columns #5639 +la = list(as.list(1:3), list("a","b","c")) +lb = list(list(1L,"a"), list(2L,"b"), list(3L,"c")) +test(1477.14, transpose(list(1:3, c("a","b","c")), list.cols=TRUE), lb) +test(1477.15, transpose(list(1:3, c("a","b","c")), list.cols=FALSE), lapply(lb, unlist)) +test(1477.16, transpose(la, list.cols=TRUE), lb) +test(1477.17, transpose(lb, list.cols=TRUE), la) +test(1477.18, transpose(list(list(1L,"a"), list(2L), list(3L,"c")), list.cols=TRUE, fill="b"), la) +test(1477.19, transpose(list(1:2, c("a","b","c")), list.cols=TRUE, fill=3L), lb) +test(1477.20, transpose(list(factor(letters[1:3])), list.cols=TRUE), list(list("a"), list("b"), list("c"))) +test(1477.21, transpose(list(factor(letters[1:3])), list.cols=FALSE), list("a", "b", "c")) +test(1477.22, transpose(la, list.cols=NA), error="list.cols should be logical TRUE/FALSE.") # #480 `setDT` and 'lapply' ll = list(data.frame(a=1), data.frame(x=1, y=2), NULL, list()) diff --git a/man/transpose.Rd b/man/transpose.Rd index 61a2d1dd1..1d54ddbd0 100644 --- a/man/transpose.Rd +++ b/man/transpose.Rd @@ -6,7 +6,7 @@ } \usage{ -transpose(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL) +transpose(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL, list.cols=FALSE) } \arguments{ \item{l}{ A list, data.frame or data.table. } @@ -14,6 +14,7 @@ transpose(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL) \item{ignore.empty}{Default is \code{FALSE}. \code{TRUE} will ignore length-0 list elements.} \item{keep.names}{The name of the first column in the result containing the names of the input; e.g. \code{keep.names="rn"}. By default \code{NULL} and the names of the input are discarded.} \item{make.names}{The name or number of a column in the input to use as names of the output; e.g. \code{make.names="rn"}. By default \code{NULL} and default names are given to the output columns.} + \item{list.cols}{Default is \code{FALSE}. \code{TRUE} will avoid promoting types and return columns of type \code{list} instead. \code{factor} will always be cast to \code{character}.} } \details{ The list elements (or columns of \code{data.frame}/\code{data.table}) should be all \code{atomic}. If list elements are of unequal lengths, the value provided in \code{fill} will be used so that the resulting list always has all elements of identical lengths. The class of input object is also preserved in the transposed result. @@ -38,6 +39,14 @@ setDT(transpose(ll, fill=0))[] DT = data.table(x=1:5, y=6:10) transpose(DT) +DT = data.table(x=1:3, y=c("a","b","c")) +transpose(DT, list.cols=TRUE) + +# base R equivalent of transpose +l = list(1:3, c("a", "b", "c")) +lapply(seq(length(l[[1]])), function(x) lapply(l, `[[`, x)) +transpose(l, list.cols=TRUE) + ll = list(nm=c('x', 'y'), 1:2, 3:4) transpose(ll, make.names="nm") } diff --git a/src/data.table.h b/src/data.table.h index da82af7be..21b7e30e0 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -306,7 +306,7 @@ SEXP lookup(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP overlaps(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP whichwrapper(SEXP, SEXP); SEXP shift(SEXP, SEXP, SEXP, SEXP); -SEXP transpose(SEXP, SEXP, SEXP, SEXP); +SEXP transpose(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP anyNA(SEXP, SEXP); SEXP setlevels(SEXP, SEXP, SEXP); SEXP rleid(SEXP, SEXP); diff --git a/src/transpose.c b/src/transpose.c index 6bc399bf3..f291cf0e7 100644 --- a/src/transpose.c +++ b/src/transpose.c @@ -2,7 +2,7 @@ #include #include -SEXP transpose(SEXP l, SEXP fill, SEXP ignoreArg, SEXP keepNamesArg) { +SEXP transpose(SEXP l, SEXP fill, SEXP ignoreArg, SEXP keepNamesArg, SEXP listColsArg) { int nprotect=0; if (!isNewList(l)) @@ -18,14 +18,17 @@ SEXP transpose(SEXP l, SEXP fill, SEXP ignoreArg, SEXP keepNamesArg) { if (length(fill) != 1) error(_("fill must be a length 1 vector, such as the default NA")); R_len_t ln = LENGTH(l); + if (!IS_TRUE_OR_FALSE(listColsArg)) + error(_("list.cols should be logical TRUE/FALSE.")); + bool listCol = LOGICAL(listColsArg)[0]; // preprocessing int maxlen=0, zerolen=0; SEXPTYPE maxtype=0; for (int i=0; imaxlen) maxlen=len; zerolen += (len==0); @@ -33,8 +36,8 @@ SEXP transpose(SEXP l, SEXP fill, SEXP ignoreArg, SEXP keepNamesArg) { if (isFactor(li)) type=STRSXP; if (type>maxtype) maxtype=type; } + if (listCol) maxtype=VECSXP; // need to keep preprocessing for zerolen fill = PROTECT(coerceVector(fill, maxtype)); nprotect++; - SEXP ans = PROTECT(allocVector(VECSXP, maxlen+rn)); nprotect++; int anslen = (ignore) ? (ln - zerolen) : ln; if (rn) { @@ -54,7 +57,7 @@ SEXP transpose(SEXP l, SEXP fill, SEXP ignoreArg, SEXP keepNamesArg) { const int len = length(li); if (ignore && len==0) continue; if (TYPEOF(li) != maxtype) { - li = PROTECT(isFactor(li) ? asCharacterFactor(li) : coerceVector(li, maxtype)); + li = PROTECT(isFactor(li) ? (listCol ? coerceVector(asCharacterFactor(li), VECSXP) : asCharacterFactor(li)) : coerceVector(li, maxtype)); } else PROTECT(li); // extra PROTECT just to help rchk by avoiding two counter variables switch (maxtype) { case LGLSXP : { @@ -84,6 +87,12 @@ SEXP transpose(SEXP l, SEXP fill, SEXP ignoreArg, SEXP keepNamesArg) { SET_STRING_ELT(ansp[j+rn], k, j Date: Sun, 17 Mar 2024 00:38:39 +0530 Subject: [PATCH 014/106] Documentation for char.trunc('datatable.prettyprint.char') (#6005) * Documentation for char.trunc('datatable.prettyprint.char') * small fix * Mentioned trunc.char in 'print.data.table' function * Improved explanation * Update man/print.data.table.Rd Co-authored-by: Joshua Wu <124658199+joshhwuu@users.noreply.github.com> * Update man/print.data.table.Rd Co-authored-by: Joshua Wu <124658199+joshhwuu@users.noreply.github.com> * Update man/print.data.table.Rd Co-authored-by: Joshua Wu <124658199+joshhwuu@users.noreply.github.com> * added description, deleted from alias, usage, args * final touches * removed name of private function * removing space * Final tweaks --------- Co-authored-by: nitish jha Co-authored-by: Joshua Wu <124658199+joshhwuu@users.noreply.github.com> Co-authored-by: Michael Chirico --- man/print.data.table.Rd | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index b4929e789..304bbc999 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -13,6 +13,8 @@ Key enhancements include automatic output compression of many observations and concise column-wise \code{class} summary. \code{format_col} and \code{format_list_item} generics provide flexibility for end-users to define custom printing methods for generic classes. + + Note also the option \code{datatable.prettyprint.char}; character columns entries exceeding this limit will be truncated, with \code{...} indicating the truncation. } \usage{ \method{print}{data.table}(x, @@ -98,6 +100,14 @@ print(DT, trunc.cols=TRUE) options(old_width) + # `char.trunc` will truncate the strings if their lengths exceed the given limit: `datatable.prettyprint.char` + # For example: + + old = options(datatable.prettyprint.char=5L) + DT = data.table(x=1:2, y=c("abcdefghij", "klmnopqrstuv")) + DT + options(old) + # Formatting customization format_col.complex = function(x, ...) sprintf('(\%.1f, \%.1fi)', Re(x), Im(x)) x = data.table(z = c(1 + 3i, 2 - 1i, pi + 2.718i)) From 3a4ec47e0f77e73a26d5ceb8fade0293d369f0d4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 18 Mar 2024 07:17:38 -0700 Subject: [PATCH 015/106] Better test failure with non-ASCII characters (#6007) * better test failure with non-ASCII characters * intentional fail to check output on Windows * revert to merge * Update test.data.table.R --- R/test.data.table.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/test.data.table.R b/R/test.data.table.R index 748b61505..7ed8992d3 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -395,6 +395,10 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no catf("Test %s did not produce correct output:\n", numStr) catf("Expected: <<%s>>\n", encodeString(output)) # \n printed as '\\n' so the two lines of output can be compared vertically catf("Observed: <<%s>>\n", encodeString(out)) + if (anyNonAscii(output) || anyNonAscii((out))) { + catf("Expected (raw): <<%s>>\n", paste(charToRaw(output), collapse = " ")) + catf("Observed (raw): <<%s>>\n", paste(charToRaw(out), collapse = " ")) + } fail = TRUE # nocov end } @@ -403,6 +407,10 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no catf("Test %s produced output but should not have:\n", numStr) catf("Expected absent (case insensitive): <<%s>>\n", encodeString(notOutput)) catf("Observed: <<%s>>\n", encodeString(out)) + if (anyNonAscii(notOutput) || anyNonAscii((out))) { + catf("Expected absent (raw): <<%s>>\n", paste(charToRaw(notOutput), collapse = " ")) + catf("Observed (raw): <<%s>>\n", paste(charToRaw(out), collapse = " ")) + } fail = TRUE # nocov end } @@ -448,6 +456,10 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # head.matrix doesn't restrict columns if (length(d <- dim(x))) do.call(`[`, c(list(x, drop = FALSE), lapply(pmin(d, 6L), seq_len))) else print(head(x)) + if (typeof(x) == 'character' && anyNonAscii(x)) { + cat("Non-ASCII string detected, raw representation:\n") + print(lapply(head(x), charToRaw)) + } } } failPrint(x, deparse(xsub)) @@ -466,3 +478,4 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no invisible(!fail) } +anyNonAscii = function(x) anyNA(iconv(x, to="ASCII")) # nocov From 54f904831cafc37ba841958fbdbc614b8baac4c8 Mon Sep 17 00:00:00 2001 From: Nitish Jha <151559388+Nj221102@users.noreply.github.com> Date: Mon, 18 Mar 2024 22:46:54 +0530 Subject: [PATCH 016/106] reformatting text (#6011) Co-authored-by: nitish jha --- man/print.data.table.Rd | 3 ++- man/transpose.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index 304bbc999..bda7a9b78 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -100,7 +100,8 @@ print(DT, trunc.cols=TRUE) options(old_width) - # `char.trunc` will truncate the strings if their lengths exceed the given limit: `datatable.prettyprint.char` + # `char.trunc` will truncate the strings, + # if their lengths exceed the given limit: `datatable.prettyprint.char` # For example: old = options(datatable.prettyprint.char=5L) diff --git a/man/transpose.Rd b/man/transpose.Rd index 1d54ddbd0..a8d8ca44e 100644 --- a/man/transpose.Rd +++ b/man/transpose.Rd @@ -6,7 +6,8 @@ } \usage{ -transpose(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names=NULL, list.cols=FALSE) +transpose(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, + make.names=NULL, list.cols=FALSE) } \arguments{ \item{l}{ A list, data.frame or data.table. } From 958e3dd3cba7c259220aa653bef4beb8ad74b239 Mon Sep 17 00:00:00 2001 From: Joshua Wu <124658199+joshhwuu@users.noreply.github.com> Date: Tue, 19 Mar 2024 08:55:43 -0700 Subject: [PATCH 017/106] Added tests for DT[, .SD] retaining secondary indices, #1709 (#6012) * added tests for DT[, .SD] retaining secondary indices, #1709 * updated news.md * NEWS not needed * terminal newline --------- Co-authored-by: Michael Chirico --- inst/tests/tests.Rraw | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bca2c13ab..4370fb888 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18350,3 +18350,12 @@ if (test_bit64) { apple = data.table(id = c("a", "b", "b"), time = c(1L, 1L, 2L), y = i64v[1:3]) test(2248, dcast(apple, id ~ time, value.var = "y"), data.table(id = c('a', 'b'), `1` = i64v[1:2], `2` = i64v[4:3], key='id')) } + +# Unit tests for DT[, .SD] retaining secondary indices, #1709 +DT = data.table(x=1:5, y=6:10) +setindex(DT, x) +test(2249.1, indices(DT), 'x') +test(2249.2, indices(DT[, .SD]), 'x') +setindex(DT, y) +test(2249.3, indices(DT), c('x', 'y')) +test(2249.4, indices(DT[, .SD]), c('x', 'y')) From 3eefbcaa47a1c2fc28037574aee27992d083750e Mon Sep 17 00:00:00 2001 From: Cole Miller <57992489+ColeMiller1@users.noreply.github.com> Date: Wed, 20 Mar 2024 01:33:06 -0400 Subject: [PATCH 018/106] names(.SD) should work (#4163) * Update data.table.R * Update tests.Rraw * Update data.table.R * Update tests.Rraw * Update datatable-reference-semantics.Rmd * Update assign.Rd * Update NEWS.md * Update NEWS.md * Update data.table.R * Update tests.Rraw * Update tests.Rraw * Update data.table.R * Update tests.Rraw * replace iris with raw dataset * Update tests.Rraw * update replace_names_sd and made .SD := not work * change .SD to names(.SD) * update typo; change .SD to names(.SD) * update to names(.SD) * include names(.SD) and fx to .SD usage I may have went too far. There's no use of ```(cols) := ...``` now but there is at least a reference to the other vignette. * Updates news to names(.SD) * Update typo. * tweak NEWS * minor grammar * jans comment * jan's comment (ii) * added "footnote" * Add is.name(e[[2L]]) * Put tests above Add new tests here * added test to test names(.SD(2)) * include .SDcols in example for assign * included .SDcols = function example * test 2138 is greater than 2137 * bad merge * Make updates per Michael's comments. * Added test where .SD is used as well as some columns not in .SD. * Mention count of reactions in issue * small copy-edit * more specific * specify LHS/RHS * Simplify implementation to probe for names(.SD) and new test * fine-tune comment --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 + R/data.table.R | 4 +- inst/tests/tests.Rraw | 30 ++++++++++++ man/assign.Rd | 3 ++ vignettes/datatable-reference-semantics.Rmd | 17 +++++++ vignettes/datatable-sd-usage.Rmd | 52 ++++++++++----------- 6 files changed, 78 insertions(+), 30 deletions(-) diff --git a/NEWS.md b/NEWS.md index cb020d1e1..d751b1652 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,8 @@ 5. `transpose` gains `list.cols=` argument, [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoids type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and Benjamin Schwendinger for the PR. +4. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index c80e89f88..f7b9b4192 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1122,8 +1122,8 @@ replace_dot_alias = function(e) { if (is.name(lhs)) { lhs = as.character(lhs) } else { - # e.g. (MyVar):= or get("MyVar"):= - lhs = eval(lhs, parent.frame(), parent.frame()) + # lhs is e.g. (MyVar) or get("MyVar") or names(.SD) || setdiff(names(.SD), cols) + lhs = eval(lhs, list(.SD = setNames(logical(length(sdvars)), sdvars)), parent.frame()) } } else { # `:=`(c2=1L,c3=2L,...) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4370fb888..fe68cc5de 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18359,3 +18359,33 @@ test(2249.2, indices(DT[, .SD]), 'x') setindex(DT, y) test(2249.3, indices(DT), c('x', 'y')) test(2249.4, indices(DT[, .SD]), c('x', 'y')) + +# make names(.SD) work - issue #795 +dt = data.table(a = 1:4, b = 5:8) +test(2250.01, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 1L], data.table(a = 1:4 * 2, b = 5:8)) +test(2250.02, dt[, names(.SD) := lapply(.SD, '*', 2), .SDcols = 2L], data.table(a = 1:4 * 2, b = 5:8 * 2)) +test(2250.03, dt[, names(.SD) := lapply(.SD, as.integer)], data.table(a = as.integer(1:4 * 2), b = as.integer(5:8 * 2))) +test(2250.04, dt[1L, names(.SD) := lapply(.SD, '+', 2L)], data.table(a = as.integer(c(4, 2:4 * 2)), b = as.integer(c(12, 6:8 * 2)))) +test(2250.05, dt[, setdiff(names(.SD), 'a') := NULL], data.table(a = as.integer(c(4, 2:4 * 2)))) +test(2250.06, dt[, c(names(.SD)) := NULL], null.data.table()) + +dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) +test(2250.07, dt[, names(.SD) := lapply(.SD, max), by = grp], data.table(a = c(2L, 2L, 3L, 4L), b = c(6L, 6L, 7L, 8L), grp = c('a', 'a', 'b', 'c'))) + +dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) +keep = c('a', 'b') +test(2250.08, dt[, names(.SD) := NULL, .SDcols = !keep], data.table(a = 1:4, b = 5:8)) + +dt = data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c')) +test(2250.09, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp] , data.table(a = 1:4, b = 5:8, grp = c('a', 'a', 'b', 'c'), a_max = c(2L, 2L, 3L, 4L), b_max = c(6L, 6L, 7L, 8L))) + +dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) +test(2250.10, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) +test(2250.11, dt[, names(.SD(2)) := lapply(.SD, .I)], error = 'could not find function ".SD"') + +dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) +test(2250.12, dt[, names(.SD) := lapply(.SD, \(x) x + b), .SDcols = "a"], data.table(a = 1:3 + 5:7, b = 5:7, grp = c('a', 'a', 'b'))) + + +dt = data.table(a = 1L, b = 2L, c = 3L, d = 4L, e = 5L, f = 6L) +test(2250.13, dt[, names(.SD)[1:5] := sum(.SD)], data.table(a = 21L, b = 21L, c = 21L, d = 21L, e = 21L, f = 6L)) diff --git a/man/assign.Rd b/man/assign.Rd index df255d395..62c8d6142 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -26,6 +26,9 @@ # LHS2 = RHS2, # ...), by = ...] +# 3. Multiple columns in place +# DT[i, names(.SD) := lapply(.SD, fx), by = ..., .SDcols = ...] + set(x, i = NULL, j, value) } \arguments{ diff --git a/vignettes/datatable-reference-semantics.Rmd b/vignettes/datatable-reference-semantics.Rmd index 7a9990ba4..b678c390e 100644 --- a/vignettes/datatable-reference-semantics.Rmd +++ b/vignettes/datatable-reference-semantics.Rmd @@ -258,6 +258,23 @@ flights[, c("speed", "max_speed", "max_dep_delay", "max_arr_delay") := NULL] head(flights) ``` +#### -- How can we update multiple existing columns in place using `.SD`? + +```{r} +flights[, names(.SD) := lapply(.SD, as.factor), .SDcols = is.character] +``` +Let's clean up again and convert our newly-made factor columns back into character columns. This time we will make use of `.SDcols` accepting a function to decide which columns to include. In this case, `is.factor()` will return the columns which are factors. For more on the **S**ubset of the **D**ata, there is also an [SD Usage vignette](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-sd-usage.html). + +Sometimes, it is also nice to keep track of columns that we transform. That way, even after we convert our columns we would be able to call the specific columns we were updating. +```{r} +factor_cols <- sapply(flights, is.factor) +flights[, names(.SD) := lapply(.SD, as.character), .SDcols = factor_cols] +str(flights[, ..factor_cols]) +``` +#### {.bs-callout .bs-callout-info} + +* We also could have used `(factor_cols)` on the `LHS` instead of `names(.SD)`. + ## 3. `:=` and `copy()` `:=` modifies the input object by reference. Apart from the features we have discussed already, sometimes we might want to use the update by reference feature for its side effect. And at other times it may not be desirable to modify the original object, in which case we can use `copy()` function, as we will see in a moment. diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 5f0348e4f..09243c820 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -77,7 +77,15 @@ The first way to impact what `.SD` is is to limit the _columns_ contained in `.S Pitching[ , .SD, .SDcols = c('W', 'L', 'G')] ``` -This is just for illustration and was pretty boring. But even this simply usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations: +This is just for illustration and was pretty boring. In addition to accepting a character vector, `.SDcols` also accepts: + +1. any function such as `is.character` to filter _columns_ +2. the function^{*} `patterns()` to filter _column names_ by regular expression +3. integer and logical vectors + +*see `?patterns` for more details + +This simple usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations: ## Column Type Conversion @@ -91,52 +99,40 @@ We notice that the following columns are stored as `character` in the `Teams` da # teamIDretro: Team ID used by Retrosheet fkt = c('teamIDBR', 'teamIDlahman45', 'teamIDretro') # confirm that they're stored as `character` -Teams[ , sapply(.SD, is.character), .SDcols = fkt] +str(Teams[ , ..fkt]) ``` -If you're confused by the use of `sapply` here, note that it's quite similar for base R `data.frames`: - -```{r identify_factors_as_df} -setDF(Teams) # convert to data.frame for illustration -sapply(Teams[ , fkt], is.character) -setDT(Teams) # convert back to data.table -``` - -The key to understanding this syntax is to recall that a `data.table` (as well as a `data.frame`) can be considered as a `list` where each element is a column -- thus, `sapply`/`lapply` applies the `FUN` argument (in this case, `is.character`) to each _column_ and returns the result as `sapply`/`lapply` usually would. - -The syntax to now convert these columns to `factor` is very similar -- simply add the `:=` assignment operator: +The syntax to now convert these columns to `factor` is simple: ```{r assign_factors} -Teams[ , (fkt) := lapply(.SD, factor), .SDcols = fkt] +Teams[ , names(.SD) := lapply(.SD, factor), .SDcols = patterns('teamID')] # print out the first column to demonstrate success head(unique(Teams[[fkt[1L]]])) ``` -Note that we must wrap `fkt` in parentheses `()` to force `data.table` to interpret this as column names, instead of trying to assign a column named `'fkt'`. +Note: -Actually, the `.SDcols` argument is quite flexible; above, we supplied a `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. `.SDcols` even accepts regular expression-based pattern matching. +1. The `:=` is an assignment operator to update the `data.table` in place without making a copy. See [reference semantics](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reference-semantics.html) for more. +2. The LHS, `names(.SD)`, indicates which columns we are updating - in this case we update the entire `.SD`. +3. The RHS, `lapply()`, loops through each column of the `.SD` and converts the column to a factor. +4. We use the `.SDcols` to only select columns that have pattern of `teamID`. + +Again, the `.SDcols` argument is quite flexible; above, we supplied `patterns` but we could have also supplied `fkt` or any `character` vector of column names. In other situations, it is more convenient to supply an `integer` vector of column _positions_ or a `logical` vector dictating include/exclude for each column. Finally, the use of a function to filter columns is very helpful. For example, we could do the following to convert all `factor` columns to `character`: ```{r sd_as_logical} -# while .SDcols accepts a logical vector, -# := does not, so we need to convert to column -# positions with which() -fkt_idx = which(sapply(Teams, is.factor)) -Teams[ , (fkt_idx) := lapply(.SD, as.character), .SDcols = fkt_idx] -head(unique(Teams[[fkt_idx[1L]]])) +fct_idx = Teams[, which(sapply(.SD, is.factor))] # column numbers to show the class changing +str(Teams[[fct_idx[1L]]]) +Teams[ , names(.SD) := lapply(.SD, as.character), .SDcols = is.factor] +str(Teams[[fct_idx[1L]]]) ``` Lastly, we can do pattern-based matching of columns in `.SDcols` to select all columns which contain `team` back to `factor`: ```{r sd_patterns} Teams[ , .SD, .SDcols = patterns('team')] - -# now convert these columns to factor; -# value = TRUE in grep() is for the LHS of := to -# get column names instead of positions -team_idx = grep('team', names(Teams), value = TRUE) -Teams[ , (team_idx) := lapply(.SD, factor), .SDcols = team_idx] +Teams[ , names(.SD) := lapply(.SD, factor), .SDcols = patterns('team')] ``` ** A proviso to the above: _explicitly_ using column numbers (like `DT[ , (1) := rnorm(.N)]`) is bad practice and can lead to silently corrupted code over time if column positions change. Even implicitly using numbers can be dangerous if we don't keep smart/strict control over the ordering of when we create the numbered index and when we use it. From 7cab6f1fc1720589c25243563e9c3e4ae725e187 Mon Sep 17 00:00:00 2001 From: Joshua Wu <124658199+joshhwuu@users.noreply.github.com> Date: Wed, 20 Mar 2024 12:23:03 -0700 Subject: [PATCH 019/106] Updated warning for referencing a non-existent value during creation of new column (#6016) * Updated warning message in assign.c, as well as updated tests 316, 944.1 and 944.3 * added spaces for consistency * Update src/assign.c warning Co-authored-by: Michael Chirico * updated tests --------- Co-authored-by: Michael Chirico --- inst/tests/tests.Rraw | 6 +++--- src/assign.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fe68cc5de..10f365332 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1012,7 +1012,7 @@ test(313, DT[,a:=1:3], data.table(a=1:3)) # test changed in 1.12.2; can now a DT = data.table(a=20:22) test(314, {DT[,b:=23:25];DT[,c:=26:28]}, data.table(a=20:22,b=23:25,c=26:28)) # add in series test(315, DT[,c:=NULL], data.table(a=20:22,b=23:25)) # delete last -test(316, DT[,c:=NULL], data.table(a=20:22,b=23:25), warning="Column 'c' does not exist to remove") +test(316, DT[,c:=NULL], data.table(a=20:22,b=23:25), warning="Tried to assign NULL to column 'c', but this column does not exist to remove") # Test adding, removing and updating columns via [<- in one step DT = data.table(a=1:6,b=1:6,c=1:6) @@ -2809,9 +2809,9 @@ test(943, merge(X,Y,all.y=TRUE,by="a"), data.table(a=2:4,b=INT(5:6,NA),"d 1"=5:7 # Test error message about NULL type DT = data.table(NULL) -test(944.1, DT[, foo:=NULL], DT, warning="Column 'foo' does not exist to remove") +test(944.1, DT[, foo:=NULL], DT, warning="Tried to assign NULL to column 'foo', but this column does not exist to remove") test(944.2, DT[,a:=1L], data.table(a=1L)) # can now add columns to an empty data.table from v1.12.2 -test(944.3, DT[,aa:=NULL], data.table(a=1L), warning="Column 'aa' does not exist to remove") +test(944.3, DT[,aa:=NULL], data.table(a=1L), warning="Tried to assign NULL to column 'aa', but this column does not exist to remove") test(944.4, DT[,a:=NULL], data.table(NULL)) if (base::getRversion() >= "3.4.0") { test(944.5, typeof(structure(NULL, class=c("data.table","data.frame"))), 'list', warning="deprecated, as NULL cannot have attributes") # R warns which is good and we like diff --git a/src/assign.c b/src/assign.c index ef49fd230..2285bfba5 100644 --- a/src/assign.c +++ b/src/assign.c @@ -430,7 +430,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) if (newcolnum<0 || newcolnum>=length(newcolnames)) error(_("Internal error in assign.c: length(newcolnames)=%d, length(names)=%d, coln=%d"), length(newcolnames), length(names), coln); // # nocov if (isNull(thisvalue)) { - warning(_("Column '%s' does not exist to remove"),CHAR(STRING_ELT(newcolnames,newcolnum))); + warning(_("Tried to assign NULL to column '%s', but this column does not exist to remove"), CHAR(STRING_ELT(newcolnames,newcolnum))); continue; } // RHS of assignment to new column is zero length but we'll use its type to create all-NA column of that type From e6937f1d38be65135aecc4921a9b19813e8ebd00 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Mar 2024 07:38:36 +0100 Subject: [PATCH 020/106] fread: use fill with integer as ncol guess (#5119) * fread: turn off sampling for fill * fixed stop * add stopf * fread: turn off sampling for fill * added coverage * coverage * revert additional argument * fill upperbound * integer as fill argument * fix typo * fix L * add NEWS * update verbose * undo verbose * init cleanup * fix typo news * renum NEWS * add proper cleanup of overallocated columns * add tests and coverage * fix tests * add tests * cleanup * update NEWS * update tests * Refine NEWS * use integer for fill Co-authored-by: Michael Chirico * refine warning Co-authored-by: Michael Chirico * wording Co-authored-by: Michael Chirico * test readability * small tweak to NEWS --------- Co-authored-by: Michael Chirico --- NEWS.md | 4 +++- R/fread.R | 3 ++- inst/tests/tests.Rraw | 30 ++++++++++++++++++++++++++++++ man/fread.Rd | 2 +- src/fread.c | 42 ++++++++++++++++++++++++++++++++++++------ src/fread.h | 11 +++++++++-- src/freadR.c | 16 +++++++++++++--- 7 files changed, 94 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index d751b1652..7110f10e0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,7 +22,9 @@ 5. `transpose` gains `list.cols=` argument, [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoids type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and Benjamin Schwendinger for the PR. -4. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. +6. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. + +7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. ## BUG FIXES diff --git a/R/fread.R b/R/fread.R index 8e9a11b12..b4086d155 100644 --- a/R/fread.R +++ b/R/fread.R @@ -22,11 +22,12 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } stopifnot( - isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill) || is.numeric(fill) && length(fill)==1L && fill >= 0L, isTRUEorFALSE(showProgress), isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L ) + fill=as.integer(fill) nrows=as.double(nrows) #4686 if (is.na(nrows) || nrows<0) nrows=Inf # accept -1 to mean Inf, as read.table does if (identical(header,"auto")) header=NA diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 10f365332..2e75c8c96 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18389,3 +18389,33 @@ test(2250.12, dt[, names(.SD) := lapply(.SD, \(x) x + b), .SDcols = "a"], data.t dt = data.table(a = 1L, b = 2L, c = 3L, d = 4L, e = 5L, f = 6L) test(2250.13, dt[, names(.SD)[1:5] := sum(.SD)], data.table(a = 21L, b = 21L, c = 21L, d = 21L, e = 21L, f = 6L)) + +# fread(...,fill) can also be used to specify a guess on the maximum number of columns #2691 #1812 #4130 #3436 #2727 +dt_str = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse="") +ans = data.table(1L, 2L, rep(c(NA, 3L), each=100L)) +test(2251.01, fread(text = dt_str, fill=FALSE), ans[1:100, -3L], warning=".*Consider fill=TRUE.*") +test(2251.02, fread(text = dt_str, fill=TRUE), ans[1:100, -3L], warning=".*Consider fill=3.*") +test(2251.03, fread(text = dt_str, fill=2L), ans[1:100, -3L], warning=".*Consider fill=3.*") +test(2251.04, fread(text = dt_str, fill=3L), ans) +test(2251.05, fread(text = dt_str, fill=5L, verbose=TRUE), ans, output="Provided number of fill columns: 5 but only found 3\n Dropping 2 overallocated columns") # user guess slightly too big +test(2251.06, fread(text = dt_str, fill=1000L), ans) # user guess much too big +lines = c( + "12223, University", + "12227, bridge, Sky", + "12828, Sunset", + "13801, Ground", + "14853, Tranceamerica", + "14854, San Francisco", + "15595, shibuya, Shrine", + "16126, fog, San Francisco", + "16520, California, ocean, summer, golden gate, beach, San Francisco", + "") +text = paste(lines, collapse="\n") +test(2251.07, dim(fread(text)), c(6L, 3L), warning=c("fill=TRUE", "Discarded")) +test(2251.08, dim(fread(text, fill=TRUE)), c(9L, 9L)) +text = paste(lines[c(1:5, 9L, 6:8, 10L)], collapse="\n") +test(2251.09, dim(fread(text)), c(3L, 3L), warning=c("fill=TRUE", "fill=7")) +test(2251.10, dim(fread(text, fill=TRUE)), c(9L, 9L)) +test(2251.11, dim(fread(text, fill=7)), c(9L, 9L)) +test(2251.12, dim(fread(text, fill=9)), c(9L, 9L)) +test(2251.13, dim(fread(text, fill=20)), c(9L, 20L)) # clean up currently only kicks in if sep!=' ' diff --git a/man/fread.Rd b/man/fread.Rd index 09ed80bd5..b431969dc 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -53,7 +53,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } - \item{fill}{logical (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, blank fields are implicitly filled.} + \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } diff --git a/src/fread.c b/src/fread.c index f27c17ff1..a1521fb37 100644 --- a/src/fread.c +++ b/src/fread.c @@ -55,7 +55,9 @@ static const char* const* NAstrings; static bool any_number_like_NAstrings=false; static bool blank_is_a_NAstring=false; static bool stripWhite=true; // only applies to character columns; numeric fields always stripped -static bool skipEmptyLines=false, fill=false; +static bool skipEmptyLines=false; +static int fill=0; +static int *dropFill = NULL; static double NA_FLOAT64; // takes fread.h:NA_FLOAT64_VALUE @@ -141,6 +143,7 @@ bool freadCleanup(void) free(tmpType); tmpType = NULL; free(size); size = NULL; free(colNames); colNames = NULL; + free(dropFill); dropFill = NULL; if (mmp != NULL) { // Important to unmap as OS keeps internal reference open on file. Process is not exiting as // we're a .so/.dll here. If this was a process exiting we wouldn't need to unmap. @@ -171,7 +174,7 @@ bool freadCleanup(void) stripWhite = true; skipEmptyLines = false; eol_one_r = false; - fill = false; + fill = 0; // following are borrowed references: do not free sof = eof = NULL; NAstrings = NULL; @@ -1618,7 +1621,7 @@ int freadMain(freadMainArgs _args) { if (eol(&ch)) ch++; } firstJumpEnd = ch; // size of first 100 lines in bytes is used later for nrow estimate - fill = true; // so that blank lines are read as empty + fill = 1; // so that blank lines are read as empty ch = pos; } else { int nseps; @@ -1750,7 +1753,7 @@ int freadMain(freadMainArgs _args) { } sep = topSep; whiteChar = (sep==' ' ? '\t' : (sep=='\t' ? ' ' : 0)); - ncol = topNumFields; + ncol = fill > topNumFields ? fill : topNumFields; // overwrite user guess if estimated number is higher if (fill || sep==127) { // leave pos on the first populated line; that is start of data ch = pos; @@ -2125,6 +2128,7 @@ int freadMain(freadMainArgs _args) { int nTypeBump=0, nTypeBumpCols=0; double tRead=0, tReread=0; double thRead=0, thPush=0; // reductions of timings within the parallel region + int max_col=0; char *typeBumpMsg=NULL; size_t typeBumpMsgSize=0; int typeCounts[NUMTYPE]; // used for verbose output; needs populating after first read and before reread (if any) -- see later comment #define internalErrSize 1000 @@ -2218,7 +2222,7 @@ int freadMain(freadMainArgs _args) { } prepareThreadContext(&ctx); - #pragma omp for ordered schedule(dynamic) reduction(+:thRead,thPush) + #pragma omp for ordered schedule(dynamic) reduction(+:thRead,thPush) reduction(max:max_col) for (int jump = jump0; jump < nJumps; jump++) { if (stopTeam) continue; // must continue and not break. We desire not to depend on (relatively new) omp cancel directive, yet double tLast = 0.0; // thread local wallclock time at last measuring point for verbose mode only. @@ -2299,6 +2303,7 @@ int freadMain(freadMainArgs _args) { tch++; j++; } + if (j > max_col) max_col = j; //*** END HOT. START TEPID ***// if (tch==tLineStart) { skip_white(&tch); // skips \0 before eof @@ -2310,6 +2315,7 @@ int freadMain(freadMainArgs _args) { int8_t thisSize = size[j]; if (thisSize) ((char **) targets)[thisSize] += thisSize; j++; + if (j > max_col) max_col = j; if (j==ncol) { tch++; myNrow++; continue; } // next line. Back up to while (tch1 && max_col0) { + int ndropFill = ncol - max_col; + if (verbose) { + DTPRINT(_(" Provided number of fill columns: %d but only found %d\n"), ncol, max_col); + DTPRINT(_(" Dropping %d overallocated columns\n"), ndropFill); + } + dropFill = (int *)malloc((size_t)ndropFill * sizeof(int)); + int i=0; + for (int j=max_col; j>"), + if (fill>0) { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or even more based on your knowledge of the input file. First discarded non-empty line: <<%s>>"), + (uint64_t)DTi+row1line, ncol, tt, tt, strlim(skippedFooter,500)); + } else { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, strlim(skippedFooter,500)); + } } } } diff --git a/src/fread.h b/src/fread.h index 7035615a5..1e2783643 100644 --- a/src/fread.h +++ b/src/fread.h @@ -124,8 +124,10 @@ typedef struct freadMainArgs bool skipEmptyLines; // If True, then rows are allowed to have variable number of columns, and - // all ragged rows will be filled with NAs on the right. - bool fill; + // all ragged rows will be filled with NAs on the right. Supplying integer + // argument > 1 results in setting an upper bound estimate for the number + // of columns. + int fill; // If True, then emit progress messages during the parsing. bool showProgress; @@ -348,6 +350,11 @@ void pushBuffer(ThreadLocalFreadParsingContext *ctx); void setFinalNrow(size_t nrows); +/** + * Called at the end to delete columns added due to too high user guess for fill. + */ +void dropFilledCols(int* dropArg, int ndrop); + /** * Free any srtuctures associated with the thread-local parsing context. */ diff --git a/src/freadR.c b/src/freadR.c index 6b12210f5..97fbfadac 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -45,7 +45,7 @@ static int64_t dtnrows = 0; static bool verbose = false; static bool warningsAreErrors = false; static bool oldNoDateTime = false; - +static int *dropFill; SEXP freadR( // params passed to freadMain @@ -82,7 +82,7 @@ SEXP freadR( freadMainArgs args; ncol = 0; dtnrows = 0; - + if (!isString(inputArg) || LENGTH(inputArg)!=1) error(_("Internal error: freadR input not a single character string: a filename or the data itself. Should have been caught at R level.")); // # nocov const char *ch = (const char *)CHAR(STRING_ELT(inputArg,0)); @@ -152,7 +152,7 @@ SEXP freadR( // here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL args.stripWhite = LOGICAL(stripWhiteArg)[0]; args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0]; - args.fill = LOGICAL(fillArg)[0]; + args.fill = INTEGER(fillArg)[0]; args.showProgress = LOGICAL(showProgressArg)[0]; if (INTEGER(nThreadArg)[0]<1) error(_("nThread(%d)<1"), INTEGER(nThreadArg)[0]); args.nth = (uint32_t)INTEGER(nThreadArg)[0]; @@ -533,6 +533,16 @@ void setFinalNrow(size_t nrow) { R_FlushConsole(); // # 2481. Just a convenient place; nothing per se to do with setFinalNrow() } +void dropFilledCols(int* dropArg, int ndelete) { + dropFill = dropArg; + int ndt=length(DT); + for (int i=0; i Date: Thu, 21 Mar 2024 04:21:45 -0700 Subject: [PATCH 021/106] Use options= in test() for rarely-touched scripts (#6015) * Add options= to test(), convert most Rraw scripts data.table spacing style document in Rd Add options= to test() document in Rd missed staged chunk convert most Rraw scripts to use test(options=) Merge branch 'master' into test-options Merge remote-tracking branch 'origin/test-options' into test-options * trailing ws --- inst/tests/benchmark.Rraw | 20 +++++++++++--------- inst/tests/programming.Rraw | 15 ++++++++------- inst/tests/types.Rraw | 10 ++++------ 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 04c5c490b..6d9f604d6 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -81,15 +81,17 @@ test(655, abs(tt1["user.self"] - tt2["user.self"])<2.0) # unoptimized tt2 takes # Test for optimisation of 'order' to 'forder'. set.seed(45L) DT <- data.table(x=sample(1e2, 1e6,TRUE), y=sample(1e2, 1e6,TRUE)) -old = options(datatable.optimize=Inf) -t1 = system.time(ans1 <- DT[order(x,-y)])[['elapsed']] # optimized to forder() -t2 = system.time(ans2 <- DT[base_order(x,-y)])[['elapsed']] # not optimized -test(1241.1, ans1, ans2) -if (.devtesting) test(1241.2, t1 < t2+0.1) -# 0.2 < 3.8 on Matt's laptop seems safe enough to test. -# Even so, 1241.2 has been known to fail, perhaps if system swaps and this R sessions pauses or something? -# We shouldn't have timing tests here that run on CRAN for this reason. Hence wrapping with .devtesting -options(old) +local({ + old = options(datatable.optimize=Inf) + on.exit(options(old)) + t1 = system.time(ans1 <- DT[order(x,-y)])[['elapsed']] # optimized to forder() + t2 = system.time(ans2 <- DT[base_order(x,-y)])[['elapsed']] # not optimized + test(1241.1, ans1, ans2) + if (.devtesting) test(1241.2, t1 < t2+0.1) + # 0.2 < 3.8 on Matt's laptop seems safe enough to test. + # Even so, 1241.2 has been known to fail, perhaps if system swaps and this R sessions pauses or something? + # We shouldn't have timing tests here that run on CRAN for this reason. Hence wrapping with .devtesting +}) # fwrite showProgress test 1735. Turned off as too long/big for CRAN. diff --git a/inst/tests/programming.Rraw b/inst/tests/programming.Rraw index 429545dcb..56d0a1a9f 100644 --- a/inst/tests/programming.Rraw +++ b/inst/tests/programming.Rraw @@ -195,37 +195,38 @@ test(5.10, list2lang(list("a", 1L, c(1L, 2L))), list(as.name("a"), 1L, c(1L,2L)) test(5.11, list2lang(list("a", 1L, call("c", 1L, 2L))), list(as.name("a"), 1L, quote(c(1L, 2L)))) # datatable.enlist -op = options(datatable.enlist=NULL) test(6.01, + options = c(datatable.enlist=NULL), substitute2(list(v1 = v2, v3 = v4), list(v1 = "int", v2 = 1L, v3 = "lst", v4 = list("a", "b", list("c", "d")))), quote(list(int = 1L, lst = list(a, b, list(c, d))))) -options(datatable.enlist=FALSE) test(6.02, + options = c(datatable.enlist=FALSE), substitute2(list(v1 = v2, v3 = v4), list(v1 = "int", v2 = 1L, v3 = "lst", v4 = list("a", "b", list("c", "d")))), substitute(list(int = 1L, lst = lst), list(lst = list("a", "b", list("c", "d"))))) -options(datatable.enlist=NULL) test(6.03, + options = c(datatable.enlist=NULL), enlist(list(v1 = 1L, v2 = list(v3 = "b", v4 = list(v5 = "c")))), quote(list(v1 = 1L, v2 = list(v3 = b, v4 = list(v5 = c))))) -options(datatable.enlist=FALSE) test(6.04, + options = c(datatable.enlist=FALSE), enlist(list(v1 = 1L, v2 = list(v3 = "b", v4 = list(v5 = "c")))), substitute(list(v1 = 1L, v2 = lst), list(lst=list(v3 = "b", v4 = list(v5 = "c"))))) -options(datatable.enlist=NULL) test(6.05, + options = c(datatable.enlist=NULL), substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5")))), quote(list(V1, V2, list(V4, V5)))) -options(datatable.enlist=FALSE) test(6.06, + options = c(datatable.enlist=FALSE), substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5")))), quote(list(V1, V2, list(V4, V5)))) test(6.07, + options = c(datatable.enlist=FALSE), substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5", list("V6"))))), substitute(list(V1, V2, list(V4, V5, lst)), list(lst=list("V6")))) test(6.08, + options = c(datatable.enlist=FALSE), substitute2(list(v1, v2, v3), list(v1="V1", v2="V2", v3=enlist(list("V4","V5", enlist(list("V6")))))), quote(list(V1, V2, list(V4, V5, list(V6))))) -options(op) # documentation examples test(7.01, substitute2(list(var1 = var2), list(var1 = "c1", var2 = 5L)), quote(list(c1 = 5L))) ## works also on names diff --git a/inst/tests/types.Rraw b/inst/tests/types.Rraw index 6b9aca84f..ee301e71d 100644 --- a/inst/tests/types.Rraw +++ b/inst/tests/types.Rraw @@ -30,12 +30,10 @@ test(2.05, testMsg(3, 2, 1), error=err) test(2.06, testMsg(23, 2, 1), warning=wrn[1L], error=err) ##test(2.07, testMsg(123, 2, 1), message=msg[1L], warning=wrn[1L], error=err) # test all messages -op = options(datatable.verbose=TRUE) -test(3.01, testMsg(0, 2, 1), as.list(rep(0L, 2L)), output=out) +test(3.01, options = c(datatable.verbose=TRUE), testMsg(0, 2, 1), as.list(rep(0L, 2L)), output=out) ##test(3.02, testMsg(1, 2, 1), as.list(rep(1L, 2L)), output=out, message=msg) -test(3.03, testMsg(2, 2, 1), as.list(rep(2L, 2L)), output=out, warning=wrn) +test(3.03, options = c(datatable.verbose=TRUE), testMsg(2, 2, 1), as.list(rep(2L, 2L)), output=out, warning=wrn) ##test(3.04, testMsg(12, 2, 1), as.list(rep(2L, 2L)), output=out, message=msg, warning=wrn) -test(3.05, testMsg(3, 2, 1), output=out[1L], error=err) -test(3.06, testMsg(23, 2, 1), output=out[1L], warning=wrn[1L], error=err) +test(3.05, options = c(datatable.verbose=TRUE), testMsg(3, 2, 1), output=out[1L], error=err) +test(3.06, options = c(datatable.verbose=TRUE), testMsg(23, 2, 1), output=out[1L], warning=wrn[1L], error=err) ##test(3.07, testMsg(123, 2, 1), output=out[1L], message=msg[1L], warning=wrn[1L], error=err) -options(op) From 566bff0fe1a10d94a494026c59eb611b90b4dc04 Mon Sep 17 00:00:00 2001 From: Nitish Jha <151559388+Nj221102@users.noreply.github.com> Date: Tue, 26 Mar 2024 05:26:39 +0530 Subject: [PATCH 022/106] Enhance Error Message for using `:=` or `let` in Non-data.table-aware Environment (#6019) * adding error for when := is called in not data.table aware enviroment * added tests * whitespace * Update R/data.table.R Co-authored-by: Michael Chirico * Update tests.Rraw * sprintf() won't coerce symbol->character, do it explicitly * cleanup of test, better error message test --------- Co-authored-by: nitish jha Co-authored-by: Michael Chirico --- R/data.table.R | 4 ++++ inst/tests/tests.Rraw | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/R/data.table.R b/R/data.table.R index f7b9b4192..24eff62d5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -145,6 +145,10 @@ replace_dot_alias = function(e) { # the drop=NULL is to sink drop argument when dispatching to [.data.frame; using '...' stops test 147 if (!cedta()) { # Fix for #500 (to do) + if (substitute(j) %iscall% c(":=", "let")) { + # Throw a specific error message + stopf("[ was called on a data.table in an environment that is not data.table-aware (i.e. cedta()), but '%s' was used, implying the owner of this call really intended for data.table methods to be called. See vignette('datatable-importing') for details on properly importing data.table.", as.character(substitute(j)[[1L]])) + } Nargs = nargs() - (!missing(drop)) ans = if (Nargs<3L) { `[.data.frame`(x,i) } # drop ignored anyway by DF[i] else if (missing(drop)) `[.data.frame`(x,i,j) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2e75c8c96..24b4f4c60 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18419,3 +18419,9 @@ test(2251.10, dim(fread(text, fill=TRUE)), c(9L, 9L)) test(2251.11, dim(fread(text, fill=7)), c(9L, 9L)) test(2251.12, dim(fread(text, fill=9)), c(9L, 9L)) test(2251.13, dim(fread(text, fill=20)), c(9L, 20L)) # clean up currently only kicks in if sep!=' ' + +.datatable.aware = FALSE +dt = data.table(a = 1L) +test(2252.1, dt[, b:=2L], error = "\\[ was called on a data.table.*not data.table-aware.*':='") +test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data.table-aware.*'let'") +rm(.datatable.aware) From e24663a302e3421a74b782263d367c49a18b78a3 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 26 Mar 2024 09:43:58 -0700 Subject: [PATCH 023/106] use Michael wording --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 586de06e8..771547961 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,7 +36,7 @@ 5. `fwrite(x, row.names=TRUE)` with `x` a `matrix` writes `row.names` when present, not row numbers, [#5315](https://github.com/Rdatatable/data.table/issues/5315). Thanks to @Liripo for the report, and @ben-schwen for the fix. -6. `melt` with a list for `measure.vars` was inconsistent between length=1 list and length>1 list, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. +6. `melt` returns an integer column for `variable` whenever `measure.vars` is a list, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. ## NOTES From dbfd853bff3802d9677c716d622a2514a7f6dfa8 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 26 Mar 2024 09:45:10 -0700 Subject: [PATCH 024/106] Update R/fmelt.R Co-authored-by: Michael Chirico --- R/fmelt.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fmelt.R b/R/fmelt.R index 02011ee05..020b38518 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -30,7 +30,7 @@ patterns = function(..., cols=character(0L)) { # replace with lengths when R 3.2.0 dependency arrives if (length(idx <- which(sapply(matched, length) == 0L))) stopf('Pattern(s) not found: [%s]', brackify(p[idx])) - if(length(matched)==1)return(matched[[1]]) + if (length(matched) == 1L) return(matched[[1L]]) matched } From d3d00a91a28530739afade09efd86969da81aa8b Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 26 Mar 2024 09:46:32 -0700 Subject: [PATCH 025/106] Update src/fmelt.c Co-authored-by: Michael Chirico --- src/fmelt.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index bca7904ca..c36ddac98 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -303,11 +303,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna SEXPTYPE type; data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); - if(!isNull(measure) && isNewList(measure)){ - data->measure_is_list = TRUE; - }else{ - data->measure_is_list = FALSE; - } + data->measure_is_list = !isNull(measure) && isNewList(measure) ? TRUE : FALSE; data->idcols = VECTOR_ELT(vars, 0); data->valuecols = VECTOR_ELT(vars, 1); data->lids = length(data->idcols); From 548d94cda24c3b54cfd8dc4ab17d43f48496d56a Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 26 Mar 2024 09:47:46 -0700 Subject: [PATCH 026/106] Rboolean->bool --- src/fmelt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fmelt.c b/src/fmelt.c index c36ddac98..2b9707bb2 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -293,7 +293,7 @@ struct processData { totlen, // of output/long DT result of melt operation. nrow; // of input/wide DT to be melted. SEXPTYPE *maxtype; - Rboolean measure_is_list, + bool measure_is_list, narm; // remove missing values? }; @@ -303,7 +303,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna SEXPTYPE type; data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); - data->measure_is_list = !isNull(measure) && isNewList(measure) ? TRUE : FALSE; + data->measure_is_list = !isNull(measure) && isNewList(measure); data->idcols = VECTOR_ELT(vars, 0); data->valuecols = VECTOR_ELT(vars, 1); data->lids = length(data->idcols); From 898dce33c4f115c10c02c6a2e56855bf74dfd441 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 29 Mar 2024 17:53:19 -0700 Subject: [PATCH 027/106] Mention the forder-applicable case for a descending sort operation (#6038) --- man/setorder.Rd | 1 + 1 file changed, 1 insertion(+) diff --git a/man/setorder.Rd b/man/setorder.Rd index 6e7b59842..e1cdc40bb 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -31,6 +31,7 @@ setorder(x, \dots, na.last=FALSE) setorderv(x, cols = colnames(x), order=1L, na.last=FALSE) # optimised to use data.table's internal fast order # x[order(., na.last=TRUE)] +# x[order(., decreasing=TRUE)] } \arguments{ \item{x}{ A \code{data.table}. } From 096b20f1f1b9788787a826ba3d32177ed9836352 Mon Sep 17 00:00:00 2001 From: Kevin Ushey Date: Tue, 2 Apr 2024 17:42:43 -0700 Subject: [PATCH 028/106] improve OpenMP detection on macOS (#6034) * improve OpenMP detection on macOS * detect LLVM OpenMP builds * remove some comments * define _OPENMP when using LLVM libomp * fix nesting for 'echo no' * let openmp warning stand out a bit more * Always redirect to config.log, not /dev/null * NEWS * Restore deletion of all test-omp* artefacts * try and reduce diff by restoring comment * reduce diff: delete in same order * annotate 'fi' for readability --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ configure | 89 +++++++++++++++++++++++++++++++++++++++++++---------- src/init.c | 19 +++++++----- src/myomp.h | 8 +++++ 4 files changed, 94 insertions(+), 24 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7110f10e0..902f2fecc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -54,6 +54,8 @@ 7. Updated a test relying on `>` working for comparing language objects to a string, which will be deprecated by R, [#5977](https://github.com/Rdatatable/data.table/issues/5977); no user-facing effect. Thanks to R-core for continuously improving the language. +8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the uxpected setup. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE diff --git a/configure b/configure index 9cf4baf98..101607173 100755 --- a/configure +++ b/configure @@ -21,7 +21,7 @@ esac msg=0 NOZLIB=1 # if pkg-config is not available then zlib will be disabled for higher chance of compilation success -pkg-config --version >/dev/null 2>&1 +pkg-config --version >> config.log 2>&1 if [ $? -ne 0 ]; then echo "*** pkg-config is not installed." msg=1 @@ -33,9 +33,9 @@ else else NOZLIB=0 lib=`pkg-config --libs zlib` - expr -- "$lib" : ".*-lz$" >/dev/null # -- for FreeBSD, #4652 + expr -- "$lib" : ".*-lz$" >> config.log # -- for FreeBSD, #4652 if [ $? -ne 0 ]; then - expr -- "$lib" : ".*-lz " >/dev/null + expr -- "$lib" : ".*-lz " >> config.log # would use \b in one expr but MacOS does not support \b if [ $? -ne 0 ]; then echo "*** pkg-config is installed and 'pkg-config --exists zlib' succeeds but" @@ -81,36 +81,93 @@ int main() { } EOF -# First, try R CMD SHLIB to see if R can already compile -# things using OpenMP without any extra help from data.table -"${R_HOME}/bin/R" CMD SHLIB test-omp.c >/dev/null 2>&1 || R_NO_OPENMP=1 +detect_openmp () { -if [ "$R_NO_OPENMP" = "1" ]; then - # Compilation failed -- try forcing -fopenmp instead. - R_NO_OPENMP=0 - "${CC}" "${CFLAGS}" -fopenmp test-omp.c || R_NO_OPENMP=1 - # TODO: and then nothing seems to be done with this outcome -else - echo "R CMD SHLIB supports OpenMP without any extra hint" -fi + if [ "$(uname)" = "Linux" ]; then + + printf "%s" "* checking if R installation supports OpenMP without any extra hints... " + if "${R_HOME}/bin/R" CMD SHLIB test-omp.c >> config.log 2>&1; then + echo "yes" + export R_OPENMP_ENABLED=1 + return + else + echo "no" + fi + + + printf "%s" "* checking if R installation supports openmp with \"-fopenmp\" flag... " + if ${CC} ${CFLAGS} -fopenmp test-omp.c >> config.log 2>&1; then + echo "yes" + export PKG_CFLAGS="${PKG_CFLAGS} -fopenmp" + export R_OPENMP_ENABLED=1 + return + else + echo "no" + fi + fi # uname=Linux + + if [ "$(uname)" = "Darwin" ]; then + + # https://mac.r-project.org/openmp + printf "%s" "* checking if R installation supports OpenMP with \"-Xclang -fopenmp\" ... " + if CPPFLAGS="${CPPFLAGS} -Xclang -fopenmp" LDFLAGS="${LDFLAGS} -lomp" "${R_HOME}/bin/R" CMD SHLIB test-omp.c >> config.log 2>&1; then + echo "yes" + export PKG_CFLAGS="${PKG_CFLAGS} -Xclang -fopenmp" + export PKG_LIBS="${PKG_LIBS} -lomp" + export R_OPENMP_ENABLED=1 + return + else + echo "no" + fi + if [ "$(uname -m)" = "arm64" ]; then + HOMEBREW_PREFIX=/opt/homebrew + else + HOMEBREW_PREFIX=/usr/local + fi + + if [ -e "${HOMEBREW_PREFIX}/opt/libomp" ]; then + printf "%s" "* checking if libomp installation at ${HOMEBREW_PREFIX}/opt/libomp can be used... " + LIBOMP_INCLUDE="-I${HOMEBREW_PREFIX}/opt/libomp/include -Xclang -fopenmp" + LIBOMP_LINK="-L${HOMEBREW_PREFIX}/opt/libomp/lib -lomp" + if ${CC} ${CFLAGS} ${LIBOMP_INCLUDE} ${LIBOMP_LINK} test-omp.c >> config.log 2>&1; then + echo "yes" + export PKG_CFLAGS="${PKG_CFLAGS} ${LIBOMP_INCLUDE}" + export PKG_LIBS="${PKG_LIBS} ${LIBOMP_LINK}" + export R_OPENMP_ENABLED=1 + return + else + echo "no" + fi + fi + + fi # uname=Darwin + + # No support for OpenMP available + export R_OPENMP_ENABLED=0 +} + +detect_openmp # Clean up. rm -f test-omp.* a.out -# Write to Makevars -if [ "$R_NO_OPENMP" = "1" ]; then +if [ "${R_OPENMP_ENABLED}" = "0" ]; then + echo "***" echo "*** OpenMP not supported! data.table uses OpenMP to automatically" echo "*** parallelize operations like sorting, grouping, file reading, etc." echo "*** For details on how to install the necessary toolchains on your OS see:" echo "*** https://github.com/Rdatatable/data.table/wiki/Installation" echo "*** Continuing installation without OpenMP support..." + echo "***" sed -e "s|@openmp_cflags@||" src/Makevars.in > src/Makevars else sed -e "s|@openmp_cflags@|\$(SHLIB_OPENMP_CFLAGS)|" src/Makevars.in > src/Makevars fi + # retain user supplied PKG_ env variables, #4664. See comments in Makevars.in too. sed -e "s|@PKG_CFLAGS@|$PKG_CFLAGS|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars sed -e "s|@PKG_LIBS@|$PKG_LIBS|" src/Makevars > src/Makevars.tmp && mv src/Makevars.tmp src/Makevars + # optional dependency on zlib if [ "$NOZLIB" = "1" ]; then echo "*** Compilation without compression support in fwrite" diff --git a/src/init.c b/src/init.c index 1dcc67dc6..a974a2d95 100644 --- a/src/init.c +++ b/src/init.c @@ -321,15 +321,18 @@ int GetVerbose(void) { // # nocov start SEXP hasOpenMP(void) { - // Just for use by onAttach (hence nocov) to avoid an RPRINTF from C level which isn't suppressable by CRAN - // There is now a 'grep' in CRAN_Release.cmd to detect any use of RPRINTF in init.c, which is - // why RPRINTF is capitalized in this comment to avoid that grep. - // .Platform or .Machine in R itself does not contain whether OpenMP is available because compiler and flags are per-package. - #ifdef _OPENMP + +#if defined(_OPENMP) + // gcc build of libomp return ScalarInteger(_OPENMP); // return the version; e.g. 201511 (i.e. 4.5) - #else - return ScalarInteger(0); // 0 rather than NA so that if() can be used on the result - #endif +#elif defined(KMP_VERSION_BUILD) + // LLVM builds of libomp + return ScalarInteger(KMP_VERSION_BUILD); +#else + // no OpenMP support detected + return ScalarInteger(0); +#endif + } // # nocov end diff --git a/src/myomp.h b/src/myomp.h index 57d8b5873..efb6a5454 100644 --- a/src/myomp.h +++ b/src/myomp.h @@ -1,3 +1,11 @@ + +// Compatibility define for LLVM builds of libomp. +#ifdef KMP_VERSION_BUILD +# ifndef _OPENMP +# define _OPENMP KMP_VERSION_BUILD +# endif +#endif + #ifdef _OPENMP #include #if _OPENMP >= 201511 From 9d73cf27e9d3e53b383b5e21db384c03d9d96528 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 3 Apr 2024 07:42:52 -0700 Subject: [PATCH 029/106] Use early exit to get auto-print output for 'main' branch (#6021) * Use early exit to get auto-print output for 'main' branch * Update test goldens * update goldens to work with/without knitr * remove vestigial --- tests/autoprint.R | 1 - tests/autoprint.Rout.save | 7 +++---- tests/knitr.R | 14 +++++++------- tests/knitr.Rout.mock | 8 ++++++-- tests/knitr.Rout.save | 24 +++++++++++------------- tests/other.R | 24 ++++++++++++------------ 6 files changed, 39 insertions(+), 39 deletions(-) diff --git a/tests/autoprint.R b/tests/autoprint.R index 4709cd13b..1e4694668 100644 --- a/tests/autoprint.R +++ b/tests/autoprint.R @@ -43,4 +43,3 @@ DT[1,a:=10L][] # yes. ...[] == oops, forgot print(...) tryCatch(DT[,foo:=ColumnNameTypo], error=function(e) e$message) # error: not found. DT # yes DT # yes - diff --git a/tests/autoprint.Rout.save b/tests/autoprint.Rout.save index a2879ff15..41aaa8965 100644 --- a/tests/autoprint.Rout.save +++ b/tests/autoprint.Rout.save @@ -1,6 +1,6 @@ -R version 4.1.1 (2021-08-10) -- "Kick Things" -Copyright (C) 2021 The R Foundation for Statistical Computing +R version 4.3.2 (2023-10-31) -- "Eye Holes" +Copyright (C) 2023 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. @@ -136,7 +136,6 @@ NULL 1: 10 2: 10 > -> > proc.time() user system elapsed - 0.723 0.637 0.217 + 0.223 0.016 0.231 diff --git a/tests/knitr.R b/tests/knitr.R index eb9bfe1ae..678510e11 100644 --- a/tests/knitr.R +++ b/tests/knitr.R @@ -1,9 +1,9 @@ -if (suppressPackageStartupMessages(requireNamespace("knitr", quietly = TRUE))) { - require(knitr) - knit("knitr.Rmd", quiet=TRUE) - cat(readLines("knitr.md"), sep="\n") - invisible(file.remove("knitr.md")) -} else { - cat(readLines("knitr.Rout.mock", warn = FALSE), sep="\n") +if (!suppressPackageStartupMessages(requireNamespace("knitr", quietly=TRUE))) { + cat(readLines("knitr.Rout.mock", warn=FALSE), sep="\n") + q('no') } +library(knitr) +invisible(knit("knitr.Rmd", quiet=TRUE)) +cat(readLines("knitr.md"), sep="\n") +invisible(file.remove("knitr.md")) diff --git a/tests/knitr.Rout.mock b/tests/knitr.Rout.mock index ea37b2c46..dcea841a4 100644 --- a/tests/knitr.Rout.mock +++ b/tests/knitr.Rout.mock @@ -1,5 +1,7 @@ -Loading required package: knitr -Loading required package: data.table +> +> library(knitr) +> invisible(knit("knitr.Rmd", quiet=TRUE)) +> cat(readLines("knitr.md"), sep="\n") ```r require(data.table) # print? @@ -42,3 +44,5 @@ DT # yes ``` Some text. +> invisible(file.remove("knitr.md")) +> diff --git a/tests/knitr.Rout.save b/tests/knitr.Rout.save index 3d4b0cf72..790006d3a 100644 --- a/tests/knitr.Rout.save +++ b/tests/knitr.Rout.save @@ -1,6 +1,6 @@ -R version 4.1.1 (2021-08-10) -- "Kick Things" -Copyright (C) 2021 The R Foundation for Statistical Computing +R version 4.3.2 (2023-10-31) -- "Eye Holes" +Copyright (C) 2023 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. @@ -15,16 +15,14 @@ Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. -> if (suppressPackageStartupMessages(requireNamespace("knitr", quietly = TRUE))) { -+ require(knitr) -+ knit("knitr.Rmd", quiet=TRUE) -+ cat(readLines("knitr.md"), sep="\n") -+ invisible(file.remove("knitr.md")) -+ } else { -+ cat(readLines("knitr.Rout.mock", warn = FALSE), sep="\n") +> if (!suppressPackageStartupMessages(requireNamespace("knitr", quietly=TRUE))) { ++ cat(readLines("knitr.Rout.mock", warn=FALSE), sep="\n") ++ q('no') + } -Loading required package: knitr -Loading required package: data.table +> +> library(knitr) +> invisible(knit("knitr.Rmd", quiet=TRUE)) +> cat(readLines("knitr.md"), sep="\n") ```r require(data.table) # print? @@ -67,8 +65,8 @@ DT # yes ``` Some text. -> +> invisible(file.remove("knitr.md")) > > proc.time() user system elapsed - 0.742 0.666 0.261 + 0.247 0.044 0.283 diff --git a/tests/other.R b/tests/other.R index 46a0bf776..5b2969bbf 100644 --- a/tests/other.R +++ b/tests/other.R @@ -1,15 +1,15 @@ require(data.table) -if (as.logical(Sys.getenv("TEST_DATA_TABLE_WITH_OTHER_PACKAGES","FALSE"))) { - - options(warn=1) - # test.data.table() turns on R's warnPartial* options and currently there - # are partial argument names used in base and other packages. Without the - # options(warn=1), other.Rout just contains "There were 16 warnings (use - # warnings() to see them)". However, a print(warnings()) after test.data.table() - # just results in NULL in other.Rout. Hence options(warn=1) because that - # worked to display the warnings, not because we want them displayed at the - # time per se. - - test.data.table(script="other.Rraw") +if (!as.logical(Sys.getenv("TEST_DATA_TABLE_WITH_OTHER_PACKAGES", "FALSE"))) { + q('no') } +options(warn=1) +# test.data.table() turns on R's warnPartial* options and currently there +# are partial argument names used in base and other packages. Without the +# options(warn=1), other.Rout just contains "There were 16 warnings (use +# warnings() to see them)". However, a print(warnings()) after test.data.table() +# just results in NULL in other.Rout. Hence options(warn=1) because that +# worked to display the warnings, not because we want them displayed at the +# time per se. + +test.data.table(script="other.Rraw") From d35dcebc9ea022a0a80f55e0abbcc84f4adbcffc Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 3 Apr 2024 08:56:52 -0700 Subject: [PATCH 030/106] Restore GHA on Windows (#6022) * Use early exit to get auto-print output for 'main' branch * Restore GHA on Windows * Remove Appveyor config * Amend other references to Appveyor * Try setup-pandoc * restore devel on windows * setup-pandoc not needed --- .Rbuildignore | 1 - .appveyor.yml | 71 ------------------------------ .ci/README.md | 4 -- .github/workflows/R-CMD-check.yaml | 2 +- README.md | 1 - inst/tests/benchmark.Rraw | 2 +- inst/tests/tests.Rraw | 5 +-- src/snprintf.c | 2 +- 8 files changed, 5 insertions(+), 83 deletions(-) delete mode 100644 .appveyor.yml diff --git a/.Rbuildignore b/.Rbuildignore index 25e5424de..01019070c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -16,7 +16,6 @@ ^\.graphics$ ^\.github$ -^\.appveyor\.yml$ ^\.gitlab-ci\.yml$ ^Makefile$ diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index 0f9cdf9e6..000000000 --- a/.appveyor.yml +++ /dev/null @@ -1,71 +0,0 @@ -# DO NOT CHANGE the "init" and "install" sections below - -# Download script file from GitHub -init: - ps: | - $ErrorActionPreference = "Stop" - Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" - Import-Module '..\appveyor-tool.ps1' - -install: # this is needed. Don't remove it. Don't know what it does, though. It is not Bootstrap CSS it seems. - ps: Bootstrap - -skip_branch_with_pr: true - -environment: - global: - CRAN: http://cloud.r-project.org - WARNINGS_ARE_ERRORS: 1 - R_CHECK_ARGS: --as-cran --no-manual -# --no-manual to avoid error 'pdflatex is not available' -# --as-cran no longer a lot slower (now takes under 6 mins with and without); logs show _R_CHECK_CRAN_INCOMING_=FALSE which could take 5+ mins - _R_CHECK_NO_STOP_ON_TEST_ERROR_: true -# continue tests even if some script failed - _R_CHECK_TESTS_NLINES_: 0 -# Block truncation of any error messages in R CMD check -# R is 64-bit only on Windows from 4.2.0 (prior default was build and test both 32bit and 64bit) so we no longer use R_ARCH to pick one to reduce CI time in PRs - - matrix: - - - R_VERSION: release # the single Windows.zip binary (both 32bit/64bit) that users following dev version of installation instructions should click - -# - R_VERSION: devel # Never turn back on. GLCI after merge covers latest daily R-devel very well, so we shouldn't confuse and slow down PR dev cycle by measuring PRs against daily R-devel too. If a change in R-devel yesterday breaks the PR, it's very unlikely to be due to something in the PR. So we should accept the PR if it passes R-release and fix separately anything related to R-devel which we'll see from GLCI. - -before_build: - - cmd: ECHO no Revision metadata added to DESCRIPTION - #translate from unix: - cmd: ECHO "Revision:" $CI_BUILD_REF >> ./DESCRIPTION - -build_script: - - set _R_CHECK_FORCE_SUGGESTS_=false -# Include the subset of Suggests that test.data.table() has tests for : - - travis-tool.sh r_install bit64 - - travis-tool.sh r_install xts - - travis-tool.sh r_install nanotime - - travis-tool.sh r_install R.utils - - travis-tool.sh r_install yaml - -test_script: - - travis-tool.sh run_tests - -on_failure: - - 7z a failure.zip *.Rcheck\* - - appveyor PushArtifact failure.zip - -artifacts: - - path: '*.Rcheck\**\*.log' - name: Logs - - - path: '*.Rcheck\**\*.out' - name: Logs - - - path: '*.Rcheck\**\*.fail' - name: Logs - - - path: '*.Rcheck\**\*.Rout' - name: Logs - - - path: '\*_*.tar.gz' - name: Bits - - - path: '\*_*.zip' - name: Bits diff --git a/.ci/README.md b/.ci/README.md index d684a598e..a03c39252 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -35,10 +35,6 @@ Artifacts: TODO document -### [Appveyor](./../.appveyor.yml) - -TODO document - ## CI tools ### [`ci.R`](./ci.R) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 42bec7456..75e1231af 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -25,7 +25,7 @@ jobs: # Rdatatable has full-strength GLCI which runs after merge. So we just need a few # jobs (mainly test-coverage) to run on every commit in PRs so as to not slow down dev. # GHA does run these jobs concurrently but even so reducing the load seems like a good idea. - # - {os: windows-latest, r: 'release'} # currently using AppVeyor which runs 32bit in 5 min and works + - {os: windows-latest, r: 'devel'} # - {os: macOS-latest, r: 'release'} # test-coverage.yaml uses macOS - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} # - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest", http-user-agent: "R/4.1.0 (ubuntu-20.04) R (4.1.0 x86_64-pc-linux-gnu x86_64 linux-gnu) on GitHub Actions" } diff --git a/README.md b/README.md index c3465896b..2825f7c35 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![CRAN status](https://badges.cranchecks.info/flavor/release/data.table.svg)](https://cran.r-project.org/web/checks/check_results_data.table.html) [![R-CMD-check](https://github.com/Rdatatable/data.table/workflows/R-CMD-check/badge.svg)](https://github.com/Rdatatable/data.table/actions) -[![AppVeyor build status](https://ci.appveyor.com/api/projects/status/kayjdh5qtgymhoxr/branch/master?svg=true)](https://ci.appveyor.com/project/Rdatatable/data-table) [![Codecov test coverage](https://codecov.io/github/Rdatatable/data.table/coverage.svg?branch=master)](https://app.codecov.io/github/Rdatatable/data.table?branch=master) [![GitLab CI build status](https://gitlab.com/Rdatatable/data.table/badges/master/pipeline.svg)](https://gitlab.com/Rdatatable/data.table/-/pipelines) [![downloads](https://cranlogs.r-pkg.org/badges/data.table)](https://www.rdocumentation.org/trends) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 6d9f604d6..0ba34c53b 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -348,7 +348,7 @@ ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) test(1035.21, ans, ans) # gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882 -# This runs with 2 threads in the test suite on CRAN and AppVeyor etc. +# This runs with 2 threads in the test suite on CRAN and GHA etc. # 2 threads are sufficient to fail before the fix. N = 20 DF = data.frame(a=rnorm(N), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 24b4f4c60..88c29c8e3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3051,9 +3051,8 @@ x = sample(1:1000,2100,replace=TRUE) # 2100 > 100 JUMPLINES * 10 NJUMP * 2 spac DT = data.table( A=as.character(x), B=1:100) DT[115, A:="123456789123456"] # row 115 is outside the 100 rows at 10 points. fwrite(DT,f<-tempfile()) -test(1016.1, sapply(suppressWarnings(fread(f,verbose=TRUE)),"class"), c(A="integer64", B="integer"), +test(1016.1, sapply(fread(f,verbose=TRUE),"class"), c(A="integer64", B="integer"), output="Rereading 1 columns.*Column 1.*A.*bumped.*int32.*int64.*<<123456789123456>>") -# suppressWarnings for 'bit64 is not installed' warning on AppVeyor where we (correctly) don't install Suggests test(1016.2, fread(f, colClasses = c(A="numeric"), verbose=TRUE), copy(DT)[,A:=as.numeric(A)], output="Rereading 0 columns") DT[90, A:="321456789123456"] # inside the sample write.table(DT,f,sep=",",row.names=FALSE,quote=FALSE) @@ -8354,7 +8353,7 @@ test(1590.07, identical(baseR, INT(1,4,2,3)) || identical(baseR, INT(2,3,1,4)) | Sys.setlocale("LC_CTYPE", ctype) Sys.setlocale("LC_COLLATE", collate) test(1590.08, Sys.getlocale(), oldlocale) # checked restored locale fully back to how it was before this test -# Now test default locale on all platforms: Windows-1252 on AppVeyor and win-builder, UTF-8 on Linux, and users running test.data.table() in their locale +# Now test default locale on all platforms: Windows-1252 on GHA and win-builder, UTF-8 on Linux, and users running test.data.table() in their locale x1 = "fa\xE7ile" Encoding(x1) = "latin1" x2 = iconv(x1, "latin1", "UTF-8") diff --git a/src/snprintf.c b/src/snprintf.c index f322931fc..407145ba4 100644 --- a/src/snprintf.c +++ b/src/snprintf.c @@ -1,7 +1,7 @@ // For translations (#4402) we need positional specifiers (%n$), a non-C99 POSIX extension. // On Linux and Mac, standard snprintf supports positional specifiers. // On Windows, we tried many things but just couldn't achieve linking to _sprintf_p. Even -// if we managed that on AppVeyor we may have fragility in the future on Windows given +// if we managed that on AppVeyor (now GHA) we may have fragility in the future on Windows given // varying Windows versions, compile environments/flags, and dll libraries. This may be // why R uses a third party library, trio, on Windows. But R does not expose trio for use // by packages. From 7fbc314e4dccf7cb271e84ca2d18d9dd207d0c5c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 3 Apr 2024 09:18:14 -0700 Subject: [PATCH 031/106] ignore config.log (#6045) * ignore config.log * overwrite existing log initially instead of always appending --- .Rbuildignore | 1 + .gitignore | 1 + configure | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.Rbuildignore b/.Rbuildignore index 01019070c..5cfaa1ecb 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,6 +1,7 @@ .dir-locals.el ^\.Rprofile$ ^data\.table_.*\.tar\.gz$ +^config\.log$ ^vignettes/plots/figures$ ^\.Renviron$ ^[^/]+\.R$ diff --git a/.gitignore b/.gitignore index 7197bcbe5..9dd72b5c0 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ src/Makevars # Package install inst/cc +config.log # Emacs IDE files .emacs.desktop diff --git a/configure b/configure index 101607173..c1b1f52d7 100755 --- a/configure +++ b/configure @@ -21,7 +21,7 @@ esac msg=0 NOZLIB=1 # if pkg-config is not available then zlib will be disabled for higher chance of compilation success -pkg-config --version >> config.log 2>&1 +pkg-config --version > config.log 2>&1 if [ $? -ne 0 ]; then echo "*** pkg-config is not installed." msg=1 From fad5b171b54d978bc1c7dae40dccb51ad5e04ab0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 3 Apr 2024 15:09:28 -0700 Subject: [PATCH 032/106] Make tests robust to exact phrasing & language of base condition messages (#6029) * Initial work on shielding tests from dependence on base messages in English * suppress base warning * progress * finish fixing tests * restore temp changes for smoking out issues * Fix errors in English now --- R/test.data.table.R | 4 +- inst/tests/tests.Rraw | 87 +++++++++++++++++++++++++------------------ 2 files changed, 53 insertions(+), 38 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 7ed8992d3..4972ad8d3 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -360,8 +360,8 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no if (type=="warning" && length(observed) && !is.null(ignore.warning)) { # if a warning containing this string occurs, ignore it. First need for #4182 where warning about 'timedatectl' only # occurs in R 3.4, and maybe only on docker too not for users running test.data.table(). - stopifnot(length(ignore.warning)==1L, is.character(ignore.warning), !is.na(ignore.warning), nchar(ignore.warning)>=1L) - observed = grep(ignore.warning, observed, value=TRUE, invert=TRUE) + stopifnot(is.character(ignore.warning), !anyNA(ignore.warning), nchar(ignore.warning)>=1L) + for (msg in ignore.warning) observed = grep(msg, observed, value=TRUE, invert=TRUE) # allow multiple for translated messages rather than relying on '|' to always work } if (length(expected) != length(observed)) { # nocov start diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 88c29c8e3..287d36713 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -138,18 +138,20 @@ if (!test_longdouble) { # If the target condition only appears on certain platforms/R versions, this will return NULL # whenever the code succeeds, or the message matcher wherever it fails. Thus we can flexibly # pass e.g. 'warning = NULL | warning = ' concisely as 'test(., warning = get_msg())'. -# Three use cases: +# Use cases: # (1) match message exactly [missing delim] # (2) match message pattern after dropping anything between delimeters [delim, fmt=FALSE] # (3) function factory for matching messages exactly by substituting anything between delimeters [delim, fmt=TRUE] +# (4) function factory for matching messages exactly by substituting a generic string [fmt=string] get_msg = function(e, delim, fmt=FALSE) { condition = tryCatch({e; NULL}, error=identity, warning=identity) if (is.null(condition)) return(condition) msg = condition$message + if (is.character(fmt)) return(function(str) gsub(fmt, str, msg)) if (missing(delim)) return(msg) if (length(delim) == 1L) delim[2L] = delim[1L] msg = gsub( - sprintf("%1$s[^%2$s]+%2$s", delim[1L], delim[2L]), + sprintf("%s[^%s]+%s", delim[1L], gsub("^\\[|\\]$", "", delim[2L]), delim[2L]), # NB; also allow dim to be a char class like ['"] sprintf("%s%s%s", delim[1L], if (fmt) "%s" else ".+", delim[2L]), msg ) @@ -159,9 +161,16 @@ get_msg = function(e, delim, fmt=FALSE) { base_messages = list( missing_object = get_msg(`__dt_test_missing_` + 1, "'", fmt=TRUE), missing_function = get_msg(`__dt_test_missing_`(), '"', fmt=TRUE), + missing_coerce_method = get_msg(delim = '"', { + old = options(useFancyQuotes = FALSE) # otherwise we get angled quotes, hard to match robustly + on.exit(options(old)) + as(TRUE, 'foo') + }), + missing_dispatch_method = get_msg(conditionMessage(structure(1, class="foo")), '[\'"]'), invalid_arg_unary_operator = get_msg(-'a'), invalid_arg_binary_operator = get_msg(1 + 'a'), invalid_arg_sum = get_msg(sum('a'), c("\\(", "\\)"), fmt=TRUE), + unused_arg = get_msg(data.frame()[y = 2], c("\\(", "\\)"), fmt=TRUE), arg_length_mismatch = get_msg(base::order(1, 1:2)), empty_max = get_msg(max(numeric())), empty_min = get_msg(min(numeric())), @@ -171,7 +180,11 @@ base_messages = list( # gives both error & warning but tryCatch returns the warning first, so suppress cant_open_file = get_msg(suppressWarnings({con<-file(tempfile()); open(con, 'r')})), mixed_subscripts = get_msg(letters[-1:1]), - maybe_invalid_old_posixct = get_msg(as.POSIXct("1893-12-28 05:15:36", tz = "")) + maybe_invalid_old_posixct = get_msg(as.POSIXct("1893-12-28 05:15:36", tz = "")), + stopifnot = get_msg(stopifnot(FALSE), fmt="FALSE"), + not_yet_used = get_msg(.NotYetUsed("abc"), "'", fmt=TRUE), # NB: need fmt= because the English message has '(yet)' --> parens in regex + ambiguous_date_fmt = get_msg(as.Date('xxx')), + NULL ) ########################## @@ -495,7 +508,7 @@ xx = capture.output(ans <- DT[,{print(x);sum(y)},by=x,verbose=FALSE]) test(145, xx, c("[1] \"a\"","[1] \"b\"")) test(146, ans, data.table(x=c("a","b"),V1=c(3L,12L))) -test(147, DT[,MySum=sum(v)], error="unused argument") # user meant DT[,list(MySum=sum(v))]. FR#204 done. +test(147, DT[,MySum=sum(v)], error=base_messages$unused_arg("MySum = sum(v)")) # user meant DT[,list(MySum=sum(v))]. FR#204 done. dt = data.table(a=c(1L,4L,5L), b=1:3, key="a") test(148, dt[CJ(2:3),roll=TRUE], data.table(a=c(2L,3L),b=c(1L,1L),key="a")) @@ -3440,10 +3453,11 @@ test(1078.3, a$x, rep(1:5,each=10)) # a$x would segfault before the fix to rbi # data.table() shouldn't retain column names, root cause of #103 x = 1:5 names(x) = letters[1:5] -test(1079.1, DF<-data.frame(x=x, y=1:10), data.frame(x=rep(1:5,2),y=1:10), warning="row names.*discarded") -test(1079.2, lapply(DF, names), list(x=NULL, y=NULL)) -test(1079.3, DT<-data.table(x=x, y=1:10), data.table(x=rep(1:5,2),y=1:10)) -test(1079.4, lapply(DT, names), list(x=NULL, y=NULL)) +DF = suppressWarnings(data.frame(x=x, y=1:10)) +DT = data.table(x=x, y=1:10) +# NB simplified from earlier test -- we just want to be sure the contents of DF & DT are the same, esp. w.r.t. named vectors as columns. +test(1079, list(DT$x, DT$y), list(DF$x, DF$y)) + # test from similar #102 for completeness z = c(a=1,b=2,c=3) a = data.table(z,x=1:3) @@ -3737,10 +3751,10 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, } test(1102.39, dcast(DT, . ~ chr, mymin, value.var="int"), data.table(.=".",a=1L,b=2L,key=".")) # fill not used in output, so default fill not computed. ans <- data.table(int=1:3, a=c(1L,NA,NA), b=c(NA,2L,3L), key="int") - test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning=c("no non-missing arguments to min; returning Inf", "inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")) # warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA. + test(1102.40, dcast(DT, int ~ chr, min, value.var="int"), ans, warning=c(base_messages$empty_min, "inf (type 'double') at RHS position 1 out-of-range(NA) or truncated (precision lost) when assigning to type 'integer' (target vector)")) # warning emitted when coercing default fill since as.integer(min(integer()) is Inf) is NA. test(1102.41, dcast(DT, int ~ chr, mymin, value.var="int", fill=NA), ans) # because fill=NA is provided by user, no need to call mymin(integer()). - test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int"), warning="no non-missing arguments to min; returning Inf") # only one warning, because no coercion. - test(1102.43, dcast(DT, int ~ chr, min, value.var="dbl", fill="coerced to NA"), data.table(int=1:3, a=c(4,NA,NA), b=c(NA,5,6), key="int"), warning=c("Coercing 'character' RHS to 'double' to match the type of target vector.", "NAs introduced by coercion")) + test(1102.42, dcast(DT, int ~ chr, min, value.var="dbl"), data.table(int=1:3, a=c(4,Inf,Inf), b=c(Inf,5,6), key="int"), warning=base_messages$empty_min) # only one warning, because no coercion. + test(1102.43, dcast(DT, int ~ chr, min, value.var="dbl", fill="coerced to NA"), data.table(int=1:3, a=c(4,NA,NA), b=c(NA,5,6), key="int"), warning=c("Coercing 'character' RHS to 'double' to match the type of target vector.", base_messages$coerce_na)) test(1102.44, dcast(DT, int ~ ., value.var="dbl", fill="ignored"), data.table(int=1:3, .=c(4,5,6), key="int")) } @@ -6513,7 +6527,7 @@ for(t in seq_len(nrow(all))){ options(datatable.optimize = Inf) # fread dec=',' e.g. France -test(1439, fread("A;B\n1;2,34\n", dec="12"), error="nchar(dec) == 1L is not TRUE") +test(1439, fread("A;B\n1;2,34\n", dec="12"), error=base_messages$stopifnot("nchar(dec) == 1L")) test(1440, fread("A;B\n8;2,34\n", dec="1"), data.table(A=8L, B="2,34")) test(1441, fread("A;B\n8;2,34\n", dec=","), data.table(A=8L, B=2.34)) test(1442, fread("A;B\n1;2,34\n", sep=".", dec="."), error="sep == dec ('.') is not allowed") @@ -7450,7 +7464,7 @@ test(1533.2, setkeyv(dt1, "x", verbose=TRUE), setkey(dt2, x), output = "forder t # remaining test for covering duplicated.data.table dt = data.table(x=1:5, y=6:10) -test(1536, duplicated(dt, incomparables=TRUE), error = "argument 'incomparables != FALSE'") +test(1536, duplicated(dt, incomparables=TRUE), error = base_messages$not_yet_used('incomparables != FALSE')) # test for covering melt 100% test(1537 , names(melt(dt, id.vars=1L, variable.name = "x", value.name="x")), c("x", "x.1", "x.2"), output = "Duplicate column names") @@ -8727,7 +8741,7 @@ test(1613.581, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = 1), all test(1613.582, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = sqrt(.Machine$double.eps)/2), all.equal(x, y, ignore.row.order = TRUE, tolerance = sqrt(.Machine$double.eps)/2)), warning = "Argument 'tolerance' was forced") # fix for #4042 -test(1613.59, all.equal.data.table(1L, 2L), error = "is.data.table(target) is not TRUE") +test(1613.59, all.equal.data.table(1L, 2L), error = base_messages$stopifnot("is.data.table(target)")) test(1613.601, all.equal(data.table(a=1), data.frame(a=1)), "target is data.table, current is data.frame") test(1613.602, all.equal(data.table(a=1), data.frame(a=1), check.attributes = FALSE)) test(1613.603, all.equal(data.table(a=1), list(a=1), check.attributes = FALSE)) @@ -9899,7 +9913,7 @@ test(1658.26, fwrite(DT), output='A,B\n2,\n,4\n3,5') test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing bom .false., yaml .0 characters. and column names .true.*"A","B".*2,NA\nNA,4\n3,5') # wrong argument types -test(1658.28, fwrite(ok_dt, 1), error="is.character\\(file\\).*not TRUE") +test(1658.28, fwrite(ok_dt, 1), error=base_messages$stopifnot("is.character(file) && length(file) == 1L && !is.na(file)")) test(1658.29, fwrite(ok_dt, quote=123), error="identical\\(quote.*auto.*FALSE.*TRUE") test(1658.30, fwrite(ok_dt, sep="..."), error="nchar(sep)") test(1658.31, fwrite(ok_dt, qmethod=c("double", "double")), error="length(qmethod)") @@ -10178,7 +10192,7 @@ test(1675.2, as.integer(B[A, bar := i.bar, on="foo"]$bar), c(1:2,NA,1:2,NA)) # fwrite na arg segfault fix, #1725 dt = data.table(x=1:2, y=c(NA,"a")) f = tempfile() -test(1676.1, fwrite(dt, f, na=NULL), error="is not TRUE") +test(1676.1, fwrite(dt, f, na=NULL), error=base_messages$stopifnot("length(na) == 1L")) fwrite(dt, f, na=NA) test(1676.2, fread(f), data.table(x=1:2, y=c(NA, "a"))) unlink(f) @@ -10784,7 +10798,7 @@ test(1732.7, fwrite(DT, quote='auto'), output='A,B\n,5\nNA,7\n"",0\nmonty,') test(1732.8, fwrite(DT, quote='auto', na="NA"), output='"A","B"\nNA,5\n"NA",7\n"",0\n"monty",NA') # dec="," -test(1733.1, fwrite(data.table(pi),dec=","), error="dec != sep is not TRUE") +test(1733.1, fwrite(data.table(pi),dec=","), error=base_messages$stopifnot("dec != sep")) test(1733.2, fwrite(data.table(c(1.2,-8.0,pi,67.99),1:4),dec=",",sep=";"), output="V1;V2\n1,2;1\n-8;2\n3,14159265358979;3\n67,99;4") @@ -10976,10 +10990,12 @@ test(1743.123, fread("a,b\n1+3i,2015-01-01", colClasses=c(NA,"IDate")), data.tab ## Attempts to impose incompatible colClasses is a warning (not an error) ## and does not change the value of the columns -test(1743.13, sapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=c(a="character", b="Date"), warning="standard unambiguous format") +test(1743.13, sapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=c(a="character", b="Date"), warning=base_messages$ambiguous_date_fmt) ## Just invalid -test(1743.14, sapply(fread("a,b\n2017-01-01,1", colClasses=c("foo", "integer")), class), c(a="character", b="integer"), warning="[nN]o method .*for .*foo") +test(1743.14, options = c(useFancyQuotes = FALSE), + sapply(fread("a,b\n2017-01-01,1", colClasses=c("foo", "integer")), class), c(a="character", b="integer"), + warning=base_messages$missing_coerce_method) test(1743.15, sapply(fread("a,b\n2017-01-01,1", colClasses=c("foo", "integer")), class), c(a="character", b="integer"), warning="the column has been left as type .*character") test(1743.16, sapply(fread("a,b\n2017-01-01,2", colClasses=list(foo=1)), class), c(a="character", b="integer"), warning="the column has been left as type .*character") @@ -12071,8 +12087,8 @@ txt = 'a,b\n ab,cd,ce\n abcdef\n hjkli \n' # now auto detected as ncol 1 anyway test(1840.1, fread(txt), data.table("a,b" = c("ab,cd,ce","abcdef","hjkli"))) write('a,b\n ab,cd,ce\nabc,def \n hj,kli ', f<-tempfile()) # write to file to generate \r\n line ending on Windows, test 1840.6 below test(1840.2, fread(f), data.table("ab"=c("abc","hj"), "cd"=c("def","kli"), "ce"=NA), warning="Detected 3 column names but the data has 2.*Filling.*automatically") -test(1840.3, fread(f, sep=NA), error="!is.na(sep) is not TRUE") -test(1840.4, fread(f, sep=NA_character_), error="!is.na(sep) is not TRUE") +test(1840.3, fread(f, sep=NA), error=base_messages$stopifnot("!is.na(sep)")) +test(1840.4, fread(f, sep=NA_character_), error=base_messages$stopifnot("!is.na(sep)")) test(1840.5, fread(f, sep=""), ans<-data.table("a,b"=c("ab,cd,ce","abc,def","hj,kli"))) test(1840.6, fread(f, sep="\n"), ans) test(1840.7, fread(f, sep=NULL), ans) @@ -13287,7 +13303,7 @@ DT <- data.table(a=1, b=2, d=3) old <- c("a", "b", "c", "d") new <- c("A", "B", "C", "D") test(1955.1, setnames(DT, old, new, skip_absent=TRUE), data.table(A=1, B=2, D=3)) -test(1955.2, setnames(DT, old, new, skip_absent=0), error="is not") # must be TRUE or FALSE +test(1955.2, setnames(DT, old, new, skip_absent=0), error=base_messages$stopifnot("isTRUEorFALSE(skip_absent)")) test(1955.3, setnames(DT, "missing", "dummy", skip_absent=TRUE), DT) # all missing test(1955.4, setnames(DT, c("D","missing","A"), c("dd","ignored","aa"), skip_absent=TRUE), data.table(aa=1, B=2, dd=3)) # different order with a missing test(1955.5, setnames(DT, "B", "bb", skip_absent=TRUE), data.table(aa=1, bb=2, dd=3)) # none missing so skip_absent not needed @@ -13416,8 +13432,7 @@ test(1962.005, duplicated(DT, by = 'y'), test(1962.0061, duplicated(data.table(NULL)), logical(0L)) test(1962.0062, duplicated(data.table(a = 1L), by = character()), FALSE) -test(1962.007, unique(DT, incomparables = TRUE), - error = 'not used (yet)') +test(1962.007, unique(DT, incomparables=TRUE), error=base_messages$not_yet_used('incomparables != FALSE')) test(1962.008, unique(DT, fromLast = TRUE), data.table(x = c(1, 2, 3), key = 'x')) @@ -13532,8 +13547,8 @@ test(1962.0482, forder(L), 3:1) test(1962.0483, forder(), NULL) setDT(DT) test(1962.049, forder(DT[ , 0L]), error = 'Attempting to order a 0-column') -test(1962.050, forder(DT, decreasing = NA), error = 'isTRUEorFALSE(decreasing) is not TRUE') -test(1962.051, forder(DT, decreasing = 1.4), error = 'isTRUEorFALSE(decreasing) is not TRUE') +test(1962.050, forder(DT, decreasing = NA), error = base_messages$stopifnot('isTRUEorFALSE(decreasing)')) +test(1962.051, forder(DT, decreasing = 1.4), error = base_messages$stopifnot('isTRUEorFALSE(decreasing)')) test(1962.052, forder(DT, NULL), 3:1) test(1962.053, forder(DT), 3:1) test(1962.054, forder(DT, ), 3:1) @@ -13583,11 +13598,11 @@ DT = data.table( 2.5, 3, 3, 3, 3, 3, 3, 2, 2.5, 3, 3) ) setDF(DT) -test(1962.068, rollup(DT), error = 'no applicable method') +test(1962.068, rollup(DT), error=base_messages$missing_dispatch_method) test(1962.069, rollup.data.table(DT), error = 'must be a data.table object') -test(1962.070, cube(DT), error = 'no applicable method') +test(1962.070, cube(DT), error=base_messages$missing_dispatch_method) test(1962.071, cube.data.table(DT), error = 'must be a data.table object') -test(1962.072, groupingsets(DT), error = 'no applicable method') +test(1962.072, groupingsets(DT), error=base_messages$missing_dispatch_method) test(1962.073, groupingsets.data.table(DT), error = 'must be a data.table object') setDT(DT) test(1962.074, rollup(DT, by = 3L), error = "'by' must be a character vector") @@ -14056,11 +14071,11 @@ test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 options(datatable.optimize=0L) DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.1, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") +test(1981.1, DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) test(1981.2, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") options(datatable.optimize=Inf) DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.3, DT[, new_col := shift(val, "lead")], error="is.numeric(n) is not TRUE") +test(1981.3, DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # 1982 moved to benchmark.Rraw, #5517 @@ -17056,7 +17071,7 @@ if (inherits(x,"try-error")) { # this version of R doesn't have the fix linked to from #4762. That fix was made to R-devel in Oct 2020 when R-release was 4.0.3 test(2159.09, min(DT[0L]), error="only.*numeric") } else { - test(2159.10, min(DT[0L]), Inf, warning="missing") + test(2159.10, min(DT[0L]), Inf, warning=base_messages$empty_min) } DT = data.table(x = c("a","b")) test(2159.11, typeof(as.matrix(DT)), "character") @@ -17521,7 +17536,7 @@ test(2198.2, d1[d2, paste0("z", 1:2) := Y, on = "id", env = list(Y = as.list(pas test(2199.1, as.data.table(as.list(1:2))[, .SD,.SDcols=(-1L)], data.table(V2=2L)) test(2199.2, as.data.table(as.list(1:2))[, .SD,.SDcols=(-(1L))], data.table(V2=2L)) test(2199.3, as.data.table(as.list(1:3))[, .SD,.SDcols=(-1L)], data.table(V2=2L, V3=3L)) -test(2199.4, data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1], error="not found") +test(2199.4, data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1], error=base_messages$missing_object('V2')) # setDF now drops index attributes, #4889 d = data.table(a=1:100, b=1:100) @@ -17564,7 +17579,7 @@ test(2203.15, tstrsplit(z, "/", type.convert=list(as.factor, as.numeric)), error test(2203.16, tstrsplit(z, "/", type.convert=list(as.integer=2L), keep=5L), error="keep.+contain integer.+between") test(2203.17, tstrsplit(w, "/", type.convert="4"), error="TRUE/FALSE.+function.+named list") test(2203.18, tstrsplit(w, "/", type.convert=c(TRUE, FALSE)), error="TRUE/FALSE.+function.+named list") -test(2203.19, tstrsplit(w, "/", keep=integer()), error="keep.+contain integer.+between", ignore.warning="no non-missing") +test(2203.19, tstrsplit(w, "/", keep=integer()), error="keep.+contain integer.+between", ignore.warning=c(base_messages$empty_min, base_messages$empty_max)) test(2203.20, tstrsplit(w, "/", type.convert=list()), error="not support empty list") # set rownames as key directly in as.data.table, #4468 @@ -17720,7 +17735,7 @@ if (base::getRversion() >= "4.1.0") { test(2212.50, EVAL("df |> DT(df[, .I[which.max(mpg)], by=cyl]$V1)"), ans<-dt[c(4,20,25)]) test(2212.51, EVAL("dt |> DT(dt[, .I[which.max(mpg)], by=cyl]$V1)"), ans) test(2212.52, EVAL("D |> DT(D[, .I[which.max(mpg)], by=cyl]$V1)"), ans) - test(2212.53, EVAL("filter |> DT(filter[, .I[which.max(mpg)], by=cyl]$V1)"), error="unused.*argument.*by.*cyl") # R's [.data.frame error on filter[...] + test(2212.53, EVAL("filter |> DT(filter[, .I[which.max(mpg)], by=cyl]$V1)"), error=base_messages$unused_arg("by = cyl")) # R's [.data.frame error on filter[...] test(2212.54, EVAL("filter |> DT((filter |> DT(, .I[which.max(mpg)], by=cyl))$V1)"), as.data.frame(ans)) rm(DT) } @@ -18380,7 +18395,7 @@ test(2250.09, dt[, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) test(2250.10, dt[1:2, paste(names(.SD), 'max', sep = '_') := lapply(.SD, max), by = grp], data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b'), a_max = c(2L, 2L, NA_integer_), b_max = c(6L, 6L, NA_integer_))) -test(2250.11, dt[, names(.SD(2)) := lapply(.SD, .I)], error = 'could not find function ".SD"') +test(2250.11, dt[, names(.SD(2)) := lapply(.SD, .I)], error=base_messages$missing_function('.SD')) dt = data.table(a = 1:3, b = 5:7, grp = c('a', 'a', 'b')) test(2250.12, dt[, names(.SD) := lapply(.SD, \(x) x + b), .SDcols = "a"], data.table(a = 1:3 + 5:7, b = 5:7, grp = c('a', 'a', 'b'))) From 502c59e2af0628feb83ad573a2d04c99f18d735d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 3 Apr 2024 17:53:44 -0700 Subject: [PATCH 033/106] Remove importClassesFrom entry (#6004) --- NAMESPACE | 1 - 1 file changed, 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 20601c9cf..b9872ee7e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,7 +2,6 @@ useDynLib("data_table", .registration=TRUE) ## For S4-ization importFrom(methods, "S3Part<-", slotNames) -importMethodsFrom(methods, "[") exportClasses(data.table, IDate, ITime) ## From b6d61007d1da8abea3a1555aeaa56b60e9a09aef Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 4 Apr 2024 11:21:11 -0700 Subject: [PATCH 034/106] Deprecate key="a,b" and by="a,b" (#6047) * Deprecate key="a,b" and by="a,b" * Fixes in examples+documentation * typo s/string/comma/ --- NEWS.md | 4 ++ R/data.table.R | 6 +-- R/fread.R | 2 +- inst/tests/tests.Rraw | 105 +++++++++++++++++++--------------------- man/IDateTime.Rd | 2 +- man/data.table.Rd | 2 +- man/duplicated.Rd | 4 +- man/fread.Rd | 2 +- man/merge.Rd | 8 +-- man/print.data.table.Rd | 2 +- man/setDT.Rd | 2 +- 11 files changed, 68 insertions(+), 71 deletions(-) diff --git a/NEWS.md b/NEWS.md index 902f2fecc..e62a60104 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,10 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) +## BREAKING CHANGES + +1. Usage of comma-separated character strings representing multiple columns in `data.table()`'s `key=` argument and `[`'s `by=`/`keyby=` arguments is deprecated, [#4357](https://github.com/Rdatatable/data.table/issues/4357). While sometimes convenient, ultimately it introduces inconsistency in implementation that is not worth the benefit to maintain. NB: this hard deprecation is temporary in the development version. Before release, it will soften into the normal data.table deprecation cycle starting from introducing the new behavior with an option, then changing the default for the option with a warning, then upgrading the warning to an error before finally removing the option and the error. + ## NEW FEATURES 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. diff --git a/R/data.table.R b/R/data.table.R index 24eff62d5..d55132071 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -62,8 +62,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str if (!is.null(key)) { if (!is.character(key)) stopf("key argument of data.table() must be character") if (length(key)==1L) { - key = strsplit(key,split=",")[[1L]] - # eg key="A,B"; a syntax only useful in key argument to data.table(), really. + if (key != strsplit(key,split=",")[[1L]]) stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "key=") } setkeyv(ans,key) } else { @@ -806,8 +805,7 @@ replace_dot_alias = function(e) { if (mode(bysub) == "character") { if (any(grepl(",", bysub, fixed = TRUE))) { - if (length(bysub)>1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub)) - bysub = strsplit(bysub, split=",", fixed=TRUE)[[1L]] + stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "by=") } bysub = gsub("^`(.*)`$", "\\1", bysub) # see test 138 nzidx = nzchar(bysub) diff --git a/R/fread.R b/R/fread.R index b4086d155..b2e55403d 100644 --- a/R/fread.R +++ b/R/fread.R @@ -340,7 +340,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!is.character(key)) stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") if (length(key) == 1L) { - key = strsplit(key, split = ",", fixed = TRUE)[[1L]] + if (key != strsplit(key,split=",")[[1L]]) stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "key=") } setkeyv(ans, key) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 287d36713..0b740f605 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -334,7 +334,7 @@ test(69.4, names(tables(silent=TRUE, mb=FALSE, index=TRUE)), xenv = new.env() # to control testing tables() xenv$DT = data.table(a = 1) test(69.5, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 1 1 0 a [NULL]\nTotal: 0MB") -xenv$DT = data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=9:10, F=11:12, G=13:14, H=15:16, key="A,D,F,G") +xenv$DT = data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=9:10, F=11:12, G=13:14, H=15:16, key=c("A", "D", "F", "G")) test(69.6, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 2 8 0 A,B,C,D,E,F,... A,D,F,G.*Total: 0MB") rm(xenv) test(69.7, tables(order.col='asdf'), error="not a column name of info") @@ -369,7 +369,7 @@ test(82, TESTDT[,c("a","b")], data.table(a=TESTDT[[1]], b=TESTDT[[2]], key=c("a" test(83, TESTDT[,list("a","b")], data.table(V1="a",V2="b")) test(83.1, TESTDT[,list("sum(a),sum(b)")], data.table("sum(a),sum(b)")) test(83.2, TESTDT[,list("sum(a),sum(b)"),by=a], {tt=data.table(a=c("a","c","d","g"),V1="sum(a),sum(b)",key="a");tt$V1=as.character(tt$V1);tt}) -test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = 'a,b')) +test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = c('a', 'b'))) # test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated ## this is very old DT() functionality, completely different than DT() discussed in 2023 test(86, TESTDT[,sum(v),by="b"], data.table(b=c("e","f","i","b"),V1=INT(3,7,11,7))) # TESTDT is key'd by a,b, so correct that grouping by b should not be key'd in the result by default @@ -396,8 +396,8 @@ test(97, TESTDT[c("f","i","b"),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c( test(98, TESTDT[SJ(c("f","i","b")),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c("b","f","i"), GroupSum=c(7L,7L,11L), key="b")) # line above is the way to group, sort by group and setkey on the result by group. -dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") -test(99, unique(dt, by=key(dt)), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key="A,B")) +dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = c("A", "B")) +test(99, unique(dt, by=key(dt)), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key=c("A", "B"))) # test [<- for column assignment dt1 <- dt2 <- dt @@ -419,7 +419,7 @@ test(106, all(dt + dt > 1)) test(107, dt + dt, dt * 2L) # test a few other generics: -test(108, dt, data.table(t(t(dt)),key="A,B")) +test(108, dt, data.table(t(t(dt)),key=c('A', 'B'))) test(109, all(!is.na(dt))) dt2 <- dt dt2$A[1] <- NA # removes key @@ -649,7 +649,7 @@ test(184, xx[a>6,sum(b),by=a], data.table(a=integer(),V1=integer())) # Tests of bug 1015 highlight by Harish # See thread "'by without by' now heeds nomatch=NA" # Tests 185-201 were added in above next to originals -x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key="a,b") +x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key=c('a', 'b')) y <- data.table(g=c("a","b","c","d"),h=c("A","A","A","A")) test(202, x[y], x[y,mult="all"]) test(203, x[y,d], c(1,2,NA,NA)) @@ -775,7 +775,7 @@ test(243, X[Y][,sum(foo*bar)], 195L) # test(245, X[Y,sum(foo*bar),mult="last"], data.table(a=2:3,V1=c(36L,56L))) # joining to less than all X's key colums (in examples but can't see formal test) -X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key="a,b") +X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key=c('a', 'b')) test(246.1, X["A"], X[1:2]) # checks that X[1:2] retains key, too test(246.2, key(X["A"]), c("a","b")) test(247, X["C"]$v, NA_integer_) @@ -959,7 +959,7 @@ test(295.3,DT,data.table(a=1:3,b=4:6,key="a")) # The := was on the local copy # new feature added 1.6.3, that key can be vector. -test(296,data.table(a=1:3,b=4:6,key="a,b"),data.table(a=1:3,b=4:6,key=c("a","b"))) +test(296,data.table(a=1:3,b=4:6,key=c('a', 'b')),data.table(a=1:3,b=4:6,key=c("a","b"))) # test .SDcols (not speed, just operation) DT = data.table(grp=1:3,A1=1:9,A2=10:18,A3=19:27,B1=101:109,B2=110:118,B3=119:127,key="grp") @@ -986,7 +986,7 @@ test(299.11, DT[1,c:=42L], data.table(a=1:3, c=TRUE), warning="42.*integer.*at R test(299.12, DT[2:3,c:=c(0L, 0L)], data.table(a=1:3,c=c(TRUE,FALSE,FALSE))) # Test bug fix #1468, combining i and by. -DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") +DT = data.table(a=1:3,b=1:9,v=1:9,key=c('a', 'b')) test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L))) # should not retain key because by= is not on head(key(DT)) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) @@ -1455,7 +1455,7 @@ unlink(f) # Test CJ problems with v1.7.4, #1689 test(463, all(sapply(CJ(1:2,1:3),length)==6L)) -DT = data.table(x=1:4,y=1:2,cnt=1L,key="x,y") +DT = data.table(x=1:4,y=1:2,cnt=1L,key=c('x', 'y')) test(464, DT[CJ(1:4,1:4)]$cnt, INT(1,rep(NA,4),1,NA,NA,1,rep(NA,4),1,NA,NA)) test(465, DT[CJ(1:4,1:4), sum(cnt>0), by=.EACHI]$y, rep(1:4,4)) f1 = factor(c("READING","MATHEMATICS")) @@ -1534,7 +1534,7 @@ test(483.2, DT, data.table(x=1:4)) # i.e. DT as it was before, without foo bein test(484, DT[,c("foo","bar"):=list(20L,numeric())], data.table(x=1:4, foo=20L, bar=NA_real_)) # Test i's key longer than x's -d1 <- data.table(a=1:2, b=11:14, key="a,b") +d1 <- data.table(a=1:2, b=11:14, key=c('a', 'b')) d2 <- data.table(A=0:1, B=1:4, key="A") test(485, d2[d1, allow.cartesian=TRUE], data.table(A=INT(1,1,1,1,2,2),B=INT(2,4,2,4,NA,NA),b=INT(11,11,13,13,12,14),key="A")) test(486, d2[d1,sum(B),by=.EACHI], data.table(A=INT(1,1,2,2),V1=INT(6,6,NA,NA),key="A")) # no allow.cartesian needed due to by-without-by @@ -1601,7 +1601,7 @@ dtA = data.table(i = 1:8, j = rep(1:2, 4), k = rep(1:4, 2), A = 10:17) dtB = data.table(j = rep(1:2, 2), k = 1:4, B = 18:21) test(502, merge(dtA, dtB, by = c("j","k"), all.x = TRUE), data.table(j=rep(1:2,each=4), k=rep(INT(1,3,2,4),each=2), i=INT(1,5,3,7,2,6,4,8), - A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key="j,k")) + A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key=c('j', 'k'))) test(503, dtA$i, 1:8) # check that merge didn't change the order of dtA by reference test(504, dtB$k, 1:4) # or dtB @@ -1686,8 +1686,7 @@ test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) test(542, DT[,sum(v),keyby=c("a","b","c")]$V1, INT(1,3,4,6,5,2)) -test(543, DT[,sum(v),keyby="a,b,c"]$V1, INT(1,3,4,6,5,2)) -test(544, DT[,sum(v),keyby=c("a","b,c")], error="but one or more items include a comma") +# tests 543,544 were of deprecated behavior to allow comma-separated entries to keyby # Test single expressions passed to by, FR#1743 in v1.8.0 DT = data.table(a=1:4,date=as.IDate("2012-02-28")+0:3,v=5:8) @@ -1754,20 +1753,16 @@ test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be gro test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because") test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be grouped because") -# Test spaces in by="..." format, datatable-help on 31 March -DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) -test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L))) -test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L))) -test(573, DT[,sum(v),by="b, a"], error=base_messages$missing_object(" a")) +# tests 571-573 were of deprecated behavior to allow comma-separated entries in by= # Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages DT = data.table(a=1:3, b=4:6) test(574, dim(unname(DT)), 3:2) # Test that CJ retains explicit names (useful if used independently) -test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="x,y")) -test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="V1,y")) -test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key="x,V2")) +test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key=c('x', 'y'))) +test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key=c('V1', 'y'))) +test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key=c('x', 'V2'))) # Test factor to character join when factor contains unused and reverse order levels : X = data.table(a=LETTERS[1:4],v=1:4,key="a") @@ -2294,7 +2289,7 @@ RHS = as.integer(DT$a) test(754.6, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") # Used to test warning on redundant by (#2282) but by=.EACHI has now superseded -DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key="a,b") +DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key=c('a', 'b')) test(755, DT[c("b","c"),sum(x),by=.EACHI], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(756, DT[c("b","c"),sum(x),by=a], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(757, DT[list(c("b","c"),"d"),sum(x),by=a], data.table(a=c("b","c"),V1=2:3,key="a")) # 'by' less than number of join columns @@ -2555,15 +2550,15 @@ test(864.3, rbindlist(list(data.table(logical(0),logical(0)), DT<-data.table(baz # Steve's find that setnames failed for numeric 'old' when pointing to duplicated names DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) options(datatable.optimize = 0L) -test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="(GForce FALSE)") +test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") options(datatable.optimize = 1L) -test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="(GForce FALSE)") +test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") options(datatable.optimize = 2L) -test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], +test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="GForce optimized.*gsum[(]v[)], gsum[(]w[)]") # v1.9.7 treats wrapped {} better, so this is now optimized options(datatable.optimize = Inf) test(866, names(ans1), c("a","b","name1","name2")) -test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by="a,b"]), c("a","b","name1","name2")) # list names extracted here +test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a","b","name1","name2")) # list names extracted here test(868, ans1, ans2) # and related to setnames, too DT = data.table(a=1:3,b=1:6,key="a") @@ -2755,9 +2750,9 @@ DT = data.table(a=1:3,b=1:6) test(916, DT[,newcol:=logical(0),by=a], data.table(a=1:3,b=1:6,newcol=NA)) # roll join error when non last join column is factor, #2450 -X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key='state,uid,ts') -Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key='state,uid,ts') -test(917.1, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,59,59,57), i.id=INT(3001,3004,3003,3002), key='state,uid,ts')) +X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key=c('state', 'uid', 'ts')) +Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key=c('state', 'uid', 'ts')) +test(917.1, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,59,59,57), i.id=INT(3001,3004,3003,3002), key=c('state', 'uid', 'ts'))) test(917.2, X[Y, on=c("id","state"), roll=TRUE], error="Attempting roll join on factor column when joining x.state to i.state") # NA in join column of type double, #2453. @@ -2802,7 +2797,7 @@ DT[,num:=1:.N] # to group each row by itself test(931, DT[,cbind(.SD,dup=1:rep),by="num"], data.table(num=INT(1,2,2,3:7,7,7),x=c(1,1,1,1,1,2,2,3,3,3),y=c(1,1,1,2,3,1,1,2,2,2),rep=INT(1,2,2,1,1,1,1,3,3,3), dup=INT(1,1,2,1,1,1,1,1,2,3))) # New roll=+/- and rollends -DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key="a,b") +DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key=c('a', 'b')) test(932, DT[J(c(0,2,6,8)), roll=+Inf, rollends=TRUE, v], INT(1,1,6,7)) test(933, DT[J(c(0,2,6,8)), roll=-Inf, rollends=TRUE, v], INT(1,2,7,7)) test(934, DT[J(c(0,2,6,8)), roll=+Inf, v], INT(NA,1,6,7)) @@ -2933,7 +2928,7 @@ test(985.2, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C",NA))) ## Allow unique/duplicated to accept custom colum combination to query for ## uniqueness -dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") +dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = c('A', 'B')) df <- as.data.frame(dt) test(986, unique(dt, by=key(dt)), dt[!duplicated(df[, key(dt)]),]) test(987, unique(dt, by='A'), dt[!duplicated(df[, 'A'])]) @@ -3577,11 +3572,11 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, test(1102.01, dcast(DT, time ~ variable, fun.aggregate=sum)[c(1,2,11,.N)], data.table(time=c(0,2,20,21),weight=c(2053,2461,9647,9841), key="time")) test(1102.02, dcast(DT, diet ~ variable, fun.aggregate=sum), data.table(diet=factor(1:4), weight=c(22582, 14714, 17154, 15961), key="diet")) test(1102.03, dcast(DT, diet+chick ~ time, drop=FALSE)[c(1,.N),c(1:4,13:14)], - ans<-data.table(diet=factor(c(1,4)), chick=ordered(c(18,48),levels=levels(DT$chick)), "0"=39, "2"=c(35,50), "20"=c(NA,303), "21"=c(NA,322), key="diet,chick")) + ans<-data.table(diet=factor(c(1,4)), chick=ordered(c(18,48),levels=levels(DT$chick)), "0"=39, "2"=c(35,50), "20"=c(NA,303), "21"=c(NA,322), key=c('diet', 'chick'))) test(1102.04, dcast(DT, diet+chick ~ time, drop=FALSE, fill=0)[c(1,.N),c(1:4,13:14)], ans[1, c("20","21"):=0]) # add test for 'subset=' in dcast test(1102.05, dcast(DT, time + chick ~ variable+diet, fun.aggregate=sum, subset=.(time> 20))[c(1,2,44,.N)], - data.table(time=21, chick=ordered(c(13,9,42,48), levels=levels(DT$chick)), weight_1=c(96,98,0,0), weight_2=0, weight_3=0, weight_4=c(0,0,281,322), key="time,chick")) + data.table(time=21, chick=ordered(c(13,9,42,48), levels=levels(DT$chick)), weight_1=c(96,98,0,0), weight_2=0, weight_3=0, weight_4=c(0,0,281,322), key=c('time', 'chick'))) # testing without aggregation set.seed(3) @@ -3633,7 +3628,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, v=factor(NA, levels=tail(letters,5)), x=factor(NA, levels=tail(letters,5)), y=factor(c(NA,"y",NA), levels=tail(letters,5)), - z=factor(NA, levels=tail(letters,5)), key="a1,a2,a3")) + z=factor(NA, levels=tail(letters,5)), key=c("a1", "a2", "a3"))) # dcast bug fix for 'subset' argument (it doesn't get key set before to run C-fcast): DT = data.table(x=c(1,1,1,2,2,2,1,1), y=c(1,2,3,1,2,1,1,2), z=c(1,2,3,NA,4,5,NA,NA)) @@ -3704,17 +3699,17 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L) test(1102.31, dcast(DT, x + y ~ z, fun.aggregate=sum, value.var=c("d1","d2"))[c(1,.N)][, 3:4:=lapply(.SD,round,4), .SDcols=c("d1_a","d1_b")][], - data.table(x=INT(1,5), y=INT(1,1), d1_a=c(0.0,0.4785), d1_b=c(0.8753,0.9804), d2_a=INT(0,1), d2_b=INT(1,3), key="x,y")) + data.table(x=INT(1,5), y=INT(1,1), d1_a=c(0.0,0.4785), d1_b=c(0.8753,0.9804), d2_a=INT(0,1), d2_b=INT(1,3), key=c('x', 'y'))) # multiple fun.agg test(1102.32, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var="d1")[c(1,.N)][, 3:6:=lapply(.SD,round,3), .SDcols=3:6][], - data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327), key="x,y")) + data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327), key=c('x', 'y'))) # multiple fun.agg and value.var (all combinations) test(1102.33, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var=c("d1", "d2"))[c(1,.N)][, c(3,4,7:10):=lapply(.SD,round,3), .SDcols=c(3,4,7:10)][], data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_sum_a=INT(0,1),d2_sum_b=INT(1,3), - d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key="x,y")) + d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key=c('x', 'y'))) # multiple fun.agg and value.var (one-to-one) test(1102.34, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var=list("d1", "d2"))[c(1,.N)][, 3:4:=lapply(.SD,round,3), .SDcols=3:4][], - data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key="x,y")) + data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key=c('x', 'y'))) # Additional test after fixing fun.agg creation - using the example here: https://github.com/Rdatatable/data.table/issues/716 DT = data.table(x=1:5, y=paste("v", 1:5, sep=""), v1=6:10, v2=11:15, k1=letters[1:5], k2=letters[6:10]) @@ -4427,7 +4422,7 @@ test(1220, set(DT,j=2:3,value=newVals), data.table(a=1:3,b=16:18,c=19:21)) # Test non-join key columns used in j work again (spotted straight away by Michele on datatable-help when v1.9.2 was released). # Introduced at commit 1030. Very extensive new tests 1136* still all pass (great stuff Arun). -DT = data.table(a=1:2,b=letters[1:6],key="a,b") +DT = data.table(a=1:2,b=letters[1:6],key=c('a', 'b')) test(1221, DT[.(1),b], c("a","c","e")) ########################################################################################### @@ -5235,7 +5230,7 @@ test(1305.13, setDF(dt, rownames=rep("a",5)), error='rownames contains duplicate # .SD retains as much of head(key) as appropriate. # by= always keeps data appearance order, so it's which columns are grouped and selected that drive how much of key is retained -DT = data.table(a=1:3,b=1:6,c=1:6,key="a,b") +DT = data.table(a=1:3,b=1:6,c=1:6,key=c('a', 'b')) test(1306, DT[1:2,key(.SD)], c("a","b")) test(1307, DT[2:1,key(.SD)], NULL) test(1308, DT[,key(.SD),by=a], data.table(a=integer())) @@ -5304,9 +5299,9 @@ test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c" # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = -c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = "x,y") -dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = "x,y") -test(1317.1, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[2], dt2$w[5:6]), key="x,y")) +c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = c('x', 'y')) +dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = c('x', 'y')) +test(1317.1, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[2], dt2$w[5:6]), key=c('x', 'y'))) # also test where 'i' is not sorted. set.seed(1L) @@ -7364,10 +7359,10 @@ x = c(1, 2, 1) y = c(5, 8, 8, 4) w = c(10, 12, 12, 13) # already sorted but has dups; more efficient case to cover # tests 1525.1, 1525.2 tested the now-ineffectual datatable.CJ.names option. -ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key="V1,z") +ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key=c('V1', 'z')) test(1525.3, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) test(1525.4, CJ(x, z=y, unique=TRUE), setnames(copy(ans),c("x","z"))) -test(1525.5, CJ(x, w, unique=TRUE), data.table(x=(rep(c(1,2), each=3)), w=c(10,12,13), key="x,w")) +test(1525.5, CJ(x, w, unique=TRUE), data.table(x=(rep(c(1,2), each=3)), w=c(10,12,13), key=c('x', 'w'))) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 DT <- data.table(A = 1:4, B = 5:8) @@ -7687,7 +7682,7 @@ setkey(x1, a1, a2) test(1544.1, setDF(merge(x1, y)), merge(as.data.frame(x1), as.data.frame(y))) test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.frame(y), by="a2")) # also test shallow here so as to catch future regressions -x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key="a1,a2") +x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key=c('a1', 'a2')) test(1545.01, key(.shallow(x1, cols="a2")), NULL) test(1545.02, key(.shallow(x1, retain.key=FALSE)), NULL) test(1545.03, key(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL) @@ -9096,7 +9091,7 @@ test(1630.09, copy(dt1)[id>5, z:=2L, nomatch=0L], copy(dt1)[ test(1630.10, copy(dt1)[id>5, z:=2L, nomatch=NA], copy(dt1)[,z:=NA_integer_], warning="ignoring nomatch") # fix for #1268, on= retains keys correctly. -A = data.table(site=rep(c("A","B"), each=3), date=rep(1:3, times=2), x=rep(1:3*10, times=2), key="site,date") +A = data.table(site=rep(c("A","B"), each=3), date=rep(1:3, times=2), x=rep(1:3*10, times=2), key=c('site', 'date')) B = data.table(x=c(10,20), y=c(100,200), key="x") test(1631, key(A[B, on="x"]), NULL) @@ -13158,8 +13153,8 @@ setindex(DT, NULL) test(1942.06, indices(DT), NULL) setindex(DT,id1,id2) test(1942.07, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using uniqlist on index 'id1__id2'") -test(1942.08, DT[,sum(v),keyby=.(id1,id2),verbose=TRUE], data.table(id1=c("A","C","C","D"), id2=INT(9,2,3,3), V1=INT(6,3,5,1), key="id1,id2"), output="Finding groups using uniqlist on index 'id1__id2'") -test(1942.09, DT[,sum(v),keyby=.(id2,id1),verbose=TRUE], data.table(id2=INT(2,3,3,9), id1=c("C","C","D","A"), V1=INT(3,5,1,6), key="id2,id1"), output="Finding groups using forderv") +test(1942.08, DT[,sum(v),keyby=.(id1,id2),verbose=TRUE], data.table(id1=c("A","C","C","D"), id2=INT(9,2,3,3), V1=INT(6,3,5,1), key=c('id1', 'id2')), output="Finding groups using uniqlist on index 'id1__id2'") +test(1942.09, DT[,sum(v),keyby=.(id2,id1),verbose=TRUE], data.table(id2=INT(2,3,3,9), id1=c("C","C","D","A"), V1=INT(3,5,1,6), key=c('id2', 'id1')), output="Finding groups using forderv") options(datatable.use.index=FALSE) test(1942.10, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using forderv") options(datatable.use.index=TRUE) @@ -13169,7 +13164,7 @@ set.seed(2) DT = data.table(real=sample((1:1500)/1000, 10000, replace=TRUE), id=sample(letters, 1000, replace=TRUE), value=1:10000) setkey(DT,id,real) test(1942.11, DT[, .(list(value)), keyby=.(id,real), verbose=TRUE][c(1,6,8744,.N)], - data.table(id=c("a","a","z","z"), real=c(0.004,0.037,1.486,1.497), V1=list(9441L, c(3375L,5983L), c(4901L,5260L,7668L), 4181L), key="id,real"), + data.table(id=c("a","a","z","z"), real=c(0.004,0.037,1.486,1.497), V1=list(9441L, c(3375L,5983L), c(4901L,5260L,7668L), 4181L), key=c('id', 'real')), output="Finding groups using uniqlist on key") setindex(DT,real) test(1942.12, DT[, sum(value), keyby=real, verbose=TRUE][c(1,500,1498,.N)], data.table(real=c(0.001, 0.501, 1.499, 1.5), V1=INT(31036,37564,14792,38606), key="real"), @@ -13191,8 +13186,8 @@ DT2 <- data.table( test(1943.1, (ans<-DT1[DT2])[,1:4], DT1) # ok before test(1943.2, DT1[DT2, on=c("id","date","period")], ans) # ok before test(1943.3, DT1[DT2, on=c("id","date","period","year")], ans[,1:4]) # no warning (longer object length is not a multiple) -DT1 = data.table(id=c("A","A","A"), date=1:3, val=7:9, key="id,date") -DT2 = data.table(id=c("A","A","A"), date=1:3, date2=3:1, key="id,date") +DT1 = data.table(id=c("A","A","A"), date=1:3, val=7:9, key=c('id', 'date')) +DT2 = data.table(id=c("A","A","A"), date=1:3, date2=3:1, key=c('id', 'date')) test(1943.4, DT1[DT2, on=c("id",date="date2")], data.table(id="A", date=3:1, val=9:7, i.date=1:3)) # was invalidly keyed by id,date in 1.11.6 @@ -15679,7 +15674,7 @@ test(2069.28, data.table(c='1', d=2)[ , c(a='b'), by=c, verbose=TRUE], output='j test(2069.29, data.table(c = '1', d = 2)[ , .(a = c(nm='b')), by = c, verbose = TRUE], output = 'Column 1 of j is a named vector') DT <- data.table(a = rep(1:3, each = 4), b = LETTERS[1:4], z = 0:3 + (4:1)*1i) test(2069.30, DT[, .SD[3,], by=b], DT[9:12, .(b, a, z)]) -DT = data.table(x=1:4,y=1:2,lgl=TRUE,key="x,y") +DT = data.table(x=1:4,y=1:2,lgl=TRUE,key=c('x', 'y')) test(2069.31, DT[CJ(1:4,1:4), any(lgl), by=.EACHI]$V1, c(TRUE, NA, NA, NA, NA, TRUE, NA, NA, TRUE, NA, NA, NA, NA, TRUE, NA, NA)) set.seed(45L) @@ -15863,7 +15858,7 @@ test(2074.23, capture.output(print(DT2, topn=1L, col.names='none')), c(" 1: 1", " --- ", "101: 101")) # foverlaps -x = data.table(start=NA_integer_, end=1L, key='start,end') +x = data.table(start=NA_integer_, end=1L, key=c('start', 'end')) y = copy(x) test(2074.24, foverlaps(x, y), error="NA values in data.table x 'start' column") x[ , start := 0L] diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd index 928e732bc..be7620890 100644 --- a/man/IDateTime.Rd +++ b/man/IDateTime.Rd @@ -241,7 +241,7 @@ identical(as.ITime("10:45"), methods::as("10:45", "ITime")) as.POSIXct("2001-01-01") + as.ITime("10:45") datetime <- seq(as.POSIXct("2001-01-01"), as.POSIXct("2001-01-03"), by = "5 hour") -(af <- data.table(IDateTime(datetime), a = rep(1:2, 5), key = "a,idate,itime")) +(af <- data.table(IDateTime(datetime), a = rep(1:2, 5), key = c("a", "idate", "itime"))) af[, mean(a), by = "itime"] af[, mean(a), by = list(hour = hour(itime))] diff --git a/man/data.table.Rd b/man/data.table.Rd index 2e326fed0..557139e2f 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -44,7 +44,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}.} - \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}.} + \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}.} \item{stringsAsFactors}{Logical (default is \code{FALSE}). Convert all \code{character} columns to \code{factor}s?} diff --git a/man/duplicated.Rd b/man/duplicated.Rd index daf7c39d5..e17d8df0c 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -88,7 +88,7 @@ If none exists, 0L is returned. } \examples{ DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), - C = rep(1:2, 6), key = "A,B") + C = rep(1:2, 6), key = c("A", "B")) duplicated(DT) unique(DT) @@ -113,7 +113,7 @@ identical(unique(DT),DT[10]) # FALSE # fromLast=TRUE DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), - C = rep(1:2, 6), key = "A,B") + C = rep(1:2, 6), key = c("A", "B")) duplicated(DT, by="B", fromLast=TRUE) unique(DT, by="B", fromLast=TRUE) diff --git a/man/fread.Rd b/man/fread.Rd index b431969dc..49b187364 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -55,7 +55,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} - \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } + \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. } \item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. The default for this argument can be changed with \code{options(datatable.fread.datatable=FALSE)}.} diff --git a/man/merge.Rd b/man/merge.Rd index d8246668c..d374da076 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -87,16 +87,16 @@ merge(dt1, dt2, all = TRUE) (dt2 <- data.table(A = letters[rep(2:4, 2)], Y = 6:1, key = "A")) merge(dt1, dt2, allow.cartesian=TRUE) -(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = "A,B")) -(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = "A,B")) +(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = c("A", "B"))) +(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = c("A", "B"))) merge(dt1, dt2) merge(dt1, dt2, by="B", allow.cartesian=TRUE) # test it more: -d1 <- data.table(a=rep(1:2,each=3), b=1:6, key="a,b") +d1 <- data.table(a=rep(1:2,each=3), b=1:6, key=c("a", "b")) d2 <- data.table(a=0:1, bb=10:11, key="a") d3 <- data.table(a=0:1, key="a") -d4 <- data.table(a=0:1, b=0:1, key="a,b") +d4 <- data.table(a=0:1, b=0:1, key=c("a", "b")) merge(d1, d2) merge(d2, d1) diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index bda7a9b78..a39c8c446 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -83,7 +83,7 @@ print(DT, row.names = FALSE) #`print.keys` can alert which columns are currently keys - DT <- data.table(a=1:3, b=4:6, c=7:9, key="b,a") + DT <- data.table(a=1:3, b=4:6, c=7:9, key=c("b", "a")) setindexv(DT, c("a", "b")) setindexv(DT, "a") print(DT, print.keys=TRUE) diff --git a/man/setDT.Rd b/man/setDT.Rd index c00ba0f46..9311d0e3b 100644 --- a/man/setDT.Rd +++ b/man/setDT.Rd @@ -13,7 +13,7 @@ setDT(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) \arguments{ \item{x}{ A named or unnamed \code{list}, \code{data.frame} or \code{data.table}. } \item{keep.rownames}{ For \code{data.frame}s, \code{TRUE} retains the \code{data.frame}'s row names under a new column \code{rn}. \code{keep.rownames = "id"} names the column \code{"id"} instead. } - \item{key}{Character vector of one or more column names which is passed to \code{\link{setkeyv}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. } + \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. } \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}. } } From 420e60ba1f34ddde4f1b99208bef2585ba29c599 Mon Sep 17 00:00:00 2001 From: Joshua Wu <124658199+joshhwuu@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:17:11 -0700 Subject: [PATCH 035/106] Changed char.trunc to better handle combining and full-width multibyte characters (#6048) * changed char.trunc to better handle full-width and combining characters * Tests added, make pretty later * Added comment to char.trunc for future issues/suggestions * Refactored tests for readability, added multiple rows/columns tests * Updated NEWS.md * Added myself as contributor in DESCRIPTION * changed test style to have options in front --------- Co-authored-by: Michael Chirico --- DESCRIPTION | 3 ++- NEWS.md | 2 ++ R/print.data.table.R | 9 +++++++-- inst/tests/tests.Rraw | 40 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6b29cb848..9e00eb8f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -83,5 +83,6 @@ Authors@R: c( person("Dereck","de Mezquita", role="ctb"), person("Michael","Czekanski", role="ctb"), person("Dmitry", "Shemetov", role="ctb"), - person("Nitish", "Jha", role="ctb") + person("Nitish", "Jha", role="ctb"), + person("Joshua", "Wu", role="ctb") ) diff --git a/NEWS.md b/NEWS.md index e62a60104..b15ebd696 100644 --- a/NEWS.md +++ b/NEWS.md @@ -60,6 +60,8 @@ 8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the uxpected setup. +9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE diff --git a/R/print.data.table.R b/R/print.data.table.R index 6588ca458..919c8aaed 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -229,11 +229,16 @@ format_list_item.default = function(x, ...) { # FR #1091 for pretty printing of character # TODO: maybe instead of doing "this is...", we could do "this ... test"? +# Current implementation may have issues when dealing with strings that have combinations of full-width and half-width characters, +# if this becomes a problem in the future, we could consider string traversal instead. char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) if (!is.character(x) || trunc.char <= 0L) return(x) - idx = which(nchar(x) > trunc.char) - x[idx] = paste0(substr(x[idx], 1L, as.integer(trunc.char)), "...") + nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096 + nchar_chars = nchar(x, 'char') + is_full_width = nchar_width > nchar_chars + idx = pmin(nchar_width, nchar_chars) > trunc.char + x[idx] = paste0(strtrim(x[idx], trunc.char * fifelse(is_full_width[idx], 2L, 1L)), "...") x } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0b740f605..62a33db50 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18434,3 +18434,43 @@ dt = data.table(a = 1L) test(2252.1, dt[, b:=2L], error = "\\[ was called on a data.table.*not data.table-aware.*':='") test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data.table-aware.*'let'") rm(.datatable.aware) + +# tests for trunc.char handling wide characters # 5096 +accented_a = "\u0061\u0301" +ja_ichi = "\u4E00" +ja_ni = "\u4E8C" +ja_ko = "\u3053" +ja_n = "\u3093" +dots = "..." +clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output +# Tests for combining character latin a and acute accent, single row +DT = data.table(strrep(accented_a, 4L)) +test(2253.01, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(accented_a, 4L)) +test(2253.02, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(accented_a, 3L), dots)) +test(2253.03, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(accented_a, 1L), dots)) +# Tests for full-width japanese character ichi, single row +DT = data.table(strrep(ja_ichi, 4L)) +test(2253.04, options=list(datatable.prettyprint.char = 4L), DT, output=strrep(ja_ichi, 4L)) +test(2253.05, options=list(datatable.prettyprint.char = 3L), DT, output=paste0(strrep(ja_ichi, 3L), dots)) +test(2253.06, options=list(datatable.prettyprint.char = 1L), DT, output=paste0(strrep(ja_ichi, 1L), dots)) +# Tests for multiple, different length combining character rows +DT = data.table(strrep(accented_a, 1L:4L)) +test(2253.07, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "áááá")) +test(2253.08, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "áá", "ááá", "ááá...")) +test(2253.09, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("á", "á...", "á...", "á...")) +# Tests for multiple, different length full-width characters +DT = data.table(strrep(ja_ichi, 1L:4L)) +test(2253.10, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一一")) +test(2253.11, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一一", "一一一", "一一一...")) +test(2253.12, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("一", "一...", "一...", "一...")) +# Tests for combined characters, multiple columns +DT = data.table(paste0(ja_ichi), strrep(ja_ni, 2L), strrep(ja_ko, 3L), strrep(accented_a, 2L), "aaa") +test(2253.13, options=list(datatable.prettyprint.char = 4L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa") +test(2253.14, options=list(datatable.prettyprint.char = 3L), capture.output(print(DT))[-1L], "1: 一 二二 こここ áá aaa") +test(2253.15, options=list(datatable.prettyprint.char = 2L), capture.output(print(DT))[-1L], "1: 一 二二 ここ... áá aa...") +test(2253.16, options=list(datatable.prettyprint.char = 1L), capture.output(print(DT))[-1L], "1: 一 二... こ... á... a...") +# Tests for multiple columns, multiple rows +DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) +test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんんん ááá")) +test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんん... ááá")) +test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á...")) From 9e93067c45575820ad4495b03cbce52b350b6051 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 4 Apr 2024 23:03:28 -0700 Subject: [PATCH 036/106] Convenience features for .SDcols patterns perl=TRUE (#5663) * anonymous .SDcols function test fails * user-defined .SDcols patterns funs * patterns(perl=TRUE) test fails * perl arg passed from patterns to grep * fix test * document perl arg * `foo` -> \code{foo} * move news item up * Update man/patterns.Rd Co-authored-by: Michael Chirico * add more grep args * remove user-defined patterns docs * do not mention user-defined patterns * rm user-defined patterns code and tests * rm user-defined patterns * } -> , * collapse to a single item with shared description * avoid ::: * style --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/fmelt.R | 4 ++-- inst/tests/tests.Rraw | 2 ++ man/data.table.Rd | 14 +++++++------- man/patterns.Rd | 6 +++++- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/NEWS.md b/NEWS.md index b15ebd696..f04988fb7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -42,6 +42,8 @@ 5. `fwrite(x, row.names=TRUE)` with `x` a `matrix` writes `row.names` when present, not row numbers, [#5315](https://github.com/Rdatatable/data.table/issues/5315). Thanks to @Liripo for the report, and @ben-schwen for the fix. +3. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/fmelt.R b/R/fmelt.R index 83963bebc..092da48b9 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -19,14 +19,14 @@ melt.default = function(data, ..., na.rm = FALSE, value.name = "value") { # nocov end } -patterns = function(..., cols=character(0L)) { +patterns = function(..., cols=character(0L), ignore.case=FALSE, perl=FALSE, fixed=FALSE, useBytes=FALSE) { # if ... has no names, names(list(...)) will be ""; # this assures they'll be NULL instead L = list(...) p = unlist(L, use.names = any(nzchar(names(L)))) if (!is.character(p)) stopf("Input patterns must be of type character.") - matched = lapply(p, grep, cols) + matched = lapply(p, grep, cols, ignore.case=ignore.case, perl=perl, fixed=fixed, useBytes=useBytes) # replace with lengths when R 3.2.0 dependency arrives if (length(idx <- which(sapply(matched, length) == 0L))) stopf('Pattern(s) not found: [%s]', brackify(p[idx])) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 62a33db50..b0aab587d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16661,6 +16661,8 @@ test(2128.2, names(DT[, .SD, .SDcols=!is.numeric]), 'c') test(2128.3, DT[, .SD, .SDcols=function(x) x==1], error='conditions were not met for: [a, b, c]') test(2128.4, DT[, .SD, .SDcols=function(x) 2L], error='conditions were not met for: [a, b, c]') test(2128.5, DT[, .SD, .SDcols=function(x) NA], error='conditions were not met for: [a, b, c]') +# patterns with PCRE, #5387 +test(2128.6, names(DT[, .SD, .SDcols=patterns('^(?![bc])', perl=TRUE)]), 'a') # lookahead is only supported with perl=TRUE. # expression columns in rbindlist, #546 A = data.table(c1 = 1, c2 = 'asd', c3 = expression(as.character(Sys.time()))) diff --git a/man/data.table.Rd b/man/data.table.Rd index 557139e2f..680e25574 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -84,7 +84,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac As long as \code{j} returns a \code{list}, each element of the list becomes a column in the resulting \code{data.table}. When the output of \code{j} is not a \code{list}, the output is returned as-is (e.g. \code{x[ , a]} returns the column vector \code{a}), unless \code{by} is used, in which case it is implicitly wrapped in \code{list} for convenience (e.g. \code{x[ , sum(a), by=b]} will create a column named \code{V1} with value \code{sum(a)} for each group). - The expression `.()` is a \emph{shorthand} alias to \code{list()}; they both mean the same. (An exception is made for the use of \code{.()} within a call to \code{\link{bquote}}, where \code{.()} is left unchanged.) + The expression \code{.()} is a \emph{shorthand} alias to \code{list()}; they both mean the same. (An exception is made for the use of \code{.()} within a call to \code{\link{bquote}}, where \code{.()} is left unchanged.) When \code{j} is a vector of column names or positions to select (as in \code{data.frame}). There is no need to use \code{with=FALSE} anymore. Note that \code{with=FALSE} is still necessary when using a logical vector with length \code{ncol(x)} to include/exclude columns. Note: if a logical vector with length \code{k < ncol(x)} is passed, it will be filled to length \code{ncol(x)} with \code{FALSE}, which is different from \code{data.frame}, where the vector is recycled. @@ -110,13 +110,13 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item or of the form \code{startcol:endcol}: e.g., \code{DT[, sum(a), by=x:z]} } - \emph{Advanced:} When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}), \code{DT[i, j, by=.EACHI]} evaluates \code{j} for the groups in `DT` that each row in \code{i} joins to. That is, you can join (in \code{i}) and aggregate (in \code{j}) simultaneously. We call this \emph{grouping by each i}. See \href{https://stackoverflow.com/a/27004566/559784}{this StackOverflow answer} for a more detailed explanation until we \href{https://github.com/Rdatatable/data.table/issues/944}{roll out vignettes}. + \emph{Advanced:} When \code{i} is a \code{list} (or \code{data.frame} or \code{data.table}), \code{DT[i, j, by=.EACHI]} evaluates \code{j} for the groups in \code{DT} that each row in \code{i} joins to. That is, you can join (in \code{i}) and aggregate (in \code{j}) simultaneously. We call this \emph{grouping by each i}. See \href{https://stackoverflow.com/a/27004566/559784}{this StackOverflow answer} for a more detailed explanation until we \href{https://github.com/Rdatatable/data.table/issues/944}{roll out vignettes}. \emph{Advanced:} In the \code{X[Y, j]} form of grouping, the \code{j} expression sees variables in \code{X} first, then \code{Y}. We call this \emph{join inherited scope}. If the variable is not in \code{X} or \code{Y} then the calling frame is searched, its calling frame, and so on in the usual way up to and including the global environment.} - \item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use `keyby=` routinely when you wish the result to be sorted. May also be \code{TRUE} or \code{FALSE} when \code{by} is provided as an alternative way to accomplish the same operation.} + \item{keyby}{ Same as \code{by}, but with an additional \code{setkey()} run on the \code{by} columns of the result, for convenience. It is common practice to use \code{keyby=} routinely when you wish the result to be sorted. May also be \code{TRUE} or \code{FALSE} when \code{by} is provided as an alternative way to accomplish the same operation.} - \item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In case of overlapping variables names inside dataset and in parent scope you can use double dot prefix \code{..cols} to explicitly refer to `\code{cols} variable parent scope and not from your dataset. + \item{with}{ By default \code{with=TRUE} and \code{j} is evaluated within the frame of \code{x}; column names can be used as variables. In case of overlapping variables names inside dataset and in parent scope you can use double dot prefix \code{..cols} to explicitly refer to \code{cols} variable parent scope and not from your dataset. When \code{j} is a character vector of column names, a numeric vector of column positions to select or of the form \code{startcol:endcol}, and the value returned is always a \code{data.table}. \code{with=FALSE} is not necessary anymore to select columns dynamically. Note that \code{x[, cols]} is equivalent to \code{x[, ..cols]} and to \code{x[, cols, with=FALSE]} and to \code{x[, .SD, .SDcols=cols]}.} @@ -145,18 +145,18 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{which}{\code{TRUE} returns the row numbers of \code{x} that \code{i} matches to. If \code{NA}, returns the row numbers of \code{i} that have no match in \code{x}. By default \code{FALSE} and the rows in \code{x} that match are returned.} - \item{.SDcols}{ Specifies the columns of \code{x} to be included in the special symbol \code{\link{.SD}} which stands for \code{Subset of data.table}. May be character column names, numeric positions, logical, a function name such as `is.numeric`, or a function call such as `patterns()`. `.SDcols` is particularly useful for speed when applying a function through a subset of (possible very many) columns by group; e.g., \code{DT[, lapply(.SD, sum), by="x,y", .SDcols=301:350]}. + \item{.SDcols}{ Specifies the columns of \code{x} to be included in the special symbol \code{\link{.SD}} which stands for \code{Subset of data.table}. May be character column names, numeric positions, logical, a function name such as \code{is.numeric}, or a function call such as \code{patterns()}. \code{.SDcols} is particularly useful for speed when applying a function through a subset of (possible very many) columns by group; e.g., \code{DT[, lapply(.SD, sum), by="x,y", .SDcols=301:350]}. For convenient interactive use, the form \code{startcol:endcol} is also allowed (as in \code{by}), e.g., \code{DT[, lapply(.SD, sum), by=x:y, .SDcols=a:f]}. Inversion (column dropping instead of keeping) can be accomplished be prepending the argument with \code{!} or \code{-} (there's no difference between these), e.g. \code{.SDcols = !c('x', 'y')}. - Finally, you can filter columns to include in \code{.SD} based on their \emph{names} according to regular expressions via \code{.SDcols=patterns(regex1, regex2, ...)}. The included columns will be the \emph{intersection} of the columns identified by each pattern; pattern unions can easily be specified with \code{|} in a regex. You can filter columns on \code{values} by passing a function, e.g. \code{.SDcols=\link{is.numeric}}. You can also invert a pattern as usual with \code{.SDcols=!patterns(...)} or \code{.SDcols=!is.numeric}. + Finally, you can filter columns to include in \code{.SD} based on their \emph{names} according to regular expressions via \code{.SDcols=patterns(regex1, regex2, ...)}. The included columns will be the \emph{intersection} of the columns identified by each pattern; pattern unions can easily be specified with \code{|} in a regex. You can filter columns on \code{values} by passing a function, e.g. \code{.SDcols=\link{is.numeric}}. You can also invert a pattern as usual with \code{.SDcols=!patterns(...)} or \code{.SDcols=!is.numeric}. } \item{verbose}{ \code{TRUE} turns on status and information messages to the console. Turn this on by default using \code{options(datatable.verbose=TRUE)}. The quantity and types of verbosity may be expanded in future. } - \item{allow.cartesian}{ \code{FALSE} prevents joins that would result in more than \code{nrow(x)+nrow(i)} rows. This is usually caused by duplicate values in \code{i}'s join columns, each of which join to the same group in `x` over and over again: a \emph{misspecified} join. Usually this was not intended and the join needs to be changed. The word 'cartesian' is used loosely in this context. The traditional cartesian join is (deliberately) difficult to achieve in \code{data.table}: where every row in \code{i} joins to every row in \code{x} (a \code{nrow(x)*nrow(i)} row result). 'cartesian' is just meant in a 'large multiplicative' sense, so FALSE does not always prevent a traditional cartesian join. } + \item{allow.cartesian}{ \code{FALSE} prevents joins that would result in more than \code{nrow(x)+nrow(i)} rows. This is usually caused by duplicate values in \code{i}'s join columns, each of which join to the same group in \code{x} over and over again: a \emph{misspecified} join. Usually this was not intended and the join needs to be changed. The word 'cartesian' is used loosely in this context. The traditional cartesian join is (deliberately) difficult to achieve in \code{data.table}: where every row in \code{i} joins to every row in \code{x} (a \code{nrow(x)*nrow(i)} row result). 'cartesian' is just meant in a 'large multiplicative' sense, so FALSE does not always prevent a traditional cartesian join. } \item{drop}{ Never used by \code{data.table}. Do not use. It needs to be here because \code{data.table} inherits from \code{data.frame}. See \href{../doc/datatable-faq.html}{\code{vignette("datatable-faq")}}.} diff --git a/man/patterns.Rd b/man/patterns.Rd index 5041975dc..cd3d3fd8b 100644 --- a/man/patterns.Rd +++ b/man/patterns.Rd @@ -12,11 +12,15 @@ and melt them into separate columns. See the \code{Efficient reshaping using data.tables} vignette linked below to learn more. } \usage{ -patterns(\dots, cols=character(0)) +patterns( + \dots, cols=character(0), + ignore.case=FALSE, perl=FALSE, + fixed=FALSE, useBytes=FALSE) } \arguments{ \item{\dots}{A set of regular expression patterns.} \item{cols}{A character vector of names to which each pattern is matched.} + \item{ignore.case, perl, fixed, useBytes}{Passed to \code{\link{grep}}.} } \seealso{ \code{\link{melt}}, From b058f62a634fb6279dcc27161b881bc7d975dbf5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 5 Apr 2024 17:45:12 -0700 Subject: [PATCH 037/106] Set digits= and warn= to their normal defaults before testing (#6052) --- NEWS.md | 2 ++ R/test.data.table.R | 2 ++ 2 files changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index f04988fb7..d67cc3bc2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -64,6 +64,8 @@ 9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. +10. `test.data.table()` runs correctly in more sessions, in particular those where the `digits` or `warn` settings are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE diff --git a/R/test.data.table.R b/R/test.data.table.R index 4972ad8d3..e2efe29d9 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -98,6 +98,8 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F datatable.print.trunc.cols = FALSE, #4552 datatable.rbindlist.check = NULL, datatable.integer64 = "integer64", + digits = 7L, # ensure printing rounds to the expected number of digits in all sessions, #5285 + warn = 0L, # ensure signals are emitted as they are in the code, #5285 warnPartialMatchArgs = base::getRversion()>="3.6.0", # ensure we don't rely on partial argument matching in internal code, #3664; >=3.6.0 for #3865 warnPartialMatchAttr = TRUE, warnPartialMatchDollar = TRUE, From 7b749b1e1649f94736e7ac672bde6bc2b2ed3bf3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 6 Apr 2024 18:38:14 -0700 Subject: [PATCH 038/106] Move S4 tests to separate script --- inst/tests/S4.Rraw | 79 +++++++++++++++ inst/tests/tests.Rraw | 224 ++++++++++++++++++------------------------ tests/S4.R | 2 + 3 files changed, 176 insertions(+), 129 deletions(-) create mode 100644 inst/tests/S4.Rraw create mode 100644 tests/S4.R diff --git a/inst/tests/S4.Rraw b/inst/tests/S4.Rraw new file mode 100644 index 000000000..743875391 --- /dev/null +++ b/inst/tests/S4.Rraw @@ -0,0 +1,79 @@ +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if (length(find.package("data.table", quiet=TRUE))) { + remove.packages("data.table") + stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n") + } + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") + rm_all = function() {} + DTfun = DT ## otherwise DT would be re-defined by many tests +} else { + library(data.table) + + shallow = data.table:::shallow + test = data.table:::test +} + +library(methods) + +suppressWarnings({ + setClass("Data.Table", contains="data.table") # suppress "Created a package name, '2018-05-26 06:14:43.444', when none found" + setClass("S4Composition", representation(data="data.table")) +}) +# data.table can be a parent class +ids <- sample(letters[1:3], 10, replace=TRUE) +scores <- stats::rnorm(10) +dt <- data.table(id=ids, score=scores) +dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) +test(1.01, isS4(dt.s4)) +test(1.02, inherits(dt.s4, 'data.table')) +# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table +test(1.03, isS4(shallow(dt.s4))) +## pull out data from S4 as.list, and compare to list from dt +dt.s4.list <- dt.s4@.Data +names(dt.s4.list) <- names(dt.s4) +test(1.04, dt.s4.list, as.list(dt)) # Underlying data not identical +# simple S4 conversion-isms work +df = data.frame(a=sample(letters, 10), b=1:10) +dt = as.data.table(df) +test(1.05, identical(methods::as(df, 'data.table'), dt)) +test(1.06, identical(methods::as(dt, 'data.frame'), df)) +# data.table can be used in an S4 slot +dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=stats::rnorm(10)) +dt.comp <- new("S4Composition", data=dt) +test(1.07, dt.comp@data, dt) +# S4 methods dispatch properly on data.table slots" +dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=stats::rnorm(10)) +dt.comp <- new("S4Composition", data=dt) +setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) +setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) +setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) +test(1.08, dtGet(dt.comp), dt) # actually +test(1.09, identical(dtGet(dt.comp, 1), dt[[1]])) +test(1.10, identical(dtGet(dt.comp, 'b'), dt$b)) +removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev +removeClass("S4Composition") +# END port of old testthat tests + +# miscellaneous missing tests uncovered by CodeCov difference in the process of PR #2573 [S4 portion, c.f. 1872.* in tests.Rraw] +## data.table cannot recycle complicated types +short_s4_col = getClass("MethodDefinition") +test(2, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'") + +# print dims in list-columns, #3671, c.f. 2130.* in tests.Rraw +s4class = setClass("ex_class", slots = list(x="integer", y="character", z="numeric")) +DT = data.table( + x = 1:2, + y = list(s4class(x=1L, y=c("yes", "no"), z=2.5), + s4class(x=2L, y="yes", z=1))) +test(3, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) + +# S4 object not suported in fifelse and fcase, #4135 +class4 = setClass("class4", slots=list(x="numeric")) +s1 = class4(x=20191231) +s2 = class4(x=20191230) +test(4.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.") +test(4.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.") +test(4.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") +test(4.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") +rm(s1, s2, class4) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b0aab587d..8f0fc886a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1,10 +1,33 @@ -require(methods) +library(datasets) # for airquality, BOD, cars, ChickWeight, CO2, iris, mtcars + +lm=stats::lm +median=stats::median +na.omit=stats::na.omit +rnorm=stats::rnorm +runif=stats::runif +sd=stats::sd +setNames=stats::setNames +var=stats::var +weighted.mean=stats::weighted.mean + +capture.output=utils::capture.output +combn=utils::combn +head=utils::head +read.csv=utils::read.csv +read.delim=utils::read.delim +read.table=utils::read.table +tail=utils::tail +type.convert=utils::type.convert +write.csv=utils::write.csv +write.table=utils::write.table + +as.integer64=bit64::as.integer64 +integer64=bit64::integer64 +is.na.integer64=bit64::is.na.integer64 +lim.integer64=bit64::lim.integer64 +NA_integer64_=bit64::NA_integer64_ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { - if (!identical(suppressWarnings(packageDescription("data.table")), NA)) { - remove.packages("data.table") - stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n") - } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") rm_all = function() {} @@ -164,7 +187,7 @@ base_messages = list( missing_coerce_method = get_msg(delim = '"', { old = options(useFancyQuotes = FALSE) # otherwise we get angled quotes, hard to match robustly on.exit(options(old)) - as(TRUE, 'foo') + methods::as(TRUE, 'foo') }), missing_dispatch_method = get_msg(conditionMessage(structure(1, class="foo")), '[\'"]'), invalid_arg_unary_operator = get_msg(-'a'), @@ -424,8 +447,8 @@ test(109, all(!is.na(dt))) dt2 <- dt dt2$A[1] <- NA # removes key test(110, sum(is.na(dt2)), 1L) -test(111, {setkey(dt,NULL);dt}, na.omit(dt)) -test(112, dt2[2:nrow(dt2),A], na.omit(dt2)$A) +test(111, {setkey(dt,NULL);dt}, stats::na.omit(dt)) +test(112, dt2[2:nrow(dt2),A], stats::na.omit(dt2)$A) # test [<- assignment: dt2[is.na(dt2)] <- 1L @@ -1679,8 +1702,8 @@ test(536, DT[,sum(v),by=a], data.table(a=c(1L,3L,2L),V1=c(4L,7L,10L))) # retain ans = data.table(a=1:3,V1=c(4L,10L,7L),key="a") test(537, DT[,sum(v),keyby=a], ans) test(538, DT[,sum(v),keyby="a"], ans) -var="a" -test(539, DT[,sum(v),keyby=eval(var)], ans) +byvar="a" +test(539, DT[,sum(v),keyby=eval(byvar)], ans) a=quote(a%%2L) test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) @@ -1783,7 +1806,7 @@ test(584, DT[a<1], output="Empty data.table (0 rows and 2 cols): a,v") test(585, DT[a<1,list(v)], output="Empty data.table (0 rows and 1 cols): v") test(586.1, data.table(a=integer(),V1=integer()), output="Empty data.table (0 rows and 2 cols): a,V1") env = environment() -data(iris, package='datasets', envir = env) # in case user has edited iris in their session +utils::data(iris, package='datasets', envir = env) # in case user has edited iris in their session test(586.2, print.data.table(iris[,FALSE]), output="Empty data.frame (150 rows and 0 cols)") #3363 # Test that .N is available in by on empty table, also in #1945 @@ -1893,7 +1916,7 @@ DT$time1 <- Sys.time() # recycle via *tmp* DT$time2 <- rep(Sys.time(), 5) # plonk via *tmp* DT[,time3:=Sys.time()] # recycle DT[,time4:=rep(Sys.time(),5)] # plonk -test(625, all(sapply(DT,is,"POSIXct")[-1])) +test(625, all(sapply(DT, inherits, "POSIXct")[-1])) # unique on ITime doesn't lose attributes, #1719 t = as.ITime(strptime(c("09:10:00","09:11:00","09:11:00","09:12:00"),"%H:%M:%S")) @@ -2475,7 +2498,7 @@ test(834, comment(DT1[2:3]$A), "first comment") # Test that matrix RHS of := is caught, #2333 DT = data.table(a=1:3) DT[,a:=scale(a)] # 1 column matrix auto treated as vector -test(835, na.omit(DT), DT) +test(835, stats::na.omit(DT), DT) test(836, DT[,a:=as.integer(a)], data.table(a=INT(-1,0,1))) test(837, DT[,a:=cbind(1,2)], warning = "2 column matrix RHS of := will be treated as one vector", @@ -3177,7 +3200,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.051, ans1, melt(DT, id.vars="id", measure.vars=list(c(5, 6), c(7, 8)))) test(1035.052, melt(DT, id.vars="id", measure.vars=list(as.raw(0))), error="Unknown 'measure.vars' type raw") - test(1035.06, na.omit(ans1), melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) + test(1035.06, stats::na.omit(ans1), melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) test(1035.07, ans1, melt(DT, id.vars="id", measure.vars=patterns("d_", "l_"))) # melt retains ordered factors! test(1035.08, melt(DT, id.vars="id", measure.vars=c("f_1", "f_2"), value.factor=TRUE)$value, factor(c(as.character(DT$f_1), as.character(DT$f_2)), ordered=TRUE)) @@ -3401,8 +3424,8 @@ test(1064, DT[integer(0), list(x2=x), by=x], output="Empty data.table (0 rows an # bug #2445 fix - := fails when subsetting yields NAs and with=FALSE X = data.table(A=1:3, B=1:6, key="A") -var <- "B" -test(1065, X[J(2:5), (var):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A")) +col <- "B" +test(1065, X[J(2:5), (col):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A")) # fread single unnamed colClasses f = "A,B,C,D\n1,3,5,7\n2,4,6,8\n" @@ -3472,7 +3495,7 @@ setkey(DT2, p,q) ans <- DT1[DT2, nomatch=0, allow.cartesian=TRUE] # NB: DT2 contains duplicate key values so columns c ends up not being sorted test(1082.1, key(ans), c("a","b")) test(1082.2, setkeyv(ans, key(ans)), ans) # i.e. key is valid, otherwise re-built warning will be caught -check <- setkey(as.data.table(aggregate(r ~a+b+c, ans, length)), a, b) +check <- setkey(as.data.table(stats::aggregate(r ~a+b+c, ans, length)), a, b) test(1083, setkeyv(ans[, list(r = .N), by=key(DT1)], key(ans)), check) # if the key is set properly, then and only then will the aggregation results match with "check" # Tests for #2531. `:=` loses POSIXct or ITime attribute: @@ -3565,8 +3588,9 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, "5" = as.Date(c(NA, "2014-06-15", "2014-05-18", NA)), "6" = as.Date(c(NA, NA, "2014-06-15", NA)), key="ID")) - names(ChickWeight) <- tolower(names(ChickWeight)) - DT = melt(as.data.table(ChickWeight), id.vars=2:4) # calls melt.data.table + DT = ChickWeight + names(DT) <- tolower(names(DT)) + DT = melt(as.data.table(DT), id.vars=2:4) # calls melt.data.table # changed 'mean' to 'sum' to avoid valgrind floating point precision based error. test(1102.01, dcast(DT, time ~ variable, fun.aggregate=sum)[c(1,2,11,.N)], data.table(time=c(0,2,20,21),weight=c(2053,2461,9647,9841), key="time")) @@ -5524,12 +5548,8 @@ setkey(X, val1) test(1354, X[Y, val2 := i.val2, allow.cartesian=TRUE][, val1 := NULL][order(id)], data.table(id=1:10, val2=as.integer(c(8,7,7,6,8,6,6,7,7,8)))) # Fix for #475, setDT(CO2) should error, as it's trying to modify the object whose binding is locked. -# CO2 is not locked in R 2.14.1 but is in R >= 3.1.0. R NEWS isn't clear when that change happened, so just test there is an error when it is locked. -if (bindingIsLocked("CO2",as.environment("package:datasets"))) { - test(1355.1, setDT(CO2), error="Cannot convert 'CO2' to data.table by reference because binding is locked.") -} else { - test(1355.2, setDT(CO2), CO2) -} +# NB: requires datasets be attached -- no error thrown on datasets::CO2 or CO2=datasets::CO2 or get("CO2", asNamespace("CO2")) +test(1355, setDT(CO2), error="Cannot convert 'CO2' to data.table by reference because binding is locked.") # Fix for #698. not join doesn't need to check for allow.cartesian=TRUE. DT1 <- data.table(x=rep(1:3, each=3L), y=1:9, key="x") @@ -5754,7 +5774,7 @@ dt = list(AA=sample(c(NA,-2:2), 50, TRUE), EE=sample(as.logical(c(NA,-2:2)), 50, TRUE)) if (test_bit64) dt[["DD"]] = as.integer64(dt[["DD"]]) test_no = 1370.0 -ans = as.list(na.omit(as.data.table(dt))) +ans = as.list(stats::na.omit(as.data.table(dt))) for (i in seq_along(dt)) { combn(names(dt), i, function(cols) { ans1 = is_na(dt[cols]) @@ -6091,9 +6111,9 @@ test(1391.2, subset(DT, select=c("V2", "V1")), DT[, c("V2", "V1"), with=FALSE]) # Test faster version of na.omit() using is_na. DT = data.table(x=sample(c(1:2, NA), 30, TRUE), y=sample(c(1:5, NA, NaN), 30, TRUE)) -test(1392.1, na.omit(DT), DT[!is.na(x) & !is.na(y)]) +test(1392.1, stats::na.omit(DT), DT[!is.na(x) & !is.na(y)]) # added 'invert = ', a logical argument which when TRUE returns rows that has any NAs instead. -test(1392.2, na.omit(DT, invert=TRUE), DT[is.na(x) | is.na(y)]) +test(1392.2, stats::na.omit(DT, invert=TRUE), DT[is.na(x) | is.na(y)]) # Fix for #899. Mix of ordered and normal factors where normal factors in more than 1 data.table has identical levels. DT1 = data.table(A = factor(INT(7,8,7,8,7)), B = factor(6:10), C = 0) @@ -6116,8 +6136,8 @@ DT = data.table(a=sample(col, 20, TRUE), b=as.numeric(sample(col,20,TRUE)), c=as test_no = 1394 for (i in seq_along(DT)) { combn(names(DT), i, function(cols) { - ans1 = na.omit(DT, cols=cols) - ans2 = DT[complete.cases(DT[, cols, with=FALSE])] + ans1 = stats::na.omit(DT, cols=cols) + ans2 = DT[stats::complete.cases(DT[, cols, with=FALSE])] test_no <<- test_no+.001 test(test_no, ans1, ans2) 0L @@ -6634,11 +6654,11 @@ test(1459.12, .Call("CsubsetDT", DT, 5L, seq_along(DT)), setDT(as.data.frame(DT) # Test for na.omit with list, raw and complex types DT = data.table(x=c(1L,1L,NA), y=c(NA, NA, 1), z=as.raw(1:3), w=list(1,NA,2), v=c(1+5i, NA, NA)) -test(1460.1, na.omit(DT, cols="w"), DT[c(1,3)]) -test(1460.2, na.omit(DT, cols="v"), DT[1]) -test(1460.3, na.omit(DT, cols=c("v", "y")), DT[0]) -test(1460.4, na.omit(DT, cols=c("z", "v")), DT[1]) -test(1460.5, na.omit(DT, cols=c("w", "v")), DT[1]) +test(1460.1, stats::na.omit(DT, cols="w"), DT[c(1,3)]) +test(1460.2, stats::na.omit(DT, cols="v"), DT[1]) +test(1460.3, stats::na.omit(DT, cols=c("v", "y")), DT[0]) +test(1460.4, stats::na.omit(DT, cols=c("z", "v")), DT[1]) +test(1460.5, stats::na.omit(DT, cols=c("w", "v")), DT[1]) # Fix for #985 DT = data.table(x=c("a", "a", "b", "b"), v1=sample(4), v2=sample(4)) @@ -6898,7 +6918,7 @@ test(1475.16, uniqueN(logical(), na.rm=TRUE), 0L) # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) -test(1476.1, DT[, .(y=mean(y)), x], setDT(aggregate(y ~ x, DT, mean))) +test(1476.1, DT[, .(y=mean(y)), x], setDT(stats::aggregate(y ~ x, DT, mean))) # test for 'transpose' of a list ll = lapply(1:12, function(x) { @@ -9030,10 +9050,10 @@ test(1627.5, fread(testDir("utf16be.txt")), error="File is encoded in UTF-16") # uniqueN gains na.rm argument, #1455 set.seed(1L) dt = data.table(x=sample(c(1:3,NA),25,TRUE), y=sample(c(NA,"a", "b"), 25,TRUE), z=sample(2,25,TRUE)) -test(1628.1, uniqueN(dt, by=1:2, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y)]))) -test(1628.2, uniqueN(dt, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y,z)]))) -test(1628.3, dt[, uniqueN(y, na.rm=TRUE), by=z], dt[, length(unique(na.omit(y))), by=z]) -test(1628.4, dt[, uniqueN(.SD, na.rm=TRUE), by=z], dt[, nrow(na.omit(.SD[, .N, by=.(x,y)])), by=z]) +test(1628.1, uniqueN(dt, by=1:2, na.rm=TRUE), nrow(stats::na.omit(dt[, .N, by=.(x,y)]))) +test(1628.2, uniqueN(dt, na.rm=TRUE), nrow(stats::na.omit(dt[, .N, by=.(x,y,z)]))) +test(1628.3, dt[, uniqueN(y, na.rm=TRUE), by=z], dt[, length(unique(stats::na.omit(y))), by=z]) +test(1628.4, dt[, uniqueN(.SD, na.rm=TRUE), by=z], dt[, nrow(stats::na.omit(.SD[, .N, by=.(x,y)])), by=z]) # fix for long standing FR/bug, #495 # most likely I'm missing some tests, but we'll fix/add them as we go along. @@ -9612,7 +9632,7 @@ test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("p test(1639.138, ans, split(as.data.table(df), by=c("product","year"))) test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE))) # test if split preallocate columns in results #1908 -dt = data.table(x=rexp(100),y=rep(LETTERS[1:10], 10)) +dt = data.table(x=stats::rexp(100),y=rep(LETTERS[1:10], 10)) dtL = split(dt, by = "y") test(1639.140, dim(dtL[[1]][, x2 := -x]), c(10L,3L)) test(1639.141, all(sapply(dtL, truelength) > 1000)) @@ -9721,8 +9741,8 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { dt1 = nq_fun(100L) # 400 reduced to 100, #5517 dt2 = nq_fun(50L) -x = na.omit(dt1) -y = na.omit(dt2) +x = stats::na.omit(dt1) +y = stats::na.omit(dt2) if (.Machine$sizeof.pointer>4) { @@ -10201,9 +10221,9 @@ test(1677.2, foverlaps(b, a), error="y has some duplicated column") # na.omit.data.table removes indices #1734 dt = data.table(a=4:1, b=c(letters[c(1L,NA,2:3)])) setindexv(dt, "a") -test(1678.1, indices(dt2 <- na.omit(dt, cols="b")), NULL) +test(1678.1, indices(dt2 <- stats::na.omit(dt, cols="b")), NULL) setindexv(dt2, "a") -test(1678.2, indices(na.omit(dt2, cols="b")), "a") +test(1678.2, indices(stats::na.omit(dt2, cols="b")), "a") # rleid gains `prefix` argument, similar to rowid x = sample(3,10,TRUE) @@ -12440,11 +12460,8 @@ for (i in 100:1) { test(1871.2 + (100-i)/1000, fread(lines, nrows=i), data.table(V1=rep.int(2L,i), V2=3L, V3=4L)) } -# miscellaneous missing tests uncovered by CodeCov difference -# in the process of PR #2573 -## data.table cannot recycle complicated types -short_s4_col = getClass("MethodDefinition") -test(1872.01, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'") +# miscellaneous missing tests uncovered by CodeCov difference in the process of PR #2573 +# 1872.01 moved to S4.Rraw since it uses S4 ## i must be a data.table when on is specified DT = data.table(a = 1:3) test(1872.02, DT[c(TRUE, FALSE), on = 'coefficients'], error = "not a data.table, but 'on'") @@ -12593,11 +12610,11 @@ test(1886, fread(testDir("quoted_no_header.csv"))[c(1,.N),list(V1,V6)], data.tab # na.omit with invert & no NAs works, #2660 DT = data.table(a = 1:5) -test(1887.1, na.omit(DT), DT) -test(1887.2, na.omit(DT, invert=TRUE), DT[0L]) +test(1887.1, stats::na.omit(DT), DT) +test(1887.2, stats::na.omit(DT, invert=TRUE), DT[0L]) DT = fread(",2,3\n1,,3\n1,2,\n") # all rows contain an NA, #2784 -test(1887.3, na.omit(DT), DT[0L]) -test(1887.4, na.omit(DT, invert=TRUE), DT) +test(1887.3, stats::na.omit(DT), DT[0L]) +test(1887.4, stats::na.omit(DT, invert=TRUE), DT) x = runif(1e3) # 1e4 reduced to 1e3 in #5517 but really it was the 1e6 just after 1888.5 below which is now 1e3 too test(1888, fsort(x), base::sort(x)) @@ -12957,45 +12974,8 @@ for (col in c('b', 'c')) { test_no = test_no + 0.0001 test(test_no, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns } -# -# tests-S4.R (S4 Compatability) -# -suppressWarnings(setClass("Data.Table", contains="data.table")) # suppress "Created a package name, '2018-05-26 06:14:43.444', when none found" -suppressWarnings(setClass("S4Composition", representation(data="data.table"))) -# data.table can be a parent class -ids <- sample(letters[1:3], 10, replace=TRUE) -scores <- rnorm(10) -dt <- data.table(id=ids, score=scores) -dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) -test(1914.01, isS4(dt.s4)) -test(1914.02, inherits(dt.s4, 'data.table')) -# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table -test(1914.03, isS4(shallow(dt.s4))) -## pull out data from S4 as.list, and compare to list from dt -dt.s4.list <- dt.s4@.Data -names(dt.s4.list) <- names(dt.s4) -test(1914.04, dt.s4.list, as.list(dt)) # Underlying data not identical -# simple S4 conversion-isms work -df = data.frame(a=sample(letters, 10), b=1:10) -dt = as.data.table(df) -test(1914.05, identical(methods::as(df, 'data.table'), dt)) -test(1914.06, identical(methods::as(dt, 'data.frame'), df)) -# data.table can be used in an S4 slot -dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) -dt.comp <- new("S4Composition", data=dt) -test(1914.07, dt.comp@data, dt) -# S4 methods dispatch properly on data.table slots" -dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) -dt.comp <- new("S4Composition", data=dt) -setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) -setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) -setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) -test(1914.08, dtGet(dt.comp), dt) # actually -test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]])) -test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b)) -removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev -removeClass("S4Composition") -# END port of old testthat tests + +# Test 1914 of S4 compatibility was moved to S4.Rraw for #3808 str = "Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species 5.1,3.5,1.4,0.2,setosa @@ -13039,13 +13019,13 @@ test(1918.6, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure( test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) # wrong bmerge result if character gets coerced to factor, i is keyed, and level order in i is different from x, #2881 -iris = data.table(iris) -iris$grp = rep(c('A','B'), 75L) -iris[, Species1 := factor(Species, levels=c('setosa','versicolor','virginica'), labels=c('setosa','versicolor','Virginica'))] +iris.dt = data.table(iris) +iris.dt$grp = rep(c('A','B'), 75L) +iris.dt[, Species1 := factor(Species, levels=c('setosa','versicolor','virginica'), labels=c('setosa','versicolor','Virginica'))] iSorted = data.table(Species1 = c('setosa','Virginica'), grp='B', key=c("grp","Species1")) i = setkey(copy(iSorted),NULL) -test(1920, iris[iSorted, on = c("grp==grp", 'Species1==Species1')], - iris[i, on = c("grp==grp", 'Species1==Species1')]) +test(1920, iris.dt[iSorted, on = c("grp==grp", 'Species1==Species1')], + iris.dt[i, on = c("grp==grp", 'Species1==Species1')]) # origin= ignored by as.IDate.numeric(), #2880 test(1921.1, as.IDate(1000, origin = "1930-01-01"), as.IDate("1932-09-27")) @@ -13199,7 +13179,7 @@ test(1944.3, DT[flag == 1, sum(x), keyby = group], # should not use index data.table(group=c("A","B"), V1=INT(1,8), key="group")) set.seed(123) N = 10 -DT = data.table(group = rbinom(N, 5, 0.5), x = 1:N, flag = rbinom(N, 1, 0.9)) +DT = data.table(group = stats::rbinom(N, 5, 0.5), x = 1:N, flag = stats::rbinom(N, 1, 0.9)) test(1944.4, DT[flag == 1 & group == 1, x], 6L) test(1944.5, indices(DT), "group__flag") test(1944.6, DT[flag == 1, sum(x), keyby = group], data.table(group=1:4, V1=INT(6,3,18,17), key="group")) @@ -14019,10 +13999,10 @@ DT = data.table(a=1:3, b=4:6, key="a") K = data.table(a=2:3, FOO=9L, BAR=12L) test(1973.1, DT[K, "FOO"], data.table(FOO=c(9L,9L))) test(1973.2, DT[K, "FOO", with=FALSE], data.table(FOO=c(9L,9L))) -var = "b" -test(1973.3, DT[K, c(var, "FOO")], c("b","FOO")) -test(1973.4, DT[K, c(..var, "FOO")], ans<-data.table(b=5:6, FOO=9L)) -test(1973.5, DT[K, c(var, "FOO"), with=FALSE], ans) +col = "b" +test(1973.3, DT[K, c(col, "FOO")], c("b","FOO")) +test(1973.4, DT[K, c(..col, "FOO")], ans<-data.table(b=5:6, FOO=9L)) +test(1973.5, DT[K, c(col, "FOO"), with=FALSE], ans) # no error when j is supplied but inherits missingness from caller DT = data.table(a=1:3, b=4:6) @@ -14121,10 +14101,10 @@ test(1984.20, dimnames(DT) <- list(NULL, 5), error = "Can't assign 1 names") dimnames(DT) <- list(NULL, 1:5) test(1984.21, names(DT), paste0(1:5)) DT = data.table(a = 1:10) -test(1984.22, na.omit(DT, invert = 'a'), error="'invert' must be logical") -test(1984.23, na.omit(DT, cols = 'b'), error="received non-existing column*.*b") -#test(1984.24, na.omit(DT, cols = c('b', 'c')), error="Columns [b, c] don't") # only first non-existing col is now reported for efficiency -test(1984.242, na.omit(data.table(A=c(1,NA,2)), cols=character()), data.table(A=c(1,NA,2))) #2514 +test(1984.22, stats::na.omit(DT, invert = 'a'), error="'invert' must be logical") +test(1984.23, stats::na.omit(DT, cols = 'b'), error="received non-existing column*.*b") +#test(1984.24, stats::na.omit(DT, cols = c('b', 'c')), error="Columns [b, c] don't") # only first non-existing col is now reported for efficiency +test(1984.242, stats::na.omit(data.table(A=c(1,NA,2)), cols=character()), data.table(A=c(1,NA,2))) #2514 ### idcol = TRUE behavior of rbindlist test(1984.25, rbindlist(list(DT[1L], DT[2L]), idcol = TRUE), data.table(.id=1:2, a=1:2)) test(1984.26, setalloccol(`*tmp*`), error='setalloccol attempting to modify `*tmp*`') @@ -16414,7 +16394,7 @@ test(2119.17, data.table(a=1:2)[, newcol := list(2L, 3L)], ans) # i symbol fetch from calling scope; #3669 iDT = data.table(key = "i_id", i_id = c("A", "B", "C", "D"), - g = state.name[c(1,1,2,3)], + g = c("Alabama", "Alabama", "Alaska", "Arizona"), e_date = as.IDate(c("2019-01-20", "2019-01-20", "2019-01-01", "2019-01-01")), e_time = as.ITime(c("14:00", "20:00", "20:00", "20:00")) ) @@ -16680,12 +16660,7 @@ DT = data.table( y = list(list(x=1, y=c("yes", "no")), list(x=2, y=2))) test(2130.02, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) -s4class = setClass("ex_class", slots = list(x="integer", y="character", z="numeric")) -DT = data.table( - x = 1:2, - y = list(s4class(x=1L, y=c("yes", "no"), z=2.5), - s4class(x=2L, y="yes", z=1))) -test(2130.03, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) +# test 2130.03 moved to S4.Rraw # format_col and format_list_item printing helpers/generics ## Use case: solve #2842 by defining format_col.POSIXct to have usetz = TRUE @@ -16726,7 +16701,7 @@ registerS3method("format", "myclass2130", format.default) registerS3method("format", "foo2130", format.default) DT = data.table(num = 1:2, - formula = list(as.formula("mpg~cyl")), + formula = list(mpg~cyl), model = list(lm(mpg~cyl, mtcars)), shallow = list(1:3, 4:6), nested = list(list(1:3), list(4:6))) @@ -16751,16 +16726,7 @@ dt = data.table(x = rep(1:3, each = 3), y = runif(9)) out = dt[, list(evaluated = list(f(copy(.SD)))), by = x] test(2131.2, class(out$evaluated[[1L]]), 'environment') -# S4 object not suported in fifelse and fcase, #4135 -class2132 = setClass("class2132", slots=list(x="numeric")) -s1 = class2132(x=20191231) -s2 = class2132(x=20191230) -test(2132.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.") -test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.") -test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") -test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") -rm(s1, s2, class2132) - +# 2132 tested S4 in fcase()/fifelse() moved to S4.Rraw # 2133 tested xts moved to other.Rraw 20, #5516 # friendlier error for common mistake of using := in i instead of j, #4227 @@ -16995,7 +16961,7 @@ DT = data.table(a = c("s", "x"), survmean = 1:2) test(2151, dcast(DT, 1 ~ a, value.var='survmean'), data.table('.'='.', s=1L, x=2L, key='.')) # list object with [[ method that returns itself (e.g. person) lead to infinite loop in copy(), #4620 -y = person(given='Joel', family='Mossong') +y = utils::person(given='Joel', family='Mossong') test(2152, copy(y), y) # .N and .GRP special statics copied correctly when placed as a vector in a list column; part of PR#4655 @@ -17296,7 +17262,7 @@ measurev = list("foo", "bar")#measurev below should not use this since it is not test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging") test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NULL), pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword -iris.dt = data.table(datasets::iris) +iris.dt = data.table(iris) test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1]") test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1, 2]") @@ -17327,7 +17293,7 @@ test(2183.09, melt(DTid, measure.vars=structure(1:3, variable_table=data.table(x test(2183.10, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") test(2183.11, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=list(x=1:2, y=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2")#make sure to check each list element, not just the first. # general measure errors. -iris.dt = data.table(datasets::iris) +iris.dt = data.table(iris) test(2183.20, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") # school example. schools.wide <- data.table( @@ -17366,7 +17332,7 @@ myfac = function(x)factor(x)#user-defined conversion function. test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) # measure errors. iris.i <- 1 -iris.num <- datasets::iris[iris.i, 1:4] +iris.num <- iris[iris.i, 1:4] iris.days <- data.table( day1=iris.num, day2=iris.num, Species=iris$Species[iris.i]) test(2183.61, melt(iris.days, measure.vars=measure(before=as.integer, value.name, dim, sep=".")), error="before conversion function returned vector of all NA", warning=base_messages$coerce_na) diff --git a/tests/S4.R b/tests/S4.R new file mode 100644 index 000000000..90ed742cf --- /dev/null +++ b/tests/S4.R @@ -0,0 +1,2 @@ +library(data.table) +test.data.table(script="S4.Rraw") From 53df7e59d1f2f75e858868c4538700267b0978d8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 6 Apr 2024 21:38:56 -0700 Subject: [PATCH 039/106] Revert "Move S4 tests to separate script" This reverts commit 7b749b1e1649f94736e7ac672bde6bc2b2ed3bf3. --- inst/tests/S4.Rraw | 79 --------------- inst/tests/tests.Rraw | 224 ++++++++++++++++++++++++------------------ tests/S4.R | 2 - 3 files changed, 129 insertions(+), 176 deletions(-) delete mode 100644 inst/tests/S4.Rraw delete mode 100644 tests/S4.R diff --git a/inst/tests/S4.Rraw b/inst/tests/S4.Rraw deleted file mode 100644 index 743875391..000000000 --- a/inst/tests/S4.Rraw +++ /dev/null @@ -1,79 +0,0 @@ -if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { - if (length(find.package("data.table", quiet=TRUE))) { - remove.packages("data.table") - stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n") - } - if ((tt<-compiler::enableJIT(-1))>0) - cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") - rm_all = function() {} - DTfun = DT ## otherwise DT would be re-defined by many tests -} else { - library(data.table) - - shallow = data.table:::shallow - test = data.table:::test -} - -library(methods) - -suppressWarnings({ - setClass("Data.Table", contains="data.table") # suppress "Created a package name, '2018-05-26 06:14:43.444', when none found" - setClass("S4Composition", representation(data="data.table")) -}) -# data.table can be a parent class -ids <- sample(letters[1:3], 10, replace=TRUE) -scores <- stats::rnorm(10) -dt <- data.table(id=ids, score=scores) -dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) -test(1.01, isS4(dt.s4)) -test(1.02, inherits(dt.s4, 'data.table')) -# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table -test(1.03, isS4(shallow(dt.s4))) -## pull out data from S4 as.list, and compare to list from dt -dt.s4.list <- dt.s4@.Data -names(dt.s4.list) <- names(dt.s4) -test(1.04, dt.s4.list, as.list(dt)) # Underlying data not identical -# simple S4 conversion-isms work -df = data.frame(a=sample(letters, 10), b=1:10) -dt = as.data.table(df) -test(1.05, identical(methods::as(df, 'data.table'), dt)) -test(1.06, identical(methods::as(dt, 'data.frame'), df)) -# data.table can be used in an S4 slot -dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=stats::rnorm(10)) -dt.comp <- new("S4Composition", data=dt) -test(1.07, dt.comp@data, dt) -# S4 methods dispatch properly on data.table slots" -dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=stats::rnorm(10)) -dt.comp <- new("S4Composition", data=dt) -setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) -setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) -setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) -test(1.08, dtGet(dt.comp), dt) # actually -test(1.09, identical(dtGet(dt.comp, 1), dt[[1]])) -test(1.10, identical(dtGet(dt.comp, 'b'), dt$b)) -removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev -removeClass("S4Composition") -# END port of old testthat tests - -# miscellaneous missing tests uncovered by CodeCov difference in the process of PR #2573 [S4 portion, c.f. 1872.* in tests.Rraw] -## data.table cannot recycle complicated types -short_s4_col = getClass("MethodDefinition") -test(2, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'") - -# print dims in list-columns, #3671, c.f. 2130.* in tests.Rraw -s4class = setClass("ex_class", slots = list(x="integer", y="character", z="numeric")) -DT = data.table( - x = 1:2, - y = list(s4class(x=1L, y=c("yes", "no"), z=2.5), - s4class(x=2L, y="yes", z=1))) -test(3, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) - -# S4 object not suported in fifelse and fcase, #4135 -class4 = setClass("class4", slots=list(x="numeric")) -s1 = class4(x=20191231) -s2 = class4(x=20191230) -test(4.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.") -test(4.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.") -test(4.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") -test(4.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") -rm(s1, s2, class4) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8f0fc886a..b0aab587d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1,33 +1,10 @@ -library(datasets) # for airquality, BOD, cars, ChickWeight, CO2, iris, mtcars - -lm=stats::lm -median=stats::median -na.omit=stats::na.omit -rnorm=stats::rnorm -runif=stats::runif -sd=stats::sd -setNames=stats::setNames -var=stats::var -weighted.mean=stats::weighted.mean - -capture.output=utils::capture.output -combn=utils::combn -head=utils::head -read.csv=utils::read.csv -read.delim=utils::read.delim -read.table=utils::read.table -tail=utils::tail -type.convert=utils::type.convert -write.csv=utils::write.csv -write.table=utils::write.table - -as.integer64=bit64::as.integer64 -integer64=bit64::integer64 -is.na.integer64=bit64::is.na.integer64 -lim.integer64=bit64::lim.integer64 -NA_integer64_=bit64::NA_integer64_ +require(methods) if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if (!identical(suppressWarnings(packageDescription("data.table")), NA)) { + remove.packages("data.table") + stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n") + } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") rm_all = function() {} @@ -187,7 +164,7 @@ base_messages = list( missing_coerce_method = get_msg(delim = '"', { old = options(useFancyQuotes = FALSE) # otherwise we get angled quotes, hard to match robustly on.exit(options(old)) - methods::as(TRUE, 'foo') + as(TRUE, 'foo') }), missing_dispatch_method = get_msg(conditionMessage(structure(1, class="foo")), '[\'"]'), invalid_arg_unary_operator = get_msg(-'a'), @@ -447,8 +424,8 @@ test(109, all(!is.na(dt))) dt2 <- dt dt2$A[1] <- NA # removes key test(110, sum(is.na(dt2)), 1L) -test(111, {setkey(dt,NULL);dt}, stats::na.omit(dt)) -test(112, dt2[2:nrow(dt2),A], stats::na.omit(dt2)$A) +test(111, {setkey(dt,NULL);dt}, na.omit(dt)) +test(112, dt2[2:nrow(dt2),A], na.omit(dt2)$A) # test [<- assignment: dt2[is.na(dt2)] <- 1L @@ -1702,8 +1679,8 @@ test(536, DT[,sum(v),by=a], data.table(a=c(1L,3L,2L),V1=c(4L,7L,10L))) # retain ans = data.table(a=1:3,V1=c(4L,10L,7L),key="a") test(537, DT[,sum(v),keyby=a], ans) test(538, DT[,sum(v),keyby="a"], ans) -byvar="a" -test(539, DT[,sum(v),keyby=eval(byvar)], ans) +var="a" +test(539, DT[,sum(v),keyby=eval(var)], ans) a=quote(a%%2L) test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) @@ -1806,7 +1783,7 @@ test(584, DT[a<1], output="Empty data.table (0 rows and 2 cols): a,v") test(585, DT[a<1,list(v)], output="Empty data.table (0 rows and 1 cols): v") test(586.1, data.table(a=integer(),V1=integer()), output="Empty data.table (0 rows and 2 cols): a,V1") env = environment() -utils::data(iris, package='datasets', envir = env) # in case user has edited iris in their session +data(iris, package='datasets', envir = env) # in case user has edited iris in their session test(586.2, print.data.table(iris[,FALSE]), output="Empty data.frame (150 rows and 0 cols)") #3363 # Test that .N is available in by on empty table, also in #1945 @@ -1916,7 +1893,7 @@ DT$time1 <- Sys.time() # recycle via *tmp* DT$time2 <- rep(Sys.time(), 5) # plonk via *tmp* DT[,time3:=Sys.time()] # recycle DT[,time4:=rep(Sys.time(),5)] # plonk -test(625, all(sapply(DT, inherits, "POSIXct")[-1])) +test(625, all(sapply(DT,is,"POSIXct")[-1])) # unique on ITime doesn't lose attributes, #1719 t = as.ITime(strptime(c("09:10:00","09:11:00","09:11:00","09:12:00"),"%H:%M:%S")) @@ -2498,7 +2475,7 @@ test(834, comment(DT1[2:3]$A), "first comment") # Test that matrix RHS of := is caught, #2333 DT = data.table(a=1:3) DT[,a:=scale(a)] # 1 column matrix auto treated as vector -test(835, stats::na.omit(DT), DT) +test(835, na.omit(DT), DT) test(836, DT[,a:=as.integer(a)], data.table(a=INT(-1,0,1))) test(837, DT[,a:=cbind(1,2)], warning = "2 column matrix RHS of := will be treated as one vector", @@ -3200,7 +3177,7 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) test(1035.051, ans1, melt(DT, id.vars="id", measure.vars=list(c(5, 6), c(7, 8)))) test(1035.052, melt(DT, id.vars="id", measure.vars=list(as.raw(0))), error="Unknown 'measure.vars' type raw") - test(1035.06, stats::na.omit(ans1), melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) + test(1035.06, na.omit(ans1), melt(DT, id.vars="id", measure.vars=list(5:6, 7:8), na.rm=TRUE)) test(1035.07, ans1, melt(DT, id.vars="id", measure.vars=patterns("d_", "l_"))) # melt retains ordered factors! test(1035.08, melt(DT, id.vars="id", measure.vars=c("f_1", "f_2"), value.factor=TRUE)$value, factor(c(as.character(DT$f_1), as.character(DT$f_2)), ordered=TRUE)) @@ -3424,8 +3401,8 @@ test(1064, DT[integer(0), list(x2=x), by=x], output="Empty data.table (0 rows an # bug #2445 fix - := fails when subsetting yields NAs and with=FALSE X = data.table(A=1:3, B=1:6, key="A") -col <- "B" -test(1065, X[J(2:5), (col):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A")) +var <- "B" +test(1065, X[J(2:5), (var):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A")) # fread single unnamed colClasses f = "A,B,C,D\n1,3,5,7\n2,4,6,8\n" @@ -3495,7 +3472,7 @@ setkey(DT2, p,q) ans <- DT1[DT2, nomatch=0, allow.cartesian=TRUE] # NB: DT2 contains duplicate key values so columns c ends up not being sorted test(1082.1, key(ans), c("a","b")) test(1082.2, setkeyv(ans, key(ans)), ans) # i.e. key is valid, otherwise re-built warning will be caught -check <- setkey(as.data.table(stats::aggregate(r ~a+b+c, ans, length)), a, b) +check <- setkey(as.data.table(aggregate(r ~a+b+c, ans, length)), a, b) test(1083, setkeyv(ans[, list(r = .N), by=key(DT1)], key(ans)), check) # if the key is set properly, then and only then will the aggregation results match with "check" # Tests for #2531. `:=` loses POSIXct or ITime attribute: @@ -3588,9 +3565,8 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, "5" = as.Date(c(NA, "2014-06-15", "2014-05-18", NA)), "6" = as.Date(c(NA, NA, "2014-06-15", NA)), key="ID")) - DT = ChickWeight - names(DT) <- tolower(names(DT)) - DT = melt(as.data.table(DT), id.vars=2:4) # calls melt.data.table + names(ChickWeight) <- tolower(names(ChickWeight)) + DT = melt(as.data.table(ChickWeight), id.vars=2:4) # calls melt.data.table # changed 'mean' to 'sum' to avoid valgrind floating point precision based error. test(1102.01, dcast(DT, time ~ variable, fun.aggregate=sum)[c(1,2,11,.N)], data.table(time=c(0,2,20,21),weight=c(2053,2461,9647,9841), key="time")) @@ -5548,8 +5524,12 @@ setkey(X, val1) test(1354, X[Y, val2 := i.val2, allow.cartesian=TRUE][, val1 := NULL][order(id)], data.table(id=1:10, val2=as.integer(c(8,7,7,6,8,6,6,7,7,8)))) # Fix for #475, setDT(CO2) should error, as it's trying to modify the object whose binding is locked. -# NB: requires datasets be attached -- no error thrown on datasets::CO2 or CO2=datasets::CO2 or get("CO2", asNamespace("CO2")) -test(1355, setDT(CO2), error="Cannot convert 'CO2' to data.table by reference because binding is locked.") +# CO2 is not locked in R 2.14.1 but is in R >= 3.1.0. R NEWS isn't clear when that change happened, so just test there is an error when it is locked. +if (bindingIsLocked("CO2",as.environment("package:datasets"))) { + test(1355.1, setDT(CO2), error="Cannot convert 'CO2' to data.table by reference because binding is locked.") +} else { + test(1355.2, setDT(CO2), CO2) +} # Fix for #698. not join doesn't need to check for allow.cartesian=TRUE. DT1 <- data.table(x=rep(1:3, each=3L), y=1:9, key="x") @@ -5774,7 +5754,7 @@ dt = list(AA=sample(c(NA,-2:2), 50, TRUE), EE=sample(as.logical(c(NA,-2:2)), 50, TRUE)) if (test_bit64) dt[["DD"]] = as.integer64(dt[["DD"]]) test_no = 1370.0 -ans = as.list(stats::na.omit(as.data.table(dt))) +ans = as.list(na.omit(as.data.table(dt))) for (i in seq_along(dt)) { combn(names(dt), i, function(cols) { ans1 = is_na(dt[cols]) @@ -6111,9 +6091,9 @@ test(1391.2, subset(DT, select=c("V2", "V1")), DT[, c("V2", "V1"), with=FALSE]) # Test faster version of na.omit() using is_na. DT = data.table(x=sample(c(1:2, NA), 30, TRUE), y=sample(c(1:5, NA, NaN), 30, TRUE)) -test(1392.1, stats::na.omit(DT), DT[!is.na(x) & !is.na(y)]) +test(1392.1, na.omit(DT), DT[!is.na(x) & !is.na(y)]) # added 'invert = ', a logical argument which when TRUE returns rows that has any NAs instead. -test(1392.2, stats::na.omit(DT, invert=TRUE), DT[is.na(x) | is.na(y)]) +test(1392.2, na.omit(DT, invert=TRUE), DT[is.na(x) | is.na(y)]) # Fix for #899. Mix of ordered and normal factors where normal factors in more than 1 data.table has identical levels. DT1 = data.table(A = factor(INT(7,8,7,8,7)), B = factor(6:10), C = 0) @@ -6136,8 +6116,8 @@ DT = data.table(a=sample(col, 20, TRUE), b=as.numeric(sample(col,20,TRUE)), c=as test_no = 1394 for (i in seq_along(DT)) { combn(names(DT), i, function(cols) { - ans1 = stats::na.omit(DT, cols=cols) - ans2 = DT[stats::complete.cases(DT[, cols, with=FALSE])] + ans1 = na.omit(DT, cols=cols) + ans2 = DT[complete.cases(DT[, cols, with=FALSE])] test_no <<- test_no+.001 test(test_no, ans1, ans2) 0L @@ -6654,11 +6634,11 @@ test(1459.12, .Call("CsubsetDT", DT, 5L, seq_along(DT)), setDT(as.data.frame(DT) # Test for na.omit with list, raw and complex types DT = data.table(x=c(1L,1L,NA), y=c(NA, NA, 1), z=as.raw(1:3), w=list(1,NA,2), v=c(1+5i, NA, NA)) -test(1460.1, stats::na.omit(DT, cols="w"), DT[c(1,3)]) -test(1460.2, stats::na.omit(DT, cols="v"), DT[1]) -test(1460.3, stats::na.omit(DT, cols=c("v", "y")), DT[0]) -test(1460.4, stats::na.omit(DT, cols=c("z", "v")), DT[1]) -test(1460.5, stats::na.omit(DT, cols=c("w", "v")), DT[1]) +test(1460.1, na.omit(DT, cols="w"), DT[c(1,3)]) +test(1460.2, na.omit(DT, cols="v"), DT[1]) +test(1460.3, na.omit(DT, cols=c("v", "y")), DT[0]) +test(1460.4, na.omit(DT, cols=c("z", "v")), DT[1]) +test(1460.5, na.omit(DT, cols=c("w", "v")), DT[1]) # Fix for #985 DT = data.table(x=c("a", "a", "b", "b"), v1=sample(4), v2=sample(4)) @@ -6918,7 +6898,7 @@ test(1475.16, uniqueN(logical(), na.rm=TRUE), 0L) # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) -test(1476.1, DT[, .(y=mean(y)), x], setDT(stats::aggregate(y ~ x, DT, mean))) +test(1476.1, DT[, .(y=mean(y)), x], setDT(aggregate(y ~ x, DT, mean))) # test for 'transpose' of a list ll = lapply(1:12, function(x) { @@ -9050,10 +9030,10 @@ test(1627.5, fread(testDir("utf16be.txt")), error="File is encoded in UTF-16") # uniqueN gains na.rm argument, #1455 set.seed(1L) dt = data.table(x=sample(c(1:3,NA),25,TRUE), y=sample(c(NA,"a", "b"), 25,TRUE), z=sample(2,25,TRUE)) -test(1628.1, uniqueN(dt, by=1:2, na.rm=TRUE), nrow(stats::na.omit(dt[, .N, by=.(x,y)]))) -test(1628.2, uniqueN(dt, na.rm=TRUE), nrow(stats::na.omit(dt[, .N, by=.(x,y,z)]))) -test(1628.3, dt[, uniqueN(y, na.rm=TRUE), by=z], dt[, length(unique(stats::na.omit(y))), by=z]) -test(1628.4, dt[, uniqueN(.SD, na.rm=TRUE), by=z], dt[, nrow(stats::na.omit(.SD[, .N, by=.(x,y)])), by=z]) +test(1628.1, uniqueN(dt, by=1:2, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y)]))) +test(1628.2, uniqueN(dt, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y,z)]))) +test(1628.3, dt[, uniqueN(y, na.rm=TRUE), by=z], dt[, length(unique(na.omit(y))), by=z]) +test(1628.4, dt[, uniqueN(.SD, na.rm=TRUE), by=z], dt[, nrow(na.omit(.SD[, .N, by=.(x,y)])), by=z]) # fix for long standing FR/bug, #495 # most likely I'm missing some tests, but we'll fix/add them as we go along. @@ -9632,7 +9612,7 @@ test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("p test(1639.138, ans, split(as.data.table(df), by=c("product","year"))) test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE))) # test if split preallocate columns in results #1908 -dt = data.table(x=stats::rexp(100),y=rep(LETTERS[1:10], 10)) +dt = data.table(x=rexp(100),y=rep(LETTERS[1:10], 10)) dtL = split(dt, by = "y") test(1639.140, dim(dtL[[1]][, x2 := -x]), c(10L,3L)) test(1639.141, all(sapply(dtL, truelength) > 1000)) @@ -9741,8 +9721,8 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { dt1 = nq_fun(100L) # 400 reduced to 100, #5517 dt2 = nq_fun(50L) -x = stats::na.omit(dt1) -y = stats::na.omit(dt2) +x = na.omit(dt1) +y = na.omit(dt2) if (.Machine$sizeof.pointer>4) { @@ -10221,9 +10201,9 @@ test(1677.2, foverlaps(b, a), error="y has some duplicated column") # na.omit.data.table removes indices #1734 dt = data.table(a=4:1, b=c(letters[c(1L,NA,2:3)])) setindexv(dt, "a") -test(1678.1, indices(dt2 <- stats::na.omit(dt, cols="b")), NULL) +test(1678.1, indices(dt2 <- na.omit(dt, cols="b")), NULL) setindexv(dt2, "a") -test(1678.2, indices(stats::na.omit(dt2, cols="b")), "a") +test(1678.2, indices(na.omit(dt2, cols="b")), "a") # rleid gains `prefix` argument, similar to rowid x = sample(3,10,TRUE) @@ -12460,8 +12440,11 @@ for (i in 100:1) { test(1871.2 + (100-i)/1000, fread(lines, nrows=i), data.table(V1=rep.int(2L,i), V2=3L, V3=4L)) } -# miscellaneous missing tests uncovered by CodeCov difference in the process of PR #2573 -# 1872.01 moved to S4.Rraw since it uses S4 +# miscellaneous missing tests uncovered by CodeCov difference +# in the process of PR #2573 +## data.table cannot recycle complicated types +short_s4_col = getClass("MethodDefinition") +test(1872.01, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'") ## i must be a data.table when on is specified DT = data.table(a = 1:3) test(1872.02, DT[c(TRUE, FALSE), on = 'coefficients'], error = "not a data.table, but 'on'") @@ -12610,11 +12593,11 @@ test(1886, fread(testDir("quoted_no_header.csv"))[c(1,.N),list(V1,V6)], data.tab # na.omit with invert & no NAs works, #2660 DT = data.table(a = 1:5) -test(1887.1, stats::na.omit(DT), DT) -test(1887.2, stats::na.omit(DT, invert=TRUE), DT[0L]) +test(1887.1, na.omit(DT), DT) +test(1887.2, na.omit(DT, invert=TRUE), DT[0L]) DT = fread(",2,3\n1,,3\n1,2,\n") # all rows contain an NA, #2784 -test(1887.3, stats::na.omit(DT), DT[0L]) -test(1887.4, stats::na.omit(DT, invert=TRUE), DT) +test(1887.3, na.omit(DT), DT[0L]) +test(1887.4, na.omit(DT, invert=TRUE), DT) x = runif(1e3) # 1e4 reduced to 1e3 in #5517 but really it was the 1e6 just after 1888.5 below which is now 1e3 too test(1888, fsort(x), base::sort(x)) @@ -12974,8 +12957,45 @@ for (col in c('b', 'c')) { test_no = test_no + 0.0001 test(test_no, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns } - -# Test 1914 of S4 compatibility was moved to S4.Rraw for #3808 +# +# tests-S4.R (S4 Compatability) +# +suppressWarnings(setClass("Data.Table", contains="data.table")) # suppress "Created a package name, '2018-05-26 06:14:43.444', when none found" +suppressWarnings(setClass("S4Composition", representation(data="data.table"))) +# data.table can be a parent class +ids <- sample(letters[1:3], 10, replace=TRUE) +scores <- rnorm(10) +dt <- data.table(id=ids, score=scores) +dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) +test(1914.01, isS4(dt.s4)) +test(1914.02, inherits(dt.s4, 'data.table')) +# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table +test(1914.03, isS4(shallow(dt.s4))) +## pull out data from S4 as.list, and compare to list from dt +dt.s4.list <- dt.s4@.Data +names(dt.s4.list) <- names(dt.s4) +test(1914.04, dt.s4.list, as.list(dt)) # Underlying data not identical +# simple S4 conversion-isms work +df = data.frame(a=sample(letters, 10), b=1:10) +dt = as.data.table(df) +test(1914.05, identical(methods::as(df, 'data.table'), dt)) +test(1914.06, identical(methods::as(dt, 'data.frame'), df)) +# data.table can be used in an S4 slot +dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) +dt.comp <- new("S4Composition", data=dt) +test(1914.07, dt.comp@data, dt) +# S4 methods dispatch properly on data.table slots" +dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) +dt.comp <- new("S4Composition", data=dt) +setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) +setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) +setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) +test(1914.08, dtGet(dt.comp), dt) # actually +test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]])) +test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b)) +removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev +removeClass("S4Composition") +# END port of old testthat tests str = "Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species 5.1,3.5,1.4,0.2,setosa @@ -13019,13 +13039,13 @@ test(1918.6, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure( test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) # wrong bmerge result if character gets coerced to factor, i is keyed, and level order in i is different from x, #2881 -iris.dt = data.table(iris) -iris.dt$grp = rep(c('A','B'), 75L) -iris.dt[, Species1 := factor(Species, levels=c('setosa','versicolor','virginica'), labels=c('setosa','versicolor','Virginica'))] +iris = data.table(iris) +iris$grp = rep(c('A','B'), 75L) +iris[, Species1 := factor(Species, levels=c('setosa','versicolor','virginica'), labels=c('setosa','versicolor','Virginica'))] iSorted = data.table(Species1 = c('setosa','Virginica'), grp='B', key=c("grp","Species1")) i = setkey(copy(iSorted),NULL) -test(1920, iris.dt[iSorted, on = c("grp==grp", 'Species1==Species1')], - iris.dt[i, on = c("grp==grp", 'Species1==Species1')]) +test(1920, iris[iSorted, on = c("grp==grp", 'Species1==Species1')], + iris[i, on = c("grp==grp", 'Species1==Species1')]) # origin= ignored by as.IDate.numeric(), #2880 test(1921.1, as.IDate(1000, origin = "1930-01-01"), as.IDate("1932-09-27")) @@ -13179,7 +13199,7 @@ test(1944.3, DT[flag == 1, sum(x), keyby = group], # should not use index data.table(group=c("A","B"), V1=INT(1,8), key="group")) set.seed(123) N = 10 -DT = data.table(group = stats::rbinom(N, 5, 0.5), x = 1:N, flag = stats::rbinom(N, 1, 0.9)) +DT = data.table(group = rbinom(N, 5, 0.5), x = 1:N, flag = rbinom(N, 1, 0.9)) test(1944.4, DT[flag == 1 & group == 1, x], 6L) test(1944.5, indices(DT), "group__flag") test(1944.6, DT[flag == 1, sum(x), keyby = group], data.table(group=1:4, V1=INT(6,3,18,17), key="group")) @@ -13999,10 +14019,10 @@ DT = data.table(a=1:3, b=4:6, key="a") K = data.table(a=2:3, FOO=9L, BAR=12L) test(1973.1, DT[K, "FOO"], data.table(FOO=c(9L,9L))) test(1973.2, DT[K, "FOO", with=FALSE], data.table(FOO=c(9L,9L))) -col = "b" -test(1973.3, DT[K, c(col, "FOO")], c("b","FOO")) -test(1973.4, DT[K, c(..col, "FOO")], ans<-data.table(b=5:6, FOO=9L)) -test(1973.5, DT[K, c(col, "FOO"), with=FALSE], ans) +var = "b" +test(1973.3, DT[K, c(var, "FOO")], c("b","FOO")) +test(1973.4, DT[K, c(..var, "FOO")], ans<-data.table(b=5:6, FOO=9L)) +test(1973.5, DT[K, c(var, "FOO"), with=FALSE], ans) # no error when j is supplied but inherits missingness from caller DT = data.table(a=1:3, b=4:6) @@ -14101,10 +14121,10 @@ test(1984.20, dimnames(DT) <- list(NULL, 5), error = "Can't assign 1 names") dimnames(DT) <- list(NULL, 1:5) test(1984.21, names(DT), paste0(1:5)) DT = data.table(a = 1:10) -test(1984.22, stats::na.omit(DT, invert = 'a'), error="'invert' must be logical") -test(1984.23, stats::na.omit(DT, cols = 'b'), error="received non-existing column*.*b") -#test(1984.24, stats::na.omit(DT, cols = c('b', 'c')), error="Columns [b, c] don't") # only first non-existing col is now reported for efficiency -test(1984.242, stats::na.omit(data.table(A=c(1,NA,2)), cols=character()), data.table(A=c(1,NA,2))) #2514 +test(1984.22, na.omit(DT, invert = 'a'), error="'invert' must be logical") +test(1984.23, na.omit(DT, cols = 'b'), error="received non-existing column*.*b") +#test(1984.24, na.omit(DT, cols = c('b', 'c')), error="Columns [b, c] don't") # only first non-existing col is now reported for efficiency +test(1984.242, na.omit(data.table(A=c(1,NA,2)), cols=character()), data.table(A=c(1,NA,2))) #2514 ### idcol = TRUE behavior of rbindlist test(1984.25, rbindlist(list(DT[1L], DT[2L]), idcol = TRUE), data.table(.id=1:2, a=1:2)) test(1984.26, setalloccol(`*tmp*`), error='setalloccol attempting to modify `*tmp*`') @@ -16394,7 +16414,7 @@ test(2119.17, data.table(a=1:2)[, newcol := list(2L, 3L)], ans) # i symbol fetch from calling scope; #3669 iDT = data.table(key = "i_id", i_id = c("A", "B", "C", "D"), - g = c("Alabama", "Alabama", "Alaska", "Arizona"), + g = state.name[c(1,1,2,3)], e_date = as.IDate(c("2019-01-20", "2019-01-20", "2019-01-01", "2019-01-01")), e_time = as.ITime(c("14:00", "20:00", "20:00", "20:00")) ) @@ -16660,7 +16680,12 @@ DT = data.table( y = list(list(x=1, y=c("yes", "no")), list(x=2, y=2))) test(2130.02, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) -# test 2130.03 moved to S4.Rraw +s4class = setClass("ex_class", slots = list(x="integer", y="character", z="numeric")) +DT = data.table( + x = 1:2, + y = list(s4class(x=1L, y=c("yes", "no"), z=2.5), + s4class(x=2L, y="yes", z=1))) +test(2130.03, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) # format_col and format_list_item printing helpers/generics ## Use case: solve #2842 by defining format_col.POSIXct to have usetz = TRUE @@ -16701,7 +16726,7 @@ registerS3method("format", "myclass2130", format.default) registerS3method("format", "foo2130", format.default) DT = data.table(num = 1:2, - formula = list(mpg~cyl), + formula = list(as.formula("mpg~cyl")), model = list(lm(mpg~cyl, mtcars)), shallow = list(1:3, 4:6), nested = list(list(1:3), list(4:6))) @@ -16726,7 +16751,16 @@ dt = data.table(x = rep(1:3, each = 3), y = runif(9)) out = dt[, list(evaluated = list(f(copy(.SD)))), by = x] test(2131.2, class(out$evaluated[[1L]]), 'environment') -# 2132 tested S4 in fcase()/fifelse() moved to S4.Rraw +# S4 object not suported in fifelse and fcase, #4135 +class2132 = setClass("class2132", slots=list(x="numeric")) +s1 = class2132(x=20191231) +s2 = class2132(x=20191230) +test(2132.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.") +test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.") +test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") +test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") +rm(s1, s2, class2132) + # 2133 tested xts moved to other.Rraw 20, #5516 # friendlier error for common mistake of using := in i instead of j, #4227 @@ -16961,7 +16995,7 @@ DT = data.table(a = c("s", "x"), survmean = 1:2) test(2151, dcast(DT, 1 ~ a, value.var='survmean'), data.table('.'='.', s=1L, x=2L, key='.')) # list object with [[ method that returns itself (e.g. person) lead to infinite loop in copy(), #4620 -y = utils::person(given='Joel', family='Mossong') +y = person(given='Joel', family='Mossong') test(2152, copy(y), y) # .N and .GRP special statics copied correctly when placed as a vector in a list column; part of PR#4655 @@ -17262,7 +17296,7 @@ measurev = list("foo", "bar")#measurev below should not use this since it is not test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging") test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NULL), pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword -iris.dt = data.table(iris) +iris.dt = data.table(datasets::iris) test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1]") test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1, 2]") @@ -17293,7 +17327,7 @@ test(2183.09, melt(DTid, measure.vars=structure(1:3, variable_table=data.table(x test(2183.10, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") test(2183.11, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=list(x=1:2, y=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2")#make sure to check each list element, not just the first. # general measure errors. -iris.dt = data.table(iris) +iris.dt = data.table(datasets::iris) test(2183.20, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") # school example. schools.wide <- data.table( @@ -17332,7 +17366,7 @@ myfac = function(x)factor(x)#user-defined conversion function. test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) # measure errors. iris.i <- 1 -iris.num <- iris[iris.i, 1:4] +iris.num <- datasets::iris[iris.i, 1:4] iris.days <- data.table( day1=iris.num, day2=iris.num, Species=iris$Species[iris.i]) test(2183.61, melt(iris.days, measure.vars=measure(before=as.integer, value.name, dim, sep=".")), error="before conversion function returned vector of all NA", warning=base_messages$coerce_na) diff --git a/tests/S4.R b/tests/S4.R deleted file mode 100644 index 90ed742cf..000000000 --- a/tests/S4.R +++ /dev/null @@ -1,2 +0,0 @@ -library(data.table) -test.data.table(script="S4.Rraw") From a2213177283f0f15823e1ff823c1fdf63746da3d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 7 Apr 2024 11:35:14 -0700 Subject: [PATCH 040/106] Allow 1-D arrays in j=list() form while grouping (#6054) --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 13 +++++++++++++ src/dogroups.c | 13 +++++++++++-- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index d67cc3bc2..669b5a717 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,6 +30,8 @@ 7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. +8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b0aab587d..bd9bd54b6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18476,3 +18476,16 @@ DT = data.table(strrep(ja_ko, 1:3L), strrep(ja_n, 2:4L), strrep(accented_a, 3)) test(2253.17, options=list(datatable.prettyprint.char = 4L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんんん ááá")) test(2253.18, options=list(datatable.prettyprint.char = 3L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ んん ááá", "ここ んんん ááá", "こここ んんん... ááá")) test(2253.19, options=list(datatable.prettyprint.char = 1L), gsub(clean_regex, "", capture.output(print(DT))[-1L]), c("こ ん... á...", "こ... ん... á...", "こ... ん... á...")) + +# allow 1-D matrix in j for consistency, #783 +DT=data.table(a = rep(1:2, 3), b = 1:6) +test(2254.1, DT[, .(cbind(b, b)), by=a], error="Entry 1 for group 1.*2 dimensions > 1") +test(2254.2, DT[, .(replicate(.GRP, b)), by=a], error="Entry 1 for group 2.*2 dimensions > 1") +test(2254.3, DT[, .(b, cbind(b, b)), by=a], error="Entry 2 for group 1.*2 dimensions > 1") +test(2254.4, DT[, .(b, replicate(.GRP, b)), by=a], error="Entry 2 for group 2.*2 dimensions > 1") +test(2254.5, DT[, .(array(dim=2:4)), by=a], error="3 dimensions > 1") +test(2254.6, DT[, .(array(dim=rep(1:2, c(10L, 2L)))), by=a], error="2 dimensions > 1") +# but 1-D matrix is fine +test(2254.7, DT[, .(b = cbind(b)), by=a], DT[order(a)]) +test(2254.8, DT[, .(b = rbind(b)), by=a], DT[order(a)]) +test(2254.9, DT[, .(b = array(b, dim=rep(c(1L, .N), c(10L, 1L)))), by=a], DT[order(a)]) diff --git a/src/dogroups.c b/src/dogroups.c index 5ddd1f672..a72a7e8c5 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -275,8 +275,17 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } for (int j=0; j 1) ++nDimensions; + UNPROTECT(1); + if (nDimensions > 1) + error(_("Entry %d for group %d in j=list(...) is an array with %d dimensions > 1, which is disallowed. \"Break\" the array yourself with c() or as.vector() if that is intentional."), j+1, i+1, nDimensions); + } } } if (!isNull(lhs)) { From 79329c8a570028c73e9300143a1d3875236b7f58 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 7 Apr 2024 23:40:53 -0700 Subject: [PATCH 041/106] cc() works under R_DEFAULT_PACKAGES=NULL (#6057) * cc() works under R_DEFAULT_PACKAGES=NULL * comment * Reduce noise in normal usage --- .dev/cc.R | 24 ++++++++++++++++++++++++ R/AllS4.R | 16 ++++++++-------- R/data.table.R | 2 +- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/.dev/cc.R b/.dev/cc.R index 6adf0372c..bfe4b0430 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -31,6 +31,29 @@ sourceDir = function(path=getwd(), trace = TRUE, ...) { if(trace) cat("\n") } +# NB: since we only import from default packages, this is rarely needed, but useful for truly minimal dev environments (#6056) +sourceImports = function(path=getwd(), quiet=FALSE) { + nsFile = file.path(path, "NAMESPACE") + if (!file.exists(nsFile)) { + if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly") + return(invisible()) + } + nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function + if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports))) + for (ii in seq_along(nsParsedImports)) { + entry = nsParsedImports[[ii]] + if (paste0("package:", entry[[1L]]) %in% search()) next # not strictly needed since a redundant 'require()' is just skipped, but helpful for reducing noise in !quiet case + if (length(entry) == 1L) { + if (!quiet) cat(sprintf(" Attaching full package %s\n", entry)) + require(entry, character.only=TRUE, quietly=TRUE) + } else { + if (!quiet) cat(sprintf(" Attaching %d objects from package %s: %s\n", length(entry[[2L]]), entry[[1L]], toString(entry[[2L]]))) + require(entry[[1L]], character.only=TRUE, include.only=entry[[2L]], quietly=TRUE) + } + } + return(invisible()) +} + cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) { if (!missing(cc_dir)) { warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead") @@ -81,6 +104,7 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys .GlobalEnv[[Call$name]] = Call$address for (Extern in xx$.External) .GlobalEnv[[Extern$name]] = Extern$address + sourceImports(path, quiet=quiet) sourceDir(file.path(path, "R"), trace=!quiet) if (base::getRversion()<"4.0.0") rm(list=c("rbind.data.table", "cbind.data.table"), envir=.GlobalEnv) # 3968 follow up .GlobalEnv$testDir = function(x) file.path(path,"inst/tests",x) diff --git a/R/AllS4.R b/R/AllS4.R index fc3db0fa0..89c2d3f81 100644 --- a/R/AllS4.R +++ b/R/AllS4.R @@ -3,21 +3,21 @@ if ("package:data.table" %in% search()) stopf("data.table package loaded. When d ## Allows data.table to be defined as an object of an S4 class, ## or even have data.table be a super class of an S4 class. -setOldClass(c('data.frame')) -setOldClass(c('data.table', 'data.frame')) +methods::setOldClass(c('data.frame')) +methods::setOldClass(c('data.table', 'data.frame')) ## as(some.data.frame, "data.table") -setAs("data.frame", "data.table", function(from) { +methods::setAs("data.frame", "data.table", function(from) { as.data.table(from) }) ## as(some.data.table, "data.frame") -setAs("data.table", "data.frame", function(from) { +methods::setAs("data.table", "data.frame", function(from) { as.data.frame(from) }) -setOldClass("IDate") -setOldClass("ITime") +methods::setOldClass("IDate") +methods::setOldClass("ITime") -setAs("character", "IDate", function(from) as.IDate(from)) -setAs("character", "ITime", function(from) as.ITime(from)) +methods::setAs("character", "IDate", function(from) as.IDate(from)) +methods::setAs("character", "ITime", function(from) as.ITime(from)) diff --git a/R/data.table.R b/R/data.table.R index d55132071..de28676b7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -12,7 +12,7 @@ dim.data.table = function(x) } .global = new.env() # thanks to: http://stackoverflow.com/a/12605694/403310 -setPackageName("data.table",.global) +methods::setPackageName("data.table",.global) .global$print = "" # NB: if adding to/editing this list, be sure to do the following: From 03cda7f2d44369db49ce508c86619a2010df95c6 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 8 Apr 2024 00:08:29 -0700 Subject: [PATCH 042/106] regression test on print issue for nested lists (#6058) --- inst/tests/tests.Rraw | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bd9bd54b6..1fca4bead 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18489,3 +18489,15 @@ test(2254.6, DT[, .(array(dim=rep(1:2, c(10L, 2L)))), by=a], error="2 dimensions test(2254.7, DT[, .(b = cbind(b)), by=a], DT[order(a)]) test(2254.8, DT[, .(b = rbind(b)), by=a], DT[order(a)]) test(2254.9, DT[, .(b = array(b, dim=rep(c(1L, .N), c(10L, 1L)))), by=a], DT[order(a)]) + +# regression test on issue reported with printing nested table, #1646 +DF <- structure( + list( + DF1=structure(list(V1=list(1:2, 3:4), V2=5:6), .Names=c("V1", "V2"), class="data.frame", row.names=c(NA, 2L)), + DF2=structure(list(V3=7:8, V4=9:10), .Names=c("V3", "V4"), class="data.frame", row.names=c(NA, 2L)), + V5=11:12 + ), + .Names=c("DF1", "DF2", "V5"), class="data.frame", row.names=c(NA, 2L) +) + +test(2255, as.data.table(DF), output="DF1.V1.*DF1.V2.*DF2.V3.*DF2.V4.*V5") From e127f4834192973166b30350a2a7731b7d392ab4 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 8 Apr 2024 09:02:33 -0700 Subject: [PATCH 043/106] Update src/fmelt.c Co-authored-by: Michael Chirico --- src/fmelt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fmelt.c b/src/fmelt.c index 2b9707bb2..502e576e0 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -303,7 +303,7 @@ static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valna SEXPTYPE type; data->lmax = 0; data->totlen = 0; data->nrow = length(VECTOR_ELT(DT, 0)); SET_VECTOR_ELT(data->RCHK, 0, vars = checkVars(DT, id, measure, verbose)); - data->measure_is_list = !isNull(measure) && isNewList(measure); + data->measure_is_list = !isNull(measure) && isNewList(measure); // NB: NULL passes isNewList() hence !isNull() too data->idcols = VECTOR_ELT(vars, 0); data->valuecols = VECTOR_ELT(vars, 1); data->lids = length(data->idcols); From 2e1f5efa6d990b6113626856e1df5d65fa536d49 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 8 Apr 2024 09:04:56 -0700 Subject: [PATCH 044/106] mention length=1 --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 42823bfb2..c70b6c799 100644 --- a/NEWS.md +++ b/NEWS.md @@ -46,7 +46,7 @@ 6. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation. -7. `melt` returns an integer column for `variable` whenever `measure.vars` is a list, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. +7. `melt` returns an integer column for `variable` when `measure.vars` is a list of length=1, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. ## NOTES From d287ba84c4d1812f6d852e764aa9e1057769b7de Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 8 Apr 2024 09:15:21 -0700 Subject: [PATCH 045/106] comment about how to upgrade --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c70b6c799..7bfd9a416 100644 --- a/NEWS.md +++ b/NEWS.md @@ -46,7 +46,7 @@ 6. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation. -7. `melt` returns an integer column for `variable` when `measure.vars` is a list of length=1, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. +7. `melt` returns an integer column for `variable` when `measure.vars` is a list of length=1, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. Any users who were relying on this behavior can change `measure.vars=list("col_name")` (output `variable` was column name, now is column index/integer) to `measure.vars="col_name"` (`variable` still is column name). ## NOTES From b8973498673be0918b182522ea4dfcac50ed508c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 8 Apr 2024 12:49:51 -0700 Subject: [PATCH 046/106] remove unneeded step to uninstall data.table under cc() (#6060) --- inst/tests/tests.Rraw | 4 ---- 1 file changed, 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5f6206c42..3de892294 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1,10 +1,6 @@ require(methods) if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { - if (!identical(suppressWarnings(packageDescription("data.table")), NA)) { - remove.packages("data.table") - stop("This is dev mode but data.table was installed. Uninstalled it. Please q() this R session and try cc() again. The installed namespace causes problems in dev mode for the S4 tests.\n") - } if ((tt<-compiler::enableJIT(-1))>0) cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") rm_all = function() {} From 0bd1b8dd43e3aac001a1bf0ed0942be3c6a300cf Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 8 Apr 2024 12:50:24 -0700 Subject: [PATCH 047/106] Assign imports instead of attaching (#6061) --- .dev/cc.R | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/.dev/cc.R b/.dev/cc.R index bfe4b0430..f2031ca48 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -42,14 +42,11 @@ sourceImports = function(path=getwd(), quiet=FALSE) { if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports))) for (ii in seq_along(nsParsedImports)) { entry = nsParsedImports[[ii]] - if (paste0("package:", entry[[1L]]) %in% search()) next # not strictly needed since a redundant 'require()' is just skipped, but helpful for reducing noise in !quiet case - if (length(entry) == 1L) { - if (!quiet) cat(sprintf(" Attaching full package %s\n", entry)) - require(entry, character.only=TRUE, quietly=TRUE) - } else { - if (!quiet) cat(sprintf(" Attaching %d objects from package %s: %s\n", length(entry[[2L]]), entry[[1L]], toString(entry[[2L]]))) - require(entry[[1L]], character.only=TRUE, include.only=entry[[2L]], quietly=TRUE) - } + # getNamespaceExports includes weird objects but that's intentional, consider evalq(.__C__VIRTUAL, asNamespace("Rcpp")) due to import(methods) in that NAMESPACE + imported = if (length(entry) == 1L) getNamespaceExports(entry) else entry[[2L]] + # Assign directly to better imitate actual namespace imports. Earlier tried to require(include.only=) these objects, but R doesn't allow multiple such require, meaning we can't add more objects later in tests, see: + # https://stat.ethz.ch/pipermail/r-devel/2024-April/083319.html + for (import in imported) assign(import, getExportedValue(entry[[1L]], import), .GlobalEnv) } return(invisible()) } From 94d98dee033da0a8130637d125a72d4e3c2afbc9 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Mon, 8 Apr 2024 21:35:47 -0700 Subject: [PATCH 048/106] row separator "---" is no longer in the rownames column when row.names=FALSE (#6059) * separator is now in first column when row.names=FALSE * Changed old tests for new behavior * Change so "---" separator appears for every column, change tests * review suggestion --------- Co-authored-by: Michael Chirico --- R/print.data.table.R | 6 +++++- inst/tests/tests.Rraw | 40 ++++++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/R/print.data.table.R b/R/print.data.table.R index 919c8aaed..dd641f946 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -111,7 +111,11 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), toprint = toprint_subset(toprint, cols_to_print) } if (printdots) { - toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn)) + if (isFALSE(row.names)) { + toprint = rbind(head(toprint, topn + isTRUE(class)), "---", tail(toprint, topn)) # 4083 + } else { + toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn)) + } rownames(toprint) = format(rownames(toprint), justify="right") if (col.names == "none") { cut_colnames(print(toprint, right=TRUE, quote=quote)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3de892294..9f558ef49 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3971,7 +3971,19 @@ test(1137.12, DT[, lapply(.SD, sum), by=x, .SDcols=-"y"], DT[, lapply(.SD, sum), DT <- data.table(x=1:5, y=6:10) test(1138.1, capture.output(print(DT, row.names=FALSE)), c(" x y", " 1 6", " 2 7", " 3 8", " 4 9", " 5 10")) DT <- data.table(x=1:101, y=6:106) # bug described in #1307 -test(1138.2, capture.output(print(DT, row.names=FALSE)), c(" x y", " 1 6", " 2 7", " 3 8", " 4 9", " 5 10", "--- ", " 97 102", " 98 103", " 99 104", " 100 105", " 101 106")) +test(1138.2, capture.output(print(DT, row.names=FALSE)), +c(" x y", + " 1 6", + " 2 7", + " 3 8", + " 4 9", + " 5 10", + " --- ---", + " 97 102", + " 98 103", + " 99 104", + " 100 105", + " 101 106")) # test for FR #2591 (format.data.table issue with column of class "formula") DT <- data.table(x=c(a~b, c~d+e), y=1:2) @@ -16492,18 +16504,18 @@ test(2125.02, capture.output(print(DT, trunc.cols=TRUE)), "102: 0 bbbbbbbbbbbbb ccccccccccccc", "1 variable(s) not shown: [d]")) test(2125.03, capture.output(print(DT, trunc.cols=TRUE, row.names=FALSE)), - c(" a b c", - " 0 bbbbbbbbbbbbb ccccccccccccc", - " 0 bbbbbbbbbbbbb ccccccccccccc", - " 0 bbbbbbbbbbbbb ccccccccccccc", - " 0 bbbbbbbbbbbbb ccccccccccccc", - " 0 bbbbbbbbbbbbb ccccccccccccc", - "--- ", - " 0 bbbbbbbbbbbbb ccccccccccccc", - " 0 bbbbbbbbbbbbb ccccccccccccc", - " 0 bbbbbbbbbbbbb ccccccccccccc", - " 0 bbbbbbbbbbbbb ccccccccccccc", - " 0 bbbbbbbbbbbbb ccccccccccccc", + c(" a b c", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " --- --- ---", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " 0 bbbbbbbbbbbbb ccccccccccccc", + " 0 bbbbbbbbbbbbb ccccccccccccc", "1 variable(s) not shown: [d]" )) # also testing #4266 -- getting width of row #s register right # TODO: understand why 2 variables truncated here. a,b,c combined have width @@ -16512,7 +16524,7 @@ test(2125.03, capture.output(print(DT, trunc.cols=TRUE, row.names=FALSE)), test(2125.04, capture.output(print(DT, trunc.cols=TRUE, class=TRUE))[14L], "2 variable(s) not shown: [c , d ]") test(2125.05, capture.output(print(DT, trunc.cols=TRUE, class=TRUE, row.names=FALSE))[c(1,14)], - c(" a b c", + c(" a b c", "1 variable(s) not shown: [d ]" )) test(2125.06, capture.output(print(DT, trunc.cols=TRUE, col.names="none"))[c(1,12)], c(" 1: 0 bbbbbbbbbbbbb ccccccccccccc", From 454d7f5e7da31ec3c19b70fe9783cc1ba8c5036b Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Mon, 8 Apr 2024 22:59:38 -0700 Subject: [PATCH 049/106] Follow-up to PR #6059, added entry to NEWS.md (#6062) * followup to PR #6059, updated NEWS.md * tidy wording --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7bfd9a416..4fa8d699b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -70,6 +70,8 @@ 10. `test.data.table()` runs correctly in more sessions, in particular those where the `digits` or `warn` settings are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. +11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE From fe202ab6ee58f9f31e5397adf659d0424f113bd9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 07:26:03 -0700 Subject: [PATCH 050/106] Move S4 tests to own script (#6053) * Move S4 tests to own script * working on moving to library() calls instead of '::' * re-order attachment, use pos=, add a note * Move another batch of tests relying on S4 into S4.Rraw * tidying up * Clean-up case of methods not attached when running test.data.table() * ns-qualify capture.output() * tidying * na.omit needn't be qualified --- inst/tests/S4.Rraw | 104 +++++++++++++++++++++++++ inst/tests/tests.Rraw | 177 ++++++++++++++---------------------------- tests/S4.R | 6 ++ 3 files changed, 167 insertions(+), 120 deletions(-) create mode 100644 inst/tests/S4.Rraw create mode 100644 tests/S4.R diff --git a/inst/tests/S4.Rraw b/inst/tests/S4.Rraw new file mode 100644 index 000000000..21a7d0a6c --- /dev/null +++ b/inst/tests/S4.Rraw @@ -0,0 +1,104 @@ +search_order <- match(c("package:data.table", "package:methods"), search(), 0L) +if (diff(search_order) < 0L) { + cat("'methods' must be attached before 'data.table' for dispatch to register correctly; quitting\n") + q("no") +} +library(methods) + +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + library(data.table) + + is_utc = data.table:::is_utc + shallow = data.table:::shallow + test = data.table:::test +} + +tt = Sys.getenv("TZ", unset=NA) +TZnotUTC = !identical(tt,"") && !is_utc(tt) + +suppressWarnings({ + setClass("Data.Table", contains="data.table") # suppress "Created a package name, '2018-05-26 06:14:43.444', when none found" + setClass("S4Composition", representation(data="data.table")) +}) +# data.table can be a parent class +ids <- sample(letters[1:3], 10, replace=TRUE) +scores <- stats::rnorm(10) +dt <- data.table(id=ids, score=scores) +dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) +test(1.01, isS4(dt.s4)) +test(1.02, inherits(dt.s4, 'data.table')) +# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table +test(1.03, isS4(shallow(dt.s4))) +## pull out data from S4 as.list, and compare to list from dt +dt.s4.list <- dt.s4@.Data +names(dt.s4.list) <- names(dt.s4) +test(1.04, dt.s4.list, as.list(dt)) # Underlying data not identical +# simple S4 conversion-isms work +df = data.frame(a=sample(letters, 10), b=1:10) +dt = as.data.table(df) +test(1.05, identical(methods::as(df, 'data.table'), dt)) +test(1.06, identical(methods::as(dt, 'data.frame'), df)) +# data.table can be used in an S4 slot +dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=stats::rnorm(10)) +dt.comp <- new("S4Composition", data=dt) +test(1.07, dt.comp@data, dt) +# S4 methods dispatch properly on data.table slots" +dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=stats::rnorm(10)) +dt.comp <- new("S4Composition", data=dt) +setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) +setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) +setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) +test(1.08, dtGet(dt.comp), dt) # actually +test(1.09, identical(dtGet(dt.comp, 1), dt[[1]])) +test(1.10, identical(dtGet(dt.comp, 'b'), dt$b)) +removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev +removeClass("S4Composition") +# END port of old testthat tests + +# miscellaneous missing tests uncovered by CodeCov difference in the process of PR #2573 [S4 portion, c.f. 1872.* in tests.Rraw] +## data.table cannot recycle complicated types +short_s4_col = getClass("MethodDefinition") +test(2, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'") + +# print dims in list-columns, #3671, c.f. 2130.* in tests.Rraw +s4class = setClass("ex_class", slots = list(x="integer", y="character", z="numeric")) +DT = data.table( + x = 1:2, + y = list(s4class(x=1L, y=c("yes", "no"), z=2.5), + s4class(x=2L, y="yes", z=1))) +test(3, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) + +# S4 object not supported in fifelse and fcase, #4135 +class4 = setClass("class4", slots=list(x="numeric")) +s1 = class4(x=20191231) +s2 = class4(x=20191230) +test(4.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.") +test(4.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.") +test(4.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") +test(4.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") +rm(s1, s2, class4) + +# native reading of timestamp strings in fread +# NB: S4 required for methods::as() +test(5.1, options=c(datatable.old.fread.datetime.character=TRUE), + fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), + ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03"))) +ans_print = utils::capture.output(print(ans)) +if (TZnotUTC) { + test(5.2, options=list(datatable.old.fread.datetime.character=NULL), + fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct"), tz=""), + ans, output=ans_print) + test(5.3, options=list(datatable.old.fread.datetime.character=NULL), + fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA), tz=""), + data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print) +} else { + test(5.4, options=list(datatable.old.fread.datetime.character=NULL), + fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), + ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print) + test(5.5, options=list(datatable.old.fread.datetime.character=NULL), + fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), + ans, output=ans_print) +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9f558ef49..c09d43e90 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1,4 +1,10 @@ -require(methods) +# in order as they're attached in a normal R session, to match that if these actually have an effect, e.g. under R_DEFAULT_PACKAGES=NULL +# NB: pos= is required for these symbols to resolve searching 'upward' from data.table -- if these packages are not already attached, +# and we don't use pos=, they'll wind up 'below' data.table on the search() path --> their symbols won't resolve since, when running +# from the installed package, this is evaluated from data.table's namespace. +library(stats, include.only=c("lm", "median", "na.omit", "rnorm", "runif", "sd", "setNames", "var", "weighted.mean"), pos="package:base") +library(utils, include.only=c("capture.output", "combn", "head", "read.csv", "read.delim", "read.table", "tail", "type.convert", "write.csv", "write.table"), pos="package:base") +library(datasets, include.only=c("airquality", "BOD", "cars", "ChickWeight", "CO2", "iris", "mtcars"), pos="package:base") if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { if ((tt<-compiler::enableJIT(-1))>0) @@ -128,6 +134,9 @@ if (!test_longdouble) { # e.g. under valgrind, longdouble.digits==53; causing these to fail: 1262, 1729.04, 1729.08, 1729.09, 1729.11, 1729.13, 1830.7; #4639 } +tt = Sys.getenv("TZ", unset=NA) +TZnotUTC = !identical(tt,"") && !is_utc(tt) + # generate simple error messages from base that are checked against in our tests. this helps # protect us against these messages evolving in base in the future, and against these messages # potentially not being produced in English. @@ -160,7 +169,7 @@ base_messages = list( missing_coerce_method = get_msg(delim = '"', { old = options(useFancyQuotes = FALSE) # otherwise we get angled quotes, hard to match robustly on.exit(options(old)) - as(TRUE, 'foo') + methods::as(TRUE, 'foo') }), missing_dispatch_method = get_msg(conditionMessage(structure(1, class="foo")), '[\'"]'), invalid_arg_unary_operator = get_msg(-'a'), @@ -1675,8 +1684,8 @@ test(536, DT[,sum(v),by=a], data.table(a=c(1L,3L,2L),V1=c(4L,7L,10L))) # retain ans = data.table(a=1:3,V1=c(4L,10L,7L),key="a") test(537, DT[,sum(v),keyby=a], ans) test(538, DT[,sum(v),keyby="a"], ans) -var="a" -test(539, DT[,sum(v),keyby=eval(var)], ans) +byvar="a" +test(539, DT[,sum(v),keyby=eval(byvar)], ans) a=quote(a%%2L) test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) @@ -1779,7 +1788,7 @@ test(584, DT[a<1], output="Empty data.table (0 rows and 2 cols): a,v") test(585, DT[a<1,list(v)], output="Empty data.table (0 rows and 1 cols): v") test(586.1, data.table(a=integer(),V1=integer()), output="Empty data.table (0 rows and 2 cols): a,V1") env = environment() -data(iris, package='datasets', envir = env) # in case user has edited iris in their session +utils::data(iris, package='datasets', envir = env) # in case user has edited iris in their session test(586.2, print.data.table(iris[,FALSE]), output="Empty data.frame (150 rows and 0 cols)") #3363 # Test that .N is available in by on empty table, also in #1945 @@ -1889,7 +1898,7 @@ DT$time1 <- Sys.time() # recycle via *tmp* DT$time2 <- rep(Sys.time(), 5) # plonk via *tmp* DT[,time3:=Sys.time()] # recycle DT[,time4:=rep(Sys.time(),5)] # plonk -test(625, all(sapply(DT,is,"POSIXct")[-1])) +test(625, all(sapply(DT, inherits, "POSIXct")[-1])) # unique on ITime doesn't lose attributes, #1719 t = as.ITime(strptime(c("09:10:00","09:11:00","09:11:00","09:12:00"),"%H:%M:%S")) @@ -3397,8 +3406,8 @@ test(1064, DT[integer(0), list(x2=x), by=x], output="Empty data.table (0 rows an # bug #2445 fix - := fails when subsetting yields NAs and with=FALSE X = data.table(A=1:3, B=1:6, key="A") -var <- "B" -test(1065, X[J(2:5), (var):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A")) +col <- "B" +test(1065, X[J(2:5), (col):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A")) # fread single unnamed colClasses f = "A,B,C,D\n1,3,5,7\n2,4,6,8\n" @@ -3468,7 +3477,7 @@ setkey(DT2, p,q) ans <- DT1[DT2, nomatch=0, allow.cartesian=TRUE] # NB: DT2 contains duplicate key values so columns c ends up not being sorted test(1082.1, key(ans), c("a","b")) test(1082.2, setkeyv(ans, key(ans)), ans) # i.e. key is valid, otherwise re-built warning will be caught -check <- setkey(as.data.table(aggregate(r ~a+b+c, ans, length)), a, b) +check <- setkey(as.data.table(stats::aggregate(r ~a+b+c, ans, length)), a, b) test(1083, setkeyv(ans[, list(r = .N), by=key(DT1)], key(ans)), check) # if the key is set properly, then and only then will the aggregation results match with "check" # Tests for #2531. `:=` loses POSIXct or ITime attribute: @@ -3561,8 +3570,9 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, "5" = as.Date(c(NA, "2014-06-15", "2014-05-18", NA)), "6" = as.Date(c(NA, NA, "2014-06-15", NA)), key="ID")) - names(ChickWeight) <- tolower(names(ChickWeight)) - DT = melt(as.data.table(ChickWeight), id.vars=2:4) # calls melt.data.table + DT = ChickWeight + names(DT) <- tolower(names(DT)) + DT = melt(as.data.table(DT), id.vars=2:4) # calls melt.data.table # changed 'mean' to 'sum' to avoid valgrind floating point precision based error. test(1102.01, dcast(DT, time ~ variable, fun.aggregate=sum)[c(1,2,11,.N)], data.table(time=c(0,2,20,21),weight=c(2053,2461,9647,9841), key="time")) @@ -5532,12 +5542,8 @@ setkey(X, val1) test(1354, X[Y, val2 := i.val2, allow.cartesian=TRUE][, val1 := NULL][order(id)], data.table(id=1:10, val2=as.integer(c(8,7,7,6,8,6,6,7,7,8)))) # Fix for #475, setDT(CO2) should error, as it's trying to modify the object whose binding is locked. -# CO2 is not locked in R 2.14.1 but is in R >= 3.1.0. R NEWS isn't clear when that change happened, so just test there is an error when it is locked. -if (bindingIsLocked("CO2",as.environment("package:datasets"))) { - test(1355.1, setDT(CO2), error="Cannot convert 'CO2' to data.table by reference because binding is locked.") -} else { - test(1355.2, setDT(CO2), CO2) -} +# NB: requires datasets be attached -- no error thrown on datasets::CO2 or CO2=datasets::CO2 or get("CO2", asNamespace("CO2")) +test(1355, setDT(CO2), error="Cannot convert 'CO2' to data.table by reference because binding is locked.") # Fix for #698. not join doesn't need to check for allow.cartesian=TRUE. DT1 <- data.table(x=rep(1:3, each=3L), y=1:9, key="x") @@ -6125,7 +6131,7 @@ test_no = 1394 for (i in seq_along(DT)) { combn(names(DT), i, function(cols) { ans1 = na.omit(DT, cols=cols) - ans2 = DT[complete.cases(DT[, cols, with=FALSE])] + ans2 = DT[stats::complete.cases(DT[, cols, with=FALSE])] test_no <<- test_no+.001 test(test_no, ans1, ans2) 0L @@ -6906,7 +6912,7 @@ test(1475.16, uniqueN(logical(), na.rm=TRUE), 0L) # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) -test(1476.1, DT[, .(y=mean(y)), x], setDT(aggregate(y ~ x, DT, mean))) +test(1476.1, DT[, .(y=mean(y)), x], setDT(stats::aggregate(y ~ x, DT, mean))) # test for 'transpose' of a list ll = lapply(1:12, function(x) { @@ -9620,7 +9626,7 @@ test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("p test(1639.138, ans, split(as.data.table(df), by=c("product","year"))) test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE))) # test if split preallocate columns in results #1908 -dt = data.table(x=rexp(100),y=rep(LETTERS[1:10], 10)) +dt = data.table(x=stats::rexp(100),y=rep(LETTERS[1:10], 10)) dtL = split(dt, by = "y") test(1639.140, dim(dtL[[1]][, x2 := -x]), c(10L,3L)) test(1639.141, all(sapply(dtL, truelength) > 1000)) @@ -11043,8 +11049,6 @@ test(1743.241, fread("a,b,c\n2,2,f", colClasses = list(character="c", integer="b test(1743.242, fread("a,b,c\n2,2,f", colClasses = c("integer", "integer", "factor"), drop="a"), data.table(b=2L, c=factor("f"))) ## POSIXct -tt = Sys.getenv("TZ", unset=NA) -TZnotUTC = !identical(tt,"") && !is_utc(tt) if (TZnotUTC) { # from v1.13.0 these tests work when running under non-UTC because they compare to as.POSIXct which reads these unmarked datetime in local # the new tests 2150.* cover more cases @@ -12448,11 +12452,8 @@ for (i in 100:1) { test(1871.2 + (100-i)/1000, fread(lines, nrows=i), data.table(V1=rep.int(2L,i), V2=3L, V3=4L)) } -# miscellaneous missing tests uncovered by CodeCov difference -# in the process of PR #2573 -## data.table cannot recycle complicated types -short_s4_col = getClass("MethodDefinition") -test(1872.01, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'") +# miscellaneous missing tests uncovered by CodeCov difference in the process of PR #2573 +# 1872.01 moved to S4.Rraw since it uses S4 ## i must be a data.table when on is specified DT = data.table(a = 1:3) test(1872.02, DT[c(TRUE, FALSE), on = 'coefficients'], error = "not a data.table, but 'on'") @@ -12965,45 +12966,8 @@ for (col in c('b', 'c')) { test_no = test_no + 0.0001 test(test_no, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns } -# -# tests-S4.R (S4 Compatability) -# -suppressWarnings(setClass("Data.Table", contains="data.table")) # suppress "Created a package name, '2018-05-26 06:14:43.444', when none found" -suppressWarnings(setClass("S4Composition", representation(data="data.table"))) -# data.table can be a parent class -ids <- sample(letters[1:3], 10, replace=TRUE) -scores <- rnorm(10) -dt <- data.table(id=ids, score=scores) -dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) -test(1914.01, isS4(dt.s4)) -test(1914.02, inherits(dt.s4, 'data.table')) -# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table -test(1914.03, isS4(shallow(dt.s4))) -## pull out data from S4 as.list, and compare to list from dt -dt.s4.list <- dt.s4@.Data -names(dt.s4.list) <- names(dt.s4) -test(1914.04, dt.s4.list, as.list(dt)) # Underlying data not identical -# simple S4 conversion-isms work -df = data.frame(a=sample(letters, 10), b=1:10) -dt = as.data.table(df) -test(1914.05, identical(methods::as(df, 'data.table'), dt)) -test(1914.06, identical(methods::as(dt, 'data.frame'), df)) -# data.table can be used in an S4 slot -dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) -dt.comp <- new("S4Composition", data=dt) -test(1914.07, dt.comp@data, dt) -# S4 methods dispatch properly on data.table slots" -dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) -dt.comp <- new("S4Composition", data=dt) -setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) -setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) -setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) -test(1914.08, dtGet(dt.comp), dt) # actually -test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]])) -test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b)) -removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev -removeClass("S4Composition") -# END port of old testthat tests + +# Test 1914 of S4 compatibility was moved to S4.Rraw for #3808 str = "Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species 5.1,3.5,1.4,0.2,setosa @@ -13047,13 +13011,13 @@ test(1918.6, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure( test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) # wrong bmerge result if character gets coerced to factor, i is keyed, and level order in i is different from x, #2881 -iris = data.table(iris) -iris$grp = rep(c('A','B'), 75L) -iris[, Species1 := factor(Species, levels=c('setosa','versicolor','virginica'), labels=c('setosa','versicolor','Virginica'))] +iris.dt = data.table(iris) +iris.dt$grp = rep(c('A','B'), 75L) +iris.dt[, Species1 := factor(Species, levels=c('setosa','versicolor','virginica'), labels=c('setosa','versicolor','Virginica'))] iSorted = data.table(Species1 = c('setosa','Virginica'), grp='B', key=c("grp","Species1")) i = setkey(copy(iSorted),NULL) -test(1920, iris[iSorted, on = c("grp==grp", 'Species1==Species1')], - iris[i, on = c("grp==grp", 'Species1==Species1')]) +test(1920, iris.dt[iSorted, on = c("grp==grp", 'Species1==Species1')], + iris.dt[i, on = c("grp==grp", 'Species1==Species1')]) # origin= ignored by as.IDate.numeric(), #2880 test(1921.1, as.IDate(1000, origin = "1930-01-01"), as.IDate("1932-09-27")) @@ -13207,7 +13171,7 @@ test(1944.3, DT[flag == 1, sum(x), keyby = group], # should not use index data.table(group=c("A","B"), V1=INT(1,8), key="group")) set.seed(123) N = 10 -DT = data.table(group = rbinom(N, 5, 0.5), x = 1:N, flag = rbinom(N, 1, 0.9)) +DT = data.table(group = stats::rbinom(N, 5, 0.5), x = 1:N, flag = stats::rbinom(N, 1, 0.9)) test(1944.4, DT[flag == 1 & group == 1, x], 6L) test(1944.5, indices(DT), "group__flag") test(1944.6, DT[flag == 1, sum(x), keyby = group], data.table(group=1:4, V1=INT(6,3,18,17), key="group")) @@ -14027,10 +13991,10 @@ DT = data.table(a=1:3, b=4:6, key="a") K = data.table(a=2:3, FOO=9L, BAR=12L) test(1973.1, DT[K, "FOO"], data.table(FOO=c(9L,9L))) test(1973.2, DT[K, "FOO", with=FALSE], data.table(FOO=c(9L,9L))) -var = "b" -test(1973.3, DT[K, c(var, "FOO")], c("b","FOO")) -test(1973.4, DT[K, c(..var, "FOO")], ans<-data.table(b=5:6, FOO=9L)) -test(1973.5, DT[K, c(var, "FOO"), with=FALSE], ans) +col = "b" +test(1973.3, DT[K, c(col, "FOO")], c("b","FOO")) +test(1973.4, DT[K, c(..col, "FOO")], ans<-data.table(b=5:6, FOO=9L)) +test(1973.5, DT[K, c(col, "FOO"), with=FALSE], ans) # no error when j is supplied but inherits missingness from caller DT = data.table(a=1:3, b=4:6) @@ -16422,7 +16386,7 @@ test(2119.17, data.table(a=1:2)[, newcol := list(2L, 3L)], ans) # i symbol fetch from calling scope; #3669 iDT = data.table(key = "i_id", i_id = c("A", "B", "C", "D"), - g = state.name[c(1,1,2,3)], + g = c("Alabama", "Alabama", "Alaska", "Arizona"), e_date = as.IDate(c("2019-01-20", "2019-01-20", "2019-01-01", "2019-01-01")), e_time = as.ITime(c("14:00", "20:00", "20:00", "20:00")) ) @@ -16688,12 +16652,7 @@ DT = data.table( y = list(list(x=1, y=c("yes", "no")), list(x=2, y=2))) test(2130.02, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) -s4class = setClass("ex_class", slots = list(x="integer", y="character", z="numeric")) -DT = data.table( - x = 1:2, - y = list(s4class(x=1L, y=c("yes", "no"), z=2.5), - s4class(x=2L, y="yes", z=1))) -test(2130.03, print(DT), output=c(" x y", "1: 1 ", "2: 2 ")) +# test 2130.03 moved to S4.Rraw # format_col and format_list_item printing helpers/generics ## Use case: solve #2842 by defining format_col.POSIXct to have usetz = TRUE @@ -16734,7 +16693,7 @@ registerS3method("format", "myclass2130", format.default) registerS3method("format", "foo2130", format.default) DT = data.table(num = 1:2, - formula = list(as.formula("mpg~cyl")), + formula = list(mpg~cyl), model = list(lm(mpg~cyl, mtcars)), shallow = list(1:3, 4:6), nested = list(list(1:3), list(4:6))) @@ -16759,16 +16718,7 @@ dt = data.table(x = rep(1:3, each = 3), y = runif(9)) out = dt[, list(evaluated = list(f(copy(.SD)))), by = x] test(2131.2, class(out$evaluated[[1L]]), 'environment') -# S4 object not suported in fifelse and fcase, #4135 -class2132 = setClass("class2132", slots=list(x="numeric")) -s1 = class2132(x=20191231) -s2 = class2132(x=20191230) -test(2132.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.") -test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.") -test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see") -test(2132.4, fcase(FALSE, 1, TRUE, s1), error = "S4 class objects (except nanotime) are not supported. Please see") -rm(s1, s2, class2132) - +# 2132 tested S4 in fcase()/fifelse() moved to S4.Rraw # 2133 tested xts moved to other.Rraw 20, #5516 # friendlier error for common mistake of using := in i instead of j, #4227 @@ -16974,36 +16924,23 @@ test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2", tz=""), # no # some rows are date-only, some rows UTC-timestamp --> read the date-only in UTC too test(2150.14, fread("a,b\n2015-01-01,1.1\n2015-01-02T01:02:03Z,1.2"), data.table(a = .POSIXct(1420070400 + c(0, 90123), tz="UTC"), b = c(1.1, 1.2))) -old = options(datatable.old.fread.datetime.character=TRUE) -test(2150.15, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03T01:02:03Z"), - data.table(a="2015-01-01", b="2015-01-02", c="2015-01-03T01:02:03Z")) -test(2150.16, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), - ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03"))) -ans_print = capture.output(print(ans)) -options(datatable.old.fread.datetime.character=NULL) -if (TZnotUTC) { - test(2150.17, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct"), tz=""), - ans, output=ans_print) - test(2150.18, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA), tz=""), - data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print) -} else { - test(2150.19, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), - ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print) - test(2150.20, fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), - ans, output=ans_print) -} +test(2150.15, options=c(datatable.old.fread.datetime.character=TRUE), + fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03T01:02:03Z"), + data.table(a="2015-01-01", b="2015-01-02", c="2015-01-03T01:02:03Z")) +# tests 2150.{16,17,18,19,20} moved to S4.Rraw because they rely on S4 to dispatch as(., "IDate") correctly # fread single row single column datetime field, #2609 -test(2150.21, fread("c1\n2018-01-31 03:16:57"), data.table(V1=as.IDate("2018-01-31"), c1="03:16:57"), - warning="Detected 1 column names but the data has 2 columns") -test(2150.22, fread("c1\n2018-01-31 03:16:57", sep=""), data.table(c1=as.POSIXct("2018-01-31 03:16:57", tz="UTC"))) -options(old) +test(2150.21, options=list(datatable.old.fread.datetime.character=NULL), + fread("c1\n2018-01-31 03:16:57"), data.table(V1=as.IDate("2018-01-31"), c1="03:16:57"), + warning="Detected 1 column names but the data has 2 columns") +test(2150.22, options=list(datatable.old.fread.datetime.character=NULL), + fread("c1\n2018-01-31 03:16:57", sep=""), data.table(c1=as.POSIXct("2018-01-31 03:16:57", tz="UTC"))) # 1 is treated as . in dcast formula, #4615 DT = data.table(a = c("s", "x"), survmean = 1:2) test(2151, dcast(DT, 1 ~ a, value.var='survmean'), data.table('.'='.', s=1L, x=2L, key='.')) # list object with [[ method that returns itself (e.g. person) lead to infinite loop in copy(), #4620 -y = person(given='Joel', family='Mossong') +y = utils::person(given='Joel', family='Mossong') test(2152, copy(y), y) # .N and .GRP special statics copied correctly when placed as a vector in a list column; part of PR#4655 @@ -17310,7 +17247,7 @@ measurev = list("foo", "bar")#measurev below should not use this since it is not test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging") test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NULL), pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2))) test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword -iris.dt = data.table(datasets::iris) +iris.dt = data.table(iris) test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1]") test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1, 2]") @@ -17341,7 +17278,7 @@ test(2183.09, melt(DTid, measure.vars=structure(1:3, variable_table=data.table(x test(2183.10, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=data.table(x=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2") test(2183.11, melt(DTid, measure.vars=structure(list(a=1, b=2:3), variable_table=list(x=1:2, y=1))), error="variable_table attribute of measure.vars should be a data table with same number of rows as max length of measure.vars vectors =2")#make sure to check each list element, not just the first. # general measure errors. -iris.dt = data.table(datasets::iris) +iris.dt = data.table(iris) test(2183.20, melt(iris.dt, measure.vars=measure(value.name, dim, sep=".", pattern="foo")), error="both sep and pattern arguments used; must use either sep or pattern (not both)") # school example. schools.wide <- data.table( @@ -17380,7 +17317,7 @@ myfac = function(x)factor(x)#user-defined conversion function. test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1))) # measure errors. iris.i <- 1 -iris.num <- datasets::iris[iris.i, 1:4] +iris.num <- iris[iris.i, 1:4] iris.days <- data.table( day1=iris.num, day2=iris.num, Species=iris$Species[iris.i]) test(2183.61, melt(iris.days, measure.vars=measure(before=as.integer, value.name, dim, sep=".")), error="before conversion function returned vector of all NA", warning=base_messages$coerce_na) diff --git a/tests/S4.R b/tests/S4.R new file mode 100644 index 000000000..a2be96984 --- /dev/null +++ b/tests/S4.R @@ -0,0 +1,6 @@ +# NB: methods _has_ to be attached before data.table in order for methods::as() to +# find the right dispatch when trying as(x, "IDate"). This might be an R bug, but +# even running library(methods, pos="package:base") after attaching data.table doesn't work. +library(methods) +library(data.table) +test.data.table(script="S4.Rraw") From 3fbef50c9719870c64467ef10b7f1f78e483c5e7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 07:33:16 -0700 Subject: [PATCH 051/106] Suggest Toby as owner for dcast paths too --- CODEOWNERS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index 23f78180a..fd67c8c08 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,9 +1,12 @@ # https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners * @jangorecki @michaelchirico -# melt +# reshaping +/R/fcast.R @tdhock /R/fmelt.R @tdhock +/src/fcast.c @tdhock /src/fmelt.c @tdhock +/man/dcast.data.table.Rd @tdhock /man/melt.data.table.Rd @tdhock /vignettes/datatable-reshape.Rmd @tdhock From b09649bed6b6da1bec6e29378e0418a43155aa78 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 11:08:31 -0700 Subject: [PATCH 052/106] Suppress output of computing 'y' (#6064) --- R/test.data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index e2efe29d9..7cb573748 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -418,7 +418,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no } } if (!fail && !length(error) && (!length(output) || !missing(y))) { # TODO test y when output=, too - y = try(y,TRUE) + capture.output(y <- try(y, silent=TRUE)) # y might produce verbose output, just toss it if (identical(x,y)) return(invisible(TRUE)) all.equal.result = TRUE if (is.data.frame(x) && is.data.frame(y)) { From ac60ca2d21ea950e1eade5c1929ff2037bdacdc2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 13:04:45 -0700 Subject: [PATCH 053/106] Changes to ensure test(number instead of test(name (#6041) Related to #6040 -- it will be good to keep to a pattern where `test()` always has a numeric literal in the `num=` argument, even if it's a dynamic test where the base number is incremented by a variable amount. Doing so will make the `testPattern=` argument to `test.data.table()` more usable. We can add a linter for this (#5908) to prevent regression later.
Linter to find these: ```r l=make_linter_from_xpath( "//SYMBOL_FUNCTION_CALL[text() = 'test']/parent::expr/following-sibling::expr[1][SYMBOL or expr[1]/SYMBOL]", "xxx") lint("inst/tests/tests.Rraw", l()) ```
--- inst/tests/tests.Rraw | 250 +++++++++++++++++++++--------------------- 1 file changed, 127 insertions(+), 123 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c09d43e90..f1670dd6a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2662,13 +2662,13 @@ for (ne in seq_along(eols)) { lines = capture.output(fwrite(headDT, verbose=FALSE)) cat(paste(lines,collapse=eol), file=f, sep="") # so last line abruptly ends (missing last eol) to test that, otherwise could just pass eol to fwrite # on unix we simulate Windows too. On Windows \n will write \r\n (and \r\n will write \r\r\n) - num = 894 + nr/100 + nc/1000 + ne/10000 + num_major = nr/100 + nc/1000 + ne/10000 # if (isTRUE(all.equal(testIDtail, 0.4103))) browser() - test(num+0.00001, fread(f,na.strings=""), headDT) + test(894+num_major+0.00001, fread(f,na.strings=""), headDT) cat(eol,file=f,append=TRUE) # now a normal file properly ending with final \n - test(num+0.00002, fread(f,na.strings=""), headDT) + test(894+num_major+0.00002, fread(f,na.strings=""), headDT) cat(eol,file=f,append=TRUE) # extra \n should be ignored other than for single columns where it is significant - test(num+0.00003, fread(f,na.strings=""), if (nc==1) rbind(headDT, list(NA)) else headDT) + test(894+num_major+0.00003, fread(f,na.strings=""), if (nc==1) rbind(headDT, list(NA)) else headDT) unlink(f) }}} if (test_bit64) { @@ -4466,13 +4466,13 @@ colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -test_no = 1223.0 +test_no = 0L oldnfail = nfail for (nvars in seq_along(names(DT))) { signs = expand.grid(replicate(nvars, c(-1L,1L), simplify=FALSE)) combn(names(DT), nvars, simplify=FALSE, function(x) { # simplify=FALSE needed for R 3.1.0 for (i in seq_len(nrow(signs))) { - test_no <<- signif(test_no+.001, 7) + test_no <<- test_no + 1L ll = as.call(c(as.name("order"), lapply(seq_along(x), function(j) { if (signs[i,j] == 1L) @@ -4485,7 +4485,7 @@ for (nvars in seq_along(names(DT))) { } }) )) - test(test_no, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) + test(1223.0 + test_no*0.001, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) } integer() }) @@ -4617,16 +4617,16 @@ colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -test_no = 1246.0 +test_no = 0L oldnfail = nfail for (i in seq_along(names(DT))) { cc = combn(names(DT), i) apply(cc, 2L, function(jj) { - test_no <<- signif(test_no+.01, 7) # first without key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) - test_no <<- signif(test_no+.01, 7) + test_no <<- test_no + 1L # first without key + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test_no <<- test_no + 1L setkeyv(DT, jj) # with key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) }) } if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce @@ -4645,11 +4645,11 @@ oldnfail = nfail for (i in seq_along(names(DT))) { cc = combn(names(DT), i) apply(cc, 2L, function(jj) { - test_no <<- signif(test_no+.01, 7) # first without key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) - test_no <<- signif(test_no+.01, 7) + test_no <<- test_no + 1L # first without key + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test_no <<- test_no + 1L setkeyv(DT, jj) # with key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) }) } if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce @@ -4734,13 +4734,13 @@ setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") ans = vector("list", length(names(DT))) -test_no = 1252 +test_no = 0L oldnfail = nfail for (i in seq_along(names(DT))) { cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) ans[[i]] = combn(names(DT), i, function(x) { tmp = apply(cj, 1, function(y) { - test_no <<- signif(test_no+.001, 7) + test_no <<- test_no + 1L ll = as.call(c(as.name("base_order"), lapply(seq_along(x), function(j) { if (y[j] == 1L) @@ -4754,11 +4754,11 @@ for (i in seq_along(names(DT))) { }) )) ans1 = forderv(DT, by=x, order=y, na.last=TRUE) # adding tests for both nalast=TRUE and nalast=NA - test(test_no, ans1, with(DT, eval(ll))) - test_no <<- signif(test_no+.001, 7) + test(1252.0 + test_no*0.001, ans1, with(DT, eval(ll))) + test_no <<- test_no + 1L ll <- as.call(c(as.list(ll), na.last=NA)) ans1 = forderv(DT, by=x, order=y, na.last=NA) # nalast=NA here. - test(test_no, ans1[ans1 != 0], with(DT, eval(ll))) + test(1252.0 + test_no*0.001, ans1[ans1 != 0], with(DT, eval(ll))) }) dim(tmp)=NULL list(tmp) @@ -4885,13 +4885,13 @@ setNumericRounding(old_rounding) # distinguishing small numbers from 0.0 as from v1.9.2, test from Rick # http://stackoverflow.com/questions/22290544/grouping-very-small-numbers-e-g-1e-28-and-0-0-in-data-table-v1-8-10-vs-v1-9-2 old_rounding = getNumericRounding() -test_no = 1278.001 +test_no = 0L for (dround in c(0,2)) { setNumericRounding(dround) # rounding should not affect the result here because although small, it's very accurace (1 s.f.) for (i in c(-30:-1,1:30)) { DT = data.table(c(1 * (10^i),2,9999,-1,0,1)) - test(test_no, nrow(DT[, .N, by=V1]), 6L) - test_no = test_no + 0.001 + test_no = test_no + 1L + test(1278.0 + test_no*0.001, nrow(DT[, .N, by=V1]), 6L) } } setNumericRounding(old_rounding) @@ -5697,7 +5697,7 @@ dt = data.table(AA=sample(c(-2:2), 50, TRUE), DD=sample(c(-2:2), 50, TRUE), EE=sample(as.logical(c(-2:2)), 50, TRUE)) if (test_bit64) dt[, DD := as.integer64(DD)] -test_no = 1368.0 +test_no = 0L for (i in seq_along(dt)) { col = dt[[i]] for (j in list(TRUE, FALSE, "keep")) { @@ -5716,10 +5716,10 @@ for (i in seq_along(dt)) { r3 = frankv(col, ties.method=k, na.last=j) r4 = frankv(col, order=-1L, ties.method=k, na.last=j) - test_no = test_no+.0001 - test(test_no, r1, r3) - test_no = test_no+.0001 - test(test_no, r2, r4) + test_no = test_no + 1L + test(1368.0 + test_no*0.0001, r1, r3) + test_no = test_no + 1L + test(1368.0 + test_no*0.0001, r2, r4) } } } @@ -5730,7 +5730,7 @@ dt = data.table(AA=sample(c(-2:2, NA), 50, TRUE), DD=sample(c(-2:2, NA), 50, TRUE), EE=sample(as.logical(c(-2:2, NA)), 50, TRUE)) if (test_bit64) dt[, DD := as.integer64(DD)] -test_no = 1369.0 +test_no = 0L for (i in seq_along(dt)) { col = dt[[i]] # ensure consistency with base::rank ties.methods as advertised @@ -5748,10 +5748,10 @@ for (i in seq_along(dt)) { r3 = frankv(col, ties.method=k, na.last=NA) r4 = frankv(col, order=-1L, ties.method=k, na.last=NA) - test_no = test_no+.0001 - test(test_no, r1, r3) - test_no = test_no+.0001 - test(test_no, r2, r4) + test_no = test_no + 1L + test(1369.0 + test_no*0.0001, r1, r3) + test_no = test_no + 1L + test(1369.0 + test_no*0.0001, r2, r4) } } @@ -5767,20 +5767,20 @@ dt = list(AA=sample(c(NA,-2:2), 50, TRUE), DD=sample(c(NA,-2:2), 50, TRUE), EE=sample(as.logical(c(NA,-2:2)), 50, TRUE)) if (test_bit64) dt[["DD"]] = as.integer64(dt[["DD"]]) -test_no = 1370.0 +test_no = 0L ans = as.list(na.omit(as.data.table(dt))) for (i in seq_along(dt)) { combn(names(dt), i, function(cols) { ans1 = is_na(dt[cols]) ans2 = rowSums(is.na(as.data.table(dt[cols]))) > 0L - test_no <<- test_no+.0001 - test(test_no, ans1, ans2) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, ans1, ans2) # update: tests for any_na - test_no <<- test_no+.0001 - test(test_no, any_na(dt[cols]), TRUE) - test_no <<- test_no+.0001 - test(test_no, any_na(ans[cols]), FALSE) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, any_na(dt[cols]), TRUE) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, any_na(ans[cols]), FALSE) TRUE }) } @@ -5874,7 +5874,7 @@ types=c("any", "within", "start", "end", "equal") # add 'equal' as well mults=c("all", "first", "last") maxgap=-1L; minoverlap=0L # default has changed in IRanges/GenomicRanges :: findOverlaps verbose=FALSE; which=TRUE -test_no = 1372.0 +test_no = 0L load(testDir("test1372.Rdata")) # Regenerated on 17/02/2019 to include type = 'equal'. Var 'ans' has all the results saved by running GenomicRanges separately using code above, is a list with names of the format type_mult_run set.seed(123) this = 1L @@ -5903,11 +5903,11 @@ for (run in seq_len(times)) { # data.table overlap join nomatch = if(mult == "all") NULL else NA_integer_ thisans = foverlaps(i, x, mult=mult, type=type, nomatch=nomatch, which=which, verbose=verbose) - test_no = test_no+.01 + test_no = test_no + 1L # cat("test =", test_no, ", run = ", run, ", type = ", type, ", mult = ", mult, "\n", sep="") idx = paste(type, mult, run, sep="_") # ans[[idx]] contains fo(gr(i), gr(x), type=type, select=mult) - test(test_no, thisans, ans[[idx]]) + test(1372.0 + test_no*0.01, thisans, ans[[idx]]) this = this+1L } } @@ -6127,13 +6127,13 @@ DT = data.table(a=sample(col, 20, TRUE), b=as.numeric(sample(col,20,TRUE)), c=as # if (test_bit64) { # DT[, e := as.integer64(sample(col,20,TRUE))] # } -test_no = 1394 +test_no = 0L for (i in seq_along(DT)) { combn(names(DT), i, function(cols) { ans1 = na.omit(DT, cols=cols) ans2 = DT[stats::complete.cases(DT[, cols, with=FALSE])] - test_no <<- test_no+.001 - test(test_no, ans1, ans2) + test_no <<- test_no + 1L + test(1394.0 + test_no*0.001, ans1, ans2) 0L }) } @@ -6509,15 +6509,15 @@ for(t in seq_len(nrow(all))){ ansOpt <- DT[eval(parse(text = thisQuery))] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery))] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) ## repeat the test with 'which = TRUE' options("datatable.optimize" = 3L) ansOpt <- DT[eval(parse(text = thisQuery)), which = TRUE] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery)), which = TRUE] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" @@ -6526,8 +6526,8 @@ for(t in seq_len(nrow(all))){ ansOpt <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) } } } @@ -12905,10 +12905,10 @@ M <- merge(x, y) m <- merge(as.data.frame(x), as.data.frame(y), by="a") test(1913.09, is.data.table(M) && !is.data.table(m)) test(1913.10, all(names(M) %in% union(names(M), names(m)))) -test_no = 1913.11 +test_no = 0L for (name in names(m)) { - test_no = test_no + 0.0001 - test(test_no, M[[name]], m[[name]]) + test_no = test_no + 1L + test(1913.11 + test_no*0.0001, M[[name]], m[[name]]) } # # Original example that smoked out the bug @@ -12923,10 +12923,10 @@ for (i in 1:3) { } test(1913.12, is.data.table(M) && !is.data.table(m)) test(1913.13, all(names(M) %in% union(names(M), names(m)))) -test_no = 1913.14 +test_no = 0L for (name in names(m)) { - test_no = test_no + 0.0001 - test(test_no, M[[name]], m[[name]]) + test_no = test_no + 1L + test(1913.14 + test_no*0.0001, M[[name]], m[[name]]) } # # simple subset maintains keys @@ -12961,10 +12961,10 @@ t2 <- transform(dt, d=c+4, a=sample(c('x', 'y', 'z'), 20, replace=TRUE)) test(1913.23, is.null(key(t2))) # transforming a key column nukes the key ## This is probably not necessary, but let's just check that transforming ## a key column doesn't twist around the rows in the result. -test_no = 1913.24 +test_no = 0L for (col in c('b', 'c')) { - test_no = test_no + 0.0001 - test(test_no, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns + test_no = test_no + 1L + test(1913.24 + test_no*0.0001, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns } # Test 1914 of S4 compatibility was moved to S4.Rraw for #3808 @@ -14695,18 +14695,18 @@ test(2025.01, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data. f = tempfile() for (nNUL in 0:3) { writeBin(c(charToRaw("a=b\nA B C\n1 3 5\n"), rep(as.raw(0), nNUL), charToRaw("2 4 6\n")), con=f) - test_no = 2025 + (1+nNUL)/10 - test(test_no + .01, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) - test(test_no + .02, fread(f), ans) # auto detect skip and header works too + num_major = (1+nNUL)/10 + test(2025 + num_major + .01, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) + test(2025 + num_major + .02, fread(f), ans) # auto detect skip and header works too writeBin(c(charToRaw("a=b\nA,B,C\n1,3,5\n"), rep(as.raw(0), nNUL), charToRaw("2,4,6\n")), con=f) - test(test_no + .03, fread(f, skip=1, header=TRUE), ans) - test(test_no + .04, fread(f), ans) + test(2025 + num_major + .03, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .04, fread(f), ans) writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A B C\n1 3 5\n2 4 6\n")), con=f) - test(test_no + .05, fread(f, skip=1, header=TRUE), ans) - test(test_no + .06, fread(f), ans) + test(2025 + num_major + .05, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .06, fread(f), ans) writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A,B,C\n1,3,5\n2,4,6\n")), con=f) - test(test_no + .07, fread(f, skip=1, header=TRUE), ans) - test(test_no + .08, fread(f), ans) + test(2025 + num_major + .07, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .08, fread(f), ans) } makeNul = function(str){ tt=charToRaw(str); tt[tt==42L]=as.raw(0); writeBin(tt, con=f)} # "*" (42) represents NUL makeNul("A,B,C\n1,foo,5\n2,*bar**,6\n") @@ -17614,19 +17614,21 @@ EVAL = function(...) { # cat(e,"\n") # uncomment to check the queries tested eval(parse(text=e)) } -testnum = 2211.0 +testnum = 0L for (col in c("a","b","c")) { - testnum = testnum+0.1 + testnum = testnum + 100L for (fi in seq_along(funs)) { if (col=="c" && fi<=6L) next # first 6 funs don't support type character f = funs[fi] - testnum = testnum+0.001 - test(testnum, EVAL("DT[i, ",f,"(",col, if(fi>8L)", 1L","), by=grp]"), # segfault before when NA in i - EVAL("DT[i][, ",f,"(",col, if(fi>8L)", 1L","), by=grp]")) # ok before by taking DT[i] subset first + testnum = testnum + 1L + test(2211.0 + testnum*0.001, + EVAL("DT[i, ",f,"(",col, if(fi>8L)", 1L","), by=grp]"), # segfault before when NA in i + EVAL("DT[i][, ",f,"(",col, if(fi>8L)", 1L","), by=grp]")) # ok before by taking DT[i] subset first if (fi<=8L) { - testnum = testnum+0.001 - test(testnum, EVAL("DT[i, ",f,"(",col,", na.rm=TRUE), by=grp]"), - EVAL("DT[i][, ",f,"(",col,", na.rm=TRUE), by=grp]")) + testnum = testnum + 1L + test(2211.0 + testnum*0.001, + EVAL("DT[i, ",f,"(",col,", na.rm=TRUE), by=grp]"), + EVAL("DT[i][, ",f,"(",col,", na.rm=TRUE), by=grp]")) } } } @@ -17730,7 +17732,7 @@ DT2 = data.table(grp = c('a', 'b'), agg = list(c('1' = 4, '2' = 5), c('3' = 6))) test(2217, DT1[, by = grp, .(agg = list(setNames(as.numeric(value), id)))], DT2) # shift integer64 when fill isn't integer32, #4865 -testnum = 2218 +testnum = 0L funs = c(as.integer, as.double, as.complex, as.character, if (test_bit64) as.integer64) # when test_bit64==FALSE these all passed before; now passes with test_bit64==TRUE too # add grouping tests for #5205 @@ -17739,32 +17741,32 @@ options(datatable.optimize = 2L) for (f1 in funs) { DT = data.table(x=f1(1:4), g=g) for (f2 in funs) { - testnum = testnum + 0.001 - test(testnum, DT[, shift(x)], f1(c(NA, 1:3))) - testnum = testnum + 0.001 + testnum = testnum + 1L + test(2218.0 + testnum*0.001, DT[, shift(x)], f1(c(NA, 1:3))) + testnum = testnum + 1L w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" - test(testnum, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) - testnum = testnum + 0.001 + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) + testnum = testnum + 1L if (identical(f1,as.character) && identical(f2,as.complex)) { # one special case due to as.complex(0)=="0+0i"!="0" - test(testnum, DT[, shift(x, fill="0")], f1(0:3)) + test(2218.0 + testnum*0.001, DT[, shift(x, fill="0")], f1(0:3)) } else { - test(testnum, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) } - testnum = testnum + 0.001 - test(testnum, DT[, shift(x), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3)))) - testnum = testnum + 0.001 + testnum = testnum + 1L + test(2218.0 + testnum*0.001, DT[, shift(x), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3)))) + testnum = testnum + 1L w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" f = f2(NA) - test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3))), warning=w) - testnum = testnum + 0.001 + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3))), warning=w) + testnum = testnum + 1L if (identical(f1,as.character) && identical(f2,as.complex)) { # one special case due to as.complex(0)=="0+0i"!="0" - test(testnum, DT[, shift(x, fill="0"), by=g], data.table(g=g, V1=f1(c(0,1,0,3)))) + test(2218.0 + testnum*0.001, DT[, shift(x, fill="0"), by=g], data.table(g=g, V1=f1(c(0,1,0,3)))) } else { f = f2(0) - test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(0,1,0,3))), warning=w) + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(0,1,0,3))), warning=w) } } } @@ -17778,14 +17780,15 @@ if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data DT = data.table(g=1:2, i=c(NA, 1:4, NA), f=factor(letters[1:6]), l=as.list(1:6)) options(datatable.optimize = 2L) funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") -testnum = 2220 +testnum = 0L for (fun in funs) { - testnum = testnum + 0.01 - test(testnum, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") - testnum = testnum + 0.01 - test(testnum, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + testnum = testnum + 1L + test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + testnum = testnum + 1L + test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } -test(testnum+0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +testnum = testnum + 1L +test(2220.0 + testnum*0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -18067,19 +18070,18 @@ test(2233.38, copy(DT)[, val:=v[1L], keyby=.(A,B), verbose=TRUE], data.table(A=I set.seed(10) n = 100 a = data.table(id1=1:n, id2=sample(1:900,n,replace=TRUE), flag=sample(c(0,0,0,1),n,replace=TRUE)) -testnum = 2233.39 for (opt in c(0,Inf)) { options(datatable.optimize=opt) out = if (opt) "GForce.*gsum" else "GForce FALSE" B = copy(a) A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle - test(testnum+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= + num_bump = (opt>0)/100 + test(2233.39+num_bump+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= setorder(A, id1) - test(testnum+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) - test(testnum+0.003, any(A[,t1!=t2]), FALSE) - test(testnum+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) - test(testnum+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) - testnum = 2233.40 + test(2233.39+num_bump+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) + test(2233.39+num_bump+0.003, any(A[,t1!=t2]), FALSE) + test(2233.39+num_bump+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) + test(2233.39+num_bump+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) } # test from #5337 n=4; k=2 @@ -18099,22 +18101,24 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -old = options(datatable.verbose=TRUE) -testnum = 2233.43 -for (opt in c(0,Inf)) { - options(datatable.optimize=opt) - out = if (opt) "GForce.*gsum" else "GForce FALSE" - test(testnum, - copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") - ][, n_idT :=dim(.SD)[[1]], by=list(t, id) - ][, sum_v2_id :=sum(v2), by=.(id) - ][, sum_v1_idT:=sum(v1), by=c("id", "t") - ][, sum_v1_id :=sum(v1), by=c("id")], - ans, - output=out) - testnum = 2233.44 -} -options(old) +test(2233.43, + options = list(datatable.verbose=TRUE, datatable.optimize=0), + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output="GForce FALSE") +test(2233.44, + options = list(datatable.verbose=TRUE, datatable.optimize=Inf), + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output="GForce.*gsum") # optimized := with gforce functions that can return lists #5403 old = options(datatable.verbose=TRUE) DT = data.table(grp=1:2, x=1:4) From 585ec52e28e173c59ae1879d9dc3ade5f9477d95 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 13:06:57 -0700 Subject: [PATCH 054/106] assume PROJ_PATH=. if unset in cc() (#6042) * assume PROJ_PATH=. if unset * ? --- .dev/cc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dev/cc.R b/.dev/cc.R index f2031ca48..a51021ac7 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -51,7 +51,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) { return(invisible()) } -cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) { +cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH", unset="."), CC="gcc", quiet=FALSE) { if (!missing(cc_dir)) { warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead") path = cc_dir From 26c558d395a5d700f6b7c49c7af1e46e35c54978 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 18:58:51 -0700 Subject: [PATCH 055/106] Retain earlier dcast(fill=list(...)) behavior relying on base coercion behavior for lists (#6051) * Retain fill=list(...) behavior * refactor to unclutter line for typical usage * test list->int64 coercion too --- inst/tests/tests.Rraw | 7 ++++++- src/fcast.c | 13 ++++++++----- src/utils.c | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f1670dd6a..fc3b1163c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18314,7 +18314,12 @@ test(2247.4, split(dt, ~y+z), list("a.c"=dt[1], "b.c"=dt[2], "a.d"=dt[3], "b.d"= if (test_bit64) { i64v = as.integer64(c(12345678901234, 70, 20, NA)) apple = data.table(id = c("a", "b", "b"), time = c(1L, 1L, 2L), y = i64v[1:3]) - test(2248, dcast(apple, id ~ time, value.var = "y"), data.table(id = c('a', 'b'), `1` = i64v[1:2], `2` = i64v[4:3], key='id')) + test(2248.1, dcast(apple, id ~ time, value.var = "y"), ans<-data.table(id = c('a', 'b'), `1` = i64v[1:2], `2` = i64v[4:3], key='id')) + # associated regression test: downtreams used fill=list() which is not directly supported by coerceAs() + DT = data.table(a=1:2, b=2:3, c=3) + test(2248.2, dcast(DT, a ~ b, value.var='c', fill=list(0L)), data.table(a=1:2, `2`=c(3, 0), `3`=c(0, 3), key='a')) + # also ensure list() gets coerced to integer64 correctly + test(2248.3, dcast(apple, id ~ time, value.var = "y", fill=list(NA)), ans) } # Unit tests for DT[, .SD] retaining secondary indices, #1709 diff --git a/src/fcast.c b/src/fcast.c index d049711bf..334dfd7e8 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -21,14 +21,17 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil SEXP thisfill = fill; const SEXPTYPE thistype = TYPEOF(thiscol); int nprotect = 0; - if(some_fill){ + if (some_fill) { if (isNull(fill)) { - if (LOGICAL(is_agg)[0]) { - thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++; - } else thisfill = VECTOR_ELT(fill_d, i); + if (LOGICAL(is_agg)[0]) { + thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++; + } else + thisfill = VECTOR_ELT(fill_d, i); } if (isVectorAtomic(thiscol)) { // defer error handling to below, but also skip on list - thisfill = PROTECT(coerceAs(thisfill, thiscol, /*copyArg=*/ScalarLogical(false))); nprotect++; + // #5980: some callers used fill=list(...) and relied on R's coercion mechanics for lists, which are nontrivial, so just dispatch and double-coerce. + if (isNewList(thisfill)) { thisfill = PROTECT(coerceVector(thisfill, TYPEOF(thiscol))); nprotect++; } + thisfill = PROTECT(coerceAs(thisfill, thiscol, /*copyArg=*/ScalarLogical(false))); nprotect++; } } switch (thistype) { diff --git a/src/utils.c b/src/utils.c index 1fba47cac..e59cc8208 100644 --- a/src/utils.c +++ b/src/utils.c @@ -322,7 +322,7 @@ SEXP coerceUtf8IfNeeded(SEXP x) { return(ans); } -// class1 is used by coerseAs only, which is used by frollR.c and nafill.c only +// class1 is used by coerceAs only, which is used by frollR.c and nafill.c only const char *class1(SEXP x) { SEXP cl = getAttrib(x, R_ClassSymbol); if (length(cl)) From 9aacf3a32857b2c1cb20d8e11d254e8acf9cdc58 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 9 Apr 2024 21:50:05 -0700 Subject: [PATCH 056/106] Allow running a subset of tests by pattern (#6040) * Changes to ensure test(number instead of test(name * remaining tests * swapped outputs * don't need verbose setting anymore * more tests leading with symbol, now literal * Allow running a subset of tests by pattern * working version with static analysis * R CMD check fixes * nocov * Add an escape to back up to the full suite. --- R/test.data.table.R | 56 ++++++++++++++++++++++++++++++++++++++++-- man/test.data.table.Rd | 2 ++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 7cb573748..aa1c2c2ea 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,4 +1,4 @@ -test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, +test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, testPattern=NULL, memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id=NULL) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) memtest = as.integer(memtest) @@ -38,7 +38,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F scripts = scripts[!grepl("bench|other", scripts)] scripts = gsub("[.]bz2$","",scripts) return(sapply(scripts, function(fn) { - err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress)) + err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress, testPattern=testPattern)) cat("\n"); isTRUE(err) })) @@ -140,6 +140,58 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.") } + # nocov start: only used interactively -- "production" suites should always run in full + if (!is.null(testPattern)) { + # due to how non-hermetic our tests are, the simple approach (pass this to test(), return early if 'numStr' matches testPattern) + # does not work, or at least getting it to work is not much more efficient (see initial commit of #6040). so instead, + # here we parse the file, extract the tests that match the pattern to a new file, and include other setup lines likely required + # to run the tests successfully. two major drawbacks (1) we can only take a guess which lines are required, so this approach + # can't work (or at least, may need a lot of adjustment) for _every_ test, though not working is also a good sign that test + # should be refactored to be more hermetic (2) not all tests have literal test numbers, meaning we can't always match the + # runtime test number (i.e. 'numStr') since we're just doing a static check here, though we _are_ careful to match the + # full test expression string, i.e., not just limited to numeric literal test numbers. + arg_line = call_id = col1 = col2 = i.line1 = id = line1 = parent = preceding_line = test_start_line = text = token = x.line1 = x.parent = NULL # R CMD check + pd = setDT(utils::getParseData(parse(fn))) + file_lines = readLines(fn) + # NB: a call looks like (with id/parent tracking) + # + # name + # ( + # ... + # ... + # ) + # + ## navigate up two steps from 'test' SYMBOL_FUNCTION_CALL to the overall 'expr' for the call + test_calls = pd[pd[pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'], list(call_lhs_id = id, call_id = x.parent), on=c(id='parent')], .(line1, id), on=c(id='call_id')] + ## all the arguments for each call to test() + test_call_args = test_calls[pd[token == 'expr'], .(call_id = parent, arg_line = i.line1, col1, col2), on=c(id='parent'), nomatch=NULL] + ## 2nd argument is the num= argument + test_num_expr = test_call_args[ , .SD[2L], by="call_id"] + # NB: subtle assumption that 2nd arg to test() is all on one line, true as of 2024-Apr and likely to remain so + keep_test_ids = test_num_expr[grepl(testPattern, substring(file_lines[arg_line], col1, col2)), call_id] + # Now find all tests just previous to the keep tests; we want to keep non-test setup lines between them, e.g. + # test(drop, ...) + # setup_line1 # retain + # setup_line2 # retain + # test(keep, ...) # retain + intertest_ranges = test_calls[!id %in% keep_test_ids][test_calls[id %in% keep_test_ids], .(preceding_line = x.line1, test_start_line = i.line1), on='line1', roll=TRUE] + # TODO(michaelchirico): this doesn't do well with tests inside control statements. + # those could be included by looking for tests with parent!=0, i.e., not-top-level tests, + # and including the full parent for such tests. omitting for now until needed. + keep_lines = intertest_ranges[, sort(unique(unlist(Map(function(l, u) l:u, preceding_line+1L, test_start_line))))] + header_lines = seq_len(test_calls$line1[1L]-1L) + + tryCatch(error = function(c) warningf("Attempt to subset to %d tests matching '%s' failed, running full suite.", length(keep_test_ids), testPattern), { + new_script = file_lines[c(header_lines, keep_lines)] + parse(text = new_script) # as noted above the static approach is not fool-proof (yet?), so force the script to at least parse before continuing. + fn = tempfile() + on.exit(unlink(fn), add=TRUE) + catf("Running %d of %d tests matching '%s'\n", length(keep_test_ids), nrow(test_calls), testPattern) + writeLines(new_script, fn) + }) + } + # nocov end + err = try(sys.source(fn, envir=env), silent=silent) options(oldOptions) diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index c36e5f9d4..37496fddd 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -8,6 +8,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", silent = FALSE, showProgress = interactive() && !silent, + testPattern = NULL, memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id = NULL) } @@ -17,6 +18,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", \item{pkg}{ Root directory name under which all package content (ex: DESCRIPTION, src/, R/, inst/ etc..) resides. Used only in \emph{dev-mode}. } \item{silent}{ Controls what happens if a test fails. Like \code{silent} in \code{\link{try}}, \code{TRUE} causes the error message to be suppressed and \code{FALSE} to be returned, otherwise the error is returned. } \item{showProgress}{ Output 'Running test ...\\r' at the start of each test? } +\item{testPattern}{ When present, a regular expression tested againt the number of each test for inclusion. Useful for running only a small portion of a large test script. } \item{memtest}{ Measure and report memory usage of tests (1:gc before ps, 2:gc after ps) rather than time taken (0) by default. Intended for and tested on Linux. See PR #5515 for more details. } \item{memtest.id}{ An id for which to print memory usage for every sub id. May be a range of ids. } } From a7a12a93f1c56eae339d74ee82f2f7f2a2ccd6d7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 10 Apr 2024 21:08:34 -0700 Subject: [PATCH 057/106] New option env= for test() (#6072) --- R/test.data.table.R | 14 ++++++++- inst/tests/tests.Rraw | 69 +++++++++++++------------------------------ man/test.Rd | 3 +- 3 files changed, 35 insertions(+), 51 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index aa1c2c2ea..4908f7718 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -303,7 +303,19 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL) { + if (!is.null(env)) { + old = Sys.getenv(names(env), names=TRUE, unset=NA) + to_unset = !lengths(env) + # NB: Sys.setenv() (no arguments) errors + if (!all(to_unset)) do.call(Sys.setenv, as.list(env[!to_unset])) + Sys.unsetenv(names(env)[to_unset]) + on.exit(add=TRUE, { + is_preset = !is.na(old) + if (any(is_preset)) do.call(Sys.setenv, as.list(old[is_preset])) + Sys.unsetenv(names(old)[!is_preset]) + }) + } if (!is.null(options)) { old_options <- do.call('options', as.list(options)) # as.list(): allow passing named character vector for convenience on.exit(options(old_options), add=TRUE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fc3b1163c..078a7d173 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14290,35 +14290,24 @@ test(1997.06, setDTthreads(percent=NULL), error="but is length 0") test(1997.07, setDTthreads(percent=1:2), error="but is length 2") test(1997.08, setDTthreads(restore_after_fork=21), error="must be TRUE, FALSE, or NULL") old = getDTthreads() # (1) -oldenv1 = Sys.getenv("R_DATATABLE_NUM_PROCS_PERCENT") -oldenv2 = Sys.getenv("R_DATATABLE_NUM_THREADS") -Sys.setenv(R_DATATABLE_NUM_THREADS="") # in case user has this set, so we can test PROCS_PERCENT -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="3.0") -test(1997.09, setDTthreads(), old, ignore.warning="Ignoring invalid.*Please remove any.*not a digit") +test(1997.09, env = c(R_DATATABLE_NUM_THREADS="", R_DATATABLE_NUM_PROCS_PERCENT="3.0"), setDTthreads(), old, ignore.warning="Ignoring invalid.*Please remove any.*not a digit") new = getDTthreads() # old above at (1) may not have been default. new now is. test(1997.10, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="1") -test(1997.11, setDTthreads(), new, ignore.warning="Ignoring invalid.*integer between 2 and 100") +test(1997.11, env=c(R_DATATABLE_NUM_PROCS_PERCENT="1"), setDTthreads(), new, ignore.warning="Ignoring invalid.*integer between 2 and 100") test(1997.12, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") -test(1997.13, setDTthreads(), new) +test(1997.13, env=c(R_DATATABLE_NUM_PROCS_PERCENT="75"), setDTthreads(), new) new = getDTthreads() setDTthreads(percent=75) test(1997.14, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="100") -setDTthreads() +test(1997.15, env=c(R_DATATABLE_NUM_PROCS_PERCENT="100"), setDTthreads(), new) allcpu = getDTthreads() -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") -Sys.setenv(R_DATATABLE_NUM_THREADS=allcpu) -setDTthreads() -test(1997.15, getDTthreads(), allcpu) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT=oldenv1) -Sys.setenv(R_DATATABLE_NUM_THREADS=oldenv2) -test(1997.16, setDTthreads(old), allcpu) -test(1997.17, getDTthreads(), old) -test(1997.18, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") +test(1997.16, env=c(R_DATATABLE_NUM_PROCS_PERCENT="75", R_DATATABLE_NUM_THREADS=allcpu), setDTthreads(), allcpu) +test(1997.17, getDTthreads(), allcpu) +test(1997.18, setDTthreads(old), allcpu) +test(1997.19, getDTthreads(), old) +test(1997.20, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") setDTthreads(throttle=65536) -test(1997.19, getDTthreads(TRUE), output="throttle==65536") +test(1997.21, getDTthreads(TRUE), output="throttle==65536") setDTthreads(throttle=1024) # test that a copy is being made and output is printed, #3385 after partial revert of #3281 @@ -16429,14 +16418,9 @@ test(2122.2, DT, data.table(V3=5:6)) dt = data.table(SomeNumberA=c(1,1,1),SomeNumberB=c(1,1,1)) test(2123, dt[, .(.N, TotalA=sum(SomeNumberA), TotalB=sum(SomeNumberB)), by=SomeNumberA], data.table(SomeNumberA=1, N=3L, TotalA=1, TotalB=3)) -# system timezone is not usually UTC, so as.ITime.POSIXct shouldn't assume so, #4085 -oldtz=Sys.getenv('TZ', unset=NA) -Sys.setenv(TZ='Asia/Jakarta') # UTC+7 -t0 = as.POSIXct('2019-10-01') -test(2124.1, format(as.ITime(t0)), '00:00:00') -test(2124.2, format(as.IDate(t0)), '2019-10-01') -if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) -# careful to unset because TZ="" means UTC whereas unset TZ means local, #4261 and #4464 +# system timezone is not usually UTC, so as.ITime.POSIXct shouldn't assume so, #4085, #4261, #4464 +test(2124.1, env=c(TZ='Asia/Jakarta'), format(as.ITime(as.POSIXct('2019-10-01'))), '00:00:00') +test(2124.2, env=c(TZ='Asia/Jakarta'), format(as.IDate(as.POSIXct('2019-10-01'))), '2019-10-01') # trunc.cols in print.data.table, #4074 old_width = options("width" = 40L) @@ -16799,20 +16783,12 @@ if (.Platform$OS.type=="windows") local({ ) x_old = Map(Sys.getlocale, names(x)) invisible(Map(Sys.setlocale, names(x), x)) - old = Sys.getenv('LANGUAGE') - Sys.setenv('LANGUAGE' = 'zh_CN') - on.exit({ - if (nzchar(old)) - Sys.setenv('LANGUAGE' = old) - else - Sys.unsetenv('LANGUAGE') - invisible(Map(Sys.setlocale, names(x_old), x_old)) - }, add = TRUE) + on.exit(Map(Sys.setlocale, names(x_old), x_old)) # triggered segfault here in #4402, Windows-only under translation. # test that the argument order changes correctly (the 'item 2' moves to the beginning of the message) # since the argument order changes in this example (and that was the crash) we don't need to test # the display of the Chinese characters here. Thanks to @shrektan for all his help on this. - test(2143, rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1") + test(2143, env=c(LANGUAGE='zh_CN'), rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1") }) # test back to English (the argument order is back to 1,c,2,1) test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") @@ -16871,18 +16847,13 @@ tmp = tempfile() fwrite(DT, tmp) test(2150.01, fread(tmp), DT) # defaults for fwrite/fread simple and preserving fwrite(DT, tmp, dateTimeAs='write.csv') # as write.csv, writes the UTC times as-is not local because the time column has tzone=="UTC", but without the Z marker -oldtz = Sys.getenv("TZ", unset=NA) -Sys.unsetenv("TZ") -test(2150.021, sapply(fread(tmp,tz=""), typeof), c(dates="integer", times="character")) # from v1.14.0 tz="" needed to read datetime as character -test(2150.022, fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC -Sys.setenv(TZ="UTC") -test(2150.023, fread(tmp), DT) # TZ environment variable is also recognized +test(2150.021, env=list(TZ=NULL), sapply(fread(tmp,tz=""), typeof), c(dates="integer", times="character")) # from v1.14.0 tz="" needed to read datetime as character +test(2150.022, env=list(TZ=NULL), fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC +test(2150.023, env=c(TZ='UTC'), fread(tmp), DT) # TZ environment variable is also recognized if (.Platform$OS.type!="windows") { - Sys.setenv(TZ="") # on Windows this unsets TZ, see ?Sys.setenv - test(2150.024, fread(tmp), DT) + test(2150.024, env=c(TZ=''), fread(tmp), DT) # on Windows this unsets TZ, see ?Sys.setenv # blank TZ env variable on non-Windows is recognized as UTC consistent with C and R; but R's tz= argument is the opposite and uses "" for local } -Sys.unsetenv("TZ") # Notes: # - from v1.14.0 tz="" needed # - as.POSIXct puts "" on the result (testing the write.csv version here with missing tzone) @@ -16891,11 +16862,11 @@ Sys.unsetenv("TZ") # as.POSIXct() failure means 'times' is returned as a character, hence no 'tzone' attribute. # fread() will also throw a warning, one substring of which will be the reproduced base R error. test(2150.025, + env=list(TZ=NULL), attr(fread(tmp, colClasses=list(POSIXct="times"), tz="")$times, "tzone"), if (is.null(base_messages$maybe_invalid_old_posixct)) "" else NULL, warning=base_messages$maybe_invalid_old_posixct) # the times will be different though here because as.POSIXct read them as local time. -if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) fwrite(copy(DT)[ , times := format(times, '%FT%T+00:00')], tmp) test(2150.03, fread(tmp), DT) fwrite(copy(DT)[ , times := format(times, '%FT%T+0000')], tmp) diff --git a/man/test.Rd b/man/test.Rd index ddf1198bf..d264d98af 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL) + options = NULL, env = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -21,6 +21,7 @@ test(num, x, y = TRUE, \item{notOutput}{ Or if you are testing that a feature does \emph{not} print particular console output. Case insensitive (unlike output) so that the test does not incorrectly pass just because the string is not found due to case. } \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to `test()` (usually, `x`, or maybe `y`) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } +\item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From fa6204d35cc77731291a1a0b4574cf5e8e695f60 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 10 Apr 2024 21:34:12 -0700 Subject: [PATCH 058/106] Clean up getRversion overwrite to avoid error (#6073) --- .dev/cc.R | 1 + 1 file changed, 1 insertion(+) diff --git a/.dev/cc.R b/.dev/cc.R index a51021ac7..28f398f16 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -38,6 +38,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) { if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly") return(invisible()) } + suppressWarnings(rm("getRversion", envir=.GlobalEnv)) # clean up from previous cc() because parseNamespaceFile() run getRversion() in NAMESPACE in .GlobalEnv nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports))) for (ii in seq_along(nsParsedImports)) { From 94e4be689733ea50b0a0cae5e13e97efd0d4255b Mon Sep 17 00:00:00 2001 From: Anirban Date: Wed, 10 Apr 2024 22:58:22 -0700 Subject: [PATCH 059/106] Added my workflow (Marketplace version) and the two tests I used in the examples --- .github/workflows/autocomment.yml | 21 ++++++++++ inst/atime/tests.R | 69 +++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 .github/workflows/autocomment.yml create mode 100644 inst/atime/tests.R diff --git a/.github/workflows/autocomment.yml b/.github/workflows/autocomment.yml new file mode 100644 index 000000000..94a906a36 --- /dev/null +++ b/.github/workflows/autocomment.yml @@ -0,0 +1,21 @@ +name: Autocomment atime-based performance regression analysis on PRs + +on: + pull_request: + branches: + - '*' + types: + - opened + - reopened + - synchronize + +jobs: + comment: + runs-on: ubuntu-latest + container: ghcr.io/iterative/cml:0-dvc2-base1 + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + repo_token: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + steps: + - uses: Anirban166/Autocomment-atime-results@v1.1.6 \ No newline at end of file diff --git a/inst/atime/tests.R b/inst/atime/tests.R new file mode 100644 index 000000000..7095ae350 --- /dev/null +++ b/inst/atime/tests.R @@ -0,0 +1,69 @@ +pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { + pkg_find_replace <- function(glob, FIND, REPLACE) { + atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE) + } + Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE) + Package_ <- gsub(".", "_", old.Package, fixed = TRUE) + new.Package_ <- paste0(Package_, "_", sha) + pkg_find_replace( + "DESCRIPTION", + paste0("Package:\\s+", old.Package), + paste("Package:", new.Package)) + pkg_find_replace( + file.path("src", "Makevars.*in"), + Package_regex, + new.Package_) + pkg_find_replace( + file.path("R", "onLoad.R"), + Package_regex, + new.Package_) + pkg_find_replace( + file.path("R", "onLoad.R"), + sprintf('packageVersion\\("%s"\\)', old.Package), + sprintf('packageVersion\\("%s"\\)', new.Package)) + pkg_find_replace( + file.path("src", "init.c"), + paste0("R_init_", Package_regex), + paste0("R_init_", gsub("[.]", "_", new.Package_))) + pkg_find_replace( + "NAMESPACE", + sprintf('useDynLib\\("?%s"?', Package_regex), + paste0('useDynLib(', new.Package_)) + } + +test.list <- list( + # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 + # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 + "Test regression fixed in #4440" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(3,8), + setup = quote({ + set.seed(1L) + dt <- data.table(a = sample(N, N)) + setindex(dt, a) + }), + expr = quote(data.table:::shallow(dt)), + "Before" = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This should be changed later. Currently, the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) + "Regression" = "752012f577f8e268bb6d0084ca39a09fa7fbc1c4", # A commit that is affected by the regression: https://github.com/Rdatatable/data.table/commit/752012f577f8e268bb6d0084ca39a09fa7fbc1c4 + "Fixed" = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # The merge commit in #4440, the PR that fixed the regression: https://github.com/Rdatatable/data.table/commit/9d3b9202fddb980345025a4f6ac451ed26a423be + + # Test based on: https://github.com/Rdatatable/data.table/issues/5424 + # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 + # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 + "Test regression fixed in #5463" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(3, 8), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), + setup = quote({ + n <- N/100 + set.seed(1L) + dt <- data.table( + g = sample(seq_len(n), N, TRUE), + x = runif(N), + key = "g") + dt_mod <- copy(dt) + }), + "Before" = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # The commit in PR #4491 that comes before the regression introducting commit: https://github.com/Rdatatable/data.table/pull/4491/commits/be2f72e6f5c90622fe72e1c315ca05769a9dc854 + "Regression" = "e793f53466d99f86e70fc2611b708ae8c601a451", # The commit in #4491 that introduced the regression: https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451 + "Fixed" = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in #5463, the PR that fixed the regression: https://github.com/Rdatatable/data.table/pull/5463/commits/58409197426ced4714af842650b0cc3b9e2cb842 +) \ No newline at end of file From 2de34e729b355614c6742ad0cb5fc5311d8bf779 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 11 Apr 2024 07:25:29 -0700 Subject: [PATCH 060/106] Add a deep-but-infrequent GHA (#6076) --- .github/workflows/R-CMD-check-occasional.yaml | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 .github/workflows/R-CMD-check-occasional.yaml diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml new file mode 100644 index 000000000..1358f0538 --- /dev/null +++ b/.github/workflows/R-CMD-check-occasional.yaml @@ -0,0 +1,96 @@ +on: + schedule: + - cron: '18 13 8 * *' # 8th of month at 13:18 UTC + +# A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release +name: R-CMD-check-occasional + +jobs: + R-CMD-check-occasional: + runs-on: ${{ matrix.os }} + + name: ${{ matrix.os }} (${{ matrix.r }}) + + strategy: + matrix: + os: [macOS-latest, windows-latest, ubuntu-latest] + r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3'] + locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502) + exclude: + - os: ['macOS-latest', 'windows-latest'] # only run non-English locale CI on Ubuntu + locale: ['zh_CN.utf8', 'lv_LV.utf8'] + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Set locale + if: matrix.locale == 'en_US.utf8' + run: | + sudo locale-gen en_US + echo "LC_ALL=en_US.utf8" >> $GITHUB_ENV + + - name: Set locale + if: matrix.locale == 'zh_CN.utf8' + run: | + sudo locale-gen zh_CN + echo "LC_ALL=zh_CN.utf8" >> $GITHUB_ENV + echo "LANGUAGE=zh_CN" >> $GITHUB_ENV + + - name: Set locale + if: matrix.locale == 'lv_LV.utf8' + run: | + sudo locale-gen lv_LV + echo "LC_ALL=lv_LV.utf8" >> $GITHUB_ENV + echo "LANGUAGE=lv_LV" >> $GITHUB_ENV + + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.r }} + + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@main + with: + name: ${{ runner.os }}-r${{ matrix.r }}-results + path: check From 317139752a219140709b0250ef27df9e824bfb15 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Thu, 11 Apr 2024 07:55:48 -0700 Subject: [PATCH 061/106] Use %s in 'should be TRUE or FALSE' messages (#6075) * refactor should be TF messages * revert R-level message * consistency * changed msg in nafill, changed test 2003.2 to align with consistency --------- Co-authored-by: Michael Chirico --- inst/tests/tests.Rraw | 2 +- src/fastmean.c | 2 +- src/nafill.c | 2 +- src/rbindlist.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 078a7d173..405ccd0a0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14366,7 +14366,7 @@ test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logica #rbindlist coverage test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]") -test(2003.2, rbindlist(list(), fill=1), error="fill= should be TRUE or FALSE") +test(2003.2, rbindlist(list(), fill=1), error="fill should be TRUE or FALSE") test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE), data.table(a=c(1:4))) test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE), diff --git a/src/fastmean.c b/src/fastmean.c index 2fcc6ebd2..1c9b3eb64 100644 --- a/src/fastmean.c +++ b/src/fastmean.c @@ -36,7 +36,7 @@ SEXP fastmean(SEXP args) if (length(args)>2) { tmp = CADDR(args); if (!isLogical(tmp) || LENGTH(tmp)!=1 || LOGICAL(tmp)[0]==NA_LOGICAL) - error(_("narm should be TRUE or FALSE")); // # nocov ; [.data.table should construct the .External call correctly + error(_("%s should be TRUE or FALSE"), "narm"); // # nocov ; [.data.table should construct the .External call correctly narm=LOGICAL(tmp)[0]; } PROTECT(ans = allocNAVector(REALSXP, 1)); diff --git a/src/nafill.c b/src/nafill.c index 03aa6d091..5fe81933d 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -100,7 +100,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S bool binplace = LOGICAL(inplace)[0]; if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) - error(_("nan_is_na must be TRUE or FALSE")); // # nocov + error(_("%s must be TRUE or FALSE"), "nan_is_na"); // # nocov bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; SEXP x = R_NilValue; diff --git a/src/rbindlist.c b/src/rbindlist.c index ba19d2c38..d8cd32476 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -5,7 +5,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) { if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL) - error(_("fill= should be TRUE or FALSE")); + error(_("%s should be TRUE or FALSE"), "fill"); if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1) error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA if (!length(l)) return(l); From 0711e4004d48db749fa39dd407c87bb06726b1c0 Mon Sep 17 00:00:00 2001 From: Nitish Jha <151559388+Nj221102@users.noreply.github.com> Date: Thu, 11 Apr 2024 22:02:00 +0530 Subject: [PATCH 062/106] Added skip_absent arguement to colnamesInt() (#6068) * Added skip_absent arguement to colnamesInt() * Update NEWS.md * Update NEWS.md * Update utils.c * Update utils.c * Update utils.c * Update utils.c * Update utils.c * added test * Update src/nafill.c Co-authored-by: Michael Chirico * Update src/utils.c Co-authored-by: Michael Chirico * Update src/utils.c Co-authored-by: Michael Chirico * Update src/utils.c Co-authored-by: Michael Chirico * Implemented suggestions * small fix * Update utils.c * minor issues * restore comment for now * Update nafill.Rraw * adjusted any colno. > ncol to 0L * Added test and changed refrence to deep copy * annotate test purpose * More careful about when duplicate() is needed * refine comment * whitespace * Add a new test against duplicates for numeric input * update last test number --------- Co-authored-by: nitish jha Co-authored-by: Michael Chirico --- R/wrappers.R | 2 +- inst/tests/nafill.Rraw | 16 +++++++++++++++- src/data.table.h | 2 +- src/nafill.c | 2 +- src/utils.c | 27 +++++++++++++++++++-------- 5 files changed, 37 insertions(+), 12 deletions(-) diff --git a/R/wrappers.R b/R/wrappers.R index dcf8ba08e..a018b91ae 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -8,7 +8,7 @@ setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na) fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L]) -colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) +colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE) .Call(CcolnamesInt, x, cols, check_dups, skip_absent) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index b72c0b506..cf65f61bf 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -149,8 +149,22 @@ test(4.20, colnamesInt(dt, integer()), integer()) test(4.21, colnamesInt(dt, NULL), seq_along(dt)) test(4.22, colnamesInt("asd", 1), error="must be data.table compatible") test(4.23, colnamesInt(dt, 1, check_dups="a"), error="check_dups") +test(4.24, colnamesInt(dt, c("a", "e"), skip_absent=TRUE), c(1L,0L)) +test(4.25, colnamesInt(dt, c(1L, 4L), skip_absent=TRUE), c(1L,0L)) +test(4.26, colnamesInt(dt, c(1, 4), skip_absent=TRUE), c(1L,0L)) +test(4.27, colnamesInt(dt, c("a", NA), skip_absent=TRUE), c(1L,0L)) +test(4.28, colnamesInt(dt, c(1L, 0L), skip_absent=TRUE), error="received non-existing column*.*0") +test(4.29, colnamesInt(dt, c(1, -5), skip_absent=TRUE), error="received non-existing column*.*-5") +test(4.30, colnamesInt(dt, c(1, 4), skip_absent=NULL), error="skip_absent must be TRUE or FALSE") +test(4.31, colnamesInt(dt, c(1L, 1000L), skip_absent=TRUE), c(1L,0L)) +cols=c(1L,100L) +test(4.32, colnamesInt(dt, cols, skip_absent=TRUE), c(1L, 0L)) +test(4.33, cols, c(1L, 100L)) # ensure input was not overwritten with output 0 +cols=c(1,100) +test(4.34, colnamesInt(dt, cols, skip_absent=TRUE), c(1L, 0L)) +test(4.35, cols, c(1, 100)) # ensure input was not overwritten with output 0 names(dt) <- NULL -test(4.24, colnamesInt(dt, "a"), error="has no names") +test(4.36, colnamesInt(dt, "a"), error="has no names") # verbose dt = data.table(a=c(1L, 2L, NA_integer_), b=c(1, 2, NA_real_)) diff --git a/src/data.table.h b/src/data.table.h index 21b7e30e0..297167d46 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -238,7 +238,7 @@ bool isRealReallyInt(SEXP x); SEXP isRealReallyIntR(SEXP x); SEXP isReallyReal(SEXP x); bool allNA(SEXP x, bool errorForBadType); -SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups); +SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent); bool INHERITS(SEXP x, SEXP char_); SEXP copyAsPlain(SEXP x); void copySharedColumns(SEXP x); diff --git a/src/nafill.c b/src/nafill.c index 5fe81933d..8d50f32ea 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -114,7 +114,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S obj = PROTECT(allocVector(VECSXP, 1)); protecti++; // wrap into list SET_VECTOR_ELT(obj, 0, obj1); } - SEXP ricols = PROTECT(colnamesInt(obj, cols, ScalarLogical(TRUE))); protecti++; // nafill cols=NULL which turns into seq_along(obj) + SEXP ricols = PROTECT(colnamesInt(obj, cols, /* check_dups= */ ScalarLogical(TRUE), /* skip_absent= */ ScalarLogical(FALSE))); protecti++; // nafill cols=NULL which turns into seq_along(obj) x = PROTECT(allocVector(VECSXP, length(ricols))); protecti++; int *icols = INTEGER(ricols); for (int i=0; inx) || (icols[i]<1)) + for (int i=0; inx) || (icols[i]<1)) error(_("argument specifying columns received non-existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also + else if(bskip_absent && icols[i]>nx) + icols[i] = 0L; } } else if (isString(cols)) { SEXP xnames = PROTECT(getAttrib(x, R_NamesSymbol)); protecti++; @@ -133,9 +142,11 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups) { error(_("'x' argument data.table has no names")); ricols = PROTECT(chmatch(cols, xnames, 0)); protecti++; int *icols = INTEGER(ricols); - for (int i=0; i Date: Thu, 11 Apr 2024 09:40:56 -0700 Subject: [PATCH 063/106] Automatic detection of dec (. or ,) (#4482) * initial progress on automatic dec=, detection * if sep=, detected, turn off auto-dec * first pass at NEWS and man * add comments, tests * improve man * add verbose output, tests --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/fread.R | 5 +++-- inst/tests/tests.Rraw | 36 +++++++++++++++++++++++-------- man/fread.Rd | 8 +++---- src/fread.c | 50 ++++++++++++++++++++++++++++++++++++++----- src/fread.h | 2 ++ src/freadR.c | 5 +++-- 7 files changed, 86 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4fa8d699b..27c35e385 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,8 @@ 8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. +9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fread.R b/R/fread.R index b2e55403d..66bda3fb1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,5 +1,5 @@ fread = function( -input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", +input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, @@ -16,7 +16,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") else if (sep=="auto") sep="" # sep=="" at C level means auto sep else stopifnot( nchar(sep)==1L ) # otherwise an actual character to use as sep } - stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L ) + stopifnot( is.character(dec), length(dec)==1L) + if (dec == "auto") dec = "" else stopifnot(nchar(dec) == 1L) # handle encoding, #563 if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) { stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 405ccd0a0..d3a0e37e8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2681,15 +2681,13 @@ if (test_bit64) { test(897, class(DT$b), "integer64") test(898, fread(f), DT) unlink(f) - DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))] - DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))] - DT[,r:=a/100][,r2:=gsub(" ","",format(r))] - DT[112, a2:=as.integer64(12345678901234)] # start on row 112 to avoid the first 100 - DT[113, a3:=3.14] - DT[114, a4:="123A"] - DT[115, b2:=1234567890123.45] - DT[116, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) - DT[117, r2:="3.14A"] + DT[ , a2 := as.integer64(a)][112L, a2 := as.integer64(12345678901234)] # start on row 112 to avoid the first 100 + DT[ , a3 := as.double(a) ][113L, a3 := 3.14] + DT[ , a4 := as.character(a)][114L, a4 := "123A"] + DT[ , b2 := as.double(b) ][115L, b2 := 1234567890123.45] + DT[ , b3 := as.character(b)][116L, b3 := "12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) + DT[ , r := a/100] + DT[ , r2 := as.character(r)][117L, r2 := "3.14A"] fwrite(DT,f<-tempfile()) test(899.1, fread(f, verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>") test(899.2, fread(f, colClasses=list(character=c("a4","b3","r2"), integer64="a2", double=c("a3","b2")), verbose=TRUE), @@ -18432,3 +18430,23 @@ DF <- structure( ) test(2255, as.data.table(DF), output="DF1.V1.*DF1.V2.*DF2.V3.*DF2.V4.*V5") + +# automatic detection of dec=',' for #2431 +DT = data.table(a = letters, b = 1:26/6, c = 1:26) +## auto-detect dec=',' +fwrite(DT, f <- tempfile(), dec=',', sep=';') +test(2256.1, fread(f), DT) + +fwrite(DT, f, dec=',', sep='|') +test(2256.2, fread(f), DT) + +## auto-detect dec='.' +fwrite(DT, f) +test(2256.3, fread(f), DT) + +## verbose output +test(2256.4, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'") + +fwrite(DT, f, dec=',', sep=';') +test(2256.5, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18") +test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ") diff --git a/man/fread.Rd b/man/fread.Rd index 49b187364..d397a441d 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -9,7 +9,7 @@ \code{fread} is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector. } \usage{ -fread(input, file, text, cmd, sep="auto", sep2="auto", dec=".", quote="\"", +fread(input, file, text, cmd, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), # due to change to ""; see NEWS stringsAsFactors=FALSE, verbose=getOption("datatable.verbose", FALSE), @@ -47,7 +47,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" If type coercion results in an error, introduces \code{NA}s, or would result in loss of accuracy, the coercion attempt is aborted for that column with warning and the column's type is left unchanged. If you really desire data loss (e.g. reading \code{3.14} as \code{integer}) you have to truncate such columns afterwards yourself explicitly so that this is clear to future readers of your code. } \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{utils::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". } - \item{dec}{ The decimal separator as in \code{utils::read.csv}. If not "." (default) then usually ",". See details. } + \item{dec}{ The decimal separator as in \code{utils::read.csv}. When \code{"auto"} (the default), an attempt is made to decide whether \code{"."} or \code{","} is more suitable for this input. See details. } \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. } \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.} \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } @@ -79,9 +79,9 @@ If an empty line is encountered then reading stops there with warning if any tex \bold{Line endings:} All known line endings are detected automatically: \code{\\n} (*NIX including Mac), \code{\\r\\n} (Windows CRLF), \code{\\r} (old Mac) and \code{\\n\\r} (just in case). There is no need to convert input files first. \code{fread} running on any architecture will read a file from any architecture. Both \code{\\r} and \code{\\n} may be embedded in character strings (including column names) provided the field is quoted. -\bold{Decimal separator and locale:} \code{fread(\dots,dec=",")} should just work. \code{fread} uses C function \code{strtod} to read numeric data; e.g., \code{1.23} or \code{1,23}. \code{strtod} retrieves the decimal separator (\code{.} or \code{,} usually) from the locale of the R session rather than as an argument passed to the \code{strtod} function. So for \code{fread(\dots,dec=",")} to work, \code{fread} changes this (and only this) R session's locale temporarily to a locale which provides the desired decimal separator. +\bold{Decimal separator:} \code{dec} is used to parse numeric fields as the separator between integral and fractional parts. When \code{dec='auto'}, during column type detection, when a field is a candidate for being numeric (i.e., parsing as lower types has already failed), \code{dec='.'} is tried, and, if it fails to create a numeric field, \code{dec=','} is tried. At the end of the sample lines, if more were successfully parsed with \code{dec=','}, \code{dec} is set to \code{','}; otherwise, \code{dec} is set to \code{'.'}. -On Windows, "French_France.1252" is tried which should be available as standard (any locale with comma decimal separator would suffice) and on unix "fr_FR.utf8" (you may need to install this locale on unix). \code{fread()} is very careful to set the locale back again afterwards, even if the function fails with an error. The choice of locale is determined by \code{options()$datatable.fread.dec.locale}. This may be a \emph{vector} of locale names and if so they will be tried in turn until the desired \code{dec} is obtained; thus allowing more than two different decimal separators to be selected. This is a new feature in v1.9.6 and is experimental. In case of problems, turn it off with \code{options(datatable.fread.dec.experiment=FALSE)}. +Automatic detection of \code{sep} occurs \emph{prior} to column type detection -- as such, it is possible that \code{sep} has been inferred to be \code{','}, in which case \code{dec} is set to \code{'.'}. \bold{Quotes:} diff --git a/src/fread.c b/src/fread.c index a1521fb37..e2602e596 100644 --- a/src/fread.c +++ b/src/fread.c @@ -33,6 +33,7 @@ static const char *sof, *eof; static char sep; static char whiteChar; // what to consider as whitespace to skip: ' ', '\t' or 0 means both (when sep!=' ' && sep!='\t') static char quote, dec; +static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.' static bool eol_one_r; // only true very rarely for \r-only files // Quote rule: @@ -1206,11 +1207,16 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped skip_white(&ch); if (eol(&ch)) return 0; // empty line int field=0; + const bool autoDec = dec == '\0'; while (field>(%d)"), strlim(ch,20), quoteRule); skip_white(&ch); const char *fieldStart = ch; while (tmpType[field]<=CT_STRING) { + if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '\0') { // guess . first + dec = '.'; + } + fun[tmpType[field]](&fctx); if (end_of_field(ch)) break; skip_white(&ch); @@ -1234,9 +1240,19 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped } } ch = fieldStart; + if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '.') { // . didn't parse a double; try , + dec = ','; + continue; + } while (++tmpType[field]=eof) break; // The 9th jump could reach the end in the same situation and that's ok. As long as the end is sampled is what we want. bool bumped = false; // did this jump find any different types; to reduce verbose output to relevant lines int jumpLine = 0; // line from this jump point start + linesForDecDot = 0; while(ch0, apply the bumps (if any) at the end of the successfully completed jump sample ASSERT(jump>0, "jump(%d)>0", jump); @@ -1906,7 +1936,17 @@ int freadMain(freadMainArgs _args) { if (args.header==NA_BOOL8) { for (int j=0; j0) for (int j=0; jCT_EMPTY) { args.header=true; diff --git a/src/fread.h b/src/fread.h index 1e2783643..89dea2592 100644 --- a/src/fread.h +++ b/src/fread.h @@ -36,6 +36,8 @@ typedef enum { NUMTYPE // placeholder for the number of types including drop; used for allocation and loop bounds } colType; +#define IS_DEC_TYPE(x) ((x) == CT_FLOAT64 || (x) == CT_FLOAT64_EXT) // types where dec matters + extern int8_t typeSize[NUMTYPE]; extern const char typeName[NUMTYPE][10]; extern const long double pow10lookup[301]; diff --git a/src/freadR.c b/src/freadR.c index 97fbfadac..035c76eda 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -102,9 +102,10 @@ SEXP freadR( error(_("Internal error: freadR sep not a single character. R level catches this.")); // # nocov args.sep = CHAR(STRING_ELT(sepArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level - if (!(isString(decArg) && LENGTH(decArg)==1 && strlen(CHAR(STRING_ELT(decArg,0)))==1)) + if (!isString(decArg) || LENGTH(decArg)!=1 || strlen(CHAR(STRING_ELT(decArg,0)))>1) { error(_("Internal error: freadR dec not a single character. R level catches this.")); // # nocov - args.dec = CHAR(STRING_ELT(decArg,0))[0]; + } + args.dec = CHAR(STRING_ELT(decArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level if (IS_FALSE(quoteArg)) { args.quote = '\0'; From 523e3cc90553060f072e647a4ac68e921deb89b3 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 12:12:03 -0700 Subject: [PATCH 064/106] Renamed workflow as per Toby's suggestion --- .github/workflows/{autocomment.yml => performance-tests.yml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .github/workflows/{autocomment.yml => performance-tests.yml} (87%) diff --git a/.github/workflows/autocomment.yml b/.github/workflows/performance-tests.yml similarity index 87% rename from .github/workflows/autocomment.yml rename to .github/workflows/performance-tests.yml index 94a906a36..027854faa 100644 --- a/.github/workflows/autocomment.yml +++ b/.github/workflows/performance-tests.yml @@ -18,4 +18,4 @@ jobs: repo_token: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes steps: - - uses: Anirban166/Autocomment-atime-results@v1.1.6 \ No newline at end of file + - uses: Anirban166/Autocomment-atime-results@v1.1.6 From e07565ceb79fa2153b903aa3440dd0e27adf57c0 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 17:24:46 -0700 Subject: [PATCH 065/106] Made the suggested changes --- inst/atime/tests.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 7095ae350..94844b7b3 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -43,9 +43,9 @@ test.list <- list( setindex(dt, a) }), expr = quote(data.table:::shallow(dt)), - "Before" = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This should be changed later. Currently, the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) - "Regression" = "752012f577f8e268bb6d0084ca39a09fa7fbc1c4", # A commit that is affected by the regression: https://github.com/Rdatatable/data.table/commit/752012f577f8e268bb6d0084ca39a09fa7fbc1c4 - "Fixed" = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # The merge commit in #4440, the PR that fixed the regression: https://github.com/Rdatatable/data.table/commit/9d3b9202fddb980345025a4f6ac451ed26a423be + Before = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This needs to be changed later. Currently assigned to the merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) as the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) + Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression + Fixed = "769f02c6fbbb031391a79f46c6042de99f1ea712"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440/commits) # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 @@ -63,7 +63,7 @@ test.list <- list( key = "g") dt_mod <- copy(dt) }), - "Before" = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # The commit in PR #4491 that comes before the regression introducting commit: https://github.com/Rdatatable/data.table/pull/4491/commits/be2f72e6f5c90622fe72e1c315ca05769a9dc854 - "Regression" = "e793f53466d99f86e70fc2611b708ae8c601a451", # The commit in #4491 that introduced the regression: https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451 - "Fixed" = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in #5463, the PR that fixed the regression: https://github.com/Rdatatable/data.table/pull/5463/commits/58409197426ced4714af842650b0cc3b9e2cb842 -) \ No newline at end of file + Before = "19b7866112614db53eb3e909c097407d91cd6738", # Parent of the regression commit (https://github.com/Rdatatable/data.table/commit/0895fa247afcf6b38044bd5f56c0d209691ddb31), which is the parent of the first commit in the PR that causes the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) + Regression = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # The regression commit is the parent of the first commit in the PR that fixed the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) + Fixed = "1e03fe7b890e63da9651d997ea52548c90b3ae32") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5493/commits) +) From a3d5cf938d6831cda483c79d7b6e4194865dde7c Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 18:10:32 -0700 Subject: [PATCH 066/106] Reverted changes to the 'Fixed' commit SHA for the first test case since the last commit of #4440 failed to check out --- inst/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 94844b7b3..e023eec21 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -45,7 +45,7 @@ test.list <- list( expr = quote(data.table:::shallow(dt)), Before = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This needs to be changed later. Currently assigned to the merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) as the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression - Fixed = "769f02c6fbbb031391a79f46c6042de99f1ea712"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440/commits) + Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 From 3093a35db700441e0cb2b3fca5a67f36a17ddce1 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 19:11:03 -0700 Subject: [PATCH 067/106] Reverted changes to the 'Fixed' commit SHA for the second test case as well since the newly provided commit SHA is wrong --- inst/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index e023eec21..6f38660b4 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -65,5 +65,5 @@ test.list <- list( }), Before = "19b7866112614db53eb3e909c097407d91cd6738", # Parent of the regression commit (https://github.com/Rdatatable/data.table/commit/0895fa247afcf6b38044bd5f56c0d209691ddb31), which is the parent of the first commit in the PR that causes the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) Regression = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # The regression commit is the parent of the first commit in the PR that fixed the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) - Fixed = "1e03fe7b890e63da9651d997ea52548c90b3ae32") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5493/commits) + Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) ) From 48b2dddc551b7a2708cc3f6de061bcabf26513b9 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 19:29:09 -0700 Subject: [PATCH 068/106] Added the suggested path filters --- .github/workflows/performance-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 027854faa..0473dcbc2 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -8,6 +8,9 @@ on: - opened - reopened - synchronize + paths: + - 'R/**' + - 'src/**' jobs: comment: From fb8ca6f696624cee3a13e02cf69356341c6d7763 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 19:42:50 -0700 Subject: [PATCH 069/106] Don't need R to retain the source code attributes when parsing and saving functions --- .github/workflows/performance-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 0473dcbc2..9c8cc664f 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -19,6 +19,5 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} repo_token: ${{ secrets.GITHUB_TOKEN }} - R_KEEP_PKG_SOURCE: yes steps: - uses: Anirban166/Autocomment-atime-results@v1.1.6 From cd0331ff7942e3dce4b39a6c68deefd8e406fbfb Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 21:28:50 -0700 Subject: [PATCH 070/106] Added pseudo-roxygen style comments for the pkg.edit.fun function. --- inst/atime/tests.R | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 6f38660b4..cdbc7326e 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -1,3 +1,26 @@ +# A function to customize R package metadata and source files to facilitate version-specific installation and testing. +# +# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in `Makevars` and version checking code in `onLoad.R`) +# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. +# It appends a SHA1 hash to the package name (`PKG.SHA`), ensuring each version can be installed and tested separately. +# +# @param old.Package Current name of the package. +# @param new.Package New name of the package, including a SHA hash. +# @param sha SHA1 hash used for differentiating versions. +# @param new.pkg.path Path to the package files. +# +# @details +# The function modifies: +# - DESCRIPTION, updating the package name. +# - Makevars, customizing the shared object file name and adjusting the build settings. +# - R/onLoad.R, adapting custom version checking for package loading operations. +# - NAMESPACE, changing namespace settings for dynamic linking. +# +# @examples +# pkg.edit.fun("data.table", "data.table.some_40_digit_SHA1_hash", "some_40_digit_SHA1_hash", "/path/to/data.table") +# +# @return None (performs in-place file modifications) +# @note This setup is typically unnecessary for most packages but essential for `data.table` due to its unique configuration. pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { pkg_find_replace <- function(glob, FIND, REPLACE) { atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE) From a11a2e2840c4a3b8e84c639b24ae777ea62ae74c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 11 Apr 2024 22:49:00 -0700 Subject: [PATCH 071/106] spell out "significant figures" (#6081) --- man/setNumericRounding.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/setNumericRounding.Rd b/man/setNumericRounding.Rd index 87ce2256b..f9e00de27 100644 --- a/man/setNumericRounding.Rd +++ b/man/setNumericRounding.Rd @@ -18,8 +18,8 @@ Computers cannot represent some floating point numbers (such as 0.6) precisely, using base 2. This leads to unexpected behaviour when joining or grouping columns of type 'numeric'; i.e. 'double', see example below. In cases where this is undesirable, data.table allows rounding such data up to -approximately 11 s.f. which is plenty of digits for many cases. This is -achieved by rounding the last 2 bytes off the significand. Other possible +approximately 11 significant figures which is plenty of digits for many cases. +This is achieved by rounding the last 2 bytes off the significand. Other possible values are 1 byte rounding, or no rounding (full precision, default). It is bytes rather than bits because it is tied in with the radix sort From 716da67b6045bf57a8049102020d6cd2f2a7e033 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 11 Apr 2024 22:55:33 -0700 Subject: [PATCH 072/106] Made the suggested changes (and reverted to the correct commits for the second test case) --- inst/atime/tests.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index cdbc7326e..a02f07b52 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -86,7 +86,7 @@ test.list <- list( key = "g") dt_mod <- copy(dt) }), - Before = "19b7866112614db53eb3e909c097407d91cd6738", # Parent of the regression commit (https://github.com/Rdatatable/data.table/commit/0895fa247afcf6b38044bd5f56c0d209691ddb31), which is the parent of the first commit in the PR that causes the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) - Regression = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # The regression commit is the parent of the first commit in the PR that fixed the issue (https://github.com/Rdatatable/data.table/pull/5493/commits) + Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Commit preceding the regression causing commit (https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) + Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) ) From 047c90f0d6674a18a2a31b33d4a0521f182bf83a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 00:03:06 -0700 Subject: [PATCH 073/106] regression test for #1873 (#6080) --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d3a0e37e8..d33bd72a6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18450,3 +18450,7 @@ test(2256.4, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'") fwrite(DT, f, dec=',', sep=';') test(2256.5, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18") test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ") + +# helpful error about deleting during grouping, #1873 +DT = data.table(id = c(1, 1, 2, 2), a = 1:4, b = 5:8) +test(2257, DT[ , c("c", "a") := .(a + 1, NULL), by=id], error="it's not possible to delete parts of a column") From b5e1bc79353838ec0810626b25a946d0e89ee3ce Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 08:09:30 -0700 Subject: [PATCH 074/106] clean up style, R CMD check issues in testPattern= code (#6084) --- R/test.data.table.R | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 4908f7718..748e09512 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -151,7 +151,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # runtime test number (i.e. 'numStr') since we're just doing a static check here, though we _are_ careful to match the # full test expression string, i.e., not just limited to numeric literal test numbers. arg_line = call_id = col1 = col2 = i.line1 = id = line1 = parent = preceding_line = test_start_line = text = token = x.line1 = x.parent = NULL # R CMD check - pd = setDT(utils::getParseData(parse(fn))) + pd = setDT(utils::getParseData(parse(fn, keep.source=TRUE))) file_lines = readLines(fn) # NB: a call looks like (with id/parent tracking) # @@ -162,9 +162,15 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # ) # ## navigate up two steps from 'test' SYMBOL_FUNCTION_CALL to the overall 'expr' for the call - test_calls = pd[pd[pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'], list(call_lhs_id = id, call_id = x.parent), on=c(id='parent')], .(line1, id), on=c(id='call_id')] + test_calls = pd[ + pd[ + pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'], + list(call_lhs_id=id, call_id=x.parent), + on=c(id='parent')], + list(line1, id), + on=c(id='call_id')] ## all the arguments for each call to test() - test_call_args = test_calls[pd[token == 'expr'], .(call_id = parent, arg_line = i.line1, col1, col2), on=c(id='parent'), nomatch=NULL] + test_call_args = test_calls[pd[token == 'expr'], list(call_id=parent, arg_line=i.line1, col1, col2), on=c(id='parent'), nomatch=NULL] ## 2nd argument is the num= argument test_num_expr = test_call_args[ , .SD[2L], by="call_id"] # NB: subtle assumption that 2nd arg to test() is all on one line, true as of 2024-Apr and likely to remain so @@ -174,7 +180,11 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # setup_line1 # retain # setup_line2 # retain # test(keep, ...) # retain - intertest_ranges = test_calls[!id %in% keep_test_ids][test_calls[id %in% keep_test_ids], .(preceding_line = x.line1, test_start_line = i.line1), on='line1', roll=TRUE] + intertest_ranges = test_calls[!id %in% keep_test_ids][ + test_calls[id %in% keep_test_ids], + list(preceding_line=x.line1, test_start_line=i.line1), + on='line1', + roll=TRUE] # TODO(michaelchirico): this doesn't do well with tests inside control statements. # those could be included by looking for tests with parent!=0, i.e., not-top-level tests, # and including the full parent for such tests. omitting for now until needed. From fde7f43fbf7a8a6b52aee56e2dd41303bd2239f5 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 12 Apr 2024 09:02:31 -0700 Subject: [PATCH 075/106] link measure from the list of valid measure.vars values --- man/melt.data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 6dd74291d..53919d359 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -27,7 +27,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De \item{ When missing, \code{measure.vars} will become all columns outside \code{id.vars}. } \item{ Vector can be \code{integer} (implying column numbers) or \code{character} (column names). } \item{ \code{list} is a generalization of the vector version -- each element of the list (which should be \code{integer} or \code{character} as above) will become a \code{melt}ed column. } - \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} syntax; multiple patterns will produce multiple columns. } + \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} (regex without capture groups; matching column names are used in the \code{variable.name} output column), or \code{\link{measure}} (regex with capture groups; each capture group becomes an output column). } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } From 90abe53b9f29e9f28e2891337d0c5a05e083fa34 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 09:11:33 -0700 Subject: [PATCH 076/106] missing '}' --- man/melt.data.table.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 53919d359..44954e34c 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -27,7 +27,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De \item{ When missing, \code{measure.vars} will become all columns outside \code{id.vars}. } \item{ Vector can be \code{integer} (implying column numbers) or \code{character} (column names). } \item{ \code{list} is a generalization of the vector version -- each element of the list (which should be \code{integer} or \code{character} as above) will become a \code{melt}ed column. } - \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} (regex without capture groups; matching column names are used in the \code{variable.name} output column), or \code{\link{measure}} (regex with capture groups; each capture group becomes an output column). + \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} (regex without capture groups; matching column names are used in the \code{variable.name} output column), or \code{\link{measure}} (regex with capture groups; each capture group becomes an output column). } } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } From e78341e53934850d65054386d418e57bdbe4f280 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 13:54:51 -0700 Subject: [PATCH 077/106] Use options= to avoid failing to reset verbose (#6088) --- inst/tests/tests.Rraw | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d33bd72a6..886c1e635 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18232,17 +18232,20 @@ test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table( dt = data.table(x = c(2,2,1,1), y = 1:4, z=letters[1:4]) i=c(1,2) j=1L -old = options(datatable.optimize=1L) -test(2243.41, dt[, .I[TRUE], x]$V1, 1:4) -test(2243.42, dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) -options(datatable.optimize=2L, datatable.verbose=TRUE) -test(2243.51, dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.52, dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53, dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.54, dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.55, dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56, dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") -options(old) +test(2243.41, options=c(datatable.optimize=1L), dt[, .I[TRUE], x]$V1, 1:4) +test(2243.42, options=c(datatable.optimize=1L), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) +test(2243.51, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.52, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") +test(2243.54, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") +test(2243.55, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") From 79a7f3555aafd827a2e8f41902daa642cd661eab Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 12 Apr 2024 16:58:35 -0700 Subject: [PATCH 078/106] measure supports cols arg (#6077) * measure supports cols arg * line break to avoid too wide NOTE * measure.vec.i for patterns * test numbering * test measure(pattern,cols) together * rm lcols * measure supports cols arg --- NEWS.md | 2 ++ R/fmelt.R | 18 ++++++++++-------- inst/tests/tests.Rraw | 7 +++++++ man/melt.data.table.Rd | 5 +++++ 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 27c35e385..3d5b2f81c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,8 @@ 9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development. +10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fmelt.R b/R/fmelt.R index 23f07c552..5c50ca26c 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -107,17 +107,18 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na stopf("pattern must be character string") } match.vec = regexpr(pattern, cols, perl=TRUE) - measure.vec = which(0 < match.vec) - if (length(measure.vec) == 0L) { + measure.vec.i = which(0 < match.vec) + if (length(measure.vec.i) == 0L) { stopf("pattern did not match any cols, so nothing would be melted; fix by changing pattern") } - start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE] + start = attr(match.vec, "capture.start")[measure.vec.i, , drop=FALSE] if (is.null(start)) { stopf("pattern must contain at least one capture group (parenthesized sub-pattern)") } err.args.groups("number of capture groups in pattern", ncol(start)) - end = attr(match.vec, "capture.length")[measure.vec,]+start-1L - names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) + end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L + measure.vec <- cols[measure.vec.i] + names.mat = matrix(measure.vec, nrow(start), ncol(start)) substr(names.mat, start, end) } else { #pattern not specified, so split using sep. if (!is.character(sep)) { @@ -130,10 +131,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") } err.args.groups("max number of items after splitting column names", n.groups) - measure.vec = which(vector.lengths==n.groups) - do.call(rbind, list.of.vectors[measure.vec]) + measure.vec.i = which(vector.lengths==n.groups) + measure.vec = cols[measure.vec.i] + do.call(rbind, list.of.vectors[measure.vec.i]) } - err.names.unique("measured columns", cols[measure.vec]) + err.names.unique("measured columns", measure.vec) uniq.mat = unique(group.mat) if (nrow(uniq.mat) < nrow(group.mat)) { stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 886c1e635..977b29b5c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3157,6 +3157,13 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) DT_missing_l_2 = data.table(num_1=1, num_2=2, list_1=list(1), list_3=list(3)) test(1035.0186, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=TRUE), data.table(char="1", num=1, list=list(1))) test(1035.0187, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=FALSE), data.table(char=c("1","2","3"), num=c(1,2,NA), list=list(1,NA,3))) + # measure supports cols arg, #5063 + expected_without_value = data.table(num_1=1,num_2=2,prefix="list",char=c("1","3"),value=list(1,3)) + test(1035.0188, melt(DT_missing_l_2, measure.vars=measure(prefix, char, sep="_", cols=c("list_1","list_3"))), expected_without_value) + test(1035.0189, melt(DT_missing_l_2, measure.vars=measure(prefix, char, pattern="(.*)_(.*)", cols=c("list_1","list_3"))), expected_without_value) + expected_with_value = data.table(num_1=1,num_2=2,char=c("1","3"),list=list(1,3)) + test(1035.0190, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_", cols=c("list_1","list_3"))), expected_with_value) + test(1035.0191, melt(DT_missing_l_2, measure.vars=measure(value.name, char, pattern="(.*)_(.*)", cols=c("list_1","list_3"))), expected_with_value) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 44954e34c..ad4dfd8dd 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -154,6 +154,11 @@ melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, sep="_ # specifying columns to melt via regex. melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, pattern="(.)_(.)")) +melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, pattern="([dc])_(.)")) + +# cols arg of measure can be used if you do not want to use regex +melt(DT.missing.cols, measure.vars=measure( + value.name, number=as.integer, sep="_", cols=c("d_1","d_2","c_1"))) } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} From 33736725e8d1e48552248f5f8d63628b88ae913a Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 20:24:27 -0700 Subject: [PATCH 079/106] Documented test.list, made some formatting edits to what I documented yesterday (tick removals), added a link to the related atime vignette, tried to write in as much detail as I understand and included optional parameters for test.list --- inst/atime/tests.R | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index a02f07b52..3ef0f8bd7 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -1,8 +1,8 @@ # A function to customize R package metadata and source files to facilitate version-specific installation and testing. # -# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in `Makevars` and version checking code in `onLoad.R`) +# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) # to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. -# It appends a SHA1 hash to the package name (`PKG.SHA`), ensuring each version can be installed and tested separately. +# It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately. # # @param old.Package Current name of the package. # @param new.Package New name of the package, including a SHA hash. @@ -17,10 +17,10 @@ # - NAMESPACE, changing namespace settings for dynamic linking. # # @examples -# pkg.edit.fun("data.table", "data.table.some_40_digit_SHA1_hash", "some_40_digit_SHA1_hash", "/path/to/data.table") +# pkg.edit.fun("data.table", "data.table.some_SHA1_hash", "some_SHA1_hash", "/path/to/data.table") # # @return None (performs in-place file modifications) -# @note This setup is typically unnecessary for most packages but essential for `data.table` due to its unique configuration. +# @note This setup is typically unnecessary for most packages but essential for data.table due to its unique configuration. pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { pkg_find_replace <- function(glob, FIND, REPLACE) { atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE) @@ -54,6 +54,22 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { paste0('useDynLib(', new.Package_)) } +# A list of performance tests. +# +# Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments: +# - N: A numeric sequence of data sizes to vary. +# - setup: An expression evaluated for every data size before measuring time/memory. +# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. +# This must call a function from data.table using a syntax with double or triple colon prefix. +# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. +# (For instance, data.table:::[.data.table will become data.table.some_40_digit_SHA1_hash:::[.data.table) +# +# Optional parameters that may be useful to configure tests: +# - times: Number of times each expression is evaluated (default is 10). +# - seconds.limit: The maximum median timing (in seconds) of an expression. No timings for larger N are computed past that threshold. +# - sha.vec: Named character vector or a list of vectors that specify data.table-specific commit SHAs for testing across those different git commit versions. +# For historical regressions, use 'Before', 'Regression', and 'Fixed' (otherwise something like 'Slow' or 'Fast' ideally). +# @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information. test.list <- list( # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 From a65b08873d3145a0576a545a358d0e1e9f8b69d0 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 20:25:47 -0700 Subject: [PATCH 080/106] Temporarily removing the path filters to run a final check for the current commits to be working (tested locally, but just to ensure..) --- .github/workflows/performance-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 9c8cc664f..5750bf209 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -7,10 +7,7 @@ on: types: - opened - reopened - - synchronize - paths: - - 'R/**' - - 'src/**' + - synchronize jobs: comment: From 1d151e0cd63970fcd6db2a1e31f2a575a8c46459 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 20:46:15 -0700 Subject: [PATCH 081/106] Added back the path filters now that I'm confirmed the commit SHAs are working as expected. --- .github/workflows/performance-tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 5750bf209..2fc3a76f5 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -7,7 +7,10 @@ on: types: - opened - reopened - - synchronize + - synchronize + paths: + - 'R/**' + - 'src/**' jobs: comment: From 7268eff60180dac38ecc4b079021a020d3e82db3 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Fri, 12 Apr 2024 21:16:56 -0700 Subject: [PATCH 082/106] Add "na.print" as a new argument to "print.data.table" (#6087) * Added naprint argument to print.data.table * Added corresponding documentation * Simple tests * changed tests, added for when quote=true * Update man/print.data.table.Rd Co-authored-by: Michael Chirico * updated NEWS.md * added tests * review suggestions --------- Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ R/print.data.table.R | 9 +++++---- inst/tests/tests.Rraw | 17 +++++++++++++++++ man/print.data.table.Rd | 2 ++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3d5b2f81c..bc5147107 100644 --- a/NEWS.md +++ b/NEWS.md @@ -76,6 +76,8 @@ 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. +12. `print.data.table` now honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE diff --git a/R/print.data.table.R b/R/print.data.table.R index dd641f946..7f351fd8d 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -8,6 +8,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), print.keys=getOption("datatable.print.keys"), trunc.cols=getOption("datatable.print.trunc.cols"), quote=FALSE, + na.print=NULL, timezone=FALSE, ...) { # topn - print the top topn and bottom topn rows with '---' inbetween (5) # nrows - under this the whole (small) table is printed, unless topn is provided (100) @@ -118,9 +119,9 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), } rownames(toprint) = format(rownames(toprint), justify="right") if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote)) + cut_colnames(print(toprint, right=TRUE, quote=quote, na.print=na.print)) } else { - print(toprint, right=TRUE, quote=quote) + print(toprint, right=TRUE, quote=quote, na.print=na.print) } if (trunc.cols && length(not_printed) > 0L) # prints names of variables not shown in the print @@ -133,9 +134,9 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # option to shut this off per request of Oleg Bondar on SO, #1482 toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote)) + cut_colnames(print(toprint, right=TRUE, quote=quote, na.print=na.print)) } else { - print(toprint, right=TRUE, quote=quote) + print(toprint, right=TRUE, quote=quote, na.print=na.print) } if (trunc.cols && length(not_printed) > 0L) # prints names of variables not shown in the print diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 977b29b5c..28532eb59 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18464,3 +18464,20 @@ test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), outpu # helpful error about deleting during grouping, #1873 DT = data.table(id = c(1, 1, 2, 2), a = 1:4, b = 5:8) test(2257, DT[ , c("c", "a") := .(a + 1, NULL), by=id], error="it's not possible to delete parts of a column") + +# testing printing data.tables with na.print, #3152 +DT = data.table(x=c(NA, "a", "b")) +test(2258.1, capture.output(print(DT, na.print=".")), c(" x", "1: .", "2: a", "3: b")) +test(2258.2, capture.output(print(DT, na.print="_")), c(" x", "1: _", "2: a", "3: b")) +test(2258.3, capture.output(print(DT, na.print="NA")), c(" x", "1: NA", "2: a", "3: b")) +test(2258.4, capture.output(print(DT, na.print=TRUE)), error="invalid 'na.print' specification") +test(2258.5, capture.output(print(DT, na.print=".", quote=TRUE)), c(' "x"', "1: .", '2: "a"', '3: "b"')) +test(2258.6, capture.output(print(DT, na.print=".", right=TRUE)), c(" x", "1: .", "2: a", "3: b")) +# tests for other call sites +# col.names="none" +test(2258.7, capture.output(print(DT, na.print=".", col.names="none")), c("1: .", "2: a", "3: b")) +# table requires splitting, col.names="none" +DT = data.table(x = c(NA, "e", "b", "j", "w", NA)) +test(2258.8, capture.output(print(DT, na.print=".", topn=2, col.names="none")), c(" 1: .", " 2: e", "--- ", " 5: w", " 6: .")) +# table requires splitting, col.names!="none" +test(2258.9, capture.output(print(DT, na.print=".", topn=2)), c(" x", " 1: .", " 2: e", "--- ", " 5: w", " 6: .")) diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index a39c8c446..f740de9d9 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -26,6 +26,7 @@ print.keys=getOption("datatable.print.keys"), # default: TRUE trunc.cols=getOption("datatable.print.trunc.cols"), # default: FALSE quote=FALSE, + na.print=NULL, timezone=FALSE, \dots) format_col(x, \dots) @@ -47,6 +48,7 @@ \item{trunc.cols}{ If \code{TRUE}, only the columns that can be printed in the console without wrapping the columns to new lines will be printed (similar to \code{tibbles}). } \item{quote}{ If \code{TRUE}, all output will appear in quotes, as in \code{print.default}. } \item{timezone}{ If \code{TRUE}, time columns of class POSIXct or POSIXlt will be printed with their timezones (if attribute is available). } + \item{na.print}{ The string to be printed in place of \code{NA} values, as in \code{print.default}. } \item{\dots}{ Other arguments ultimately passed to \code{format}. } } \value{ From 27adaad7fb55687df4920a458564817d50f30564 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 22:29:06 -0700 Subject: [PATCH 083/106] Made the changes Michael suggested --- inst/atime/tests.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 3ef0f8bd7..83bb34ecf 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -79,7 +79,7 @@ test.list <- list( setup = quote({ set.seed(1L) dt <- data.table(a = sample(N, N)) - setindex(dt, a) + setindexv(dt, "a") }), expr = quote(data.table:::shallow(dt)), Before = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This needs to be changed later. Currently assigned to the merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) as the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) @@ -92,16 +92,16 @@ test.list <- list( "Test regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), - expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), setup = quote({ n <- N/100 - set.seed(1L) + set.seed(2L) dt <- data.table( g = sample(seq_len(n), N, TRUE), x = runif(N), key = "g") dt_mod <- copy(dt) }), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Commit preceding the regression causing commit (https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) From eaa70106493be5ad4b82052af82f64321889e652 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 12 Apr 2024 22:30:25 -0700 Subject: [PATCH 084/106] Oops forgot one Co-authored-by: Michael Chirico --- inst/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 83bb34ecf..68770347f 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -78,7 +78,7 @@ test.list <- list( N = 10^seq(3,8), setup = quote({ set.seed(1L) - dt <- data.table(a = sample(N, N)) + dt <- data.table(a = sample.int(N)) setindexv(dt, "a") }), expr = quote(data.table:::shallow(dt)), From c823c615dd31237e3525da75ff8f3e9fef9c9016 Mon Sep 17 00:00:00 2001 From: Ani Date: Tue, 16 Apr 2024 13:37:48 -0700 Subject: [PATCH 085/106] Made the suggested changes that Toby and I discussed this morning --- inst/atime/tests.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/atime/tests.R b/inst/atime/tests.R index 68770347f..a0635d063 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -82,7 +82,7 @@ test.list <- list( setindexv(dt, "a") }), expr = quote(data.table:::shallow(dt)), - Before = "9d3b9202fddb980345025a4f6ac451ed26a423be", # This needs to be changed later. Currently assigned to the merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) as the source of regression (or the particular commit that led to it) is not clear. In addition, older versions of data.table are having problems when being installed in this manner. (This includes commits from before Mar 20, 2020 or when the issue that discovered or first mentioned the regression was created) + # Before = "", This needs to be updated later as there are two issues here: A) The source of regression (or the particular commit that led to it) is not clear; B) Older versions of data.table are having problems when being installed in this manner (This includes commits from before March 20 2020, when the issue that discovered or first mentioned the regression was created) Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) @@ -102,7 +102,7 @@ test.list <- list( dt_mod <- copy(dt) }), expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), - Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Commit preceding the regression causing commit (https://github.com/Rdatatable/data.table/pull/4491/commits/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) + Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) ) From 47a7f52222ee1a7062701d637a346480d838ef37 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Tue, 16 Apr 2024 17:44:57 -0700 Subject: [PATCH 086/106] Refactor calls to "print.default" within "print.data.table" (#6091) * refactor calls to print.default * better approach? * refactor prints using internal helper * review change * brace not needed --------- Co-authored-by: Michael Chirico --- R/print.data.table.R | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/R/print.data.table.R b/R/print.data.table.R index 7f351fd8d..9e33e0c4d 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -110,6 +110,13 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # When nrow(toprint) = 1, attributes get lost in the subset, # function below adds those back when necessary toprint = toprint_subset(toprint, cols_to_print) + trunc.cols <- length(not_printed) > 0L + } + print_default = function(x) { + if (col.names != "none") cut_colnames = identity + cut_colnames(print(x, right=TRUE, quote=quote, na.print=na.print)) + # prints names of variables not shown in the print + if (trunc.cols) trunc_cols_message(not_printed, abbs, class, col.names) } if (printdots) { if (isFALSE(row.names)) { @@ -118,30 +125,14 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn)) } rownames(toprint) = format(rownames(toprint), justify="right") - if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote, na.print=na.print)) - } else { - print(toprint, right=TRUE, quote=quote, na.print=na.print) - } - if (trunc.cols && length(not_printed) > 0L) - # prints names of variables not shown in the print - trunc_cols_message(not_printed, abbs, class, col.names) - + print_default(toprint) return(invisible(x)) } if (nrow(toprint)>20L && col.names == "auto") # repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them # option to shut this off per request of Oleg Bondar on SO, #1482 - toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 - if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote, na.print=na.print)) - } else { - print(toprint, right=TRUE, quote=quote, na.print=na.print) - } - if (trunc.cols && length(not_printed) > 0L) - # prints names of variables not shown in the print - trunc_cols_message(not_printed, abbs, class, col.names) - + toprint = rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 + print_default(toprint) invisible(x) } From c639a003454ad25b69824d9edd4b327bfec50be7 Mon Sep 17 00:00:00 2001 From: Anirban Date: Thu, 18 Apr 2024 12:48:38 -0700 Subject: [PATCH 087/106] Move tests to .ci, updated workflow version to accommodate that change, added a test, renamed test case titles --- {inst => .ci}/atime/tests.R | 20 ++++++++++++++++---- .github/workflows/performance-tests.yml | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) rename {inst => .ci}/atime/tests.R (86%) diff --git a/inst/atime/tests.R b/.ci/atime/tests.R similarity index 86% rename from inst/atime/tests.R rename to .ci/atime/tests.R index a0635d063..8092a6a9c 100644 --- a/inst/atime/tests.R +++ b/.ci/atime/tests.R @@ -73,7 +73,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { test.list <- list( # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 - "Test regression fixed in #4440" = list( + "Test performance regression fixed in #4440" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3,8), setup = quote({ @@ -88,8 +88,8 @@ test.list <- list( # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 - # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 - "Test regression fixed in #5463" = list( + # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 + "Test performance regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), setup = quote({ @@ -104,5 +104,17 @@ test.list <- list( expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) - Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) + Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) + + # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426 + # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427 + "Test performance improvement implemented in #5427" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(1, 7), + setup = quote({ + DT = replicate(N, 1, simplify = FALSE) + }), + expr = quote(data.table:::setDT(DT)), + Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) + Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) ) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 2fc3a76f5..15b5ece28 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -20,4 +20,4 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} repo_token: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: Anirban166/Autocomment-atime-results@v1.1.6 + - uses: Anirban166/Autocomment-atime-results@v1.2.0 From 0c0ce61c9a0c70e787e53b52cf326bb07a623d3f Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 18 Apr 2024 13:58:49 -0700 Subject: [PATCH 088/106] Added to the path filter to trigger the action on changes to the tests --- .github/workflows/performance-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 15b5ece28..1cbc38f9e 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -10,7 +10,8 @@ on: - synchronize paths: - 'R/**' - - 'src/**' + - 'src/**' + - '.ci/atime/tests.R' jobs: comment: From 28483e8f8f184ab04dbf3804ef0d70c8ebf0143a Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 18 Apr 2024 14:01:15 -0700 Subject: [PATCH 089/106] Avoid repetition in test case titles --- .ci/atime/tests.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 8092a6a9c..b16eb2204 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -73,7 +73,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { test.list <- list( # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 - "Test performance regression fixed in #4440" = list( + "Regression fixed in #4440" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3,8), setup = quote({ @@ -89,7 +89,7 @@ test.list <- list( # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 - "Test performance regression fixed in #5463" = list( + "Regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), setup = quote({ @@ -108,7 +108,7 @@ test.list <- list( # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426 # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427 - "Test performance improvement implemented in #5427" = list( + "Improvement implemented in #5427" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(1, 7), setup = quote({ From a59ad449e67208efbdb53ea59140e1a39927cff1 Mon Sep 17 00:00:00 2001 From: Ani Date: Thu, 18 Apr 2024 14:49:31 -0700 Subject: [PATCH 090/106] Update .github/workflows/performance-tests.yml Co-authored-by: Michael Chirico --- .github/workflows/performance-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml index 1cbc38f9e..cb30e7625 100644 --- a/.github/workflows/performance-tests.yml +++ b/.github/workflows/performance-tests.yml @@ -11,7 +11,7 @@ on: paths: - 'R/**' - 'src/**' - - '.ci/atime/tests.R' + - '.ci/atime/**' jobs: comment: From 304aed5697aaf30fc0bd20ca5e35fe8b285eff82 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 09:17:36 -0700 Subject: [PATCH 091/106] sep= works with by= approach to splitting (#6028) --- NEWS.md | 2 ++ R/data.table.R | 6 ++++-- inst/tests/tests.Rraw | 5 +++++ man/split.Rd | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index bc5147107..e30849114 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,6 +36,8 @@ 10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. +11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index 89309e58b..e0cddb38f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2452,9 +2452,11 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR dtq[["i"]] = quote(levs) join = TRUE } + dots = list(...) + if (!"sep" %chin% names(dots)) dots$sep = "." dtq[["j"]] = substitute( - list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=".")), - list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD")) + list(.ll.tech.split=list(.expr), .ll.tech.split.names=paste(lapply(.BY, as.character), collapse=.sep)), + list(.expr = if (join) quote(if(.N == 0L) .SD[0L] else .SD) else as.name(".SD"), .sep = dots$sep) ) dtq[["by"]] = substitute( # retain order, for `join` and `sorted` it will use order of `i` data.table instead of `keyby`. .expr, diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 28532eb59..e00c4ac6e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18481,3 +18481,8 @@ DT = data.table(x = c(NA, "e", "b", "j", "w", NA)) test(2258.8, capture.output(print(DT, na.print=".", topn=2, col.names="none")), c(" 1: .", " 2: e", "--- ", " 5: w", " 6: .")) # table requires splitting, col.names!="none" test(2258.9, capture.output(print(DT, na.print=".", topn=2)), c(" x", " 1: .", " 2: e", "--- ", " 5: w", " 6: .")) + +# split(by = ., sep = ..) works like split(f= ., sep = ..), #5417 +x = data.table(rep(1:2, each=5L), 1:5, 1:10) +test(2259.1, names(split(x, by = c("V1", "V2"), sep = "|")), sort(names(split(x, list(x$V1, x$V2), sep = "|")))) +test(2259.2, names(split(x, by = c("V1", "V2"), sep = "||")), sort(names(split(x, list(x$V1, x$V2), sep = "||")))) diff --git a/man/split.Rd b/man/split.Rd index 687771f0c..eedbe7d67 100644 --- a/man/split.Rd +++ b/man/split.Rd @@ -18,7 +18,7 @@ \item{sorted}{When default \code{FALSE} it will retain the order of groups we are splitting on. When \code{TRUE} then sorted list(s) are returned. Does not have effect for \code{f} argument.} \item{keep.by}{logical default \code{TRUE}. Keep column provided to \code{by} argument.} \item{flatten}{logical default \code{TRUE} will unlist nested lists of data.tables. When using \code{f} results are always flattened to list of data.tables.} - \item{\dots}{passed to data.frame way of processing when using \code{f} argument.} + \item{\dots}{When using \code{f}, passed to \code{\link[base:split]{split.data.frame}}. When using \code{by}, \code{sep} is recognized as with the default method.} \item{verbose}{logical default \code{FALSE}. When \code{TRUE} it will print to console data.table split query used to split data.} } \details{ From b9d51f1034cc0fdd06d324e62bdd05b9dcb84cef Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 09:32:26 -0700 Subject: [PATCH 092/106] Tests robust to locale sorting (#6074) * Tests robust to locale sorting * NEWS * Comment describing helper * add a TODO for our future selves --- NEWS.md | 4 +++- inst/tests/tests.Rraw | 53 +++++++++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index e30849114..b14b9491c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -74,7 +74,9 @@ 9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. -10. `test.data.table()` runs correctly in more sessions, in particular those where the `digits` or `warn` settings are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. +10. `test.data.table()` runs robustly: + + In sessions where the `digits` or `warn` options are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. + + In locales where `letters != sort(letters)`, e.g. Latvian, [#3502](https://github.com/Rdatatable/data.table/issues/3502). Thanks @minemR for the report and @MichaelChirico for the fix. 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e00c4ac6e..e2791ed5d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -192,6 +192,16 @@ base_messages = list( NULL ) +# Ensure an operation uses C-locale sorting (#3502). For test set-ups/comparisons that use base operations, which are +# susceptible to locale-specific sorting issues, but shouldn't be needed for data.table code, which always uses C sorting. +# TODO(R>=3.3.0): use order(method="radix") as a way to avoid needing this helper +with_c_collate = function(expr) { + old = Sys.getlocale("LC_COLLATE") + on.exit(Sys.setlocale("LC_COLLATE", old)) + Sys.setlocale("LC_COLLATE", "C") + expr +} + ########################## .do_not_rm = ls() # objects that exist at this point should not be removed by rm_all(); e.g. test_*, base_messages, Ctest_dt_win_snprintf, prevtest, etc ########################## @@ -1834,10 +1844,10 @@ test(609, chorder(character()), base::order(character())) test(610, chorder(""), base::order("")) # Extra tests of chorder and chgroup x = sample(LETTERS) -test(610.1, chorder(x), base::order(x)) +test(610.1, chorder(x), with_c_collate(base::order(x))) test(610.2, chgroup(x), seq_along(x)) x = sample(LETTERS,1000,replace=TRUE) -test(610.3, chorder(x), base::order(x)) +test(610.3, chorder(x), with_c_collate(base::order(x))) test(610.4, unique(x[chgroup(x)]), unique(x)) # := by group @@ -3612,34 +3622,37 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, test(1102.12, dcast(DT, "a ~ c ", value.var="b"), error="not found or of unknown type") test(1102.13, dcast(DT, a ~ a, value.var="c"), error="are not found in 'data'") + # NB: for 1102.{14,15,16}, always supply levels for letters in setup data for locale robustness (#3502) + # fix for #47 - issue when factor columns on formula LHS along with `drop=FALSE` set.seed(1L) - DT = data.table(a=factor(sample(letters[1:3], 10, replace=TRUE), letters[1:5]), - b=factor(sample(tail(letters, 5), 10, replace=TRUE))) + DT = data.table(a=factor(sample(letters[1:3], 10L, replace=TRUE), levels=letters[1:5]), + b=factor(sample(letters[22:26], 10L, replace=TRUE), levels=letters[22:26])) test(1102.14, dcast(DT, a~b, drop=FALSE, fun.aggregate=length, value.var="b"), - data.table(a=factor(letters[1:5]), v=INT(0,1,0,0,0), w=INT(1,1,1,0,0), x=INT(0,0,1,0,0), y=INT(2,1,1,0,0), z=INT(0,1,0,0,0), key="a")) + data.table(a=factor(letters[1:5], levels=letters[1:5]), v=INT(0,1,0,0,0), w=INT(1,1,1,0,0), x=INT(0,0,1,0,0), y=INT(2,1,1,0,0), z=INT(0,1,0,0,0), key="a")) # reverse the levels set.seed(1L) - DT = data.table(a=factor(sample(letters[1:3], 10, replace=TRUE), letters[5:1]), - b=factor(sample(tail(letters, 5), 10, replace=TRUE))) + DT = data.table(a=factor(sample(letters[1:3], 10L, replace=TRUE), levels=letters[5:1]), + b=factor(sample(letters[22:26], 10L, replace=TRUE), levels=letters[22:26])) test(1102.15, dcast(DT, a~b, drop=FALSE, value.var="b", fun.aggregate=length), - data.table(a=factor(c("e","d","c","b","a"),levels=levels(DT$a)), v=INT(0,0,0,1,0), w=INT(0,0,1,1,1), x=INT(0,0,1,0,0), y=INT(0,0,1,1,2), z=INT(0,0,0,1,0), key="a")) + data.table(a=factor(c("e","d","c","b","a"), levels=levels(DT$a)), v=INT(0,0,0,1,0), w=INT(0,0,1,1,1), x=INT(0,0,1,0,0), y=INT(0,0,1,1,2), z=INT(0,0,0,1,0), key="a")) # more factor cols set.seed(1L) - DT = data.table(a1=factor(sample(letters[1:3], 10, replace=TRUE), letters[1:5]), # factor col 1 - a2=factor(sample(letters[6:10], 10, replace=TRUE), letters[6:10]), # factor col 2 - a3=sample(letters[1:3], 10, TRUE), # no factor - b=factor(sample(tail(letters, 5), 10, replace=TRUE))) + DT = data.table(a1=factor(sample(letters[1:3], 10L, replace=TRUE), levels=letters[1:5]), # factor col 1 + a2=factor(sample(letters[6:10], 10L, replace=TRUE), levels=letters[6:10]), # factor col 2 + a3=sample(letters[1:3], 10L, TRUE), # no factor + b=factor(sample(letters[22:26], 10L, replace=TRUE), levels=letters[22:26])) test(1102.16, dcast(DT, a1+a2+a3~b, drop=FALSE, value.var="b")[c(1,21,.N)], - data.table(a1=factor(c("a","b","e"),levels=letters[1:5]), + data.table(a1=factor(c("a","b","e"), levels=letters[1:5]), a2=factor(c("f","g","j"), levels=letters[6:10]), a3=c("a","c","c"), - v=factor(NA, levels=tail(letters,5)), - x=factor(NA, levels=tail(letters,5)), - y=factor(c(NA,"y",NA), levels=tail(letters,5)), - z=factor(NA, levels=tail(letters,5)), key=c("a1", "a2", "a3"))) + v=factor(NA, levels=letters[22:26]), + w=factor(NA, levels=letters[22:26]), + x=factor(NA, levels=letters[22:26]), + y=factor(c(NA,"y",NA), levels=letters[22:26]), + z=factor(NA, levels=letters[22:26]), key=c("a1", "a2", "a3"))) # dcast bug fix for 'subset' argument (it doesn't get key set before to run C-fcast): DT = data.table(x=c(1,1,1,2,2,2,1,1), y=c(1,2,3,1,2,1,1,2), z=c(1,2,3,NA,4,5,NA,NA)) @@ -4490,7 +4503,7 @@ for (nvars in seq_along(names(DT))) { } }) )) - test(1223.0 + test_no*0.001, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) + test(1223.0 + test_no*0.001, forderv(DT, by=x, order=signs[i,]), with_c_collate(with(DT, eval(ll)))) } integer() }) @@ -4759,11 +4772,11 @@ for (i in seq_along(names(DT))) { }) )) ans1 = forderv(DT, by=x, order=y, na.last=TRUE) # adding tests for both nalast=TRUE and nalast=NA - test(1252.0 + test_no*0.001, ans1, with(DT, eval(ll))) + test(1252.0 + test_no*0.001, ans1, with_c_collate(with(DT, eval(ll)))) test_no <<- test_no + 1L ll <- as.call(c(as.list(ll), na.last=NA)) ans1 = forderv(DT, by=x, order=y, na.last=NA) # nalast=NA here. - test(1252.0 + test_no*0.001, ans1[ans1 != 0], with(DT, eval(ll))) + test(1252.0 + test_no*0.001, ans1[ans1 != 0], with_c_collate(with(DT, eval(ll)))) }) dim(tmp)=NULL list(tmp) From 6f3fc8dcd37ac6976050b290e37f85762d0bccb5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 11:04:15 -0700 Subject: [PATCH 093/106] New regression test re: column plonk in magrittr pipeline (#6090) * new regression test re: column plonk in magrittr pipeline * simplify --- inst/tests/other.Rraw | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 88593bcdf..99169809f 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -722,3 +722,8 @@ if (FALSE) { # moved from tests.Rraw in #5517 and not yet back on; wasn't sure } } +if (loaded[["dplyr"]]) { + # regression test for converting character->list column in a magrittr (dplyr) pipe, #2651 + DT = data.table(a = 1, b = 2, c = '1,2,3,4]', d = 4) + test(30, DT[, c := strsplit(c, ',', fixed = TRUE) %>% lapply(as.integer) %>% as.list]$c, list(1:4)) +} From 54d6b966ada6950d8977fa80628e2c0dc727c31d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 11:14:04 -0700 Subject: [PATCH 094/106] Expand exclusion array (which isn't valid) (#6096) --- .github/workflows/R-CMD-check-occasional.yaml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml index 1358f0538..cd0fec0bd 100644 --- a/.github/workflows/R-CMD-check-occasional.yaml +++ b/.github/workflows/R-CMD-check-occasional.yaml @@ -16,9 +16,15 @@ jobs: os: [macOS-latest, windows-latest, ubuntu-latest] r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3'] locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502) - exclude: - - os: ['macOS-latest', 'windows-latest'] # only run non-English locale CI on Ubuntu - locale: ['zh_CN.utf8', 'lv_LV.utf8'] + exclude: # only run non-English locale CI on Ubuntu + - os: macOS-latest + locale: 'zh_CN.utf8' + - os: macOS-latest + locale: 'lv_LV.utf8' + - os: windows-latest + locale: 'zh_CN.utf8' + - os: windows-latest + locale: 'lv_LV.utf8' env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true From 8ae1b2d343258e22bd81dfb5bb411eb28b387d5c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 13:12:25 -0700 Subject: [PATCH 095/106] Be sure to test 'other' in occasional suite (#6095) --- .github/workflows/R-CMD-check-occasional.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml index cd0fec0bd..9a5f48277 100644 --- a/.github/workflows/R-CMD-check-occasional.yaml +++ b/.github/workflows/R-CMD-check-occasional.yaml @@ -28,6 +28,7 @@ jobs: env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + TEST_DATA_TABLE_WITH_OTHER_PACKAGES: true GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: From d420afe916aa78c41f8d9857630f8c4de8abf537 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Apr 2024 18:44:47 -0700 Subject: [PATCH 096/106] class= argument for condition calls (#5914) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 1.15.0 on CRAN. Bump to 1.15.99 * Fix transform slowness (#5493) * Fix 5492 by limiting the costly deparse to `nlines=1` * Implementing PR feedbacks * Added inside * Fix typo in name * Idiomatic use of inside * Separating the deparse line limit to a different PR --------- Co-authored-by: Michael Chirico * Improvements to the introductory vignette (#5836) * Added my improvements to the intro vignette * Removed two lines I added extra as a mistake earlier * Requested changes * Vignette typo patch (#5402) * fix typos and grammatical mistakes * fix typos and punctuation * remove double spaces where it wasn't necessary * fix typos and adhere to British English spelling * fix typos * fix typos * add missing closing bracket * fix typos * review fixes * Update vignettes/datatable-benchmarking.Rmd Co-authored-by: Michael Chirico * Update vignettes/datatable-benchmarking.Rmd Co-authored-by: Michael Chirico * Apply suggestions from code review benchmarking Co-authored-by: Michael Chirico * remove unnecessary [ ] from datatable-keys-fast-subset.Rmd * Update vignettes/datatable-programming.Rmd Co-authored-by: Michael Chirico * Update vignettes/datatable-reshape.Rmd Co-authored-by: Michael Chirico * One last batch of fine-tuning --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico * Improved handling of list columns with NULL entries (#4250) * Updated documentation for rbindlist(fill=TRUE) * Print NULL entries of list as NULL * Added news item * edit NEWS, use '[NULL]' not 'NULL' * fix test * split NEWS item * add example --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger * clarify that list input->unnamed list output (#5383) * clarify that list input->unnamed list output * Add example where make.names is used * mention role of make.names * fix subsetting issue in split.data.table (#5368) * fix subsetting issue in split.data.table * add a test * drop=FALSE on inner [ * switch to 3.2.0 R dep (#5905) * Allow early exit from check for eval/evalq in cedta (#5660) * Allow early exit from check for eval/evalq in cedta Done in the browser+untested, please take a second look :) * Use %chin% * nocov new code * frollmax1: frollmax, frollmax adaptive, left adaptive support (#5889) * frollmax exact, buggy fast, no fast adaptive * frollmax fast fixing bugs * frollmax man to fix CRAN check * frollmax fast adaptive non NA, dev * froll docs, adaptive left * no frollmax fast adaptive * frollmax adaptive exact NAs handling * PR summary in news * copy-edit changes from reviews Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * comment requested by Michael * update NEWS file * Apply suggestions from code review Co-authored-by: Michael Chirico * Apply suggestions from code review Co-authored-by: Michael Chirico * add comment requested by Michael * add comment about int iterator for loop over k-1 obs * extra comments * Revert "extra comments" This reverts commit 03af0e30f1a6a9e75f82b5827c1078f42db48e45. * add comments to frollmax and frollsum * typo fix --------- Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * Friendlier error in assignment with trailing comma (#5467) * friendlier error in assignment with trailing comma e.g. `DT[, `:=`(a = 1, b = 2,)`. WIP. Need to add tests and such, but editing from browser before I forget. * Another pass * include unnamed indices on RHS too * tests * NEWS * test numbering * explicit example in NEWS * Link to ?read.delim in ?fread to give a closer analogue of expected behavior (#5635) * fread is similar to read.delim (#5634) * Use ?read.csv / ?read.delim --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico * Run GHA jobs on 1-15-99 dev branch (#5909) * Make declarations static for covr (#5910) * class= argument for condition calls * Unify logic with helper * Add tests * Use call.=FALSE where possible * correct caught class * strip call=/call.= handling * botched merge --------- Co-authored-by: Ofek Co-authored-by: Ani Co-authored-by: David Budzynski <56514985+davidbudzynski@users.noreply.github.com> Co-authored-by: Scott Ritchie Co-authored-by: Benjamin Schwendinger Co-authored-by: Jan Gorecki Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Co-authored-by: Manuel López-Ibáñez <2620021+MLopez-Ibanez@users.noreply.github.com> --- R/translation.R | 30 ++++++++++++++++++++++-------- inst/tests/tests.Rraw | 27 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/R/translation.R b/R/translation.R index 66faa9fe8..42073aced 100644 --- a/R/translation.R +++ b/R/translation.R @@ -4,18 +4,32 @@ catf = function(fmt, ..., sep=" ", domain="R-data.table") { cat(gettextf(fmt, ..., domain=domain), sep=sep) } -stopf = function(fmt, ..., domain="R-data.table") { - stop(gettextf(fmt, ..., domain=domain), domain=NA, call. = FALSE) +raise_condition = function(signal, message, classes, immediate=FALSE, appendLF=FALSE) { + obj = list(message=message, call=sys.call(2)) + # NB: append _after_ translation + if (appendLF) obj$message = paste0(obj$message, "\n") + setattr(obj, "class", classes) + # cannot set immediate.=TRUE through warning(), so use the description in ?warning to replicate this behavior ourselves. tested manually. + if (immediate) { + old = options(warn=1) + on.exit(options(old)) + } + signal(obj) } -warningf = function(fmt, ..., immediate.=FALSE, noBreaks.=FALSE, domain="R-data.table") { - warning(gettextf(fmt, ..., domain=domain), domain=NA, call.=FALSE, immediate.=immediate., noBreaks.=noBreaks.) +stopf = function(fmt, ..., class=NULL, domain="R-data.table") { + raise_condition(stop, gettextf(fmt, ..., domain=domain), c(class, "simpleError", "error", "condition")) } -messagef = function(fmt, ..., appendLF=TRUE, domain="R-data.table") { - message(gettextf(fmt, ..., domain=domain), domain=NA, appendLF=appendLF) +warningf = function(fmt, ..., immediate.=FALSE, class=NULL, domain="R-data.table") { + raise_condition(warning, gettextf(fmt, ..., domain=domain), c(class, "simpleWarning", "warning", "condition"), immediate=immediate.) } -packageStartupMessagef = function(fmt, ..., appendLF=TRUE, domain="R-data.table") { - packageStartupMessage(gettextf(fmt, ..., domain=domain), domain=NA, appendLF=appendLF) +messagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { + raise_condition(message, gettextf(fmt, ..., domain=domain), c(class, "simpleMessage", "message", "condition"), appendLF=appendLF) +} + +packageStartupMessagef = function(fmt, ..., appendLF=TRUE, class=NULL, domain="R-data.table") { + # NB: packageStartupMessage() itself calls message(.packageStartupMessage(...)) + messagef(fmt, ..., appendLF=appendLF, class=c(class, "packageStartupMessage"), domain=domain) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e2791ed5d..5a7d8b7a3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -53,7 +53,9 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { isRealReallyInt = data.table:::isRealReallyInt is_utc = data.table:::is_utc melt.data.table = data.table:::melt.data.table # for test 1953.4 + messagef = data.table:::messagef null.data.table = data.table:::null.data.table + packageStartupMessagef = data.table:::packageStartupMessagef print.data.table = data.table:::print.data.table replace_dot_alias = data.table:::replace_dot_alias rollup.data.table = data.table:::rollup.data.table @@ -66,9 +68,11 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { .shallow = data.table:::.shallow split.data.table = data.table:::split.data.table if (!exists('startsWith', 'package:base', inherits=FALSE)) startsWith = data.table:::startsWith + stopf = data.table:::stopf test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist + warningf = data.table:::warningf which_ = data.table:::which_ which.first = data.table:::which.first which.last = data.table:::which.last @@ -18499,3 +18503,26 @@ test(2258.9, capture.output(print(DT, na.print=".", topn=2)), c(" x", " 1: ." x = data.table(rep(1:2, each=5L), 1:5, 1:10) test(2259.1, names(split(x, by = c("V1", "V2"), sep = "|")), sort(names(split(x, list(x$V1, x$V2), sep = "|")))) test(2259.2, names(split(x, by = c("V1", "V2"), sep = "||")), sort(names(split(x, list(x$V1, x$V2), sep = "||")))) + +# custom signaling functions +## basics: default signals with/without formats +test(2260.01, tryCatch(stopf("%s", "abc"), error=function(x) conditionMessage(x)), "abc") +test(2260.02, tryCatch(stopf("abc"), error=function(x) conditionMessage(x)), "abc") +test(2260.03, tryCatch(warningf("%s", "abc"), warning=function(x) conditionMessage(x)), "abc") +test(2260.04, tryCatch(warningf("abc"), warning=function(x) conditionMessage(x)), "abc") +test(2260.05, tryCatch(messagef("%s", "abc"), message=function(x) conditionMessage(x)), "abc\n") +test(2260.06, tryCatch(messagef("abc"), message=function(x) conditionMessage(x)), "abc\n") +test(2260.07, tryCatch(messagef("abc", appendLF=FALSE), message=function(x) conditionMessage(x)), "abc") +test(2260.08, tryCatch(packageStartupMessagef("%s", "abc"), packageStartupMessage=function(x) conditionMessage(x)), "abc\n") +test(2260.09, tryCatch(packageStartupMessagef("abc"), packageStartupMessage=function(x) conditionMessage(x)), "abc\n") +test(2260.10, tryCatch(packageStartupMessagef("abc", appendLF=FALSE), packageStartupMessage=function(x) conditionMessage(x)), "abc") + +## custom signal classes +test(2260.11, inherits(tryCatch(stopf("x", class="test_error"), condition=identity), "test_error")) +test(2260.12, inherits(tryCatch(stopf("x", class="test_error"), condition=identity), "error")) +test(2260.13, inherits(tryCatch(warningf("x", class="test_warning"), condition=identity), "test_warning")) +test(2260.14, inherits(tryCatch(warningf("x", class="test_warning"), condition=identity), "warning")) +test(2260.15, inherits(tryCatch(messagef("x", class="test_message"), condition=identity), "test_message")) +test(2260.16, inherits(tryCatch(messagef("x", class="test_message"), condition=identity), "message")) +test(2260.17, inherits(tryCatch(packageStartupMessagef("x", class="test_psm"), condition=identity), "test_psm")) +test(2260.18, inherits(tryCatch(packageStartupMessagef("x", class="test_psm"), condition=identity), "packageStartupMessage")) From f4c5d28df39fc5f45e602b15b20a53b5ce52329c Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 19 Apr 2024 19:28:12 -0700 Subject: [PATCH 097/106] rm whitespace --- .ci/atime/tests.R | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index b16eb2204..dcbf950ce 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -1,7 +1,7 @@ # A function to customize R package metadata and source files to facilitate version-specific installation and testing. # -# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) -# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. +# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) +# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. # It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately. # # @param old.Package Current name of the package. @@ -29,7 +29,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { Package_ <- gsub(".", "_", old.Package, fixed = TRUE) new.Package_ <- paste0(Package_, "_", sha) pkg_find_replace( - "DESCRIPTION", + "DESCRIPTION", paste0("Package:\\s+", old.Package), paste("Package:", new.Package)) pkg_find_replace( @@ -55,13 +55,13 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { } # A list of performance tests. -# +# # Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments: # - N: A numeric sequence of data sizes to vary. # - setup: An expression evaluated for every data size before measuring time/memory. -# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. +# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. # This must call a function from data.table using a syntax with double or triple colon prefix. -# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. +# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. # (For instance, data.table:::[.data.table will become data.table.some_40_digit_SHA1_hash:::[.data.table) # # Optional parameters that may be useful to configure tests: @@ -70,10 +70,11 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { # - sha.vec: Named character vector or a list of vectors that specify data.table-specific commit SHAs for testing across those different git commit versions. # For historical regressions, use 'Before', 'Regression', and 'Fixed' (otherwise something like 'Slow' or 'Fast' ideally). # @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information. +# nolint start: undesirable_operator_linter. ':::' needed+appropriate here. test.list <- list( - # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 + # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 - "Regression fixed in #4440" = list( + "Test regression fixed in #4440" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3,8), setup = quote({ @@ -89,7 +90,7 @@ test.list <- list( # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 - "Regression fixed in #5463" = list( + "Test regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), setup = quote({ @@ -101,7 +102,7 @@ test.list <- list( key = "g") dt_mod <- copy(dt) }), - expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) @@ -109,12 +110,12 @@ test.list <- list( # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426 # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427 "Improvement implemented in #5427" = list( - pkg.edit.fun = pkg.edit.fun, - N = 10^seq(1, 7), - setup = quote({ - DT = replicate(N, 1, simplify = FALSE) - }), - expr = quote(data.table:::setDT(DT)), - Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) - Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(1, 7), + setup = quote({ + DT = replicate(N, 1, simplify = FALSE) + }), + expr = quote(data.table:::setDT(DT)), + Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) + Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) ) From a2e91348fb8c37fb63be5ff1a52607ca5cd60140 Mon Sep 17 00:00:00 2001 From: Ani Date: Fri, 19 Apr 2024 19:29:18 -0700 Subject: [PATCH 098/106] Titles --- .ci/atime/tests.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index dcbf950ce..166dbfe08 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -74,7 +74,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { test.list <- list( # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 - "Test regression fixed in #4440" = list( + "Regression fixed in #4440" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3,8), setup = quote({ @@ -90,7 +90,7 @@ test.list <- list( # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 - "Test regression fixed in #5463" = list( + "Regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), setup = quote({ From 20126b118bfbf1f19a018bf0f22a4ce6adc1cf39 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 22 Apr 2024 09:39:21 -0700 Subject: [PATCH 099/106] Fix other.Rraw tests to run locally (#6097) --- inst/tests/other.Rraw | 15 +++++++++------ tests/other.R | 12 ++---------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 99169809f..087b3bada 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -27,7 +27,7 @@ f = function(pkg) suppressWarnings(suppressMessages(isTRUE( ))) loaded = sapply(pkgs, f) if (any(!loaded)) { - stop("test.data.table('other.Rraw') is missing required package(s): ", paste(names(loaded)[!loaded], collapse=", "), ". If you can't install them and this is R CMD check, please set environment variable TEST_DATA_TABLE_WITH_OTHER_PACKAGES back to the default, false.") + stop("test.data.table('other.Rraw') is missing required package(s): ", toString(names(loaded)[!loaded]), ". If you can't install them and this is R CMD check, please set environment variable TEST_DATA_TABLE_WITH_OTHER_PACKAGES back to the default, false.") # Would like to install them now for convenience but gitlab-ci.yml seems to install to bus/mirror-other-packages/cran. # If that's a cache, that's nice, but we don't know at this point whether this script is being run by GLCI or by a user or in dev. # We don't allow skipping (e.g. if _R_CHECK_FORCE_SUGGESTS_ is FALSE) to keep things simple and to keep things strict; i.e. @@ -35,7 +35,7 @@ if (any(!loaded)) { } cat("\n") -print(data.table(pkg=pkgs, loaded)[loaded==TRUE, version:=as.character(sapply(pkg, function(p) format(packageVersion(p))))][]) +print(data.table(pkg=pkgs, loaded)[loaded==TRUE, .(pkg, version=sapply(pkg, function(p) format(packageVersion(p))))]) cat("\n") print(sessionInfo()) cat("\n") @@ -50,7 +50,9 @@ if (loaded[["ggplot2"]]) { test(1.2, DT[,print(ggplot(.SD,aes(b,f))+geom_point()),by=list(grp%%2L)],data.table(grp=integer())) # %%2 to reduce time needed for ggplot2 to plot if (loaded[["hexbin"]]) { # Test reported by C Neff on 11 Oct 2011 - test(1.3, names(print(ggplot(DT) + geom_hex(aes(b, f)) + facet_wrap(~grp)))[c(1,3)], c("data","scales")) + # TODO(r-lib/gtable#94): don't suppressWarnings() here. + x <- suppressWarnings(print(ggplot(DT) + geom_hex(aes(b, f)) + facet_wrap(~grp))) + test(1.3, names(x)[c(1L, 3L)], c("data", "scales")) } # Test plotting ITime with ggplot2 which seems to require an as.data.frame method for ITime, #1713 datetimes = c("2011 NOV18 09:29:16", "2011 NOV18 10:42:40", "2011 NOV18 23:47:12", @@ -97,7 +99,8 @@ if (loaded[["caret"]]) { # So I put the win-builder fail down to resource issues and moved this test into test.data.table("other.Rraw"). DT = data.table(x = rnorm(10), y = rnorm(10)) cv.ctrl = trainControl(method = 'repeatedcv', number = 5, repeats = 1) - fit = train(y ~ x, data = DT, 'lm', trControl = cv.ctrl) + # TODO(topepo/caret#1361): remove suppressWarnings() for partially matched args internal to caret + fit = suppressWarnings(train(y ~ x, data = DT, 'lm', trControl = cv.ctrl)) test(4, names(DT), c("x", "y")) } @@ -209,7 +212,7 @@ test(14.1, {example(':=', package='data.table', local=TRUE, echo=FALSE); TRUE}) test(14.2, {example('CJ', package='data.table', local=TRUE, echo=FALSE); TRUE}) if (loaded[["sf"]]) { #2273 - DT = as.data.table(st_read(system.file("shape/nc.shp", package = "sf"))) + DT = as.data.table(st_read(system.file("shape/nc.shp", package = "sf"), quiet=TRUE)) test(15, DT[1:3, .(NAME, FIPS, geometry)], output="Ashe.*-81.4.*Surry.*-80.4") dsf = sf::st_as_sf(data.table(x=1:10, y=1:10, s=sample(1:2, 10, TRUE)), coords=1:2) @@ -724,6 +727,6 @@ if (FALSE) { # moved from tests.Rraw in #5517 and not yet back on; wasn't sure if (loaded[["dplyr"]]) { # regression test for converting character->list column in a magrittr (dplyr) pipe, #2651 - DT = data.table(a = 1, b = 2, c = '1,2,3,4]', d = 4) + DT = data.table(a = 1, b = 2, c = '1,2,3,4', d = 4) test(30, DT[, c := strsplit(c, ',', fixed = TRUE) %>% lapply(as.integer) %>% as.list]$c, list(1:4)) } diff --git a/tests/other.R b/tests/other.R index 5b2969bbf..34ced2327 100644 --- a/tests/other.R +++ b/tests/other.R @@ -1,15 +1,7 @@ -require(data.table) +library(data.table) if (!as.logical(Sys.getenv("TEST_DATA_TABLE_WITH_OTHER_PACKAGES", "FALSE"))) { + cat("Skipping tests in 'other' and quitting, set TEST_DATA_TABLE_WITH_OTHER_PACKAGES to proceed.\n") q('no') } -options(warn=1) -# test.data.table() turns on R's warnPartial* options and currently there -# are partial argument names used in base and other packages. Without the -# options(warn=1), other.Rout just contains "There were 16 warnings (use -# warnings() to see them)". However, a print(warnings()) after test.data.table() -# just results in NULL in other.Rout. Hence options(warn=1) because that -# worked to display the warnings, not because we want them displayed at the -# time per se. - test.data.table(script="other.Rraw") From 51e4615d8ed9dbef6e600485d33f82ad616c72ba Mon Sep 17 00:00:00 2001 From: Ani Date: Mon, 22 Apr 2024 12:47:46 -0700 Subject: [PATCH 100/106] Reverted whitespace changes --- .ci/atime/tests.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 166dbfe08..27094ae53 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -1,7 +1,7 @@ # A function to customize R package metadata and source files to facilitate version-specific installation and testing. # -# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) -# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. +# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) +# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. # It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately. # # @param old.Package Current name of the package. @@ -29,7 +29,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { Package_ <- gsub(".", "_", old.Package, fixed = TRUE) new.Package_ <- paste0(Package_, "_", sha) pkg_find_replace( - "DESCRIPTION", + "DESCRIPTION", paste0("Package:\\s+", old.Package), paste("Package:", new.Package)) pkg_find_replace( @@ -55,13 +55,13 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { } # A list of performance tests. -# +# # Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments: # - N: A numeric sequence of data sizes to vary. # - setup: An expression evaluated for every data size before measuring time/memory. -# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. +# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. # This must call a function from data.table using a syntax with double or triple colon prefix. -# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. +# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. # (For instance, data.table:::[.data.table will become data.table.some_40_digit_SHA1_hash:::[.data.table) # # Optional parameters that may be useful to configure tests: @@ -72,7 +72,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { # @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information. # nolint start: undesirable_operator_linter. ':::' needed+appropriate here. test.list <- list( - # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 + # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 "Regression fixed in #4440" = list( pkg.edit.fun = pkg.edit.fun, @@ -89,7 +89,7 @@ test.list <- list( # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 - # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 + # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 "Regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), @@ -102,7 +102,7 @@ test.list <- list( key = "g") dt_mod <- copy(dt) }), - expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) From 1d8c69973256996beced9d66e0f54e46a0dc1c5e Mon Sep 17 00:00:00 2001 From: Ani Date: Mon, 22 Apr 2024 13:08:28 -0700 Subject: [PATCH 101/106] Removed the linter comment added from #5908 --- .ci/atime/tests.R | 1 - 1 file changed, 1 deletion(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 27094ae53..134f968e2 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -70,7 +70,6 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { # - sha.vec: Named character vector or a list of vectors that specify data.table-specific commit SHAs for testing across those different git commit versions. # For historical regressions, use 'Before', 'Regression', and 'Fixed' (otherwise something like 'Slow' or 'Fast' ideally). # @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information. -# nolint start: undesirable_operator_linter. ':::' needed+appropriate here. test.list <- list( # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 From 6db0eda711bb59ad9b6009208584c56da1abb915 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 23 Apr 2024 12:12:12 -0700 Subject: [PATCH 102/106] Add a GHA for linting code (#5908) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 1.15.0 on CRAN. Bump to 1.15.99 * Fix transform slowness (#5493) * Fix 5492 by limiting the costly deparse to `nlines=1` * Implementing PR feedbacks * Added inside * Fix typo in name * Idiomatic use of inside * Separating the deparse line limit to a different PR --------- Co-authored-by: Michael Chirico * Improvements to the introductory vignette (#5836) * Added my improvements to the intro vignette * Removed two lines I added extra as a mistake earlier * Requested changes * Vignette typo patch (#5402) * fix typos and grammatical mistakes * fix typos and punctuation * remove double spaces where it wasn't necessary * fix typos and adhere to British English spelling * fix typos * fix typos * add missing closing bracket * fix typos * review fixes * Update vignettes/datatable-benchmarking.Rmd Co-authored-by: Michael Chirico * Update vignettes/datatable-benchmarking.Rmd Co-authored-by: Michael Chirico * Apply suggestions from code review benchmarking Co-authored-by: Michael Chirico * remove unnecessary [ ] from datatable-keys-fast-subset.Rmd * Update vignettes/datatable-programming.Rmd Co-authored-by: Michael Chirico * Update vignettes/datatable-reshape.Rmd Co-authored-by: Michael Chirico * One last batch of fine-tuning --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico * Improved handling of list columns with NULL entries (#4250) * Updated documentation for rbindlist(fill=TRUE) * Print NULL entries of list as NULL * Added news item * edit NEWS, use '[NULL]' not 'NULL' * fix test * split NEWS item * add example --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger * clarify that list input->unnamed list output (#5383) * clarify that list input->unnamed list output * Add example where make.names is used * mention role of make.names * fix subsetting issue in split.data.table (#5368) * fix subsetting issue in split.data.table * add a test * drop=FALSE on inner [ * switch to 3.2.0 R dep (#5905) * Allow early exit from check for eval/evalq in cedta (#5660) * Allow early exit from check for eval/evalq in cedta Done in the browser+untested, please take a second look :) * Use %chin% * nocov new code * frollmax1: frollmax, frollmax adaptive, left adaptive support (#5889) * frollmax exact, buggy fast, no fast adaptive * frollmax fast fixing bugs * frollmax man to fix CRAN check * frollmax fast adaptive non NA, dev * froll docs, adaptive left * no frollmax fast adaptive * frollmax adaptive exact NAs handling * PR summary in news * copy-edit changes from reviews Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * comment requested by Michael * update NEWS file * Apply suggestions from code review Co-authored-by: Michael Chirico * Apply suggestions from code review Co-authored-by: Michael Chirico * add comment requested by Michael * add comment about int iterator for loop over k-1 obs * extra comments * Revert "extra comments" This reverts commit 03af0e30f1a6a9e75f82b5827c1078f42db48e45. * add comments to frollmax and frollsum * typo fix --------- Co-authored-by: Michael Chirico Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * Friendlier error in assignment with trailing comma (#5467) * friendlier error in assignment with trailing comma e.g. `DT[, `:=`(a = 1, b = 2,)`. WIP. Need to add tests and such, but editing from browser before I forget. * Another pass * include unnamed indices on RHS too * tests * NEWS * test numbering * explicit example in NEWS * Link to ?read.delim in ?fread to give a closer analogue of expected behavior (#5635) * fread is similar to read.delim (#5634) * Use ?read.csv / ?read.delim --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico * Run GHA jobs on 1-15-99 dev branch (#5909) * overhauled linter * revert code changes * Initial commit of {lintr} approach * first pass at personalization * first custom linter * delint vignettes * delint tests * delint R sources * rm empty * re-merge * Move config to .ci directory * Use endsWithAny * Make declarations static for covr (#5910) * restore lint on branch * extension needed after all? * set option in R * debug printing * Exact file name in option * really hacky approach * skip more linters * One more round of deactivation * FIx whitespace issues (again??) * botched merge * obsolete branch ref * restore simple CI script thanks to upstream fix * more delint * just disable unused_import_linter() everywhere for now * rm whitespace from atime tests * comment about comment --------- Co-authored-by: Ofek Co-authored-by: Ani Co-authored-by: David Budzynski <56514985+davidbudzynski@users.noreply.github.com> Co-authored-by: Scott Ritchie Co-authored-by: Benjamin Schwendinger Co-authored-by: Jan Gorecki Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Co-authored-by: Manuel López-Ibáñez <2620021+MLopez-Ibanez@users.noreply.github.com> --- .ci/.lintr.R | 105 +++++++++++++++++++++++ .github/workflows/lint.yaml | 35 ++++++++ R/bmerge.R | 1 - R/data.table.R | 2 +- R/duplicated.R | 1 - R/foverlaps.R | 1 - R/fwrite.R | 3 +- R/onAttach.R | 2 +- R/openmp-utils.R | 1 - R/print.data.table.R | 3 +- R/setkey.R | 1 - R/setops.R | 1 - R/tables.R | 1 - R/test.data.table.R | 4 +- R/timetaken.R | 1 - R/transpose.R | 3 +- R/uniqlist.R | 1 - R/utils.R | 1 - inst/atime/tests.R | 22 ++--- vignettes/datatable-faq.Rmd | 4 +- vignettes/datatable-intro.Rmd | 2 +- vignettes/datatable-keys-fast-subset.Rmd | 6 +- vignettes/datatable-programming.Rmd | 2 +- vignettes/datatable-sd-usage.Rmd | 4 +- 24 files changed, 168 insertions(+), 39 deletions(-) create mode 100644 .ci/.lintr.R create mode 100644 .github/workflows/lint.yaml diff --git a/.ci/.lintr.R b/.ci/.lintr.R new file mode 100644 index 000000000..09a0db3dc --- /dev/null +++ b/.ci/.lintr.R @@ -0,0 +1,105 @@ +for (f in list.files('ci/linters', full.names=TRUE)) source(f) +rm(f) + +linters = all_linters( + packages = "lintr", # TODO(lintr->3.2.0): Remove this. + # eq_assignment_linter(), + brace_linter(allow_single_line = TRUE), + # TODO(michaelchirico): Activate these incrementally. These are the + # parameterizations that match our style guide. + # implicit_assignment_linter(allow_lazy = TRUE, allow_scoped = TRUE), + # implicit_integer_linter(allow_colon = TRUE), + # system_time_linter = undesirable_function_linter(c( + # system.time = "Only run timings in benchmark.Rraw" + # )), + # undesirable_function_linter(modify_defaults( + # default_undesirable_functions, + # ifelse = "Use fifelse instead.", + # Sys.setenv = NULL, + # library = NULL, + # options = NULL, + # par = NULL, + # setwd = NULL + # )), + undesirable_operator_linter(modify_defaults( + default_undesirable_operators, + `<<-` = NULL + )), + # TODO(lintr#2441): Use upstream implementation. + assignment_linter = NULL, + # TODO(lintr#2442): Use this once x[ , j, by] is supported. + commas_linter = NULL, + commented_code_linter = NULL, + # TODO(linter->3.2.0): Activate this. + consecutive_assertion_linter = NULL, + cyclocomp_linter = NULL, + function_argument_linter = NULL, + indentation_linter = NULL, + infix_spaces_linter = NULL, + # TODO(R>3.2.0): Activate this, extending to recognize vapply_1i(x, length). + lengths_linter = NULL, + line_length_linter = NULL, + missing_package_linter = NULL, + namespace_linter = NULL, + nonportable_path_linter = NULL, + object_name_linter = NULL, + object_usage_linter = NULL, + quotes_linter = NULL, + semicolon_linter = NULL, + spaces_inside_linter = NULL, + spaces_left_parentheses_linter = NULL, + # TODO(michaelchirico): Only exclude from vignettes, not sure what's wrong. + strings_as_factors_linter = NULL, + # TODO(lintr->3.2.0): Fix on a valid TODO style, enforce it, and re-activate. + todo_comment_linter = NULL, + # TODO(michaelchirico): Enforce these and re-activate them one-by-one. Also stop using '<<-'. + brace_linter = NULL, + condition_call_linter = NULL, + conjunct_test_linter = NULL, + fixed_regex_linter = NULL, + function_left_parentheses_linter = NULL, + if_not_else_linter = NULL, + implicit_assignment_linter = NULL, + implicit_integer_linter = NULL, + keyword_quote_linter = NULL, + length_levels_linter = NULL, + matrix_apply_linter = NULL, + missing_argument_linter = NULL, + nzchar_linter = NULL, + object_overwrite_linter = NULL, + paren_body_linter = NULL, + redundant_equals_linter = NULL, + rep_len_linter = NULL, + repeat_linter = NULL, + return_linter = NULL, + sample_int_linter = NULL, + scalar_in_linter = NULL, + seq_linter = NULL, + undesirable_function_linter = NULL, + unnecessary_concatenation_linter = NULL, + unnecessary_lambda_linter = NULL, + unnecessary_nesting_linter = NULL, + unreachable_code_linter = NULL, + unused_import_linter = NULL +) +# TODO(lintr#2172): Glob with lintr itself. +exclusions = local({ + exclusion_for_dir <- function(dir, exclusions) { + files = list.files(dir, pattern = "\\.(R|Rmd)$") + stats::setNames(rep(list(exclusions), length(files)), files) + } + c( + exclusion_for_dir("tests", list( + quotes_linter = Inf, + # TODO(michaelchirico): Enforce these and re-activate them one-by-one. + implicit_integer_linter = Inf, + infix_spaces_linter = Inf, + undesirable_function_linter = Inf + )), + exclusion_for_dir("vignettes", list( + quotes_linter = Inf + # strings_as_factors_linter = Inf + # system_time_linter = Inf + )) + ) +}) diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 000000000..7170016b1 --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,35 @@ +on: + push: + branches: + - master + pull_request: + branches: + - master + +name: lint + +jobs: + lint: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: | + r-lib/lintr + local::. + needs: lint + + - name: Lint + run: lintr::lint_package() + shell: Rscript {0} + env: + LINTR_ERROR_ON_LINT: true + R_LINTR_LINTER_FILE: .ci/.lintr diff --git a/R/bmerge.R b/R/bmerge.R index ddaedc1b3..ff40fddb4 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -187,4 +187,3 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos ans$xo = xo # for further use by [.data.table return(ans) } - diff --git a/R/data.table.R b/R/data.table.R index e0cddb38f..5513fb276 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2300,7 +2300,7 @@ transform.data.table = function (`_data`, ...) { if (!cedta()) return(NextMethod()) # nocov `_data` = copy(`_data`) - e = eval(substitute(list(...)), `_data`, parent.frame()) + e = eval(substitute(list(...)), `_data`, parent.frame()) set(`_data`, ,names(e), e) `_data` } diff --git a/R/duplicated.R b/R/duplicated.R index 27b903812..0aad2ebdd 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -118,4 +118,3 @@ uniqueN = function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) length(starts) } } - diff --git a/R/foverlaps.R b/R/foverlaps.R index 9a0cd5580..54dc61f93 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -247,4 +247,3 @@ foverlaps = function(x, y, by.x=if (!is.null(key(x))) key(x) else key(y), by.y=k # Tests are added to ensure we cover these aspects (to my knowledge) to ensure that any undesirable changes in the future breaks those tests. # Conclusion: floating point manipulations are hell! - diff --git a/R/fwrite.R b/R/fwrite.R index b13b0afb7..37968d4ea 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -64,7 +64,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", length(nThread)==1L && !is.na(nThread) && nThread>=1L ) - is_gzip = compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) + is_gzip = compress == "gzip" || (compress == "auto" && endsWithAny(file, ".gz")) file = path.expand(file) # "~/foo/bar" if (append && (file=="" || file.exists(file))) { @@ -122,4 +122,3 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } haszlib = function() .Call(Cdt_has_zlib) - diff --git a/R/onAttach.R b/R/onAttach.R index 6ff17972b..7a4a9be3e 100644 --- a/R/onAttach.R +++ b/R/onAttach.R @@ -21,7 +21,7 @@ nth = getDTthreads(verbose=FALSE) if (dev) packageStartupMessagef("data.table %s IN DEVELOPMENT built %s%s using %d threads (see ?getDTthreads). ", v, d, g, nth, appendLF=FALSE) - else + else packageStartupMessagef("data.table %s using %d threads (see ?getDTthreads). ", v, nth, appendLF=FALSE) packageStartupMessagef("Latest news: r-datatable.com") if (gettext("TRANSLATION CHECK") != "TRANSLATION CHECK") diff --git a/R/openmp-utils.R b/R/openmp-utils.R index f19120724..85f6b3257 100644 --- a/R/openmp-utils.R +++ b/R/openmp-utils.R @@ -13,4 +13,3 @@ setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL, thr getDTthreads = function(verbose=getOption("datatable.verbose")) { .Call(CgetDTthreads, verbose) } - diff --git a/R/print.data.table.R b/R/print.data.table.R index 9e33e0c4d..f80a5833c 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -230,7 +230,7 @@ format_list_item.default = function(x, ...) { char.trunc = function(x, trunc.char = getOption("datatable.prettyprint.char")) { trunc.char = max(0L, suppressWarnings(as.integer(trunc.char[1L])), na.rm=TRUE) if (!is.character(x) || trunc.char <= 0L) return(x) - nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096 + nchar_width = nchar(x, 'width') # Check whether string is full-width or half-width, #5096 nchar_chars = nchar(x, 'char') is_full_width = nchar_width > nchar_chars idx = pmin(nchar_width, nchar_chars) > trunc.char @@ -272,4 +272,3 @@ trunc_cols_message = function(not_printed, abbs, class, col.names){ n, brackify(paste0(not_printed, classes)) ) } - diff --git a/R/setkey.R b/R/setkey.R index 84488a803..62da9ebe8 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -353,4 +353,3 @@ CJ = function(..., sorted = TRUE, unique = FALSE) } l } - diff --git a/R/setops.R b/R/setops.R index 1034b0f0f..23dd6ec8f 100644 --- a/R/setops.R +++ b/R/setops.R @@ -290,4 +290,3 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu } TRUE } - diff --git a/R/tables.R b/R/tables.R index e47a1a42e..6a0209c86 100644 --- a/R/tables.R +++ b/R/tables.R @@ -60,4 +60,3 @@ tables = function(mb=type_size, order.col="NAME", width=80, } invisible(info) } - diff --git a/R/test.data.table.R b/R/test.data.table.R index 748e09512..43486e278 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -7,7 +7,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (length(memtest.id)) { if (length(memtest.id)==1L) memtest.id = rep(memtest.id, 2L) # for convenience of supplying one id rather than always a range stopifnot(length(memtest.id)<=2L, # conditions quoted to user when false so "<=2L" even though following conditions rely on ==2L - !anyNA(memtest.id), memtest.id[1L]<=memtest.id[2L]) + !anyNA(memtest.id), memtest.id[1L]<=memtest.id[2L]) if (memtest==0L) memtest=1L # using memtest.id implies memtest } if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { @@ -134,7 +134,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F owd = setwd(tempdir()) # ensure writeable directory; e.g. tests that plot may write .pdf here depending on device option and/or batch mode; #5190 on.exit(setwd(owd)) - + if (memtest) { catf("\n***\n*** memtest=%d. This should be the first call in a fresh R_GC_MEM_GROW=0 R session for best results. Ctrl-C now if not.\n***\n\n", memtest) if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.") diff --git a/R/timetaken.R b/R/timetaken.R index daa52c9f1..ae4b384fc 100644 --- a/R/timetaken.R +++ b/R/timetaken.R @@ -12,4 +12,3 @@ timetaken = function(started.at) tt = proc.time()-started.at # diff all 3 times paste0(format(tt[3L])," elapsed (", format(tt[1L]), " cpu)") } - diff --git a/R/transpose.R b/R/transpose.R index 684b135d4..6d6da6779 100644 --- a/R/transpose.R +++ b/R/transpose.R @@ -56,7 +56,7 @@ tstrsplit = function(x, ..., fill=NA, type.convert=FALSE, keep, names=FALSE) { if (!(sum(!is_named) == 1L && !is_named[n] && is.function(type.convert[[n]]))) stopf("When the argument 'type.convert' contains an unnamed element, it is expected to be the last element and should be a function. More than one unnamed element is not allowed unless all elements are functions with length equal to %d (the length of the transpose list or 'keep' argument if it is specified).", length(keep)) else { - fothers = type.convert[[n]] + fothers = type.convert[[n]] type.convert = type.convert[-n] } } @@ -90,4 +90,3 @@ tstrsplit = function(x, ..., fill=NA, type.convert=FALSE, keep, names=FALSE) { setattr(ans, 'names', names) ans } - diff --git a/R/uniqlist.R b/R/uniqlist.R index 2a610ab1a..4f3600f83 100644 --- a/R/uniqlist.R +++ b/R/uniqlist.R @@ -21,4 +21,3 @@ uniqlengths = function(x, len) { ans = .Call(Cuniqlengths, as.integer(x), as.integer(len)) ans } - diff --git a/R/utils.R b/R/utils.R index a78e5450f..feacd2b00 100644 --- a/R/utils.R +++ b/R/utils.R @@ -165,4 +165,3 @@ rss = function() { #5515 #5517 round(ans / 1024, 1L) # return MB # nocov end } - diff --git a/inst/atime/tests.R b/inst/atime/tests.R index a0635d063..19c1b27a8 100644 --- a/inst/atime/tests.R +++ b/inst/atime/tests.R @@ -1,7 +1,7 @@ # A function to customize R package metadata and source files to facilitate version-specific installation and testing. # -# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) -# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. +# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) +# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. # It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately. # # @param old.Package Current name of the package. @@ -29,7 +29,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { Package_ <- gsub(".", "_", old.Package, fixed = TRUE) new.Package_ <- paste0(Package_, "_", sha) pkg_find_replace( - "DESCRIPTION", + "DESCRIPTION", paste0("Package:\\s+", old.Package), paste("Package:", new.Package)) pkg_find_replace( @@ -55,13 +55,13 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { } # A list of performance tests. -# +# # Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments: # - N: A numeric sequence of data sizes to vary. # - setup: An expression evaluated for every data size before measuring time/memory. -# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. +# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. # This must call a function from data.table using a syntax with double or triple colon prefix. -# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. +# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. # (For instance, data.table:::[.data.table will become data.table.some_40_digit_SHA1_hash:::[.data.table) # # Optional parameters that may be useful to configure tests: @@ -70,8 +70,9 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { # - sha.vec: Named character vector or a list of vectors that specify data.table-specific commit SHAs for testing across those different git commit versions. # For historical regressions, use 'Before', 'Regression', and 'Fixed' (otherwise something like 'Slow' or 'Fast' ideally). # @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information. +# nolint start: undesirable_operator_linter. ':::' needed+appropriate here. test.list <- list( - # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 + # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 "Test regression fixed in #4440" = list( pkg.edit.fun = pkg.edit.fun, @@ -88,7 +89,7 @@ test.list <- list( # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 - # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 + # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 "Test regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), @@ -101,8 +102,9 @@ test.list <- list( key = "g") dt_mod <- copy(dt) }), - expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) - Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) + Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) ) +# nolint end: undesirable_operator_linter. diff --git a/vignettes/datatable-faq.Rmd b/vignettes/datatable-faq.Rmd index 97c11aeba..1501497bc 100644 --- a/vignettes/datatable-faq.Rmd +++ b/vignettes/datatable-faq.Rmd @@ -373,7 +373,7 @@ Yes: The general form is: -```{r, eval = FALSE} +```r DT[where, select|update, group by][order by][...] ... [...] ``` @@ -619,4 +619,4 @@ Please see [this answer](https://stackoverflow.com/a/10529888/403310). ```{r, echo=FALSE} setDTthreads(.old.th) -``` \ No newline at end of file +``` diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd index c783c3fa7..e63caee5d 100644 --- a/vignettes/datatable-intro.Rmd +++ b/vignettes/datatable-intro.Rmd @@ -652,4 +652,4 @@ We will see how to *add/update/delete* columns *by reference* and how to combine ```{r, echo=FALSE} setDTthreads(.old.th) -``` \ No newline at end of file +``` diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd index 3ec50640c..d85f69ad8 100644 --- a/vignettes/datatable-keys-fast-subset.Rmd +++ b/vignettes/datatable-keys-fast-subset.Rmd @@ -157,7 +157,7 @@ Once you *key* a *data.table* by certain columns, you can subset by querying tho flights[.("JFK")] ## alternatively -# flights[J("JFK")] (or) +# flights[J("JFK")] (or) # flights[list("JFK")] ``` @@ -464,7 +464,7 @@ Now let us look at binary search approach (method 2). Recall from [Properties of Here's a very simple illustration. Let's consider the (sorted) numbers shown below: -```{r eval = FALSE} +``` 1, 5, 10, 19, 22, 23, 30 ``` @@ -499,4 +499,4 @@ Key based subsets are **incredibly fast** and are particularly useful when the t ```{r, echo=FALSE} setDTthreads(.old.th) -``` \ No newline at end of file +``` diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 0536f11d6..3ec1f57d5 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -420,4 +420,4 @@ DT[, cl, env = list(cl = cl)] ```{r cleanup, echo=FALSE} options(.opts) registerS3method("print", "data.frame", base::print.data.frame) -``` \ No newline at end of file +``` diff --git a/vignettes/datatable-sd-usage.Rmd b/vignettes/datatable-sd-usage.Rmd index 09243c820..bd2618d53 100644 --- a/vignettes/datatable-sd-usage.Rmd +++ b/vignettes/datatable-sd-usage.Rmd @@ -169,7 +169,7 @@ lm_coef = sapply(models, function(rhs) { }) barplot(lm_coef, names.arg = sapply(models, paste, collapse = '/'), main = 'Wins Coefficient\nWith Various Covariates', - col = col16, las = 2L, cex.names = .8) + col = col16, las = 2L, cex.names = 0.8) ``` The coefficient always has the expected sign (better pitchers tend to have more wins and fewer runs allowed), but the magnitude can vary substantially depending on what else we control for. @@ -254,4 +254,4 @@ The above is just a short introduction of the power of `.SD` in facilitating bea ```{r, echo=FALSE} setDTthreads(.old.th) -``` \ No newline at end of file +``` From f23abda7f11bb35ca078a35e55cd4237f2372172 Mon Sep 17 00:00:00 2001 From: Ani Date: Tue, 23 Apr 2024 15:07:29 -0700 Subject: [PATCH 103/106] Fixed the third test --- .ci/atime/tests.R | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 77f9df75a..977821e3c 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -109,14 +109,18 @@ test.list <- list( # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426 # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427 - "Improvement implemented in #5427" = list( - pkg.edit.fun = pkg.edit.fun, - N = 10^seq(1, 7), - setup = quote({ - DT = replicate(N, 1, simplify = FALSE) - }), - expr = quote(data.table:::setDT(DT)), - Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) - Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) + "Test performance improvement implemented in #5427" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(1, 7), + setup = quote({ + L <- replicate(N, 1, simplify = FALSE) + setDT(L) + }), + expr = quote({ + data.table:::setattr(L, "class", NULL) + data.table:::setDT(L) + }), + Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) + Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) ) # nolint end: undesirable_operator_linter. From 91b5de314e9674eaa5e23ca77e3408019fe75365 Mon Sep 17 00:00:00 2001 From: Ani Date: Tue, 23 Apr 2024 15:12:03 -0700 Subject: [PATCH 104/106] Titles --- .ci/atime/tests.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 977821e3c..13011eb1c 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -74,7 +74,7 @@ pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { test.list <- list( # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 - "Regression fixed in #4440" = list( + "shallow regression fixed in #4440" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3,8), setup = quote({ @@ -90,7 +90,7 @@ test.list <- list( # Test based on: https://github.com/Rdatatable/data.table/issues/5424 # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 - "Regression fixed in #5463" = list( + "memrecycle regression fixed in #5463" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(3, 8), setup = quote({ @@ -109,7 +109,7 @@ test.list <- list( # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426 # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427 - "Test performance improvement implemented in #5427" = list( + "setDT improvement implemented in #5427" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(1, 7), setup = quote({ From 6c7157d4277931f972a5b2c3f6bb24da4accece2 Mon Sep 17 00:00:00 2001 From: Ani Date: Tue, 23 Apr 2024 15:33:26 -0700 Subject: [PATCH 105/106] Better wording --- .ci/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 13011eb1c..c5d2e3f25 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -109,7 +109,7 @@ test.list <- list( # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426 # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427 - "setDT improvement implemented in #5427" = list( + "setDT improved in #5427" = list( pkg.edit.fun = pkg.edit.fun, N = 10^seq(1, 7), setup = quote({ From 65212df877af58a9f935e13615a70e40778e8866 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Wed, 24 Apr 2024 08:47:57 +0200 Subject: [PATCH 106/106] Update CODEOWNERS for not to be *jangorecki (#6101) --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index fd67c8c08..eeb8ab98c 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,5 @@ # https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners -* @jangorecki @michaelchirico +* @michaelchirico # reshaping /R/fcast.R @tdhock