From 654b44bb5637af6c4f24fe4a91bdb40d98be0b53 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 23 Feb 2018 07:59:48 +0530 Subject: [PATCH 1/5] CJ fails with proper error msg. Closes #2636 --- NEWS.md | 3 ++- R/setkey.R | 3 +++ inst/tests/tests.Rraw | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 28f578015..eadb202d7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -139,6 +139,8 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat 33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing. +34. `CJ` now fails with proper error message when results would exceed max integer. Closes #2636. + #### NOTES 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. @@ -163,7 +165,6 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat 10. As warned in v1.9.8 release notes below in this file (on CRAN 25 Nov 2016) it has been 1 year since then and so use of `options(datatable.old.unique.by.key=TRUE)` to restore the old default is now deprecated with warning. The new warning states that this option still works and repeats the request to pass `by=key(DT)` explicitly to `unique()`, `duplicated()`, `uniqueN()` and `anyDuplicated()` and to stop using this option. In another year, this warning will become error. Another year after that the option will be removed. - ### Changes in v1.10.4-3 (on CRAN 20 Oct 2017) 1. Fixed crash/hang on MacOS when `parallel::mclapply` is used and data.table is merely loaded, [#2418](https://github.com/Rdatatable/data.table/issues/2418). Oddly, all tests including test 1705 (which tests `mclapply` with data.table) passed fine on CRAN. It appears to be some versions of MacOS or some versions of libraries on MacOS, perhaps. Many thanks to Martin Morgan for reporting and confirming this fix works. Thanks also to @asenabouth, Joe Thorley and Danton Noriega for testing, debugging and confirming that automatic parallelism inside data.table (such as `fwrite`) works well even on these MacOS installations. See also news items below for 1.10.4-1 and 1.10.4-2. diff --git a/R/setkey.R b/R/setkey.R index e26af1d95..01dd36717 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -356,6 +356,9 @@ CJ <- function(..., sorted = TRUE, unique = FALSE) attribs = lapply(l, attributes) # remember attributes for resetting after rep.int n = vapply(l, length, 0L) nrow = prod(n) + if (nrow > .Machine$integer.max) { + stop("Cross product of elements provided to CJ exceeds integer max.") + } x = c(rev(take(cumprod(rev(n)))), 1L) for (i in seq_along(x)) { y = l[[i]] diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 31bc4ec14..182f49f3d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11717,6 +11717,8 @@ test(1879.6, fread(f, verbose=TRUE), DT, sep = '.*')) unlink(f) +# CJ will should fail with proper error message #2636 +test(1880, CJ(1:1000, 1:1000, 1:1000, 1:3), error="Cross product of elements provided to CJ exceeds integer max") ################################### # Add new tests above this line # From d6326342a8af60dbc5c92d44a9fadf502c8c9b69 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 23 Feb 2018 18:20:02 +0800 Subject: [PATCH 2/5] add note for easier update once dependency meets R 3.2.0 --- R/setkey.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/setkey.R b/R/setkey.R index 01dd36717..2f54eb37c 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -354,7 +354,7 @@ CJ <- function(..., sorted = TRUE, unique = FALSE) else if (length(l) > 1L && !emptyList) { # using rep.int instead of rep speeds things up considerably (but attributes are dropped). attribs = lapply(l, attributes) # remember attributes for resetting after rep.int - n = vapply(l, length, 0L) + n = vapply(l, length, 0L) #lengths(l) will work from R 3.2.0 nrow = prod(n) if (nrow > .Machine$integer.max) { stop("Cross product of elements provided to CJ exceeds integer max.") From c2629ebbdf6afe82f093686d813f01d1e0ee8736 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 23 Feb 2018 18:34:23 +0800 Subject: [PATCH 3/5] platform-independent failure in test --- inst/tests/tests.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 182f49f3d..1d0be401f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11718,7 +11718,8 @@ test(1879.6, fread(f, verbose=TRUE), DT, unlink(f) # CJ will should fail with proper error message #2636 -test(1880, CJ(1:1000, 1:1000, 1:1000, 1:3), error="Cross product of elements provided to CJ exceeds integer max") +maxp = ceiling(log10(.Machine$integer.max) / 3) +test(1880, CJ(1:10^maxp, 1:10^maxp, 1:10^maxp), error="Cross product of elements provided to CJ exceeds integer max") ################################### # Add new tests above this line # From 598b3a5fe52b4f1380ff58dd2ce02ab9f12ac0b9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 23 Feb 2018 18:35:51 +0800 Subject: [PATCH 4/5] tinker --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1d0be401f..be9401d7d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11718,8 +11718,8 @@ test(1879.6, fread(f, verbose=TRUE), DT, unlink(f) # CJ will should fail with proper error message #2636 -maxp = ceiling(log10(.Machine$integer.max) / 3) -test(1880, CJ(1:10^maxp, 1:10^maxp, 1:10^maxp), error="Cross product of elements provided to CJ exceeds integer max") +maxp_vec = seq_len(ceiling(log10(.Machine$integer.max) / 3)) +test(1880, CJ(maxp_vec, maxp_vec, maxp_vec), error="Cross product of elements provided to CJ exceeds integer max") ################################### # Add new tests above this line # From f9f6d3c2cbdc1460c426c54bf359c874f6e1ef69 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 28 Feb 2018 11:33:03 -0800 Subject: [PATCH 5/5] Fixed new test --- R/setkey.R | 2 +- inst/tests/tests.Rraw | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 2f54eb37c..692cc26c4 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -357,7 +357,7 @@ CJ <- function(..., sorted = TRUE, unique = FALSE) n = vapply(l, length, 0L) #lengths(l) will work from R 3.2.0 nrow = prod(n) if (nrow > .Machine$integer.max) { - stop("Cross product of elements provided to CJ exceeds integer max.") + stop("Cross product of elements provided to CJ() would result in ",nrow," rows which exceeds .Machine$integer.max == ",.Machine$integer.max) } x = c(rev(take(cumprod(rev(n)))), 1L) for (i in seq_along(x)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 270ee2be3..481a44492 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11751,8 +11751,11 @@ test(1881.3, fread(f), DT, warning=c('Discarded single-line footer: <<(10000 row unlink(f) # CJ will should fail with proper error message, #2636 -maxp_vec = seq_len(ceiling(log10(.Machine$integer.max) / 3)) -test(1882, CJ(maxp_vec, maxp_vec, maxp_vec), error="Cross product of elements provided to CJ exceeds integer max") +test(1882.1, .Machine$integer.max, 2147483647L) # same on all platforms and very unlikely to change in R (which is good) +test(1882.2, ceiling(.Machine$integer.max^(1/3)), 1291) +v = seq_len(1291L) +test(1882.3, CJ(v, v, v), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647") + ################################### # Add new tests above this line #