diff --git a/NEWS.md b/NEWS.md index 05e595a76..2d3109012 100644 --- a/NEWS.md +++ b/NEWS.md @@ -169,6 +169,9 @@ unit = "s") 10. Starting from 4.0.0, data.table is using R's `rbind` and `cbind` methods, as described in v1.12.6 news entry. Support for R 3.x.x is resolved when processing `NAMESPACE` file, at install time, or at the time of building package binaries. As a result, users on R 3.x.x, if installing from binaries, must use binaries built by R 3.x.x, and users on R 4.x.x, if installing from binaries, must use binaries built by R 4.x.x. Users will see `package ‘data.table’ was built under R version...` warning when this happen. Thanks to @vinhdizzo for reporting in [#4528](https://github.com/Rdatatable/data.table/issues/4528). +11. Internal function `shallow()` no longer makes a deep copy of secondary indices. This eliminates a relatively small time and memory overhead when indices are present that added up significantly when performing many operations, such as joins, in a loop or when joining in `j` by group, [#4311](https://github.com/Rdatatable/data.table/issues/4311). Many thanks to @renkun-ken for the report, and @tlapak for the investigation and PR. + + # data.table [v1.12.8](https://github.com/Rdatatable/data.table/milestone/15?closed=1) (09 Dec 2019) ## NEW FEATURES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 11168db5c..7b3902a2b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12871,30 +12871,32 @@ ids <- sample(letters[1:3], 10, replace=TRUE) scores <- rnorm(10) dt <- data.table(id=ids, score=scores) dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) -test(1914.1, isS4(dt.s4)) -test(1914.2, inherits(dt.s4, 'data.table')) +test(1914.01, isS4(dt.s4)) +test(1914.02, inherits(dt.s4, 'data.table')) +# Test possible regression. shallow() needs to preserve the S4 bit to support S4 classes that contain data.table +test(1914.03, isS4(shallow(dt.s4))) ## pull out data from S4 as.list, and compare to list from dt dt.s4.list <- dt.s4@.Data names(dt.s4.list) <- names(dt.s4) -test(1914.3, dt.s4.list, as.list(dt)) # Underlying data not identical +test(1914.04, dt.s4.list, as.list(dt)) # Underlying data not identical # simple S4 conversion-isms work df = data.frame(a=sample(letters, 10), b=1:10) dt = as.data.table(df) -test(1914.4, identical(as(df, 'data.table'), dt)) -test(1914.5, identical(as(dt, 'data.frame'), df)) +test(1914.05, identical(as(df, 'data.table'), dt)) +test(1914.06, identical(as(dt, 'data.frame'), df)) # data.table can be used in an S4 slot dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) dt.comp <- new("S4Composition", data=dt) -test(1914.6, dt.comp@data, dt) +test(1914.07, dt.comp@data, dt) # S4 methods dispatch properly on data.table slots" dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) dt.comp <- new("S4Composition", data=dt) setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what){x@data}) setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) {x@data[[what]]}) -test(1914.7, dtGet(dt.comp), dt) # actually -test(1914.8, identical(dtGet(dt.comp, 1), dt[[1]])) -test(1914.9, identical(dtGet(dt.comp, 'b'), dt$b)) +test(1914.08, dtGet(dt.comp), dt) # actually +test(1914.09, identical(dtGet(dt.comp, 1), dt[[1]])) +test(1914.10, identical(dtGet(dt.comp, 'b'), dt$b)) removeClass("Data.Table") # so that test 1914.2 passes on the second run of cc() in dev removeClass("S4Composition") # END port of old testthat tests @@ -16999,3 +17001,16 @@ B = data.table(x=1:2) X = A == B A[, y := 3:4] test(2148, colnames(X), c('x')) + +# shallow() shouldn't take a deep copy of indices, #4311 +dt <- data.table(a = c(3, 1)) +setindex(dt, a) +dt2 <- shallow(dt) +test(2149.1, address(attr(attr(dt, 'index'), '__a')), address(attr(attr(dt2, 'index'), '__a'))) +# Testing possible future regression. shallow() needs to copy the names of indices and keys. +setnames(dt2, 'a', 'A') +test(2149.2, indices(dt), 'a') +setkey(dt, a) +dt2 <- shallow(dt) +setnames(dt2, 'a', 'A') +test(2149.3, key(dt), 'a') diff --git a/src/assign.c b/src/assign.c index 1392079e7..88fc26065 100644 --- a/src/assign.c +++ b/src/assign.c @@ -152,13 +152,25 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) R_len_t i,l; int protecti=0; SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here? - //copyMostAttrib(dt, newdt); // including class - DUPLICATE_ATTRIB(newdt, dt); + SET_ATTRIB(newdt, shallow_duplicate(ATTRIB(dt))); + SET_OBJECT(newdt, OBJECT(dt)); + IS_S4_OBJECT(dt) ? SET_S4_OBJECT(newdt) : UNSET_S4_OBJECT(newdt); // To support S4 objects that incude data.table + //SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // SHALLOW_DUPLICATE_ATTRIB would be a bit neater but is only available from R 3.3.0 + // TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It // also increases truelength. Perhaps make that distinction, then, and split out, but marked // so that the next change knows to duplicate. - // Does copyMostAttrib duplicate each attrib or does it point? It seems to point, hence DUPLICATE_ATTRIB - // for now otherwise example(merge.data.table) fails (since attr(d4,"sorted") gets written by setnames). + // keepattr() also merely points to the entire attrbutes list and thus doesn't allow replacing + // some of its elements. + + // We copy all attributes that refer to column names so that calling setnames on either + // the original or the shallow copy doesn't break anything. + SEXP index = PROTECT(getAttrib(dt, sym_index)); protecti++; + setAttrib(newdt, sym_index, shallow_duplicate(index)); + + SEXP sorted = PROTECT(getAttrib(dt, sym_sorted)); protecti++; + setAttrib(newdt, sym_sorted, duplicate(sorted)); + SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++; SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++; if (isNull(cols)) {