Skip to content

Commit

Permalink
Fixed .shallow to consistently retain keys and indices. (#2337)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkusBonsch authored and mattdowle committed Sep 11, 2017
1 parent 3c1b6d0 commit 61a25ba
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 12 deletions.
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@

18. Integer values that are too large to fit in `int64` will now be read as strings [#2250](https://github.com/Rdatatable/data.table/issues/2250).


19. Internal-only `.shallow` now retains keys correctly, [#2336](https://github.com/Rdatatable/data.table/issues/2336). Thanks to @MarkusBonsch for reporting, fixing ([PR #2337](https://github.com/Rdatatable/data.table/pull/2337)) and adding 37 tests. This much advances the journey towards exporting `shallow()`, [#2323](https://github.com/Rdatatable/data.table/issues/2323).

#### NOTES

1. `?data.table` makes explicit the option of using a `logical` vector in `j` to select columns, [#1978](https://github.com/Rdatatable/data.table/issues/1978). Thanks @Henrik-P for the note and @MichaelChirico for filing.
Expand Down
39 changes: 33 additions & 6 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -2355,14 +2355,41 @@ point <- function(to, to_idx, from, from_idx) {
isnull = is.null(cols)
if (!isnull) cols = validate(cols, x) # NULL is default = all columns
ans = .Call(Cshallowwrapper, x, cols) # copies VECSXP only
if (retain.key && isnull) return(ans) # handle most frequent case first
# rest of the cases
cols = names(x)[cols]
retain.key = retain.key && identical(cols, head(key(x), length(cols)))
setattr(ans, 'sorted', if (haskey(x) && retain.key) cols else NULL)

if(retain.key){
if(isnull) return(ans) # handle most frequent case first
## get correct key if cols are present
cols = names(x)[cols]
keylength <- which.first(!key(ans) %chin% cols) - 1L
if(is.na(keylength)) keylength <- length(key(ans))
if(!keylength){
setattr(ans, "sorted", NULL) ## no key remaining
} else {
setattr(ans, "sorted", head(key(ans), keylength)) ## keep what can be kept
}
## take care of attributes.
indices <- names(attributes(attr(ans, "index")))
for(index in indices){
indexcols <- strsplit(index, split = "__")[[1]][-1L]
indexlength <- which.first(!indexcols %chin% cols) - 1L
if(is.na(indexlength)) next ## all columns are present, nothing to be done
reducedindex <- paste0(c("", indexcols[seq_len(indexlength)]), collapse = "__") ## the columns until the first missing form the new index
if(reducedindex %chin% indices || !indexlength){
## Either reduced index already present or no columns of the original index remain.
## Drop the original index completely
setattr(attr(ans, "index", exact = TRUE), index, NULL)
} else {
## rename index to reducedindex
names(attributes(attr(ans, "index")))[names(attributes(attr(ans, "index"))) == index] <- reducedindex
}
}
} else { # retain.key == FALSE
setattr(ans, "sorted", NULL)
setattr(ans, "index", NULL)
}
if (unlock) setattr(ans, '.data.table.locked', NULL)
ans
# TODO: check/remove attributes for secondary keys?

}

shallow <- function(x, cols=NULL) {
Expand Down
5 changes: 1 addition & 4 deletions R/foverlaps.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,6 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.
## hopefully all checks are over. Now onto the actual task at hand.
origx = x; x = shallow(x, by.x)
origy = y; y = shallow(y, by.y)
if (identical(by.x, key(origx)[seq_along(by.x)]))
setattr(x, 'sorted', by.x)
setattr(y, 'sorted', by.y) ## is definitely sorted on by.y
roll = switch(type, start=, end=, equal= 0.0, any=, within= +Inf)
make_call <- function(names, fun=NULL) {
if (is.character(names))
Expand Down Expand Up @@ -113,7 +110,7 @@ foverlaps <- function(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.
if (verbose) {cat(round(proc.time()[3]-last.started.at,3),"secs\n");flush.console}
matches <- function(ii, xx, del, ...) {
cols = setdiff(names(xx), del)
xx = shallow(xx, cols)
xx = .shallow(xx, cols, retain.key = FALSE)
ans = bmerge(xx, ii, seq_along(xx), seq_along(xx), haskey(xx), integer(0), mult=mult, ops=rep(1L, length(xx)), integer(0), 1L, verbose=verbose, ...)
# vecseq part should never run here, but still...
if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
Expand Down
50 changes: 49 additions & 1 deletion inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6843,11 +6843,59 @@ setkey(x1, a1, a2)
test(1544.1, setDF(merge(x1, y)), merge(as.data.frame(x1), as.data.frame(y)))
test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.frame(y), by="a2"))
# also test shallow here so as to catch future regressions
x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), key="a1,a2")
x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key="a1,a2")
test(1545.1, key(.shallow(x1, cols="a2")), NULL)
test(1545.2, key(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.2, key(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL)
test(1545.3, key(.shallow(x1, retain.key=TRUE)), key(x1))
test(1545.4, key(.shallow(x1, cols="a1", retain.key=TRUE)), "a1")
# tests for #2336. .shallow drops keys unnecessarily
test(1545.5, key(.shallow(x1, cols=c("a1", "a3"), retain.key=TRUE)), "a1")
test(1545.6, .shallow(x1, cols=c("a3", "a1"), retain.key=TRUE), .shallow(x1, cols=c("a3", "a1"), retain.key=TRUE))
test(1545.7, key(.shallow(x1, cols=c("a1", "a2", "a3"), retain.key=TRUE)), c("a1", "a2"))
test(1545.8, key(.shallow(x1, cols=c("a2", "a3"), retain.key=TRUE)), NULL)
test(1545.9, key(.shallow(x1, cols=c("a2"), retain.key=TRUE)), NULL)
test(1545.10, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL)
setkey(x1, NULL)
test(1545.11, key(.shallow(x1, retain.key=TRUE)), NULL)
test(1545.111, key(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.12, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL)
test(1545.121, key(.shallow(x1, cols=c("a1", "a2"), retain.key=FALSE)), NULL)
x1 <- x1[0]
test(1545.13, key(.shallow(x1, retain.key=TRUE)), NULL)
test(1545.131, key(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.132, key(.shallow(x1, cols = c("a1"), retain.key=FALSE)), NULL)
test(1545.133, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), NULL)
setkey(x1, a1)
test(1545.134, key(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.135, key(.shallow(x1, cols = "a2", retain.key=FALSE)), NULL)
test(1545.136, key(.shallow(x1, retain.key=TRUE)), "a1")
test(1545.137, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), "a1")
test(1545.138, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL)

# tests for #2336. .shallow now retains indices as well
x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), a2 = c(2L, 2L, 1L, 2L, 3L, 2L), a3 = c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE), key="a1,a2")
setindex(x1, a1, a2, a3)
setindex(x1, a1, a3)
test(1545.15, indices(.shallow(x1, retain.key=FALSE)), NULL)
test(1545.16, indices(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL)
test(1545.17, indices(.shallow(x1, retain.key=TRUE)), indices(x1))
test(1545.18, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a2__a3")], c("a1", "a2", "a3")), integer(0))
test(1545.19, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0))
test(1545.20, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0))
test(1545.21, indices(.shallow(x1, cols = "a1", retain.key=TRUE)), c("a1"))
test(1545.22, forderv(.shallow(x1, cols = "a1", retain.key=TRUE)[attr(attr(.shallow(x1, cols = "a1", retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0))
test(1545.23, attributes(attr(.shallow(x1, cols = c("a1", "a2"), retain.key = TRUE), "index", exact = TRUE)), attributes(attr(.shallow(x1, cols = c("a2", "a1"), retain.key = TRUE), "index", exact = TRUE)))
test(1545.24, indices(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)), c("a1__a2", "a1"))
test(1545.25, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0))
test(1545.26, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1__a2")], c("a1", "a2")), integer(0))
test(1545.27, indices(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)), c("a1", "a1__a3"))
test(1545.28, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0))
test(1545.29, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0))
test(1545.30, indices(.shallow(x1, cols = c("a2", "a3"), retain.key=TRUE)), NULL)
test(1545.31, indices(.shallow(x1, cols = c("a3"), retain.key=TRUE)), NULL)
test(1545.32, .shallow(x1, cols = c("a1", "a2", "a3"), retain.key=TRUE), .shallow(x1, retain.key=TRUE))


# test for #1234
df1 = df2 = data.frame(cats = rep(c('', ' ', 'meow'), 5))
Expand Down

0 comments on commit 61a25ba

Please sign in to comment.