Skip to content

Commit

Permalink
DT[i] now calls parallel CsubsetDT (#3210)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle authored Dec 13, 2018
1 parent 4c9a1f0 commit 3937881
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 25 deletions.
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,19 @@

6. `fread()` and `fwrite()` can now handle file names in native and UTF-8 encoding, [#3078](https://github.com/Rdatatable/data.table/issues/3078). Thanks to Daniel Possenriede (@dpprdan) for reporting and fixing.

7. `DT[i]` now calls internal parallel subsetting code, [#2951](https://github.com/Rdatatable/data.table/issues/2951). Further, if `DT` has extra attributes (e.g. user defined or inherited via `as.data.table`) those attributes are now retained. Subsetting is significantly faster (as are many other operations) with factor columns rather than character.
```R
N = 2e8
DT = data.table(ID = sample(LETTERS,N,TRUE),
V1 = sample(5,N,TRUE),
V2 = runif(N))
w = which(DT$V1 > 3) # 40% of rows
# v1.12.0 v1.11.8
system.time(DT[w]) # 0.8s 2.6s
DT[, ID := as.factor(ID)]
system.time(DT[w]) # 0.4s 2.3s
```


#### BUG FIXES

Expand Down
20 changes: 9 additions & 11 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,12 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
if (!missing(i) & is.data.table(ans)) setkey(ans,NULL) # See test 304
return(ans)
}
.global$print=""
if (missing(i) && missing(j)) {
if (!is.null(names(sys.call()))) # not using nargs() as it considers DT[,] to have 3 arguments, #3163
stop("When i and j are both missing, no other argument should be used. Empty [] is useful after := to have the result displayed; e.g. DT[,col:=val][]")
return(x)
}
if (!mult %chin% c("first","last","all")) stop("mult argument can only be 'first','last' or 'all'")
missingroll = missing(roll)
if (length(roll)!=1L || is.na(roll)) stop("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
Expand All @@ -261,14 +267,6 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
if (!is.logical(which) || length(which)>1L) stop("which= must be a logical vector length 1. Either FALSE, TRUE or NA.")
if ((isTRUE(which)||is.na(which)) && !missing(j)) stop("which==",which," (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.")
if (!is.na(nomatch) && is.na(which)) stop("which=NA with nomatch=0 would always return an empty vector. Please change or remove either which or nomatch.")
.global$print=""
if (missing(i) && missing(j)) {
# ...[] == oops at console, forgot print(...)
# or some kind of dynamic construction that has edge case of no contents inside [...]
if (!is.null(names(sys.call()))) # not using nargs() as it considers DT[,] to have 3 arguments, #3163
stop("When i and j are both missing, no other argument should be used. Empty [] is useful after := to have the result printed.")
return(x)
}
if (!with && missing(j)) stop("j must be provided when with=FALSE")
if (missing(i) && !missing(on)) stop("i must be provided when on= is provided")
if (!missing(keyby)) {
Expand Down Expand Up @@ -724,9 +722,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
# missing(by)==TRUE was already checked above before dealing with i
if (!length(x)) return(null.data.table())
if (!length(leftcols)) {
ansvars = nx = names(x)
jisvars = character()
xcols = xcolsAns = seq_along(x)
# basic x[i] subset, #2951
return( if (is.null(irows)) x else .Call(CsubsetDT, x, irows, seq_along(x)) )
} else {
jisvars = names(i)[-leftcols]
tt = jisvars %chin% names(x)
Expand Down Expand Up @@ -1292,6 +1289,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
else ans[[target]] = ans[[target]]
}
} else {
# TODO COMMENT HERE
for (s in seq_along(xcols)) {
target = xcolsAns[s]
source = xcols[s]
Expand Down
18 changes: 10 additions & 8 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -2543,7 +2543,7 @@ test(922, setnames(DT,c("A","B","C")), data.table(A=1:2,B=3:4,C=5:6,key="B"))
DT = data.table(x=rep(1L,50000),key="x")
test(923, DT[DT], error="Join results in more than 2^31 rows (internal vecseq reached physical limit). Very likely misspecified join.")
X = data.table(x=1:2,y=1:6,key="x")
test(924.1, X[J(c(1,1,1))], X[rep(1:3,3)])
test(924.1, X[J(c(1,1,1))], setkey(X[rep(1:3,3)],NULL))
test(924.2, X[J(c(1,1,1,1))], error="Join results in 12 rows; more than 10 = nrow(x)+nrow(i). Check for duplicate key values in i each of")


Expand Down Expand Up @@ -3825,7 +3825,9 @@ test(1163, last(x), character(0))
# Bug fix for #5117 - segfault when rbindlist on empty data.tables
x <- as.data.table(BOD)
y <- copy(x)
test(1165, x[Time>100], rbindlist(list(x[Time > 100], y[Time > 200])))
test(1165.1, attr(ans<-x[Time>100], "reference"), "A1.4, p. 270") # DT[] retains attribute as from v1.12.0
attr(ans, "reference") <- NULL
test(1165.2, ans, rbindlist(list(x[Time > 100], y[Time > 200]))) # rbindlist drops attribute (no change)

# Bug fix for the #5300 - rbind(DT, NULL) should not result in error, but BOD has an attribute as well, which won't be preserved (due to C-impl). Changing test.
setattr(x <- as.data.table(BOD), 'reference', NULL)
Expand Down Expand Up @@ -12941,12 +12943,12 @@ test(1967.35, data.table(1:5, integer(0L)), error = 'Item 2 has no length')
test(1967.36, data.table(1:5, key = 5L), error = 'must be character')

x = data.table(a = 1:5)
test(1967.37, x[mult = 'none'], error = 'mult argument can only be')
test(1967.38, x[roll = c(3, 4)], error = 'roll must be a single')
test(1967.39, x[roll = NA], error = 'roll must be a single')
test(1967.40, x[roll = 'furthest'], error = 'Only valid character value')
test(1967.41, x[rollends = 1 + 3i], error = 'rollends must be a logical')
test(1967.42, x[rollends = rep(TRUE, 10L)], error = 'rollends must be length 1 or 2')
test(1967.37, x[3, mult = 'none'], error = 'mult argument can only be') # i==3 to get past 'i and j both missing' error
test(1967.38, x[3, roll = c(3, 4)], error = 'roll must be a single')
test(1967.39, x[3, roll = NA], error = 'roll must be a single')
test(1967.40, x[3, roll = 'furthest'], error = 'Only valid character value')
test(1967.41, x[3, rollends = 1 + 3i], error = 'rollends must be a logical')
test(1967.42, x[3, rollends = rep(TRUE, 10L)], error = 'rollends must be length 1 or 2')
test(1967.43, x[ , ..], error = 'symbol .. is invalid')
test(1967.44, x[NULL], data.table(NULL))
test(1967.45, x[ , NULL], NULL)
Expand Down
11 changes: 5 additions & 6 deletions src/subset.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,13 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {
SEXP in = PROTECT(chmatch(key,getAttrib(ans,R_NamesSymbol), 0, TRUE)); nprotect++; // (nomatch ignored when in=TRUE)
int i = 0; while(i<LENGTH(key) && LOGICAL(in)[i]) i++;
// i is now the keylen that can be kept. 2 lines above much easier in C than R
if (i==0) {
setAttrib(ans, sym_sorted, R_NilValue);
if (i==0 || !orderedSubset) {
// clear key that was copied over by copyMostAttrib() above
setAttrib(ans, sym_sorted, R_NilValue);
} else {
if (orderedSubset) {
setAttrib(ans, sym_sorted, tmp=allocVector(STRSXP, i));
for (int j=0; j<i; j++) SET_STRING_ELT(tmp, j, STRING_ELT(key, j));
}
// make a new key attribute; shorter if i<LENGTH(key) or same length copied so this key is safe to change by ref (setnames)
setAttrib(ans, sym_sorted, tmp=allocVector(STRSXP, i));
for (int j=0; j<i; j++) SET_STRING_ELT(tmp, j, STRING_ELT(key, j));
}
}
setAttrib(ans, install(".data.table.locked"), R_NilValue);
Expand Down

0 comments on commit 3937881

Please sign in to comment.