Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

benchmark #4517

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
87 changes: 86 additions & 1 deletion R/test.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
if (identical(script,"*.Rraw")) {
# nocov start
scripts = dir(fulldir, "*.Rraw.*")
scripts = scripts[!grepl("bench|other", scripts)]
scripts = scripts[!grepl("bench|other|manual", scripts)]
scripts = gsub("[.]bz2$","",scripts)
return(sapply(scripts, function(fn) {
err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress))
Expand Down Expand Up @@ -459,3 +459,88 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
invisible(!fail)
}

# nocov start
is.AsIs = function(x) inherits(x, "AsIs")
benchmark = function(num, expr, limit, tolerance=0.025, verbose=FALSE) {

.test.data.table = exists("nfail", parent.frame()) # test() can be used inside functions defined in tests.Rraw, so inherits=TRUE (default) here
numStr = sprintf("%.8g", num)
if (.test.data.table) {
prevtest = get("prevtest", parent.frame())
nfail = get("nfail", parent.frame())
whichfail = get("whichfail", parent.frame())
assign("ntest", get("ntest", parent.frame()) + 1L, parent.frame(), inherits=TRUE) # bump number of tests run
lasttime = get("lasttime", parent.frame())
timings = get("timings", parent.frame())
memtest = get("memtest", parent.frame())
inittime = get("inittime", parent.frame())
filename = get("filename", parent.frame())
foreign = get("foreign", parent.frame())
showProgress = get("showProgress", parent.frame())
time = nTest = NULL # to avoid 'no visible binding' note
on.exit( {
now = proc.time()[3L]
took = now-lasttime # so that prep time between tests is attributed to the following test
assign("lasttime", now, parent.frame(), inherits=TRUE)
timings[ as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE ]
} )
if (showProgress)
cat("\rRunning benchmark id", numStr, " ")
} else {
# not `test.data.table` but developer running tests manually; i.e. `cc(F); test(...)`
memtest = FALSE
filename = NA_character_
foreign = FALSE
showProgress = FALSE
}

sub.expr = substitute(expr)
stopifnot(is.call(sub.expr))
l = if (limit.call<-is.call(sub.limit<-substitute(limit))) system.time(limit)[["elapsed"]]
else if (is.numeric(limit)) limit
else stop("limit must be constant numeric or a call to time it")

t = system.time(expr)[["elapsed"]]

fail = FALSE
if (.test.data.table) {
if (num<prevtest+0.0000005) {
cat("Test id", numStr, "is not in increasing order\n")
fail = TRUE
}
assign("prevtest", num, parent.frame(), inherits=TRUE)
}
if (!fail) {
if (length(tolerance)==1L) {
fail = if (is.AsIs(tolerance)) {
t > l+tolerance
} else {
t > l*(1+tolerance)
}
} else if (length(tolerance)==2L) { ## absolute difference, test 655
if (tolerance[1] >= tolerance[2])
stop("invalid use of tolerance argument, first element must be smaller than second one, first usually being negative and second positive")
fail = if (is.AsIs(tolerance)) {
t < l+tolerance[1] || t > l+tolerance[2]
} else {
t < l*(1+tolerance[1L]) || t > l*(1+tolerance[2])
}
} else stop("tolerance must be length 1 or 2")
if (fail || verbose) {
cat(sprintf("Benchmark %s %scheck that expression:\n> ",
numStr, if (fail) "failed to " else ""))
print(sub.expr)
cat("# elapsed: ", t, "\n", sep="")
cat(sprintf("computes within given limit and tolerance (%s):\n> ",
paste(sprintf("%.3f%s", tolerance, if (is.AsIs(tolerance)) "s" else ""), collapse=",")))
if (limit.call) print(sub.limit) else cat(limit, "\n", sep="")
cat("# elapsed: ", l, "\n", sep="")
}
}
if (fail && .test.data.table) {
assign("nfail", nfail+1L, parent.frame(), inherits=TRUE)
assign("whichfail", c(whichfail, numStr), parent.frame(), inherits=TRUE)
}
invisible(!fail)
}
# nocov end
170 changes: 0 additions & 170 deletions inst/tests/benchmark.Rraw

This file was deleted.

127 changes: 127 additions & 0 deletions inst/tests/benchmarks.Rraw
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
if ((tt<-compiler::enableJIT(-1))>0)
cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
} else {
require(data.table)
benchmark = data.table:::benchmark
}
# warm-up #2912
d = data.table(a=1)
d[1L] -> nul
rm(d, nul)

n=1e4
grp1=sample(1:50,n,replace=TRUE)
grp2=sample(1:50,n,replace=TRUE)
dt=data.table(x=rnorm(n),y=rnorm(n),grp1=grp1,grp2=grp2)
benchmark(120, tolerance=I(0.05), #verbose=TRUE,
ans <- dt[,list(.Internal(mean(x)),.Internal(mean(y))),by=c("grp1","grp2")],
limit = 0.05)
i = sample(nrow(ans),1)
stopifnot(all.equal(ans[i,c(V1,V2)], dt[grp1==ans[i,grp1] & grp2==ans[i,grp2], c(mean(x),mean(y))]))
# To DO: add a data.frame aggregate method here and check data.table is faster
rm(grp1, grp2, dt, i)

# Test character and list columns in tables with many small groups
N = 1000L # the version in tests.Rraw has 100L
DT = data.table(grp=1:(2*N),char=sample(as.hexmode(1:N),4*N,replace=TRUE),int=sample(1:N,4*N,replace=TRUE))
benchmark(476, ans <- DT[,list(p=paste(unique(char),collapse=","), i=list(unique(int))), by=grp], limit=0.1)
stopifnot(nrow(as.matrix(ans))==2L*N)
rm(DT, ans)

# Speed test of chmatch vs match.
# sortedmatch was 40 times slower and the wrong approach, removed in v1.8.0.
# Example from Tom in Jan 2011 who first found and raised the issue with sortedmatch.
n = 1e6
a = as.character(as.hexmode(sample(n,replace=TRUE)))
b = as.character(as.hexmode(sample(n,replace=TRUE)))
benchmark(529, ans2<-chmatch(a,b), limit = ans1<-match(a,b))
stopifnot(all.equal(ans1, ans2))
# sorting a and b no longer makes a difference since both match and chmatch work via hash in some way or another
rm(a, b)

# Test that as.list.data.table no longer copies via unclass, so speeding up sapply(DT,class) and lapply(.SD,...) etc, #2000
N = 1e7
DT = data.table(a=1:N,b=1:N,c=1:N,d=1:N) # 150MB
l = as.list(DT)
benchmark(603, sapply(DT,class), limit=sapply(l,class), tolerance=I(0.025))
rm(l)

# Tests on loopability, i.e. that overhead of [.data.table isn't huge, as in speed example in example(":=")
# These are just to catch slow down regressions where instead of 1s it takes 40s
benchmark(604, for (i in 1:1000) nrow(DT), limit=0.5)
benchmark(605, for (i in 1:1000) ncol(DT), limit=0.5)
benchmark(606, for (i in 1:1000) length(DT[[1L]]), limit=0.5) # much faster than nrow, TO DO: replace internally
rm(DT)

DT = as.data.table(matrix(1L,nrow=100000,ncol=100))
benchmark(607, for (i in 1:1000) DT[i,V1:=i], limit=1)
stopifnot(all.equal(DT[1:1000,V1], 1:1000))
rm(DT)

# Test faster mean. Example from (now not needed as much) data.table wiki point 3.
# Example is a lot of very small groups.
set.seed(100)
n=1e5
DT=data.table(grp1=sample(1:750, n, replace=TRUE),
grp2=sample(1:750, n, replace=TRUE),
x=rnorm(n),
y=rnorm(n))
DT[c(2,5),x:=NA] # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect.
DT[c(3,4),y:=NA]
basemean = base::mean # to isolate time of `::` itself
benchmark(650,
expr = ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2)],
limit = ans3<-DT[,list(basemean(x),basemean(y)),by=list(grp1,grp2)])
ans2<-DT[,list(.Internal(mean(x)),.Internal(mean(y))),by=list(grp1,grp2)]
stopifnot(all.equal(ans1, ans2), all.equal(ans1, ans3),
any(is.na(ans1$V1)) && !any(is.nan(ans1$V1)))
benchmark(653,
expr = ans1<-DT[,list(mean(x,na.rm=TRUE),mean(y,na.rm=TRUE)),by=list(grp1,grp2)],
limit = ans2<-DT[,list(mean.default(x,na.rm=TRUE),mean.default(y,na.rm=TRUE)),by=list(grp1,grp2)])
stopifnot(all.equal(ans1, ans2), any(is.nan(ans1$V1)))
# See FR#2067. Here we're just testing the optimization of mean and lapply, should be comparable to above
benchmark(655, tolerance = c(-0.2, 0.2),
expr = ans1<-DT[,list(mean(x,na.rm=TRUE),mean(y,na.rm=TRUE)),by=list(grp1,grp2)],
limit = ans2<-DT[,lapply(.SD,mean,na.rm=TRUE),by=list(grp1,grp2)])
setnames(ans2,c("x","y"),c("V1","V2"))
stopifnot(all.equal(ans1, ans2))
rm(DT, ans1, ans2, ans3)

# test the speed of simple comparison
DT = data.table(a = 1:1e7)
options(datatable.auto.index=FALSE)
benchmark(1110, tolerance=I(0.1),
#verbose=TRUE, ## it is not always within 30% difference, warm up seems to be an imporant factor here, there was a comment before "should pass most of the time" which is still valid
expr = DT[a == 100],
limit = DT[which(a == 100)])
options(datatable.auto.index=TRUE)
rm(DT)

# Fix for bug #76 - DT[, .N, by=y] was slow when "y" is not a column in DT
DT = data.table(x=sample.int(10, 1e6, replace=TRUE))
y = DT$x
benchmark(1143.2, tolerance=I(c(-0.1,0.1)),
#verbose=TRUE, ## it was abs(te1-te2)<1 before but 1s is so much here so changing to +/- 0.1s
expr = ans1 <- DT[, .N, by=x],
limit = ans2 <- DT[, .N, by=y])
setnames(ans2,"y","x")
stopifnot(all.equal(ans1, ans2))
rm(DT, ans1, ans2)

# Test for optimisation of 'order' to 'forder'.
set.seed(45L)
DT = data.table(x=sample(1e2, 1e6,TRUE), y=sample(1e2, 1e6,TRUE))
base_order = base::order ## base R is DT's order since R 3.3, but recent DT is still faster
benchmark(1241.2, tolerance=I(0.1),
expr = ans1<-DT[order(x,-y)],
limit = ans2<-DT[base_order(x,-y)])
stopifnot(all.equal(ans1, ans2))
rm(DT, ans1, ans2, base_order)

# add timing test for many .SD cols #3797
# [[ on a list column by group #4646
# joining in a loop, order of different types #3928
# simple access DT[10L], DT[, 3L] #3735
# calling setDT in a loop #4476
# multithreaded function calls by group DT[, uniqueN(a), by=b], should stress new throttle feature #4484
Loading