Rdatatable · jangorecki · May 28, 2020 · May 28, 2020 · May 28, 2020 · May 29, 2020
@@ -21,7 +21,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
   if (identical(script,"*.Rraw")) {
     # nocov start
     scripts = dir(fulldir, "*.Rraw.*")
-    scripts = scripts[!grepl("bench|other", scripts)]
+    scripts = scripts[!grepl("bench|other|manual", scripts)]
     scripts = gsub("[.]bz2$","",scripts)
     return(sapply(scripts, function(fn) {
       err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress))
@@ -459,3 +459,88 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
   invisible(!fail)
 }
 
+# nocov start
+is.AsIs = function(x) inherits(x, "AsIs")
+benchmark = function(num, expr, limit, tolerance=0.025, verbose=FALSE) {
+
+  .test.data.table = exists("nfail", parent.frame()) # test() can be used inside functions defined in tests.Rraw, so inherits=TRUE (default) here
+  numStr = sprintf("%.8g", num)
+  if (.test.data.table) {
+    prevtest = get("prevtest", parent.frame())
+    nfail = get("nfail", parent.frame())
+    whichfail = get("whichfail", parent.frame())
+    assign("ntest", get("ntest", parent.frame()) + 1L, parent.frame(), inherits=TRUE)   # bump number of tests run
+    lasttime = get("lasttime", parent.frame())
+    timings = get("timings", parent.frame())
+    memtest = get("memtest", parent.frame())
+    inittime = get("inittime", parent.frame())
+    filename = get("filename", parent.frame())
+    foreign = get("foreign", parent.frame())
+    showProgress = get("showProgress", parent.frame())
+    time = nTest = NULL  # to avoid 'no visible binding' note
+    on.exit( {
+      now = proc.time()[3L]
+      took = now-lasttime  # so that prep time between tests is attributed to the following test
+      assign("lasttime", now, parent.frame(), inherits=TRUE)
+      timings[ as.integer(num), `:=`(time=time+took, nTest=nTest+1L), verbose=FALSE ]
+    } )
+    if (showProgress)
+      cat("\rRunning benchmark id", numStr, "     ")
+  } else {
+    # not `test.data.table` but developer running tests manually; i.e. `cc(F); test(...)`
+    memtest = FALSE
+    filename = NA_character_
+    foreign = FALSE
+    showProgress = FALSE
+  }
+
+  sub.expr = substitute(expr)
+  stopifnot(is.call(sub.expr))
+  l = if (limit.call<-is.call(sub.limit<-substitute(limit))) system.time(limit)[["elapsed"]]
+  else if (is.numeric(limit)) limit
+  else stop("limit must be constant numeric or a call to time it")
+
+  t = system.time(expr)[["elapsed"]]
+
+  fail = FALSE
+  if (.test.data.table) {
+    if (num<prevtest+0.0000005) {
+      cat("Test id", numStr, "is not in increasing order\n")
+      fail = TRUE
+    }
+    assign("prevtest", num, parent.frame(), inherits=TRUE)
+  }
+  if (!fail) {
+    if (length(tolerance)==1L) {
+      fail = if (is.AsIs(tolerance)) {
+        t > l+tolerance
+      } else {
+        t > l*(1+tolerance)
+      }
+    } else if (length(tolerance)==2L) { ## absolute difference, test 655
+      if (tolerance[1] >= tolerance[2])
+        stop("invalid use of tolerance argument, first element must be smaller than second one, first usually being negative and second positive")
+      fail = if (is.AsIs(tolerance)) {
+        t < l+tolerance[1] || t > l+tolerance[2]
+      } else {
+        t < l*(1+tolerance[1L]) || t > l*(1+tolerance[2])
+      }
+    } else stop("tolerance must be length 1 or 2")
+    if (fail || verbose) {
+      cat(sprintf("Benchmark %s %scheck that expression:\n> ",
+                  numStr, if (fail) "failed to " else ""))
+      print(sub.expr)
+      cat("# elapsed: ", t, "\n", sep="")
+      cat(sprintf("computes within given limit and tolerance (%s):\n> ",
+                  paste(sprintf("%.3f%s", tolerance, if (is.AsIs(tolerance)) "s" else ""), collapse=",")))
+      if (limit.call) print(sub.limit) else cat(limit, "\n", sep="")
+      cat("# elapsed: ", l, "\n", sep="")
+    }
+  }
+  if (fail && .test.data.table) {
+    assign("nfail", nfail+1L, parent.frame(), inherits=TRUE)
+    assign("whichfail", c(whichfail, numStr), parent.frame(), inherits=TRUE)
+  }
+  invisible(!fail)
+}
+# nocov end
@@ -0,0 +1,127 @@
+if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
+  if ((tt<-compiler::enableJIT(-1))>0)
+    cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
+} else {
+  require(data.table)
+  benchmark = data.table:::benchmark
+}
+# warm-up #2912
+d = data.table(a=1)
+d[1L] -> nul
+rm(d, nul)
+
+n=1e4
+grp1=sample(1:50,n,replace=TRUE)
+grp2=sample(1:50,n,replace=TRUE)
+dt=data.table(x=rnorm(n),y=rnorm(n),grp1=grp1,grp2=grp2)
+benchmark(120, tolerance=I(0.05), #verbose=TRUE,
+  ans <- dt[,list(.Internal(mean(x)),.Internal(mean(y))),by=c("grp1","grp2")],
+  limit = 0.05)
+i = sample(nrow(ans),1)
+stopifnot(all.equal(ans[i,c(V1,V2)], dt[grp1==ans[i,grp1] & grp2==ans[i,grp2], c(mean(x),mean(y))]))
+# To DO: add a data.frame aggregate method here and check data.table is faster
+rm(grp1, grp2, dt, i)
+
+# Test character and list columns in tables with many small groups
+N = 1000L   # the version in tests.Rraw has 100L
+DT = data.table(grp=1:(2*N),char=sample(as.hexmode(1:N),4*N,replace=TRUE),int=sample(1:N,4*N,replace=TRUE))
+benchmark(476, ans <- DT[,list(p=paste(unique(char),collapse=","), i=list(unique(int))), by=grp], limit=0.1)
+stopifnot(nrow(as.matrix(ans))==2L*N)
+rm(DT, ans)
+
+# Speed test of chmatch vs match.
+# sortedmatch was 40 times slower and the wrong approach, removed in v1.8.0.
+# Example from Tom in Jan 2011 who first found and raised the issue with sortedmatch.
+n = 1e6
+a = as.character(as.hexmode(sample(n,replace=TRUE)))
+b = as.character(as.hexmode(sample(n,replace=TRUE)))
+benchmark(529, ans2<-chmatch(a,b), limit = ans1<-match(a,b))
+stopifnot(all.equal(ans1, ans2))
+# sorting a and b no longer makes a difference since both match and chmatch work via hash in some way or another
+rm(a, b)
+
+# Test that as.list.data.table no longer copies via unclass, so speeding up sapply(DT,class) and lapply(.SD,...) etc, #2000
+N = 1e7
+DT = data.table(a=1:N,b=1:N,c=1:N,d=1:N)   # 150MB
+l = as.list(DT)
+benchmark(603, sapply(DT,class), limit=sapply(l,class), tolerance=I(0.025))
+rm(l)
+
+# Tests on loopability, i.e. that overhead of [.data.table isn't huge, as in speed example in example(":=")
+# These are just to catch slow down regressions where instead of 1s it takes 40s
+benchmark(604, for (i in 1:1000) nrow(DT), limit=0.5)
+benchmark(605, for (i in 1:1000) ncol(DT), limit=0.5)
+benchmark(606, for (i in 1:1000) length(DT[[1L]]), limit=0.5) # much faster than nrow, TO DO: replace internally
+rm(DT)
+
+DT = as.data.table(matrix(1L,nrow=100000,ncol=100))
+benchmark(607, for (i in 1:1000) DT[i,V1:=i], limit=1)
+stopifnot(all.equal(DT[1:1000,V1], 1:1000))
+rm(DT)
+
+# Test faster mean.  Example from (now not needed as much) data.table wiki point 3.
+# Example is a lot of very small groups.
+set.seed(100)
+n=1e5
+DT=data.table(grp1=sample(1:750, n, replace=TRUE),
+              grp2=sample(1:750, n, replace=TRUE),
+              x=rnorm(n),
+              y=rnorm(n))
+DT[c(2,5),x:=NA]  # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect.
+DT[c(3,4),y:=NA]
+basemean = base::mean  # to isolate time of `::` itself
+benchmark(650,
+  expr =  ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2)],
+  limit = ans3<-DT[,list(basemean(x),basemean(y)),by=list(grp1,grp2)])
+ans2<-DT[,list(.Internal(mean(x)),.Internal(mean(y))),by=list(grp1,grp2)]
+stopifnot(all.equal(ans1, ans2), all.equal(ans1, ans3),
+          any(is.na(ans1$V1)) && !any(is.nan(ans1$V1)))
+benchmark(653,
+  expr =  ans1<-DT[,list(mean(x,na.rm=TRUE),mean(y,na.rm=TRUE)),by=list(grp1,grp2)],
+  limit = ans2<-DT[,list(mean.default(x,na.rm=TRUE),mean.default(y,na.rm=TRUE)),by=list(grp1,grp2)])
+stopifnot(all.equal(ans1, ans2), any(is.nan(ans1$V1)))
+# See FR#2067. Here we're just testing the optimization of mean and lapply, should be comparable to above
+benchmark(655, tolerance = c(-0.2, 0.2),
+  expr =  ans1<-DT[,list(mean(x,na.rm=TRUE),mean(y,na.rm=TRUE)),by=list(grp1,grp2)],
+  limit = ans2<-DT[,lapply(.SD,mean,na.rm=TRUE),by=list(grp1,grp2)])
+setnames(ans2,c("x","y"),c("V1","V2"))
+stopifnot(all.equal(ans1, ans2))
+rm(DT, ans1, ans2, ans3)
+
+# test the speed of simple comparison
+DT = data.table(a = 1:1e7)
+options(datatable.auto.index=FALSE)
+benchmark(1110, tolerance=I(0.1),
+  #verbose=TRUE, ## it is not always within 30% difference, warm up seems to be an imporant factor here, there was a comment before "should pass most of the time" which is still valid
+  expr = DT[a == 100],
+  limit = DT[which(a == 100)])
+options(datatable.auto.index=TRUE)  
+rm(DT)
+
+# Fix for bug #76 - DT[, .N, by=y] was slow when "y" is not a column in DT
+DT = data.table(x=sample.int(10, 1e6, replace=TRUE))
+y = DT$x
+benchmark(1143.2, tolerance=I(c(-0.1,0.1)),
+  #verbose=TRUE, ## it was abs(te1-te2)<1 before but 1s is so much here so changing to +/- 0.1s
+  expr = ans1 <- DT[, .N, by=x],
+  limit = ans2 <- DT[, .N, by=y])
+setnames(ans2,"y","x")
+stopifnot(all.equal(ans1, ans2))
+rm(DT, ans1, ans2)
+
+# Test for optimisation of 'order' to 'forder'.
+set.seed(45L)
+DT = data.table(x=sample(1e2, 1e6,TRUE), y=sample(1e2, 1e6,TRUE))
+base_order = base::order ## base R is DT's order since R 3.3, but recent DT is still faster
+benchmark(1241.2, tolerance=I(0.1),
+  expr = ans1<-DT[order(x,-y)],
+  limit = ans2<-DT[base_order(x,-y)])
+stopifnot(all.equal(ans1, ans2))
+rm(DT, ans1, ans2, base_order)
+
+# add timing test for many .SD cols #3797
+# [[ on a list column by group #4646
+# joining in a loop, order of different types #3928
+# simple access DT[10L], DT[, 3L] #3735
+# calling setDT in a loop #4476
+# multithreaded function calls by group DT[, uniqueN(a), by=b], should stress new throttle feature #4484