-
Notifications
You must be signed in to change notification settings - Fork 991
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added an atime test for performance improvement in transform() brought by #5493 #6515
Conversation
Generated via commit c423bf6 Download link for the artifact containing the test results: ↓ atime-results.zip Time taken to finish the standard R installation steps: 3 minutes and 47 seconds Time taken to run |
I'm seeing the following result from current CI wdir <- "data.table-pr6515"
dir.create(wdir)
file.copy("~/Downloads/atime-results.zip",wdir)
unzip(file.path(wdir,"atime-results.zip"),exdir=wdir)
(objs <- load(file.path(wdir,"tests.RData")))
library(data.table)
pkg.refs <- atime::references_best(pkg.results[["transform improved in #5493"]])
pkg.pred <- predict(pkg.refs, seconds=0.01, kilobytes=1000)
dcast(pkg.pred$pre[expr.name%in%c("Fast","Slow")], unit ~ expr.name, value.var="N")[, speedup := Fast/Slow][]
dcast(pkg.refs$mea[N==1000 & expr.name%in%c("Fast","Slow")], unit ~ expr.name, value.var="empirical")[, speedup := Slow/Fast][] result I got from CI is 1.2-5x speedup at N=1000, 1.2-1.5x ratios for N the code can handle at time=0.01 seconds and memory=1000kb. > dcast(pkg.pred$pre[expr.name%in%c("Fast","Slow")], unit ~ expr.name, value.var="N")[, speedup := Fast/Slow][]
Key: <unit>
unit Fast Slow speedup
<char> <num> <num> <num>
1: kilobytes 62199.52 41407.29 1.502139
2: seconds 950249.49 759291.99 1.251494
> dcast(pkg.refs$mea[N==1000 & expr.name%in%c("Fast","Slow")], unit ~ expr.name, value.var="empirical")[, speedup := Slow/Fast][]
Key: <unit>
unit Fast Slow speedup
<char> <num> <num> <num>
1: kilobytes 3.182812e+01 3.970313e+01 1.247423
2: seconds 4.173355e-05 2.110621e-04 5.057371 |
Right, we are not observing that big of a speedup as in Ofek's example. The code for the test case though is exactly the same as OP's post so I'm wondering where the difference is or if its even reproducible. As per the plot, N doesn't reach 107 within the default |
on my computer I get similar results with same test code as CI. I also tried adding assignment pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
pkg_find_replace <- function(glob, FIND, REPLACE) {
atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
}
Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
new.Package_ <- paste0(Package_, "_", sha)
pkg_find_replace(
"DESCRIPTION",
paste0("Package:\\s+", old.Package),
paste("Package:", new.Package))
pkg_find_replace(
file.path("src", "Makevars.*in"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
sprintf('packageVersion\\("%s"\\)', old.Package),
sprintf('packageVersion\\("%s"\\)', new.Package))
pkg_find_replace(
file.path("src", "init.c"),
paste0("R_init_", Package_regex),
paste0("R_init_", gsub("[.]", "_", new.Package_)))
pkg_find_replace(
"NAMESPACE",
sprintf('useDynLib\\("?%s"?', Package_regex),
paste0('useDynLib(', new.Package_))
}
expr.list <- atime::atime_versions_exprs(
pkg.path="~/R/data.table",
pkg.edit.fun=pkg.edit.fun,
expr = dt <- data.table:::transform.data.table(dt, y = round(x)),
ParentFirst="0895fa247afcf6b38044bd5f56c0d209691ddb31", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
Slow = "bf499090c0e6fd5cb492bf8b1603d93c1ee21dfb", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7) of the PR (https://github.com/Rdatatable/data.table/pull/5493) that fixes the issue
Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7") # Merge commit
library(data.table)
vres <- atime::atime(
N=2^seq(1,20),
setup = {
set.seed(1)
df <- data.frame(x = runif(N))
dt <- as.data.table(df)
},
expr.list=expr.list,
df=df <- transform(df, y = round(x)))
vref <- atime::references_best(vres)
vpre <- predict(vref)
plot(vpre) is anybody else able to reproduce the speedups that @OfekShilon observed in #5493 (comment) ? |
I am able to reproduce without atime > df <- data.frame(x = runif(n = 1e6))
> dt <- as.data.table(df)
system.time(df <- transform(df, y = round(x)))
user system elapsed
0.05 0.00 0.03
> system.time(dt <- data.table.0895fa247afcf6b38044bd5f56c0d209691ddb31:::transform.data.table(dt, y = round(x)))
user system elapsed
2.32 0.02 2.39
> system.time(dt <- data.table.0895fa247afcf6b38044bd5f56c0d209691ddb31:::transform.data.table(dt, y = round(x)))
user system elapsed
0.05 0.00 0.05 above is consistent with #5493 (comment) and we see that it is only slow the first transform, not the second. so maybe creating the new column is slow? below we first transform df and then create dt, and we see it is not slow. > df <- data.frame(x = runif(n = 1e6))
> system.time(df <- transform(df, y = round(x)))
user system elapsed
0.04 0.00 0.03
> dt <- as.data.table(df)
> system.time(dt <- data.table.0895fa247afcf6b38044bd5f56c0d209691ddb31:::transform.data.table(dt, y = round(x)))
user system elapsed
0.03 0.02 0.03
> system.time(dt <- data.table.0895fa247afcf6b38044bd5f56c0d209691ddb31:::transform.data.table(dt, y = round(x)))
user system elapsed
0.03 0.00 0.05 |
I am able to reproduce on my machine, with
pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
pkg_find_replace <- function(glob, FIND, REPLACE) {
atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
}
Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
new.Package_ <- paste0(Package_, "_", sha)
pkg_find_replace(
"DESCRIPTION",
paste0("Package:\\s+", old.Package),
paste("Package:", new.Package))
pkg_find_replace(
file.path("src", "Makevars.*in"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
sprintf('packageVersion\\("%s"\\)', old.Package),
sprintf('packageVersion\\("%s"\\)', new.Package))
pkg_find_replace(
file.path("src", "init.c"),
paste0("R_init_", Package_regex),
paste0("R_init_", gsub("[.]", "_", new.Package_)))
pkg_find_replace(
"NAMESPACE",
sprintf('useDynLib\\("?%s"?', Package_regex),
paste0('useDynLib(', new.Package_))
}
expr.list <- atime::atime_versions_exprs(
pkg.path="~/R/data.table",
pkg.edit.fun=pkg.edit.fun,
expr = data.table:::transform.data.table(dt, y = round(x)),
Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7", # Merge commit
Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31") # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
library(data.table)
vres <- atime::atime(
N=2^seq(1,20),
setup = {
set.seed(1)
df <- data.frame(x = runif(N))
dt <- as.data.table(df)
},
expr.list=expr.list,
df=transform(df, y = round(x)))
vref <- atime::references_best(vres)
vpre <- predict(vref)
plot(vpre)
|
Great find! I think the changes partially might have been brought in sometime before the commit we are using if that's the case (since the change in commit SHA is showing difference)
100%! |
great thanks |
Tested here