-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
{dplyr} functions may be slower than {base} variants #27
Comments
Performance results for library(magrittr)
select_base_nse <- function(data, ...){
el <- rlang::exprs(...)
if(length(el)>0){
sels <- as.character(el)
if(any(stringr::str_detect(sels,"-"))){
rems <- stringr::str_remove(sels,"-") %>% stringr::str_trim()
data <- data[setdiff(colnames(data),rems)]
}else{
data <- data[as.character(el)]
nms <- names(el)
if(!is.null(nms)){
nms <- nms[nchar(nms)>0]
eln <- el[nms]
if(length(eln)>0){
data <- rename_base(data, new_names = nms, old_names = as.character(eln))
}
}
}
}
data
}
rename_base <- function(data, old_names, new_names){
cn <- colnames(data)
cnt <- seq_along(cn)
names(cnt) <- cn
cn[cnt[old_names]] <- new_names
colnames(data) <- cn
data
}
rename_base_nse <- function(data, ...){
el <- rlang::exprs(...)
if(length(el)>0){
rns <- names(el)
ons <- as.character(el)
data <- rename_base(data, new_names = rns, old_names = ons)
}
data
}
microbenchmark::microbenchmark(iris[c("Sepal.Length", "Sepal.Width")],
select_base_nse(iris, Sepal.Length, Sepal.Width),
dplyr::select(iris, Sepal.Length, Sepal.Width))
#> Unit: microseconds
#> expr min lq
#> iris[c("Sepal.Length", "Sepal.Width")] 19.245 29.0815
#> select_base_nse(iris, Sepal.Length, Sepal.Width) 107.342 151.1770
#> dplyr::select(iris, Sepal.Length, Sepal.Width) 3164.217 3782.3915
#> mean median uq max neval
#> 46.03761 34.8545 45.3320 563.221 100
#> 536.62575 185.3885 250.8195 21389.570 100
#> 11904.59006 4771.9840 5913.3940 687420.029 100
microbenchmark::microbenchmark(iris[c("Sepal.Length", "Sepal.Width")] %>%
rename_base(old_names = "Sepal.Width",new_names = "tst"),
select_base_nse(iris, Sepal.Length, tst = Sepal.Width),
dplyr::select(iris, Sepal.Length, tst = Sepal.Width))
#> Unit: microseconds
#> expr
#> iris[c("Sepal.Length", "Sepal.Width")] %>% rename_base(old_names = "Sepal.Width", new_names = "tst")
#> select_base_nse(iris, Sepal.Length, tst = Sepal.Width)
#> dplyr::select(iris, Sepal.Length, tst = Sepal.Width)
#> min lq mean median uq max neval
#> 146.686 203.992 282.3938 234.5685 306.6285 1394.582 100
#> 113.329 165.503 256.7217 197.5770 233.0720 4912.468 100
#> 2988.879 3273.697 4284.6715 3810.1890 4915.2480 11775.896 100
microbenchmark::microbenchmark(rename_base(iris, old_names = "Sepal.Width",new_names = "tst"),
rename_base_nse(iris, tst = Sepal.Width),
dplyr::rename(iris, tst = Sepal.Width))
#> Unit: microseconds
#> expr min
#> rename_base(iris, old_names = "Sepal.Width", new_names = "tst") 8.126
#> rename_base_nse(iris, tst = Sepal.Width) 26.515
#> dplyr::rename(iris, tst = Sepal.Width) 1886.813
#> lq mean median uq max neval
#> 11.1195 19.46322 18.176 23.950 75.696 100
#> 44.6905 139.76679 56.879 73.771 7528.003 100
#> 2099.1435 2680.21856 2405.771 2993.796 7425.366 100 Created on 2020-04-01 by the reprex package (v0.3.0) |
The suppressPackageStartupMessages(library(dplyr))
microbenchmark::microbenchmark(
iris %>% group_by(Species) %>% summarise(m = mean(Petal.Width)),
aggregate(iris["Petal.Width"], by = iris["Species"], mean)
)
#> Unit: microseconds
#> expr min
#> iris %>% group_by(Species) %>% summarise(m = mean(Petal.Width)) 5753.665
#> aggregate(iris["Petal.Width"], by = iris["Species"], mean) 711.617
#> lq mean median uq max neval
#> 6590.798 8429.525 7926.363 8722.229 45395.958 100
#> 878.403 1171.984 1016.535 1310.548 5534.706 100 Created on 2020-04-01 by the reprex package (v0.3.0) |
test for suppressPackageStartupMessages(library(dplyr))
inner_join_base <- function(x, y, by = NULL, suffix = c(".x",".y")){
if(is.null(by)){
by <- intersect(colnames(x), colnames(y))
}
nmd <- !is.null(names(by))
if(nmd){
merge(x, y, by.x = names(by), by.y = as.character(by), all = F)
}else{
merge(x, y, by = by, all = F)
}
}
N <- 1e3
d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])
d2 <- tibble(x = rbinom(N, 20, runif(N)/2), LTbig = LETTERS[sample(26, size = N, replace = T)])
microbenchmark::microbenchmark(inner_join(d1, d2, by = "x"),
inner_join_base(d1, d2, by = "x"))
#> Unit: milliseconds
#> expr min lq mean median
#> inner_join(d1, d2, by = "x") 5.779751 6.715674 9.149584 7.771339
#> inner_join_base(d1, d2, by = "x") 107.614133 121.283896 137.676605 132.975117
#> uq max neval
#> 9.656868 39.93566 100
#> 145.775458 218.41258 100
N <- 1e1
d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])
d2 <- tibble(x = rbinom(N, 20, runif(N)/2), LTbig = LETTERS[sample(26, size = N, replace = T)])
microbenchmark::microbenchmark(inner_join(d1, d2, by = "x"),
inner_join_base(d1, d2, by = "x"))
#> Unit: microseconds
#> expr min lq mean median
#> inner_join(d1, d2, by = "x") 4712.326 5212.467 6553.3317 5834.278
#> inner_join_base(d1, d2, by = "x") 599.572 711.618 901.5904 808.482
#> uq max neval
#> 7069.130 22728.984 100
#> 1042.195 3578.614 100 Created on 2020-04-01 by the reprex package (v0.3.0) on large data (n>300), it is faster. |
Test for filter library(dplyr, quietly = T)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
N <- 1e4
d1 <- tibble(x = rbinom(N, 20, runif(N)), lt = letters[sample(26, size = N, replace = T)])
microbenchmark::microbenchmark(
d1 %>% filter(x>5, lt<"t"),
d1[d1$x>5 & d1$lt< "t",]
)
#> Unit: milliseconds
#> expr min lq mean median uq
#> d1 %>% filter(x > 5, lt < "t") 8.783170 9.319876 11.468130 9.897210 13.040256
#> d1[d1$x > 5 & d1$lt < "t", ] 4.828221 5.028363 6.140927 5.166494 6.557656
#> max neval
#> 24.62050 100
#> 13.52971 100 Created on 2020-04-03 by the reprex package (v0.3.0) |
See this SO. |
I think few of them can be transferred to base without changing much of the codebase
Like
The text was updated successfully, but these errors were encountered: