diff --git a/DESCRIPTION b/DESCRIPTION index b079b96..c362640 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: atime Type: Package Title: Asymptotic Timing -Version: 2024.10.5 +Version: 2024.11.5 Authors@R: c( person("Toby", "Hocking", email="toby.hocking@r-project.org", @@ -37,5 +37,7 @@ Suggests: dplyr, tidyr, nc, - RColorBrewer + RColorBrewer, + tibble, + Matrix VignetteBuilder: knitr diff --git a/NEWS b/NEWS index 9696449..0fee56d 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,11 @@ +Changes in version 2024.11.5 + +- new feature: atime_grid() stores a data table of parameters that were specified as input, with corresponding new column names in +- atime() measurements table, +- references_best() measurements and plot.references tables, +- predict() tables: measurements and predictions tables. +- new sparse vignette which shows how to make a custom plot with facets defined by variables that were used in atime_grid(). + Changes in version 2024.10.5 - bugfix: atime_test again outputs historical versions which were specified in ... diff --git a/R/atime.R b/R/atime.R index b966dcc..197fda9 100644 --- a/R/atime.R +++ b/R/atime.R @@ -67,18 +67,25 @@ atime_grid <- function nrow(value.mat), ncol(value.mat)) name.value.vec <- apply(name.value.mat, 1, paste, collapse=collapse) out.list <- list() + out.param.list <- list() for(expr.name in names(elist)){ for(row.i in 1:nrow(param.dt)){ param.name.value <- name.value.vec[[row.i]] out.name <- paste0(expr.name, expr.param.sep, param.name.value) - param.row.list <- as.list(param.dt[row.i]) + param.row <- param.dt[row.i] + param.row.list <- as.list(param.row) param.row.list[symbol.params] <- lapply( param.row.list[symbol.params], as.symbol) out.list[[out.name]] <- eval(substitute( substitute(EXPR, param.row.list), list(EXPR=elist[[expr.name]]))) + out.param.list[[paste(expr.name, row.i)]] <- data.table( + expr.name=out.name, + expr.grid=expr.name, + param.row) } } + attr(out.list, "parameters") <- rbindlist(out.param.list) out.list } @@ -190,6 +197,12 @@ atime <- function(N=default_N(), setup, expr.list=NULL, times=10, seconds.limit= seconds="median", more.units) measurements <- rbindlist(metric.dt.list) + expr.list.params <- attr(expr.list,"parameters") + by.vec <- "expr.name" + if(is.data.table(expr.list.params)){ + measurements <- expr.list.params[measurements, on="expr.name"] + by.vec <- names(expr.list.params) + } only.one <- measurements[, .(sizes=.N), by=expr.name][sizes==1] if(nrow(only.one)){ warning("please increase max N or seconds.limit, because only one N was evaluated for expr.name: ", paste(only.one[["expr.name"]], collapse=", ")) @@ -198,7 +211,8 @@ atime <- function(N=default_N(), setup, expr.list=NULL, times=10, seconds.limit= list( unit.col.vec=unit.col.vec, seconds.limit=seconds.limit, - measurements=measurements), + measurements=measurements, + by.vec=by.vec), class="atime") } diff --git a/R/predict.R b/R/predict.R index 238926f..c533e0e 100644 --- a/R/predict.R +++ b/R/predict.R @@ -43,7 +43,7 @@ predict.references_best <- function(object, ...){ unit.value, N=10^approx(log10.empirical, log10.N, log10(unit.value))$y)] } - }, by=expr.name] + }, by=c(object$by.vec)] not.NA <- pred.dt[!is.na(N)] if(nrow(not.NA)==0){ stop(unit, "=", unit.value, " is outside range of data, please change to a value that intersects at least one of the empirical curves") diff --git a/R/references.R b/R/references.R index b3f6157..6900ae0 100644 --- a/R/references.R +++ b/R/references.R @@ -78,21 +78,21 @@ references_best <- function(L, fun.list=NULL){ all.refs <- DT[ , references(N, .SD[[col.name]], lower.limit, fun.list), - by=expr.name] - all.refs[, rank := rank(-N), by=.(expr.name, fun.name)] + by=c(L$by.vec)] + all.refs[, rank := rank(-N), by=c(L$by.vec, "fun.name")] second <- all.refs[rank==2] second[, dist := log10(empirical/reference) ] second[, sign := sign(dist)] - l.cols <- list(overall="expr.name", each.sign=c("expr.name","sign")) + l.cols <- list(overall=L$by.vec, each.sign=c(L$by.vec,"sign")) for(best.type in names(l.cols)){ by <- l.cols[[best.type]] second[ , paste0(best.type,".rank") := rank(abs(dist)) - , by=by] + , by=c(by)] } ref.dt.list[[unit]] <- data.table(unit, all.refs[ second, - on=.(expr.name, fun.name, fun.latex)]) + on=c(L$by.vec, "fun.name", "fun.latex")]) best <- second[overall.rank==1, .(expr.name, fun.name, fun.latex)] metric.dt.list[[unit]] <- data.table(unit, best[ DT, on=.(expr.name) @@ -108,7 +108,8 @@ references_best <- function(L, fun.list=NULL){ seconds.limit=L[["seconds.limit"]], references=ref.dt, plot.references=ref.dt[each.sign.rank==1], - measurements=do.call(rbind, metric.dt.list)), + measurements=do.call(rbind, metric.dt.list), + by.vec=L[["by.vec"]]), class="references_best") } @@ -133,17 +134,17 @@ plot.references_best <- function(x, ...){ } gg <- gg+ ggplot2::geom_ribbon(ggplot2::aes( - N, ymin=min, ymax=max), + N, ymin=min, ymax=max, group=expr.name), data=meas[unit=="seconds"], fill=emp.color, alpha=0.5)+ ggplot2::geom_line(ggplot2::aes( - N, empirical), + N, empirical, group=expr.name), size=2, color=emp.color, data=meas)+ ggplot2::geom_line(ggplot2::aes( - N, reference, group=fun.name), + N, reference, group=paste(expr.name, fun.name)), color=ref.color, size=1, data=ref.dt)+ diff --git a/tests/testthat/test-CRAN.R b/tests/testthat/test-CRAN.R index 53e75ae..c2a13b3 100644 --- a/tests/testthat/test-CRAN.R +++ b/tests/testthat/test-CRAN.R @@ -167,7 +167,15 @@ test_that("atime_grid symbol.params arg OK", { ), foo=FUN(regex, text, proto, perl = PERL), symbol.params="FUN") - expect_identical(grid.result, list("foo PERL=TRUE,FUN=strcapture"=quote(strcapture(regex, text, proto, perl=TRUE)))) + expected <- list( + "foo PERL=TRUE,FUN=strcapture"=quote( + strcapture(regex, text, proto, perl=TRUE))) + attr(expected,"parameters") <- data.table( + expr.name="foo PERL=TRUE,FUN=strcapture", + expr.grid="foo", + PERL=TRUE, + FUN="strcapture") + expect_identical(grid.result, expected) }) test_that("atime_grid error for list of funs", { @@ -429,3 +437,52 @@ test_that("atime_test outputs historical versions", { Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue expect_identical(names(atest), c("setup", "expr", "Slow", "Fast")) }) + +test_that("atime_grid parameters attribute", { + library(Matrix) + param.list <- list( + non_zeros=c("N","N^2/10"), + fun=c("matrix","Matrix") + ) + (expr.list <- atime::atime_grid( + param.list, + mult_vector={ + L[[fun]][[non_zeros]]%*%w + data.frame(in_size=N) + }, + add_one={ + L[[fun]][[non_zeros]]+1 + data.frame(in_size=N) + }, + collapse="\n")) + expected.names <- c("expr.name","expr.grid","non_zeros", "fun") + expect_identical(names(attr(expr.list,"parameters")), expected.names) + mult.result <- atime::atime( + N=as.integer(10^seq(1,2,by=0.25)), + setup={ + L <- list() + set.seed(1) + w <- rnorm(N) + for(non_zeros in param.list$non_zeros){ + N.not.zero <- as.integer(eval(str2lang(non_zeros))) + m <- matrix(0, N, N) + m[sample(N^2, N.not.zero)] <- rnorm(N.not.zero) + for(fun in param.list$fun){ + L[[fun]][[non_zeros]] <- get(fun)(as.numeric(m), N, N) + } + } + }, + foo={ + w+1 + data.frame(in_size=N) + }, + result=TRUE, + expr.list=expr.list) + expect_in(expected.names, names(mult.result$measurements)) + mult.refs <- atime::references_best(mult.result) + expect_in(expected.names, names(mult.refs$measurements)) + expect_in(expected.names, names(mult.refs$plot.references)) + mult.pred <- predict(mult.refs, in_size=50) + expect_in(expected.names, names(mult.pred$measurements)) + expect_in(expected.names, names(mult.pred$prediction)) +}) diff --git a/vignettes/sparse.Rmd b/vignettes/sparse.Rmd new file mode 100644 index 0000000..c8a6ea4 --- /dev/null +++ b/vignettes/sparse.Rmd @@ -0,0 +1,251 @@ + + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +options(width=120) +``` + +In this vignette, we compare the computation time/memory usage of +dense `matrix` and sparse `Matrix`. We begin with an analysis of the +time/memory it takes to create these objects, along with a `vector` +for comparison: + +```{r} +library(Matrix) +len <- function(x)data.frame(length=length(x)) +vec.mat.result <- atime::atime( + N=10^seq(1,7,by=0.25), + vector=len(numeric(N)), + matrix=len(matrix(0, N, N)), + Matrix=len(Matrix(0, N, N)), + result=TRUE) +plot(vec.mat.result) +``` + +The plot above shows three panels, one for each unit. + +* `kilobytes` is the amount of memory used. We see that `Matrix` and + `vector` use the same amount of memory asymptotically, whereas + `matrix` uses more (larger slope on the log-log plot implies larger + asymptotic complexity class). +* `length` is the value returned by the `length` function. We see that + `matrix` and `Matrix` have the same value, whereas `vector` has + asymptotically smaller length (smaller slope on log-log plot). +* `seconds` is the amount of time taken. We see that `Matrix` is + slower than `vector` and `matrix` by a small constant overhead, + which can be seen for small `N`. We also see that for large `N`, + `Matrix` and `vector` have the same asymptotic time complexity, + which is much faster than `matrix`. + +Below we estimate the best asymptotic complexity classes: + +```{r} +vec.mat.best <- atime::references_best(vec.mat.result) +plot(vec.mat.best) +``` + +The plot above shows that + +* `matrix` has time, memory, and `length` which are all quadratic `O(N^2)`. +* `Matrix` has linear `O(N)` time and memory, but `O(N^2)` values for + `length`. +* `vector` has time, memory, and `length` which are all linear `O(N)`. + +Below we estimate the throughput for some given limits: + +```{r} +vec.mat.pred <- predict(vec.mat.best, seconds=vec.mat.result$seconds.limit, kilobytes=1000, length=1e6) +plot(vec.mat.pred) +``` + +In the plot above we can see the throughput `N` for a given limit of +`kilobytes`, `length` or `seconds`. Below we use `Matrix` as a +reference, and compute the throughput ratio, `Matrix` to other. + +```{r} +library(data.table) +dcast(vec.mat.pred$prediction[ +, ratio := N[expr.name=="Matrix"]/N, by=unit +], unit + unit.value ~ expr.name, value.var="ratio") +``` + +From the table above (`matrix` column), we can see that the throughput +of `Matrix` is 100-1000x larger than `matrix`, for the given limits. + +## Matrix Multiplication, 90% sparsity + +First we show the difference between sparse and dense matrix +multiplication, when the matrix has 90% zeros (10% non-zeros). + +```{r} +library(Matrix) +sparse.prop <- 0.9 +dense.prop <- 1-sparse.prop +mult.result <- atime::atime( + N=as.integer(10^seq(1,4,by=0.25)), + setup={ + m <- matrix(0, N, N) + set.seed(1) + w <- rnorm(N) + N.not.zero <- as.integer(dense.prop*N^2) + m[sample(N^2, N.not.zero)] <- rnorm(N.not.zero) + M <- Matrix(m) + }, + sparse = M %*% w, + dense = m %*% w, + result=TRUE) +plot(mult.result) +``` + +Above we see that `sparse` is faster than `dense`, but by constant +factors. +Below we estimate the best asymptotic complexity classes: + +```{r} +mult.best <- atime::references_best(mult.result) +plot(mult.best) +``` + +Above we see that both `sparse` and `dense` matrix multiplication are +quadratic `O(N^2)` time (for a quadratic number of non-zero entries). + +Finally, we verify below that both methods yield the same result: + +```{r} +library(data.table) +mult.compare <- dcast( + mult.result$measurements, N ~ expr.name, value.var="result" +)[ +, equal := paste(all.equal(as.numeric(dense[[1]]), as.numeric(sparse[[1]]))) +, by=N +][] +tibble::tibble(mult.compare) +``` + +## Matrix multiplication, linear number of non-zeros + +Next we show the difference between sparse and dense matrix +multiplication, when the matrix has a linear number of non-zeros +(asymptotically fewer than in the previous section). + +```{r} +library(Matrix) +mult.result <- atime::atime( + N=as.integer(10^seq(1,4,by=0.25)), + setup={ + m <- matrix(0, N, N) + set.seed(1) + w <- rnorm(N) + N.not.zero <- N + m[sample(N^2, N.not.zero)] <- rnorm(N.not.zero) + M <- Matrix(m) + }, + sparse = M %*% w, + dense = m %*% w, + result=TRUE) +plot(mult.result) +``` + +Above we see that `sparse` is asymptotically faster than `dense` (different asymptotic slopes). +Below we estimate the best asymptotic complexity classes: + +```{r} +mult.best <- atime::references_best(mult.result) +plot(mult.best) +``` + +Above we see that `sparse` is linear `O(N)` time whereas `dense` is +quadratic `O(N^2)` time (for a linear number of non-zero entries). + +Finally, we verify below that both methods yield the same result: + +```{r} +library(data.table) +mult.compare <- dcast( + mult.result$measurements, N ~ expr.name, value.var="result" +)[ +, equal := paste(all.equal(as.numeric(dense[[1]]), as.numeric(sparse[[1]]))) +, by=N +][] +tibble::tibble(mult.compare) +``` + +## Matrix multiplication, linear and quadratic number of non-zeros + +In this section we show how you can code both comparisons at the same +time, without repetition. The trick is to first define a list of parameters to vary: + +```{r} +param.list <- list( + non_zeros=c("N","N^2/10"), + fun=c("matrix","Matrix") +) +``` + +After that we create a grid of expressions to evaluate, by expanding the parameter grid: + +```{r} +(expr.list <- atime::atime_grid( + param.list, + Mw=L[[fun]][[non_zeros]]%*%w, + collapse="\n")) +``` + +Finally we pass the list of expressions to `atime`, along with a +`setup` argument which creates the required list `L` of input data, +based on the parameters: + +```{r} +mult.result <- atime::atime( + N=as.integer(10^seq(1,3.5,by=0.25)), + setup={ + L <- list() + set.seed(1) + w <- rnorm(N) + for(non_zeros in param.list$non_zeros){ + N.not.zero <- as.integer(eval(str2lang(non_zeros))) + m <- matrix(0, N, N) + m[sample(N^2, N.not.zero)] <- rnorm(N.not.zero) + for(fun in param.list$fun){ + L[[fun]][[non_zeros]] <- get(fun)(as.numeric(m), N, N) + } + } + }, + expr.list=expr.list) +plot(mult.result) +``` + +Below we estimate the best asymptotic complexity classes: + +```{r} +mult.best <- atime::references_best(mult.result) +plot(mult.best) +``` + +Below we show an alternative visualization: + +```{r} +only.seconds <- mult.best +only.seconds$measurements <- mult.best$measurements[unit=="seconds"] +only.seconds$plot.references <- mult.best$plot.references[unit=="seconds"] +library(ggplot2) +plot(only.seconds)+ + facet_grid(non_zeros ~ fun, labeller=label_both) +``` + +## Conclusion + +Overall in this vignette we have shown how `atime` can be used to +demonstrate when sparse matrices can be used for efficient +computations. + +* sparse matrices have linear rather than quadratic time/memory for creation. +* sparse matrix-vector multiply is asymptotically faster (linear + rather than quadratic time) if there are a linear number of non-zero + elements.