From 2101eae035c4dc7e27bd4b457ad6e222e1b33e5a Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Fri, 11 Nov 2022 19:44:16 -0700
Subject: [PATCH 1/2] tables(mb=type_size) faster lower bound MB by default

---
 R/tables.R    | 26 ++++++++++++++++++++++----
 man/tables.Rd |  8 ++++----
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/R/tables.R b/R/tables.R
index 5196935ed..b62516b8f 100644
--- a/R/tables.R
+++ b/R/tables.R
@@ -1,7 +1,24 @@
 # globals to pass NOTE from R CMD check, see http://stackoverflow.com/questions/9439256
 MB = NCOL = NROW = NULL
 
-tables = function(mb=TRUE, order.col="NAME", width=80,
+type_size = function(DT) {
+  # for speed and ram efficiency, a lower bound by not descending into character string lengths or list items
+  # if a more accurate and higher estimate is needed then user can pass object.size or alternative to mb=
+  # in case number of columns is very large (e.g. 1e6 columns) then we use a for() to avoid allocation of sapply()
+  ans = 0L
+  lookup = c("raw"=1L, "integer"=4L, "double"=8L, "complex"=16L)
+  for (i in seq_along(DT)) {
+    col = DT[[i]]
+    tt = lookup[storage.mode(col)]
+    if (is.na(tt)) tt = .Machine$sizeof.pointer
+    tt = tt*nrow(DT)
+    if (is.factor(col)) tt = tt + length(levels(col))*.Machine$sizeof.pointer
+    ans = ans + tt
+  }
+  ans + ncol(DT)*.Machine$sizeof.pointer  # column name pointers
+}
+
+tables = function(mb=type_size, order.col="NAME", width=80,
                   env=parent.frame(), silent=FALSE, index=FALSE)
 {
   # Prints name, size and colnames of all data.tables in the calling environment by default
@@ -13,6 +30,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80,
     if (!silent) catf("No objects of class data.table exist in %s\n", if (identical(env, .GlobalEnv)) ".GlobalEnv" else format(env))
     return(invisible(data.table(NULL)))
   }
+  if (isTRUE(mb)) mb=type_size  # can still use TRUE, although TRUE will now be the lower faster type_size method
   DT_names = all_obj[is_DT]
   info = rbindlist(lapply(DT_names, function(dt_n){
     DT = get(dt_n, envir=env)   # doesn't copy
@@ -20,7 +38,7 @@ tables = function(mb=TRUE, order.col="NAME", width=80,
       NAME = dt_n,
       NROW = nrow(DT),
       NCOL = ncol(DT),
-      MB = if (mb) round(as.numeric(object.size(DT))/1024^2), # object.size() is slow hence optional; TODO revisit
+      MB = if (is.function(mb)) round(as.numeric(mb(DT))/1024^2),
       COLS = list(names(DT)),
       KEY = list(key(DT)),
       INDICES = if (index) list(indices(DT)))
@@ -38,9 +56,9 @@ tables = function(mb=TRUE, order.col="NAME", width=80,
     tt = copy(info)
     tt[ , NROW := pretty_format(NROW, width=4L)]
     tt[ , NCOL := pretty_format(NCOL, width=4L)]
-    if (mb) tt[ , MB := pretty_format(MB, width=2L)]
+    if (is.function(mb)) tt[ , MB := pretty_format(MB, width=2L)]
     print(tt, class=FALSE, nrows=Inf)
-    if (mb) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=","))
+    if (is.function(mb)) catf("Total: %sMB\n", prettyNum(sum(info$MB), big.mark=","))
   }
   invisible(info)
 }
diff --git a/man/tables.Rd b/man/tables.Rd
index 5b95edffa..a8a74b0a7 100644
--- a/man/tables.Rd
+++ b/man/tables.Rd
@@ -5,11 +5,11 @@
   Convenience function for concisely summarizing some metadata of all \code{data.table}s in memory (or an optionally specified environment).
 }
 \usage{
-tables(mb=TRUE, order.col="NAME", width=80,
+tables(mb=type_size, order.col="NAME", width=80,
        env=parent.frame(), silent=FALSE, index=FALSE)
 }
 \arguments{
-  \item{mb}{ \code{logical}; \code{TRUE} adds the rough size of each \code{data.table} in megabytes to the output under column \code{MB}.  }
+  \item{mb}{ a function which accepts a \code{data.table} and returns its size in bytes. By default, \code{type_size} (same as \code{TRUE}) provides a fast lower bound by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). A column \code{"MB"} is included in the output unless \code{FALSE} or \code{NULL}. }
   \item{order.col}{ Column name (\code{character}) by which to sort the output. }
   \item{width}{ \code{integer}; number of characters beyond which the output for each of the columns \code{COLS}, \code{KEY}, and \code{INDICES} are truncated. }
   \item{env}{ An \code{environment}, typically the \code{.GlobalEnv} by default, see Details. }
@@ -19,9 +19,9 @@ tables(mb=TRUE, order.col="NAME", width=80,
 \details{
 Usually \code{tables()} is executed at the prompt, where \code{parent.frame()} returns \code{.GlobalEnv}. \code{tables()} may also be useful inside functions where \code{parent.frame()} is the local scope of the function; in such a scenario, simply set it to \code{.GlobalEnv} to get the same behaviour as at prompt.
 
-Note that on older versions of \R, \code{object.size} may be slow, so setting \code{mb=FALSE} may speed up execution of \code{tables} significantly.
+`mb = utils::object.size` provides a higher and more accurate estimate of size, but may take longer. Its default `units="b"` is appropriate.
 
-Setting \code{silent=TRUE} prints nothing; the metadata are returned as a \code{data.table}, invisibly, whether silent is \code{TRUE} or \code{FALSE}.
+Setting \code{silent=TRUE} prints nothing; the metadata is returned as a \code{data.table} invisibly whether \code{silent} is \code{TRUE} or \code{FALSE}.
 }
 \value{
     A \code{data.table} containing the information printed.

From 3b124bd73dc75022303d5acb6a1d0c7ff6ab80e0 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Fri, 11 Nov 2022 19:58:20 -0700
Subject: [PATCH 2/2] news item

---
 NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 15bf7e8ea..a79bcf32b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -294,6 +294,8 @@
 
 41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error.
 
+42. `tables()` is faster by default by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). `mb=` now accepts any function which accepts a `data.table` and returns a higher and better estimate of its size in bytes, albeit more slowly; e.g. `mb = utils::object.size`.
+
 ## BUG FIXES
 
 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries.