r-lib · hadley · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # evaluate (development version)
 
+* `parse_all()` consistently strips the trailing `\n` from the end of each `src` vector.
 * The package now depends on R 4.0.0 in order to decrease our maintenance burden.
 * `evaluate()` automatically strips calls from conditions emitted by top-level code (these incorrectly get calls because they're wrapped inside `eval()`) (#150).
 * `evalute(include_timing)` has been deprecated. I can't find any use of it on GitHub, and it adds substantial code complexity for little gain.

diff --git a/R/parse.R b/R/parse.R
@@ -11,12 +11,15 @@
 #' A data frame with columns `src`, a character vector of source code, and 
 #' `expr`, a list-column of parsed expressions. There will be one row for each 
 #' top-level expression in `x`. A top-level expression is a complete expression 
-#' which would trigger execution if typed at the console. The `expression`
-#' object in `expr` can be of any length: it will be 0 if the top-level 
-#' expression contains only whitespace and/or comments; 1 if the top-level 
-#' expression is a single scalar (like `TRUE`, `1`, or `"x"`), name, or call; 
-#' or 2 if the top-level expression uses `;` to put multiple expressions on 
-#' one line.
+#' which would trigger execution if typed at the console. 
+#' 
+#' The trailing `\n` at the end of each `src` is implicit.
+#' 
+#' The `expression` object in `expr` can be of any length: it will be 0 if 
+#' the top-level expression contains only whitespace and/or comments; 1 if 
+#' the top-level expression is a single scalar (like `TRUE`, `1`, or `"x"`), 
+#' name, or call; or 2 if the top-level expression uses `;` to put multiple 
+#' expressions on one line.
 #' 
 #' If there are syntax errors in `x` and `allow_error = TRUE`, the data 
 #' frame will have an attribute `PARSE_ERROR` that stores the error object.
@@ -35,99 +38,64 @@ parse_all <- function(x, filename = NULL, allow_error = FALSE) UseMethod("parse_
 
 #' @export
 parse_all.character <- function(x, filename = NULL, allow_error = FALSE) {
-  if (length(grep("\n", x))) {
-    # strsplit('a\n', '\n') needs to return c('a', '') instead of c('a')
-    x <- gsub("\n$", "\n\n", x)
-    x[x == ""] <- "\n"
+  if (any(grepl("\n", x))) {
+    # Standardise to character vector with one line per element:
+    # this is the input that parse() is documented to accept
     x <- unlist(strsplit(x, "\n"), recursive = FALSE, use.names = FALSE)
   }
   n <- length(x)
 
-  if (is.null(filename))
-    filename <- "<text>"
+  filename <- filename %||% "<text>"
   src <- srcfilecopy(filename, x)
   if (allow_error) {
     exprs <- tryCatch(parse(text = x, srcfile = src), error = identity)
-    if (inherits(exprs, 'error')) return(structure(
-      data.frame(src = paste(x, collapse = '\n'), expr = I(list(expression()))),
-      PARSE_ERROR = exprs
-    ))
+    if (inherits(exprs, 'error')) {
+      return(structure(
+        data.frame(src = paste(x, collapse = '\n'), expr = empty_expr()),
+        PARSE_ERROR = exprs
+      ))
+    }
   } else {
     exprs <- parse(text = x, srcfile = src)
   }
 
-  # No code, only comments and/or empty lines
-  ne <- length(exprs)
-  if (ne == 0) {
-    return(data.frame(src = append_break(x), expr = I(rep(list(expression()), n))))
-  }
-
   srcref <- attr(exprs, "srcref", exact = TRUE)
-
-  # Stard/End line numbers of expressions
-  pos <- do.call(rbind, lapply(srcref, unclass))[, c(7, 8), drop = FALSE]
-  l1 <- pos[, 1]
-  l2 <- pos[, 2]
-  # Add a third column i to store the indices of expressions
-  pos <- cbind(pos, i = seq_len(nrow(pos)))
-  pos <- as.data.frame(pos)  # split() does not work on matrices
-
-  # Split line number pairs into groups: if the next start line is the same as
-  # the last end line, the two expressions must belong to the same group
-  spl <- cumsum(c(TRUE, l1[-1] != l2[-ne]))
-  # Extract src lines and expressions for each group; also record the start line
-  # number of this group so we can re-order src/expr later
-  res <- lapply(split(pos, spl), function(p) {
+  pos <- data.frame(
+    start = vapply(srcref, `[[`, 7, FUN.VALUE = integer(1)),
+    end = vapply(srcref, `[[`, 8, FUN.VALUE = integer(1))
+  )
+  pos$exprs <- exprs
+
+  # parse() splits TLEs that use ; into multiple expressions so we 
+  # join together expressions that overlaps on the same line(s)
+  line_group <- cumsum(is_new_line(pos$start, pos$end))
+  tles <- lapply(split(pos, line_group), function(p) {
     n <- nrow(p)
     data.frame(
-      src = paste(x[p[1, 1]:p[n, 2]], collapse = "\n"),
-      expr = I(list(exprs[p[, 3]])),
-      line = p[1, 1]
+      src = paste(x[p$start[1]:p$end[n]], collapse = "\n"),
+      expr = I(list(p$exprs)),
+      line = p$start[1]
     )
   })
-
-  # Now process empty expressions (comments/blank lines); see if there is a
-  # "gap" between the last end number + 1 and the next start number - 1
-  pos <- cbind(c(1, l2 + 1), c(l1 - 1, n))
-  pos <- pos[pos[, 1] <= pos[, 2], , drop = FALSE]
-
-  # Extract src lines from the gaps, and assign empty expressions to them
-  res <- c(res, lapply(seq_len(nrow(pos)), function(i) {
-    p <- pos[i, ]
-    r <- p[1]:p[2]
-    data.frame(
-      src = x[r],
-      expr = I(rep(list(expression()), p[2] - p[1] + 1)),
-      line = r - 1
-    )
-  }))
-
-  # Bind everything into a data frame, order it by line numbers, append \n to
-  # all src lines except the last one, and remove the line numbers
-  res <- do.call(rbind, res)
-  res <- res[order(res$line), ]
-  res$src <- append_break(res$src)
-  res$line <- NULL
-
-  # For compatibility with evaluate (<= 0.5.7): remove the last empty line (YX:
-  # I think this is a bug)
-  n <- nrow(res)
-  if (res$src[n] == "") res <- res[-n, ]
-
+  tles <- do.call(rbind, tles)
+
+  # parse() drops comments and whitespace so we add them back in
+  gaps <- data.frame(start = c(1, pos$end + 1), end = c(pos$start - 1, n))
+  gaps <- gaps[gaps$start <= gaps$end, ,]
+  # in sequence(), nvec is equivalent to length.out
+  lines <- sequence(from = gaps$start, nvec = gaps$end - gaps$start + 1)
+  comments <- data.frame(
+    src = x[lines],
+    expr = empty_expr(length(lines)),
+    line = lines
+  )
+
+  res <- rbind(tles, comments)
+  res <- res[order(res$line), c("src", "expr")]
   rownames(res) <- NULL
   res
 }
 
-# YX: It seems evaluate (<= 0.5.7) had difficulties with preserving line breaks,
-# so it ended up with adding \n to the first n-1 lines, which does not seem to
-# be necessary to me, and is actually buggy. I'm not sure if it is worth shaking
-# the earth and work with authors of reverse dependencies to sort this out. Also
-# see #42.
-append_break <- function(x) {
-  n <- length(x)
-  if (n <= 1) x else paste(x, rep(c("\n", ""), c(n - 1, 1)), sep = "")
-}
-
 #' @export
 parse_all.connection <- function(x, filename = NULL, ...) {
   if (!isOpen(x, "r")) {
@@ -185,3 +153,19 @@ parse_all.call <- function(x, filename = NULL, ...) {
   out$expr <- list(as.expression(x))
   out
 }
+
+# Helpers ---------------------------------------------------------------------
+
+empty_expr <- function(n = 1) {
+  I(rep(list(expression()), n))
+}
+
+is_new_line <- function(start, end) {
+  if (length(start) == 0) {
+    logical()
+  } else if (length(start) == 1) {
+    TRUE
+  } else {
+    c(TRUE, start[-1] != end[-length(end)])
+  }
+}
diff --git a/man/parse_all.Rd b/man/parse_all.Rd
diff --git a/tests/testthat/test-eval.R b/tests/testthat/test-eval.R
@@ -79,12 +79,6 @@ test_that("multiple expressions on one line can get printed as expected", {
   expect_output_types(ev, c("source", "text", "text"))
 })
 
-test_that("multiple lines of comments do not lose the terminating \\n", {
-  ev <- evaluate("# foo\n#bar")
-  expect_output_types(ev, c("source", "source"))
-  expect_equal(ev[[1]]$src, "# foo\n")
-})
-
 test_that("check_stop_on_error converts integer to enum", {
   expect_equal(check_stop_on_error(0), "continue")
   expect_equal(check_stop_on_error(1), "stop")

diff --git a/tests/testthat/test-parse.R b/tests/testthat/test-parse.R
@@ -1,3 +1,50 @@
+test_that("can parse even if no expressions", {
+  expect_equal(parse_all("")$src, "")
+  expect_equal(parse_all("#")$src, "#")
+  expect_equal(parse_all("#\n\n")$src, c("#", ""))
+})
+
+test_that("every TLE has an implicit nl", {
+  expect_equal(parse_all("x")$src, "x")
+  expect_equal(parse_all("x\n")$src, "x")
+  expect_equal(parse_all("")$src, "")
+  expect_equal(parse_all("\n")$src, "")
+
+  expect_equal(parse_all("{\n1\n}")$src, "{\n1\n}")
+  expect_equal(parse_all("{\n1\n}\n")$src, "{\n1\n}")
+
+  # even empty lines
+  expect_equal(parse_all("a\n\nb")$src, c("a", "", "b"))
+  expect_equal(parse_all("\n\n")$src, c("", ""))
+})
+
+test_that("a character vector is equivalent to a multi-line string", {
+  expect_equal(parse_all(c("a", "b")), parse_all(c("a\nb")))
+})
+
+test_that("recombines multi-expression TLEs", {
+  expect_equal(
+    parse_all("1;2;3")$expr[[1]],
+    expression(1, 2, 3),
+    ignore_attr = "srcref"
+  )
+  expect_equal(
+    parse_all("1+\n2;3")$expr[[1]],
+    expression(1 + 2, 3),
+    ignore_attr = "srcref"
+  )
+  expect_equal(
+    parse_all("1+\n2;3+\n4; 5")$expr[[1]],
+    expression(1 + 2, 3 + 4, 5),
+    ignore_attr = "srcref"
+  )
+})
+
+test_that("re-integrates lines without expressions", {
+  expect_equal(parse_all("1\n\n2")$src, c("1", "", "2"))
+  expect_equal(parse_all("1\n#\n2")$src, c("1", "#", "2"))
+})
+
 test_that("expr is always an expression", {
   expect_equal(parse_all("#")$expr[[1]], expression())
   expect_equal(parse_all("1")$expr[[1]], expression(1), ignore_attr = "srcref")
@@ -34,7 +81,7 @@ if (isTRUE(l10n_info()[['UTF-8']])) {
 
   test_that("multibyte characters are parsed correct", {
     code <- c("ϱ <- 1# g / ml", "äöüßÄÖÜπ <- 7 + 3# nonsense")
-    expect_identical(parse_all(code)$src, append_break(code))
+    expect_identical(parse_all(code)$src, code)
   })
 }