Better support for compressed files. Closes #98

tidyverse · Mar 31, 2015 · 0d5e27d · 0d5e27d
1 parent d6bb0ba
commit 0d5e27d
Show file tree

Hide file tree

Showing 13 changed files with 141 additions and 48 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,7 +15,8 @@ LinkingTo:
     Rcpp,
     BH
 Imports:
-    Rcpp (>= 0.11.5)
+    Rcpp (>= 0.11.5),
+    curl
 Suggests:
     testthat,
     knitr,

diff --git a/R/read_delim.R b/R/read_delim.R
@@ -26,9 +26,14 @@ NULL
 #' # Input sources -------------------------------------------------------------
 #' # Read from a path
 #' read_csv(system.file("extdata/mtcars.csv", package = "readr"))
+#' read_csv(system.file("extdata/mtcars.csv.zip", package = "readr"))
+#' read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr"))
+#' read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
+#'
 #' # Or directly from a string (must contain a newline)
 #' read_csv("x,y\n1,2\n3,4")
 #'
+#' # Column types --------------------------------------------------------------
 #' # By default, readr guess the columns types, looking at the first 100 rows.
 #' # You can override with a compact specification:
 #' read_csv("x,y\n1,2\n3,4", col_types = "dc")

diff --git a/R/source.R b/R/source.R
@@ -1,8 +1,12 @@
 #' Create a source object.
 #'
-#' @param file Either a path to a file, a url, a connection, or literal data
-#'    (either a single string or a raw vector). Connections and urls are saved
-#'    to a temporary file before being read.
+#' @param file Either a path to a file, a connection, or literal data
+#'    (either a single string or a raw vector).
+#'
+#'    Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+#'    be automatically uncompressed. Files starting with \code{http://},
+#'    \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+#'    downloaded.
 #'
 #'    Literal data is most useful for examples and tests. It must contain at
 #'    least one new line to be recognised as data (instead of a path).
@@ -14,34 +18,39 @@
 #' datasource("a,b,c\n1,2,3")
 #' datasource(charToRaw("a,b,c\n1,2,3"))
 #'
-#' # Local path
+#' # Strings
 #' datasource(system.file("extdata/mtcars.csv", package = "readr"))
+#' datasource(system.file("extdata/mtcars.csv.bz2", package = "readr"))
+#' datasource(system.file("extdata/mtcars.csv.zip", package = "readr"))
+#' datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
 #'
 #' # Connection
 #' datasource(rawConnection(charToRaw("abc\n123")))
 datasource <- function(file, skip = 0) {
   if (inherits(file, "source")) {
     file
-  } else if (inherits(file, "connection")) {
-    path <- cache_con(file)
-    datasource_file(path, skip)
+  } else if (is.connection(file)) {
+    datasource_connection(file, skip)
   } else if (is.raw(file)) {
     datasource_raw(file, skip)
   } else if (is.character(file)) {
     if (grepl("\n", file)) {
       datasource_string(file, skip)
-    } else if (grepl("^(http|ftp|https)://", file)) {
-      tmp <- tempfile()
-      download.file(file, tmp, quiet = TRUE, mode = "wb")
-      datasource_file(tmp, skip)
     } else {
-      datasource_file(file, skip)
+      file <- standardise_path(file)
+      if (is.connection(file)) {
+        datasource_connection(file, skip)
+      } else {
+        datasource_file(file, skip)
+      }
     }
   } else {
     stop("`file` must be a string, raw vector or a connection.", call. = FALSE)
   }
 }
 
+# Constructors -----------------------------------------------------------------
+
 new_datasource <- function(type, x, skip, ...) {
   structure(list(x, skip = skip, ...),
     class = c(paste0("source_", type), "source"))
@@ -52,14 +61,21 @@ datasource_string <- function(text, skip) {
 }
 
 datasource_file <- function(path, skip) {
-  path <- check_file(path)
+  path <- check_path(path)
   new_datasource("file", path, skip = skip)
 }
 
+datasource_connection <- function(path, skip) {
+  path <- cache_con(path)
+  datasource_file(path, skip)
+}
+
 datasource_raw <- function(text, skip) {
   new_datasource("text", text, skip = skip)
 }
 
+# Helpers ----------------------------------------------------------------------
+
 cache_con <- function(con) {
   tmp <- tempfile()
   tmpcon <- file(tmp, "w+b")
@@ -76,19 +92,50 @@ cache_con <- function(con) {
   tmp
 }
 
-check_file <- function(path) {
-  if (!file.exists(path)) {
-    stop("'", path, "' does not exist",
-      if (!is_absolute_path(path))
-        paste0(" in current working directory ('", getwd(), "')"),
-      ".",
-      call. = FALSE)
-  }
+standardise_path <- function(path) {
+  if (!is.character(path))
+    return(path)
+
+  if (is_url(path))
+    return(curl::curl(path))
+
+  path <- check_path(path)
+  switch(tools::file_ext(path),
+    gz = gzfile(path, ""),
+    bz2 = bzfile(path, ""),
+    xz = xzfile(path, ""),
+    zip = zipfile(path, ""),
+    path
+  )
+}
+
+is_url <- function(path) {
+  grepl("^(http|ftp)s?://", path)
+}
+
+check_path <- function(path) {
+  if (file.exists(path))
+    return(normalizePath(path, "/", mustWork = FALSE))
 
-  normalizePath(path, "/", mustWork = FALSE)
+  stop("'", path, "' does not exist",
+    if (!is_absolute_path(path))
+      paste0(" in current working directory ('", getwd(), "')"),
+    ".",
+    call. = FALSE
+  )
 }
 
 is_absolute_path <- function(path) {
   grepl("^(/|[A-Za-z]:|\\\\|~)", path)
 }
 
+zipfile <- function(path, open = "r") {
+  files <- utils::unzip(path, list = TRUE)
+  file <- files$Name[[1]]
+
+  if (nrow(files) > 1) {
+    message("Multiple files in zip: reading '", file, "'")
+  }
+
+  unz(path, file, open = open)
+}
diff --git a/inst/extdata/mtcars.csv.bz2 b/inst/extdata/mtcars.csv.bz2
diff --git a/inst/extdata/mtcars.csv.zip b/inst/extdata/mtcars.csv.zip
diff --git a/man/count_fields.Rd b/man/count_fields.Rd
@@ -7,9 +7,13 @@
 count_fields(file, tokenizer, skip = 0)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}

diff --git a/man/datasource.Rd b/man/datasource.Rd
@@ -7,9 +7,13 @@
 datasource(file, skip = 0)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
@@ -24,8 +28,11 @@ Create a source object.
 datasource("a,b,c\\n1,2,3")
 datasource(charToRaw("a,b,c\\n1,2,3"))
 
-# Local path
+# Strings
 datasource(system.file("extdata/mtcars.csv", package = "readr"))
+datasource(system.file("extdata/mtcars.csv.bz2", package = "readr"))
+datasource(system.file("extdata/mtcars.csv.zip", package = "readr"))
+datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
 
 # Connection
 datasource(rawConnection(charToRaw("abc\\n123")))

diff --git a/man/read_delim.Rd b/man/read_delim.Rd
@@ -21,9 +21,13 @@ read_tsv(file, col_names = TRUE, col_types = NULL, na = "NA", skip = 0,
   n_max = -1, progress = interactive())
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
@@ -88,9 +92,14 @@ decimal separator.
 # Input sources -------------------------------------------------------------
 # Read from a path
 read_csv(system.file("extdata/mtcars.csv", package = "readr"))
+read_csv(system.file("extdata/mtcars.csv.zip", package = "readr"))
+read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr"))
+read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
+
 # Or directly from a string (must contain a newline)
 read_csv("x,y\\n1,2\\n3,4")
 
+# Column types --------------------------------------------------------------
 # By default, readr guess the columns types, looking at the first 100 rows.
 # You can override with a compact specification:
 read_csv("x,y\\n1,2\\n3,4", col_types = "dc")

diff --git a/man/read_file.Rd b/man/read_file.Rd
@@ -7,9 +7,13 @@
 read_file(file)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}

diff --git a/man/read_fwf.Rd b/man/read_fwf.Rd
@@ -17,9 +17,13 @@ fwf_widths(widths, col_names = NULL)
 fwf_positions(start, end, col_names = NULL)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}

diff --git a/man/read_lines.Rd b/man/read_lines.Rd
@@ -7,9 +7,13 @@
 read_lines(file, n_max = -1L)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}

diff --git a/man/read_table.Rd b/man/read_table.Rd
@@ -8,9 +8,13 @@ read_table(file, col_names = TRUE, col_types = NULL, na = "NA",
   skip = 0, n_max = -1)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}

diff --git a/man/tokenize.Rd b/man/tokenize.Rd
@@ -7,9 +7,13 @@
 tokenize(file, tokenizer = tokenizer_csv(), n = NA_integer_)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}