From 0d5e27d562b16985f03be50c04fbcf60e6f476f3 Mon Sep 17 00:00:00 2001
From: hadley <h.wickham@gmail.com>
Date: Tue, 31 Mar 2015 09:36:10 -0500
Subject: [PATCH] Better support for compressed files. Closes #98

---
 DESCRIPTION                 |   3 +-
 R/read_delim.R              |   5 ++
 R/source.R                  |  91 +++++++++++++++++++++++++++---------
 inst/extdata/mtcars.csv.bz2 | Bin 0 -> 553 bytes
 inst/extdata/mtcars.csv.zip | Bin 0 -> 711 bytes
 man/count_fields.Rd         |  10 ++--
 man/datasource.Rd           |  15 ++++--
 man/read_delim.Rd           |  15 ++++--
 man/read_file.Rd            |  10 ++--
 man/read_fwf.Rd             |  10 ++--
 man/read_lines.Rd           |  10 ++--
 man/read_table.Rd           |  10 ++--
 man/tokenize.Rd             |  10 ++--
 13 files changed, 141 insertions(+), 48 deletions(-)
 create mode 100644 inst/extdata/mtcars.csv.bz2
 create mode 100644 inst/extdata/mtcars.csv.zip

diff --git a/DESCRIPTION b/DESCRIPTION
index 2f561370..602fb056 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -15,7 +15,8 @@ LinkingTo:
     Rcpp,
     BH
 Imports:
-    Rcpp (>= 0.11.5)
+    Rcpp (>= 0.11.5),
+    curl
 Suggests:
     testthat,
     knitr,
diff --git a/R/read_delim.R b/R/read_delim.R
index 8b87db40..bea88b55 100644
--- a/R/read_delim.R
+++ b/R/read_delim.R
@@ -26,9 +26,14 @@ NULL
 #' # Input sources -------------------------------------------------------------
 #' # Read from a path
 #' read_csv(system.file("extdata/mtcars.csv", package = "readr"))
+#' read_csv(system.file("extdata/mtcars.csv.zip", package = "readr"))
+#' read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr"))
+#' read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
+#'
 #' # Or directly from a string (must contain a newline)
 #' read_csv("x,y\n1,2\n3,4")
 #'
+#' # Column types --------------------------------------------------------------
 #' # By default, readr guess the columns types, looking at the first 100 rows.
 #' # You can override with a compact specification:
 #' read_csv("x,y\n1,2\n3,4", col_types = "dc")
diff --git a/R/source.R b/R/source.R
index 1c3c3e2f..4bfd5333 100644
--- a/R/source.R
+++ b/R/source.R
@@ -1,8 +1,12 @@
 #' Create a source object.
 #'
-#' @param file Either a path to a file, a url, a connection, or literal data
-#'    (either a single string or a raw vector). Connections and urls are saved
-#'    to a temporary file before being read.
+#' @param file Either a path to a file, a connection, or literal data
+#'    (either a single string or a raw vector).
+#'
+#'    Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+#'    be automatically uncompressed. Files starting with \code{http://},
+#'    \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+#'    downloaded.
 #'
 #'    Literal data is most useful for examples and tests. It must contain at
 #'    least one new line to be recognised as data (instead of a path).
@@ -14,34 +18,39 @@
 #' datasource("a,b,c\n1,2,3")
 #' datasource(charToRaw("a,b,c\n1,2,3"))
 #'
-#' # Local path
+#' # Strings
 #' datasource(system.file("extdata/mtcars.csv", package = "readr"))
+#' datasource(system.file("extdata/mtcars.csv.bz2", package = "readr"))
+#' datasource(system.file("extdata/mtcars.csv.zip", package = "readr"))
+#' datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
 #'
 #' # Connection
 #' datasource(rawConnection(charToRaw("abc\n123")))
 datasource <- function(file, skip = 0) {
   if (inherits(file, "source")) {
     file
-  } else if (inherits(file, "connection")) {
-    path <- cache_con(file)
-    datasource_file(path, skip)
+  } else if (is.connection(file)) {
+    datasource_connection(file, skip)
   } else if (is.raw(file)) {
     datasource_raw(file, skip)
   } else if (is.character(file)) {
     if (grepl("\n", file)) {
       datasource_string(file, skip)
-    } else if (grepl("^(http|ftp|https)://", file)) {
-      tmp <- tempfile()
-      download.file(file, tmp, quiet = TRUE, mode = "wb")
-      datasource_file(tmp, skip)
     } else {
-      datasource_file(file, skip)
+      file <- standardise_path(file)
+      if (is.connection(file)) {
+        datasource_connection(file, skip)
+      } else {
+        datasource_file(file, skip)
+      }
     }
   } else {
     stop("`file` must be a string, raw vector or a connection.", call. = FALSE)
   }
 }
 
+# Constructors -----------------------------------------------------------------
+
 new_datasource <- function(type, x, skip, ...) {
   structure(list(x, skip = skip, ...),
     class = c(paste0("source_", type), "source"))
@@ -52,14 +61,21 @@ datasource_string <- function(text, skip) {
 }
 
 datasource_file <- function(path, skip) {
-  path <- check_file(path)
+  path <- check_path(path)
   new_datasource("file", path, skip = skip)
 }
 
+datasource_connection <- function(path, skip) {
+  path <- cache_con(path)
+  datasource_file(path, skip)
+}
+
 datasource_raw <- function(text, skip) {
   new_datasource("text", text, skip = skip)
 }
 
+# Helpers ----------------------------------------------------------------------
+
 cache_con <- function(con) {
   tmp <- tempfile()
   tmpcon <- file(tmp, "w+b")
@@ -76,19 +92,50 @@ cache_con <- function(con) {
   tmp
 }
 
-check_file <- function(path) {
-  if (!file.exists(path)) {
-    stop("'", path, "' does not exist",
-      if (!is_absolute_path(path))
-        paste0(" in current working directory ('", getwd(), "')"),
-      ".",
-      call. = FALSE)
-  }
+standardise_path <- function(path) {
+  if (!is.character(path))
+    return(path)
+
+  if (is_url(path))
+    return(curl::curl(path))
+
+  path <- check_path(path)
+  switch(tools::file_ext(path),
+    gz = gzfile(path, ""),
+    bz2 = bzfile(path, ""),
+    xz = xzfile(path, ""),
+    zip = zipfile(path, ""),
+    path
+  )
+}
+
+is_url <- function(path) {
+  grepl("^(http|ftp)s?://", path)
+}
+
+check_path <- function(path) {
+  if (file.exists(path))
+    return(normalizePath(path, "/", mustWork = FALSE))
 
-  normalizePath(path, "/", mustWork = FALSE)
+  stop("'", path, "' does not exist",
+    if (!is_absolute_path(path))
+      paste0(" in current working directory ('", getwd(), "')"),
+    ".",
+    call. = FALSE
+  )
 }
 
 is_absolute_path <- function(path) {
   grepl("^(/|[A-Za-z]:|\\\\|~)", path)
 }
 
+zipfile <- function(path, open = "r") {
+  files <- utils::unzip(path, list = TRUE)
+  file <- files$Name[[1]]
+
+  if (nrow(files) > 1) {
+    message("Multiple files in zip: reading '", file, "'")
+  }
+
+  unz(path, file, open = open)
+}
diff --git a/inst/extdata/mtcars.csv.bz2 b/inst/extdata/mtcars.csv.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..8dd8afa1593aa9889ba1ce028bdeed6f1629e982
GIT binary patch
literal 553
zcmV+^0@nRPT4*^jL0KkKSs1P&Apigw*?<5L5CwnWKIVO(Kmw%Ym;m9XfYhkTrW%YU
zm^8!@QIkQCXkcnhr1Y8$j3J=(2?&7-Kn6yF(sdsbGG+_Gac6XglI}~M9z1Q=zjOWi
zc(ZXWz%cvU>uy~6JoAT>b|RZ#7=W7q$bZT;E*4}+iv)pJTi)^DPBt_WAqrSds0XOC
zEodBRV{QvBf+!S}yV}g$$vc{DL9C5Mq_Dx}NvVs4nkP3egyegBE)=rD8XdwG7kkA!
z$k@(Nr@a@u&o~hqTg#%UT&zqB%xxExrBHGfZlTl=MqS-yz$*cPR31_yCbH>OQH?me
zH<~SFMA9c#0;q6jC?kq2Di#SQ*o69hMKZ9oVncHp-(abWCr_uREhWXe<SaH6QKRup
zZmt0gC5D$d63%nF&H(MAjIQFK()1$3LA$$lHo2Q@wPPA=2*NbjyO0}2LJg#K=aXGN
zPI9~REUe%<T4KoCUN+tMhfP8hRPN!6l-m?(^F3H;s;kjx%~en|EHoo+UdXWtH?K8;
zEL03JPdR*M1GXARjss0B=0x))pnK)y!7&HOUz^jXs&7X2a|X>rRbB6`Lsf90wqzR9
zcvja^ZG=Q-OWCD{8jnG&+*mKCd%olE9mNHJ;||8U<_@i1Rf;O!`CS=Tl*W`8GCM`F
rM3d@W*s);P)AaV<fn)_anF`y&^y*AJ5faQX3yZlToG3^PR}m0^%LMve

literal 0
HcmV?d00001

diff --git a/inst/extdata/mtcars.csv.zip b/inst/extdata/mtcars.csv.zip
new file mode 100644
index 0000000000000000000000000000000000000000..f8b7190347f220779fb8b65a4d8b22fb48815db3
GIT binary patch
literal 711
zcmWIWW@Zs#-~ht8p7m}FNPvq$fFZXeIkBi%FS)oZG=hiW&T6U9EFk{QD8f)W<#e93
zA&=|#e?0419cx~Gw9FG)!YZY(wOj1x@3^4FfsM<g&qfrNc*X_o+qZXr+~uz?A6s90
zp5I??v+viZn*D!|Jl?KvFQ@Nc@uw<b=ibE1+MRd*UU~cPX;q?^X`pWB3YC+}h3k5H
z*O(?6XI?r#<DO37=~nX=-#q~@GdWh|9bgHv4fhfean#a2a^sgGTa1zL6}K>f3&I5!
zSB2~<16n5e?6W9fDeaN{d}l`CzKR23vWNXno{9F|ER(fHjB#UxcgoJ!3)_T5PAay9
zv9xnfdi~*o#*UV0N>5e=bI&_`l<oYV52<q`PCYDGo4Q7)`?HYMrkQi%0{s8mo13-w
zEnvwzArXIQ*R(6*9&4nIe0>t{S0w!W)ia)-wa0$0_nq3YsyDImclWhN318~IT;9>T
zbN@XJgHsn*pOe0nu~JLnW5t^}867X5wJhd7tG*yiUocZCQhq`i-=ry?k{ZbYCwdGr
zpKX(rmx({}TvX=g0tpNG?mMb%d4FuDKhvKf?WNq~5O9?9jmWLZDGv-hKXj~F)3|wu
z;oi49=e+Z}c;MO(uPa*8lP#xjd$#H5*^kbVnbzh`zNdHb6lVS^;meIIdUI<<>70Fn
zIU7wq+?Re`5q#+Y-y|o|C2<bjr*q!jDk+z^qwr~o)SB~c;RV5G(&pN7EjSXoe7A<!
znl$SxO8=i3%W+o!+Yv3(RTd-~^-v;jnMHC{OyJqC3BUgQ{_tg@gJZn$MD4FXrdwA3
zV-N6V=V)FTu}77OfkB*=fg!+~kx7&pk=&5wxEOd~iO*q4BZ!Hd_&DH+FTk6X4J6A5
Mgf1XeK(iSb0PGnvC;$Ke

literal 0
HcmV?d00001

diff --git a/man/count_fields.Rd b/man/count_fields.Rd
index 4d82a276..8c4d197e 100644
--- a/man/count_fields.Rd
+++ b/man/count_fields.Rd
@@ -7,9 +7,13 @@
 count_fields(file, tokenizer, skip = 0)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
diff --git a/man/datasource.Rd b/man/datasource.Rd
index f7d48096..54ce5191 100644
--- a/man/datasource.Rd
+++ b/man/datasource.Rd
@@ -7,9 +7,13 @@
 datasource(file, skip = 0)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
@@ -24,8 +28,11 @@ Create a source object.
 datasource("a,b,c\\n1,2,3")
 datasource(charToRaw("a,b,c\\n1,2,3"))
 
-# Local path
+# Strings
 datasource(system.file("extdata/mtcars.csv", package = "readr"))
+datasource(system.file("extdata/mtcars.csv.bz2", package = "readr"))
+datasource(system.file("extdata/mtcars.csv.zip", package = "readr"))
+datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
 
 # Connection
 datasource(rawConnection(charToRaw("abc\\n123")))
diff --git a/man/read_delim.Rd b/man/read_delim.Rd
index 9e8b43d9..db1b9b83 100644
--- a/man/read_delim.Rd
+++ b/man/read_delim.Rd
@@ -21,9 +21,13 @@ read_tsv(file, col_names = TRUE, col_types = NULL, na = "NA", skip = 0,
   n_max = -1, progress = interactive())
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
@@ -88,9 +92,14 @@ decimal separator.
 # Input sources -------------------------------------------------------------
 # Read from a path
 read_csv(system.file("extdata/mtcars.csv", package = "readr"))
+read_csv(system.file("extdata/mtcars.csv.zip", package = "readr"))
+read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr"))
+read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
+
 # Or directly from a string (must contain a newline)
 read_csv("x,y\\n1,2\\n3,4")
 
+# Column types --------------------------------------------------------------
 # By default, readr guess the columns types, looking at the first 100 rows.
 # You can override with a compact specification:
 read_csv("x,y\\n1,2\\n3,4", col_types = "dc")
diff --git a/man/read_file.Rd b/man/read_file.Rd
index 82bd0c55..e99d8928 100644
--- a/man/read_file.Rd
+++ b/man/read_file.Rd
@@ -7,9 +7,13 @@
 read_file(file)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
diff --git a/man/read_fwf.Rd b/man/read_fwf.Rd
index afd39e3d..b2a16818 100644
--- a/man/read_fwf.Rd
+++ b/man/read_fwf.Rd
@@ -17,9 +17,13 @@ fwf_widths(widths, col_names = NULL)
 fwf_positions(start, end, col_names = NULL)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
diff --git a/man/read_lines.Rd b/man/read_lines.Rd
index 5ffb76cc..9931b2ee 100644
--- a/man/read_lines.Rd
+++ b/man/read_lines.Rd
@@ -7,9 +7,13 @@
 read_lines(file, n_max = -1L)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
diff --git a/man/read_table.Rd b/man/read_table.Rd
index 3b6be1d9..3b0031f7 100644
--- a/man/read_table.Rd
+++ b/man/read_table.Rd
@@ -8,9 +8,13 @@ read_table(file, col_names = TRUE, col_types = NULL, na = "NA",
   skip = 0, n_max = -1)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}
diff --git a/man/tokenize.Rd b/man/tokenize.Rd
index 8f4c88fd..9f282e1e 100644
--- a/man/tokenize.Rd
+++ b/man/tokenize.Rd
@@ -7,9 +7,13 @@
 tokenize(file, tokenizer = tokenizer_csv(), n = NA_integer_)
 }
 \arguments{
-\item{file}{Either a path to a file, a url, a connection, or literal data
-   (either a single string or a raw vector). Connections and urls are saved
-   to a temporary file before being read.
+\item{file}{Either a path to a file, a connection, or literal data
+   (either a single string or a raw vector).
+
+   Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
+   be automatically uncompressed. Files starting with \code{http://},
+   \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
+   downloaded.
 
    Literal data is most useful for examples and tests. It must contain at
    least one new line to be recognised as data (instead of a path).}