From 0d5e27d562b16985f03be50c04fbcf60e6f476f3 Mon Sep 17 00:00:00 2001 From: hadley Date: Tue, 31 Mar 2015 09:36:10 -0500 Subject: [PATCH] Better support for compressed files. Closes #98 --- DESCRIPTION | 3 +- R/read_delim.R | 5 ++ R/source.R | 91 +++++++++++++++++++++++++++--------- inst/extdata/mtcars.csv.bz2 | Bin 0 -> 553 bytes inst/extdata/mtcars.csv.zip | Bin 0 -> 711 bytes man/count_fields.Rd | 10 ++-- man/datasource.Rd | 15 ++++-- man/read_delim.Rd | 15 ++++-- man/read_file.Rd | 10 ++-- man/read_fwf.Rd | 10 ++-- man/read_lines.Rd | 10 ++-- man/read_table.Rd | 10 ++-- man/tokenize.Rd | 10 ++-- 13 files changed, 141 insertions(+), 48 deletions(-) create mode 100644 inst/extdata/mtcars.csv.bz2 create mode 100644 inst/extdata/mtcars.csv.zip diff --git a/DESCRIPTION b/DESCRIPTION index 2f561370..602fb056 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,8 @@ LinkingTo: Rcpp, BH Imports: - Rcpp (>= 0.11.5) + Rcpp (>= 0.11.5), + curl Suggests: testthat, knitr, diff --git a/R/read_delim.R b/R/read_delim.R index 8b87db40..bea88b55 100644 --- a/R/read_delim.R +++ b/R/read_delim.R @@ -26,9 +26,14 @@ NULL #' # Input sources ------------------------------------------------------------- #' # Read from a path #' read_csv(system.file("extdata/mtcars.csv", package = "readr")) +#' read_csv(system.file("extdata/mtcars.csv.zip", package = "readr")) +#' read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr")) +#' read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv") +#' #' # Or directly from a string (must contain a newline) #' read_csv("x,y\n1,2\n3,4") #' +#' # Column types -------------------------------------------------------------- #' # By default, readr guess the columns types, looking at the first 100 rows. #' # You can override with a compact specification: #' read_csv("x,y\n1,2\n3,4", col_types = "dc") diff --git a/R/source.R b/R/source.R index 1c3c3e2f..4bfd5333 100644 --- a/R/source.R +++ b/R/source.R @@ -1,8 +1,12 @@ #' Create a source object. #' -#' @param file Either a path to a file, a url, a connection, or literal data -#' (either a single string or a raw vector). Connections and urls are saved -#' to a temporary file before being read. +#' @param file Either a path to a file, a connection, or literal data +#' (either a single string or a raw vector). +#' +#' Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will +#' be automatically uncompressed. Files starting with \code{http://}, +#' \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically +#' downloaded. #' #' Literal data is most useful for examples and tests. It must contain at #' least one new line to be recognised as data (instead of a path). @@ -14,34 +18,39 @@ #' datasource("a,b,c\n1,2,3") #' datasource(charToRaw("a,b,c\n1,2,3")) #' -#' # Local path +#' # Strings #' datasource(system.file("extdata/mtcars.csv", package = "readr")) +#' datasource(system.file("extdata/mtcars.csv.bz2", package = "readr")) +#' datasource(system.file("extdata/mtcars.csv.zip", package = "readr")) +#' datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv") #' #' # Connection #' datasource(rawConnection(charToRaw("abc\n123"))) datasource <- function(file, skip = 0) { if (inherits(file, "source")) { file - } else if (inherits(file, "connection")) { - path <- cache_con(file) - datasource_file(path, skip) + } else if (is.connection(file)) { + datasource_connection(file, skip) } else if (is.raw(file)) { datasource_raw(file, skip) } else if (is.character(file)) { if (grepl("\n", file)) { datasource_string(file, skip) - } else if (grepl("^(http|ftp|https)://", file)) { - tmp <- tempfile() - download.file(file, tmp, quiet = TRUE, mode = "wb") - datasource_file(tmp, skip) } else { - datasource_file(file, skip) + file <- standardise_path(file) + if (is.connection(file)) { + datasource_connection(file, skip) + } else { + datasource_file(file, skip) + } } } else { stop("`file` must be a string, raw vector or a connection.", call. = FALSE) } } +# Constructors ----------------------------------------------------------------- + new_datasource <- function(type, x, skip, ...) { structure(list(x, skip = skip, ...), class = c(paste0("source_", type), "source")) @@ -52,14 +61,21 @@ datasource_string <- function(text, skip) { } datasource_file <- function(path, skip) { - path <- check_file(path) + path <- check_path(path) new_datasource("file", path, skip = skip) } +datasource_connection <- function(path, skip) { + path <- cache_con(path) + datasource_file(path, skip) +} + datasource_raw <- function(text, skip) { new_datasource("text", text, skip = skip) } +# Helpers ---------------------------------------------------------------------- + cache_con <- function(con) { tmp <- tempfile() tmpcon <- file(tmp, "w+b") @@ -76,19 +92,50 @@ cache_con <- function(con) { tmp } -check_file <- function(path) { - if (!file.exists(path)) { - stop("'", path, "' does not exist", - if (!is_absolute_path(path)) - paste0(" in current working directory ('", getwd(), "')"), - ".", - call. = FALSE) - } +standardise_path <- function(path) { + if (!is.character(path)) + return(path) + + if (is_url(path)) + return(curl::curl(path)) + + path <- check_path(path) + switch(tools::file_ext(path), + gz = gzfile(path, ""), + bz2 = bzfile(path, ""), + xz = xzfile(path, ""), + zip = zipfile(path, ""), + path + ) +} + +is_url <- function(path) { + grepl("^(http|ftp)s?://", path) +} + +check_path <- function(path) { + if (file.exists(path)) + return(normalizePath(path, "/", mustWork = FALSE)) - normalizePath(path, "/", mustWork = FALSE) + stop("'", path, "' does not exist", + if (!is_absolute_path(path)) + paste0(" in current working directory ('", getwd(), "')"), + ".", + call. = FALSE + ) } is_absolute_path <- function(path) { grepl("^(/|[A-Za-z]:|\\\\|~)", path) } +zipfile <- function(path, open = "r") { + files <- utils::unzip(path, list = TRUE) + file <- files$Name[[1]] + + if (nrow(files) > 1) { + message("Multiple files in zip: reading '", file, "'") + } + + unz(path, file, open = open) +} diff --git a/inst/extdata/mtcars.csv.bz2 b/inst/extdata/mtcars.csv.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..8dd8afa1593aa9889ba1ce028bdeed6f1629e982 GIT binary patch literal 553 zcmV+^0@nRPT4*^jL0KkKSs1P&Apigw*?<5L5CwnWKIVO(Kmw%Ym;m9XfYhkTrW%YU zm^8!@QIkQCXkcnhr1Y8$j3J=(2?&7-Kn6yF(sdsbGG+_Gac6XglI}~M9z1Q=zjOWi zc(ZXWz%cvU>uy~6JoAT>b|RZ#7=W7q$bZT;E*4}+iv)pJTi)^DPBt_WAqrSds0XOC zEodBRV{QvBf+!S}yV}g$$vc{DL9C5Mq_Dx}NvVs4nkP3egyegBE)=rD8XdwG7kkA! z$k@(Nr@a@u&o~hqTg#%UT&zqB%xxExrBHGfZlTl=MqS-yz$*cPR31_yCbH>OQH?me zH<~SFMA9c#0;q6jC?kq2Di#SQ*o69hMKZ9oVncHp-(abWCr_uREhWXerRbB6`Lsf90wqzR9 zcvja^ZG=Q-OWCD{8jnG&+*mKCd%olE9mNHJ;||8U<_@i1Rf;O!`CS=Tl*W`8GCM`F rM3d@W*s);P)AaVf3&I5! zSB2~<16n5e?6W9fDeaN{d}l`CzKR23vWNXno{9F|ER(fHjB#UxcgoJ!3)_T5PAay9 zv9xnfdi~*o#*UV0N>5e=bI&_`lt{S0w!W)ia)-wa0$0_nq3YsyDImclWhN318~IT;9>T zbN@XJgHsn*pOe0nu~JLnW5t^}867X5wJhd7tG*yiUocZCQhq`i-=ry?k{ZbYCwdGr zpKX(rmx({}TvX=g0tpNG?mMb%d4FuDKhvKf?WNq~5O9?9jmWLZDGv-hKXj~F)3|wu z;oi49=e+Z}c;MO(uPa*8lP#xjd$#H5*^kbVnbzh`zNdHb6lVS^;meIIdUI<<>70Fn zIU7wq+?Re`5q#+Y-y|o|C2