diff --git a/DESCRIPTION b/DESCRIPTION index 2f561370..602fb056 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,8 @@ LinkingTo: Rcpp, BH Imports: - Rcpp (>= 0.11.5) + Rcpp (>= 0.11.5), + curl Suggests: testthat, knitr, diff --git a/R/read_delim.R b/R/read_delim.R index 8b87db40..bea88b55 100644 --- a/R/read_delim.R +++ b/R/read_delim.R @@ -26,9 +26,14 @@ NULL #' # Input sources ------------------------------------------------------------- #' # Read from a path #' read_csv(system.file("extdata/mtcars.csv", package = "readr")) +#' read_csv(system.file("extdata/mtcars.csv.zip", package = "readr")) +#' read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr")) +#' read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv") +#' #' # Or directly from a string (must contain a newline) #' read_csv("x,y\n1,2\n3,4") #' +#' # Column types -------------------------------------------------------------- #' # By default, readr guess the columns types, looking at the first 100 rows. #' # You can override with a compact specification: #' read_csv("x,y\n1,2\n3,4", col_types = "dc") diff --git a/R/source.R b/R/source.R index 1c3c3e2f..4bfd5333 100644 --- a/R/source.R +++ b/R/source.R @@ -1,8 +1,12 @@ #' Create a source object. #' -#' @param file Either a path to a file, a url, a connection, or literal data -#' (either a single string or a raw vector). Connections and urls are saved -#' to a temporary file before being read. +#' @param file Either a path to a file, a connection, or literal data +#' (either a single string or a raw vector). +#' +#' Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will +#' be automatically uncompressed. Files starting with \code{http://}, +#' \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically +#' downloaded. #' #' Literal data is most useful for examples and tests. It must contain at #' least one new line to be recognised as data (instead of a path). @@ -14,34 +18,39 @@ #' datasource("a,b,c\n1,2,3") #' datasource(charToRaw("a,b,c\n1,2,3")) #' -#' # Local path +#' # Strings #' datasource(system.file("extdata/mtcars.csv", package = "readr")) +#' datasource(system.file("extdata/mtcars.csv.bz2", package = "readr")) +#' datasource(system.file("extdata/mtcars.csv.zip", package = "readr")) +#' datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv") #' #' # Connection #' datasource(rawConnection(charToRaw("abc\n123"))) datasource <- function(file, skip = 0) { if (inherits(file, "source")) { file - } else if (inherits(file, "connection")) { - path <- cache_con(file) - datasource_file(path, skip) + } else if (is.connection(file)) { + datasource_connection(file, skip) } else if (is.raw(file)) { datasource_raw(file, skip) } else if (is.character(file)) { if (grepl("\n", file)) { datasource_string(file, skip) - } else if (grepl("^(http|ftp|https)://", file)) { - tmp <- tempfile() - download.file(file, tmp, quiet = TRUE, mode = "wb") - datasource_file(tmp, skip) } else { - datasource_file(file, skip) + file <- standardise_path(file) + if (is.connection(file)) { + datasource_connection(file, skip) + } else { + datasource_file(file, skip) + } } } else { stop("`file` must be a string, raw vector or a connection.", call. = FALSE) } } +# Constructors ----------------------------------------------------------------- + new_datasource <- function(type, x, skip, ...) { structure(list(x, skip = skip, ...), class = c(paste0("source_", type), "source")) @@ -52,14 +61,21 @@ datasource_string <- function(text, skip) { } datasource_file <- function(path, skip) { - path <- check_file(path) + path <- check_path(path) new_datasource("file", path, skip = skip) } +datasource_connection <- function(path, skip) { + path <- cache_con(path) + datasource_file(path, skip) +} + datasource_raw <- function(text, skip) { new_datasource("text", text, skip = skip) } +# Helpers ---------------------------------------------------------------------- + cache_con <- function(con) { tmp <- tempfile() tmpcon <- file(tmp, "w+b") @@ -76,19 +92,50 @@ cache_con <- function(con) { tmp } -check_file <- function(path) { - if (!file.exists(path)) { - stop("'", path, "' does not exist", - if (!is_absolute_path(path)) - paste0(" in current working directory ('", getwd(), "')"), - ".", - call. = FALSE) - } +standardise_path <- function(path) { + if (!is.character(path)) + return(path) + + if (is_url(path)) + return(curl::curl(path)) + + path <- check_path(path) + switch(tools::file_ext(path), + gz = gzfile(path, ""), + bz2 = bzfile(path, ""), + xz = xzfile(path, ""), + zip = zipfile(path, ""), + path + ) +} + +is_url <- function(path) { + grepl("^(http|ftp)s?://", path) +} + +check_path <- function(path) { + if (file.exists(path)) + return(normalizePath(path, "/", mustWork = FALSE)) - normalizePath(path, "/", mustWork = FALSE) + stop("'", path, "' does not exist", + if (!is_absolute_path(path)) + paste0(" in current working directory ('", getwd(), "')"), + ".", + call. = FALSE + ) } is_absolute_path <- function(path) { grepl("^(/|[A-Za-z]:|\\\\|~)", path) } +zipfile <- function(path, open = "r") { + files <- utils::unzip(path, list = TRUE) + file <- files$Name[[1]] + + if (nrow(files) > 1) { + message("Multiple files in zip: reading '", file, "'") + } + + unz(path, file, open = open) +} diff --git a/inst/extdata/mtcars.csv.bz2 b/inst/extdata/mtcars.csv.bz2 new file mode 100644 index 00000000..8dd8afa1 Binary files /dev/null and b/inst/extdata/mtcars.csv.bz2 differ diff --git a/inst/extdata/mtcars.csv.zip b/inst/extdata/mtcars.csv.zip new file mode 100644 index 00000000..f8b71903 Binary files /dev/null and b/inst/extdata/mtcars.csv.zip differ diff --git a/man/count_fields.Rd b/man/count_fields.Rd index 4d82a276..8c4d197e 100644 --- a/man/count_fields.Rd +++ b/man/count_fields.Rd @@ -7,9 +7,13 @@ count_fields(file, tokenizer, skip = 0) } \arguments{ -\item{file}{Either a path to a file, a url, a connection, or literal data - (either a single string or a raw vector). Connections and urls are saved - to a temporary file before being read. +\item{file}{Either a path to a file, a connection, or literal data + (either a single string or a raw vector). + + Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will + be automatically uncompressed. Files starting with \code{http://}, + \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically + downloaded. Literal data is most useful for examples and tests. It must contain at least one new line to be recognised as data (instead of a path).} diff --git a/man/datasource.Rd b/man/datasource.Rd index f7d48096..54ce5191 100644 --- a/man/datasource.Rd +++ b/man/datasource.Rd @@ -7,9 +7,13 @@ datasource(file, skip = 0) } \arguments{ -\item{file}{Either a path to a file, a url, a connection, or literal data - (either a single string or a raw vector). Connections and urls are saved - to a temporary file before being read. +\item{file}{Either a path to a file, a connection, or literal data + (either a single string or a raw vector). + + Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will + be automatically uncompressed. Files starting with \code{http://}, + \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically + downloaded. Literal data is most useful for examples and tests. It must contain at least one new line to be recognised as data (instead of a path).} @@ -24,8 +28,11 @@ Create a source object. datasource("a,b,c\\n1,2,3") datasource(charToRaw("a,b,c\\n1,2,3")) -# Local path +# Strings datasource(system.file("extdata/mtcars.csv", package = "readr")) +datasource(system.file("extdata/mtcars.csv.bz2", package = "readr")) +datasource(system.file("extdata/mtcars.csv.zip", package = "readr")) +datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv") # Connection datasource(rawConnection(charToRaw("abc\\n123"))) diff --git a/man/read_delim.Rd b/man/read_delim.Rd index 9e8b43d9..db1b9b83 100644 --- a/man/read_delim.Rd +++ b/man/read_delim.Rd @@ -21,9 +21,13 @@ read_tsv(file, col_names = TRUE, col_types = NULL, na = "NA", skip = 0, n_max = -1, progress = interactive()) } \arguments{ -\item{file}{Either a path to a file, a url, a connection, or literal data - (either a single string or a raw vector). Connections and urls are saved - to a temporary file before being read. +\item{file}{Either a path to a file, a connection, or literal data + (either a single string or a raw vector). + + Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will + be automatically uncompressed. Files starting with \code{http://}, + \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically + downloaded. Literal data is most useful for examples and tests. It must contain at least one new line to be recognised as data (instead of a path).} @@ -88,9 +92,14 @@ decimal separator. # Input sources ------------------------------------------------------------- # Read from a path read_csv(system.file("extdata/mtcars.csv", package = "readr")) +read_csv(system.file("extdata/mtcars.csv.zip", package = "readr")) +read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr")) +read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv") + # Or directly from a string (must contain a newline) read_csv("x,y\\n1,2\\n3,4") +# Column types -------------------------------------------------------------- # By default, readr guess the columns types, looking at the first 100 rows. # You can override with a compact specification: read_csv("x,y\\n1,2\\n3,4", col_types = "dc") diff --git a/man/read_file.Rd b/man/read_file.Rd index 82bd0c55..e99d8928 100644 --- a/man/read_file.Rd +++ b/man/read_file.Rd @@ -7,9 +7,13 @@ read_file(file) } \arguments{ -\item{file}{Either a path to a file, a url, a connection, or literal data - (either a single string or a raw vector). Connections and urls are saved - to a temporary file before being read. +\item{file}{Either a path to a file, a connection, or literal data + (either a single string or a raw vector). + + Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will + be automatically uncompressed. Files starting with \code{http://}, + \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically + downloaded. Literal data is most useful for examples and tests. It must contain at least one new line to be recognised as data (instead of a path).} diff --git a/man/read_fwf.Rd b/man/read_fwf.Rd index afd39e3d..b2a16818 100644 --- a/man/read_fwf.Rd +++ b/man/read_fwf.Rd @@ -17,9 +17,13 @@ fwf_widths(widths, col_names = NULL) fwf_positions(start, end, col_names = NULL) } \arguments{ -\item{file}{Either a path to a file, a url, a connection, or literal data - (either a single string or a raw vector). Connections and urls are saved - to a temporary file before being read. +\item{file}{Either a path to a file, a connection, or literal data + (either a single string or a raw vector). + + Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will + be automatically uncompressed. Files starting with \code{http://}, + \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically + downloaded. Literal data is most useful for examples and tests. It must contain at least one new line to be recognised as data (instead of a path).} diff --git a/man/read_lines.Rd b/man/read_lines.Rd index 5ffb76cc..9931b2ee 100644 --- a/man/read_lines.Rd +++ b/man/read_lines.Rd @@ -7,9 +7,13 @@ read_lines(file, n_max = -1L) } \arguments{ -\item{file}{Either a path to a file, a url, a connection, or literal data - (either a single string or a raw vector). Connections and urls are saved - to a temporary file before being read. +\item{file}{Either a path to a file, a connection, or literal data + (either a single string or a raw vector). + + Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will + be automatically uncompressed. Files starting with \code{http://}, + \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically + downloaded. Literal data is most useful for examples and tests. It must contain at least one new line to be recognised as data (instead of a path).} diff --git a/man/read_table.Rd b/man/read_table.Rd index 3b6be1d9..3b0031f7 100644 --- a/man/read_table.Rd +++ b/man/read_table.Rd @@ -8,9 +8,13 @@ read_table(file, col_names = TRUE, col_types = NULL, na = "NA", skip = 0, n_max = -1) } \arguments{ -\item{file}{Either a path to a file, a url, a connection, or literal data - (either a single string or a raw vector). Connections and urls are saved - to a temporary file before being read. +\item{file}{Either a path to a file, a connection, or literal data + (either a single string or a raw vector). + + Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will + be automatically uncompressed. Files starting with \code{http://}, + \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically + downloaded. Literal data is most useful for examples and tests. It must contain at least one new line to be recognised as data (instead of a path).} diff --git a/man/tokenize.Rd b/man/tokenize.Rd index 8f4c88fd..9f282e1e 100644 --- a/man/tokenize.Rd +++ b/man/tokenize.Rd @@ -7,9 +7,13 @@ tokenize(file, tokenizer = tokenizer_csv(), n = NA_integer_) } \arguments{ -\item{file}{Either a path to a file, a url, a connection, or literal data - (either a single string or a raw vector). Connections and urls are saved - to a temporary file before being read. +\item{file}{Either a path to a file, a connection, or literal data + (either a single string or a raw vector). + + Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will + be automatically uncompressed. Files starting with \code{http://}, + \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically + downloaded. Literal data is most useful for examples and tests. It must contain at least one new line to be recognised as data (instead of a path).}