Skip to content

Commit

Permalink
Better support for compressed files. Closes #98
Browse files Browse the repository at this point in the history
  • Loading branch information
hadley committed Mar 31, 2015
1 parent d6bb0ba commit 0d5e27d
Show file tree
Hide file tree
Showing 13 changed files with 141 additions and 48 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ LinkingTo:
Rcpp,
BH
Imports:
Rcpp (>= 0.11.5)
Rcpp (>= 0.11.5),
curl
Suggests:
testthat,
knitr,
Expand Down
5 changes: 5 additions & 0 deletions R/read_delim.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,14 @@ NULL
#' # Input sources -------------------------------------------------------------
#' # Read from a path
#' read_csv(system.file("extdata/mtcars.csv", package = "readr"))
#' read_csv(system.file("extdata/mtcars.csv.zip", package = "readr"))
#' read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr"))
#' read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
#'
#' # Or directly from a string (must contain a newline)
#' read_csv("x,y\n1,2\n3,4")
#'
#' # Column types --------------------------------------------------------------
#' # By default, readr guess the columns types, looking at the first 100 rows.
#' # You can override with a compact specification:
#' read_csv("x,y\n1,2\n3,4", col_types = "dc")
Expand Down
91 changes: 69 additions & 22 deletions R/source.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
#' Create a source object.
#'
#' @param file Either a path to a file, a url, a connection, or literal data
#' (either a single string or a raw vector). Connections and urls are saved
#' to a temporary file before being read.
#' @param file Either a path to a file, a connection, or literal data
#' (either a single string or a raw vector).
#'
#' Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
#' be automatically uncompressed. Files starting with \code{http://},
#' \code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
#' downloaded.
#'
#' Literal data is most useful for examples and tests. It must contain at
#' least one new line to be recognised as data (instead of a path).
Expand All @@ -14,34 +18,39 @@
#' datasource("a,b,c\n1,2,3")
#' datasource(charToRaw("a,b,c\n1,2,3"))
#'
#' # Local path
#' # Strings
#' datasource(system.file("extdata/mtcars.csv", package = "readr"))
#' datasource(system.file("extdata/mtcars.csv.bz2", package = "readr"))
#' datasource(system.file("extdata/mtcars.csv.zip", package = "readr"))
#' datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
#'
#' # Connection
#' datasource(rawConnection(charToRaw("abc\n123")))
datasource <- function(file, skip = 0) {
if (inherits(file, "source")) {
file
} else if (inherits(file, "connection")) {
path <- cache_con(file)
datasource_file(path, skip)
} else if (is.connection(file)) {
datasource_connection(file, skip)
} else if (is.raw(file)) {
datasource_raw(file, skip)
} else if (is.character(file)) {
if (grepl("\n", file)) {
datasource_string(file, skip)
} else if (grepl("^(http|ftp|https)://", file)) {
tmp <- tempfile()
download.file(file, tmp, quiet = TRUE, mode = "wb")
datasource_file(tmp, skip)
} else {
datasource_file(file, skip)
file <- standardise_path(file)
if (is.connection(file)) {
datasource_connection(file, skip)
} else {
datasource_file(file, skip)
}
}
} else {
stop("`file` must be a string, raw vector or a connection.", call. = FALSE)
}
}

# Constructors -----------------------------------------------------------------

new_datasource <- function(type, x, skip, ...) {
structure(list(x, skip = skip, ...),
class = c(paste0("source_", type), "source"))
Expand All @@ -52,14 +61,21 @@ datasource_string <- function(text, skip) {
}

datasource_file <- function(path, skip) {
path <- check_file(path)
path <- check_path(path)
new_datasource("file", path, skip = skip)
}

datasource_connection <- function(path, skip) {
path <- cache_con(path)
datasource_file(path, skip)
}

datasource_raw <- function(text, skip) {
new_datasource("text", text, skip = skip)
}

# Helpers ----------------------------------------------------------------------

cache_con <- function(con) {
tmp <- tempfile()
tmpcon <- file(tmp, "w+b")
Expand All @@ -76,19 +92,50 @@ cache_con <- function(con) {
tmp
}

check_file <- function(path) {
if (!file.exists(path)) {
stop("'", path, "' does not exist",
if (!is_absolute_path(path))
paste0(" in current working directory ('", getwd(), "')"),
".",
call. = FALSE)
}
standardise_path <- function(path) {
if (!is.character(path))
return(path)

if (is_url(path))
return(curl::curl(path))

path <- check_path(path)
switch(tools::file_ext(path),
gz = gzfile(path, ""),
bz2 = bzfile(path, ""),
xz = xzfile(path, ""),
zip = zipfile(path, ""),
path
)
}

is_url <- function(path) {
grepl("^(http|ftp)s?://", path)
}

check_path <- function(path) {
if (file.exists(path))
return(normalizePath(path, "/", mustWork = FALSE))

normalizePath(path, "/", mustWork = FALSE)
stop("'", path, "' does not exist",
if (!is_absolute_path(path))
paste0(" in current working directory ('", getwd(), "')"),
".",
call. = FALSE
)
}

is_absolute_path <- function(path) {
grepl("^(/|[A-Za-z]:|\\\\|~)", path)
}

zipfile <- function(path, open = "r") {
files <- utils::unzip(path, list = TRUE)
file <- files$Name[[1]]

if (nrow(files) > 1) {
message("Multiple files in zip: reading '", file, "'")
}

unz(path, file, open = open)
}
Binary file added inst/extdata/mtcars.csv.bz2
Binary file not shown.
Binary file added inst/extdata/mtcars.csv.zip
Binary file not shown.
10 changes: 7 additions & 3 deletions man/count_fields.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
count_fields(file, tokenizer, skip = 0)
}
\arguments{
\item{file}{Either a path to a file, a url, a connection, or literal data
(either a single string or a raw vector). Connections and urls are saved
to a temporary file before being read.
\item{file}{Either a path to a file, a connection, or literal data
(either a single string or a raw vector).

Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
be automatically uncompressed. Files starting with \code{http://},
\code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
downloaded.

Literal data is most useful for examples and tests. It must contain at
least one new line to be recognised as data (instead of a path).}
Expand Down
15 changes: 11 additions & 4 deletions man/datasource.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
datasource(file, skip = 0)
}
\arguments{
\item{file}{Either a path to a file, a url, a connection, or literal data
(either a single string or a raw vector). Connections and urls are saved
to a temporary file before being read.
\item{file}{Either a path to a file, a connection, or literal data
(either a single string or a raw vector).

Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
be automatically uncompressed. Files starting with \code{http://},
\code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
downloaded.

Literal data is most useful for examples and tests. It must contain at
least one new line to be recognised as data (instead of a path).}
Expand All @@ -24,8 +28,11 @@ Create a source object.
datasource("a,b,c\\n1,2,3")
datasource(charToRaw("a,b,c\\n1,2,3"))

# Local path
# Strings
datasource(system.file("extdata/mtcars.csv", package = "readr"))
datasource(system.file("extdata/mtcars.csv.bz2", package = "readr"))
datasource(system.file("extdata/mtcars.csv.zip", package = "readr"))
datasource("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")

# Connection
datasource(rawConnection(charToRaw("abc\\n123")))
Expand Down
15 changes: 12 additions & 3 deletions man/read_delim.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@ read_tsv(file, col_names = TRUE, col_types = NULL, na = "NA", skip = 0,
n_max = -1, progress = interactive())
}
\arguments{
\item{file}{Either a path to a file, a url, a connection, or literal data
(either a single string or a raw vector). Connections and urls are saved
to a temporary file before being read.
\item{file}{Either a path to a file, a connection, or literal data
(either a single string or a raw vector).

Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
be automatically uncompressed. Files starting with \code{http://},
\code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
downloaded.

Literal data is most useful for examples and tests. It must contain at
least one new line to be recognised as data (instead of a path).}
Expand Down Expand Up @@ -88,9 +92,14 @@ decimal separator.
# Input sources -------------------------------------------------------------
# Read from a path
read_csv(system.file("extdata/mtcars.csv", package = "readr"))
read_csv(system.file("extdata/mtcars.csv.zip", package = "readr"))
read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr"))
read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")
# Or directly from a string (must contain a newline)
read_csv("x,y\\n1,2\\n3,4")
# Column types --------------------------------------------------------------
# By default, readr guess the columns types, looking at the first 100 rows.
# You can override with a compact specification:
read_csv("x,y\\n1,2\\n3,4", col_types = "dc")
Expand Down
10 changes: 7 additions & 3 deletions man/read_file.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
read_file(file)
}
\arguments{
\item{file}{Either a path to a file, a url, a connection, or literal data
(either a single string or a raw vector). Connections and urls are saved
to a temporary file before being read.
\item{file}{Either a path to a file, a connection, or literal data
(either a single string or a raw vector).

Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
be automatically uncompressed. Files starting with \code{http://},
\code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
downloaded.

Literal data is most useful for examples and tests. It must contain at
least one new line to be recognised as data (instead of a path).}
Expand Down
10 changes: 7 additions & 3 deletions man/read_fwf.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@ fwf_widths(widths, col_names = NULL)
fwf_positions(start, end, col_names = NULL)
}
\arguments{
\item{file}{Either a path to a file, a url, a connection, or literal data
(either a single string or a raw vector). Connections and urls are saved
to a temporary file before being read.
\item{file}{Either a path to a file, a connection, or literal data
(either a single string or a raw vector).

Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
be automatically uncompressed. Files starting with \code{http://},
\code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
downloaded.

Literal data is most useful for examples and tests. It must contain at
least one new line to be recognised as data (instead of a path).}
Expand Down
10 changes: 7 additions & 3 deletions man/read_lines.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
read_lines(file, n_max = -1L)
}
\arguments{
\item{file}{Either a path to a file, a url, a connection, or literal data
(either a single string or a raw vector). Connections and urls are saved
to a temporary file before being read.
\item{file}{Either a path to a file, a connection, or literal data
(either a single string or a raw vector).

Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
be automatically uncompressed. Files starting with \code{http://},
\code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
downloaded.

Literal data is most useful for examples and tests. It must contain at
least one new line to be recognised as data (instead of a path).}
Expand Down
10 changes: 7 additions & 3 deletions man/read_table.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ read_table(file, col_names = TRUE, col_types = NULL, na = "NA",
skip = 0, n_max = -1)
}
\arguments{
\item{file}{Either a path to a file, a url, a connection, or literal data
(either a single string or a raw vector). Connections and urls are saved
to a temporary file before being read.
\item{file}{Either a path to a file, a connection, or literal data
(either a single string or a raw vector).

Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
be automatically uncompressed. Files starting with \code{http://},
\code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
downloaded.

Literal data is most useful for examples and tests. It must contain at
least one new line to be recognised as data (instead of a path).}
Expand Down
10 changes: 7 additions & 3 deletions man/tokenize.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
tokenize(file, tokenizer = tokenizer_csv(), n = NA_integer_)
}
\arguments{
\item{file}{Either a path to a file, a url, a connection, or literal data
(either a single string or a raw vector). Connections and urls are saved
to a temporary file before being read.
\item{file}{Either a path to a file, a connection, or literal data
(either a single string or a raw vector).

Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will
be automatically uncompressed. Files starting with \code{http://},
\code{https://}, \code{ftp://}, or \code{ftps://} will be automatically
downloaded.

Literal data is most useful for examples and tests. It must contain at
least one new line to be recognised as data (instead of a path).}
Expand Down

0 comments on commit 0d5e27d

Please sign in to comment.