From 8bf9a23aa17722d2cd92436e08aea0c706bcedfa Mon Sep 17 00:00:00 2001 From: chainsawriot Date: Mon, 4 Sep 2023 11:41:30 +0200 Subject: [PATCH] Fix #294 (#337) * Refactor And it passes all existing tests * Update doc on zip directory [no ci] * Add tests for #294 * Update NEWS [no ci] --- NEWS.md | 1 + R/import_list.R | 167 +++++++++++++++++------------- man/import_list.Rd | 4 +- tests/testthat/test_import_list.R | 17 +++ 4 files changed, 113 insertions(+), 76 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0d0b4a2..b49422c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,7 @@ - POTENTIALLY BREAKING: YAML are exported using yaml::write_yaml(). But it can't pass the UTF-8 check on older systems. Disclaimer added. #318 - More check for the `file` argument #301 + - `import_list` works with single Excel/HTML/Zip online #294 * Declutter - remove the obsolete data.table option #323 - write all documentation blocks in markdown #311 diff --git a/R/import_list.R b/R/import_list.R index f091d97..e42ed80 100644 --- a/R/import_list.R +++ b/R/import_list.R @@ -1,6 +1,6 @@ #' @title Import list of data frames -#' @description Use [import()] to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zip directory, or HTML file) -#' @param file A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip directory, or HTML file), or a vector of file paths for multiple files to be imported. +#' @description Use [import()] to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zipped directory in a zip file, or HTML file) +#' @param file A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip file, or HTML file), or a vector of file paths for multiple files to be imported. #' @param which If `file` is a single file path, this specifies which objects should be extracted (passed to [import()]'s `which` argument). Ignored otherwise. #' @param rbind A logical indicating whether to pass the import list of data frames through [data.table::rbindlist()]. #' @param rbind_label If `rbind = TRUE`, a character string specifying the name of a column to add to the data frame indicating its source file. @@ -38,79 +38,13 @@ function(file, if (missing(setclass)) { setclass <- NULL } - strip_exts <- function(file) { - vapply(file, function(x) tools::file_path_sans_ext(basename(x)), character(1)) - } - if (length(file) > 1) { - names(file) <- strip_exts(file) - x <- lapply(file, function(thisfile) { - out <- try(import(thisfile, setclass = setclass, ...), silent = TRUE) - if (inherits(out, "try-error")) { - warning(sprintf("Import failed for %s", thisfile)) - out <- NULL - } else if (isTRUE(rbind)) { - out[[rbind_label]] <- thisfile - } - structure(out, filename = thisfile) - }) - names(x) <- names(file) + ## special cases + if (length(file) == 1) { + x <- .read_file_as_list(file = file, which = which, setclass = setclass, rbind = rbind, rbind_label = rbind_label, ...) } else { - if (get_ext(file) == "rdata") { - e <- new.env() - load(file, envir = e) - x <- as.list(e) - } else { - if (get_ext(file) == "html") { - .check_pkg_availability("xml2") - tables <- xml2::xml_find_all(xml2::read_html(unclass(file)), ".//table") - if (missing(which)) { - which <- seq_along(tables) - } - whichnames <- vapply(xml2::xml_attrs(tables[which]), - function(x) if ("class" %in% names(x)) x["class"] else "", - FUN.VALUE = character(1)) - names(which) <- whichnames - } else if (get_ext(file) %in% c("xls","xlsx")) { - .check_pkg_availability("readxl") - whichnames <- readxl::excel_sheets(path = file) - if (missing(which)) { - which <- seq_along(whichnames) - names(which) <- whichnames - } else if (is.character(which)) { - whichnames <- which - } else { - whichnames <- whichnames[which] - } - } else if (get_ext(file) %in% c("zip")) { - if (missing(which)) { - whichnames <- utils::unzip(file, list = TRUE)[, "Name"] - which <- seq_along(whichnames) - names(which) <- strip_exts(whichnames) - } else if (is.character(which)) { - whichnames <- utils::unzip(file, list = TRUE)[, "Name"] - whichnames <- whichnames[whichnames %in% which] - } else { - whichnames <- utils::unzip(file, list = TRUE)[, "Name"] - names(which) <- strip_exts(whichnames) - } - } else { - which <- 1 - whichnames <- NULL - } - x <- lapply(which, function(thiswhich) { - out <- try(import(file, setclass = setclass, which = thiswhich, ...), silent = TRUE) - if (inherits(out, "try-error")) { - warning(sprintf("Import failed for %s from %s", thiswhich, file)) - out <- NULL - } else if (isTRUE(rbind) && length(which) > 1) { - out[[rbind_label]] <- thiswhich - } - out - }) - names(x) <- whichnames - } + ## note the plural + x <- .read_multiple_files_as_list(files = file, setclass = setclass, rbind = rbind, rbind_label = rbind_label, ...) } - # optionally rbind if (isTRUE(rbind)) { if (length(x) == 1) { @@ -124,7 +58,7 @@ function(file, x <- x2 } } - # set class + ## set class a <- list(...) if (is.null(setclass)) { if ("data.table" %in% names(a) && isTRUE(a[["data.table"]])) { @@ -148,3 +82,88 @@ function(file, return(x) } + +.strip_exts <- function(file) { + vapply(file, function(x) tools::file_path_sans_ext(basename(x)), character(1)) +} + +.read_multiple_files_as_list <- function(files, setclass, rbind, rbind_label,...) { + names(files) <- .strip_exts(files) + x <- lapply(files, function(thisfile) { + out <- try(import(thisfile, setclass = setclass, ...), silent = TRUE) + if (inherits(out, "try-error")) { + warning(sprintf("Import failed for %s", thisfile)) + out <- NULL + } else if (isTRUE(rbind)) { + out[[rbind_label]] <- thisfile + } + structure(out, filename = thisfile) + }) + names(x) <- names(files) + return(x) +} + +.read_file_as_list <- function(file, which, setclass, rbind, rbind_label,...) { + if (grepl("^http.*://", file)) { + file <- remote_to_local(file) + } + if (get_ext(file) == "rdata") { + e <- new.env() + load(file, envir = e) + return(as.list(e)) + } + if (!get_ext(file) %in% c("html", "xlsx", "xls", "zip")) { + which <- 1 + whichnames <- NULL + } + ## getting list of `whichnames` + if (get_ext(file) == "html") { + .check_pkg_availability("xml2") + tables <- xml2::xml_find_all(xml2::read_html(unclass(file)), ".//table") + if (missing(which)) { + which <- seq_along(tables) + } + whichnames <- vapply(xml2::xml_attrs(tables[which]), + function(x) if ("class" %in% names(x)) x["class"] else "", + FUN.VALUE = character(1)) + names(which) <- whichnames + } + if (get_ext(file) %in% c("xls","xlsx")) { + ##.check_pkg_availability("readxl") + whichnames <- readxl::excel_sheets(path = file) + if (missing(which)) { + which <- seq_along(whichnames) + names(which) <- whichnames + } else if (is.character(which)) { + whichnames <- which + } else { + whichnames <- whichnames[which] + } + } + if (get_ext(file) %in% c("zip")) { + if (missing(which)) { + whichnames <- utils::unzip(file, list = TRUE)[, "Name"] + which <- seq_along(whichnames) + names(which) <- .strip_exts(whichnames) + } else if (is.character(which)) { + whichnames <- utils::unzip(file, list = TRUE)[, "Name"] + whichnames <- whichnames[whichnames %in% which] + } else { + whichnames <- utils::unzip(file, list = TRUE)[, "Name"] + names(which) <- .strip_exts(whichnames) + } + } + ## reading all `whichnames` + x <- lapply(which, function(thiswhich) { + out <- try(import(file, setclass = setclass, which = thiswhich, ...), silent = TRUE) + if (inherits(out, "try-error")) { + warning(sprintf("Import failed for %s from %s", thiswhich, file)) + out <- NULL + } else if (isTRUE(rbind) && length(which) > 1) { + out[[rbind_label]] <- thiswhich + } + out + }) + names(x) <- whichnames + return(x) +} diff --git a/man/import_list.Rd b/man/import_list.Rd index 3310cf0..7158325 100644 --- a/man/import_list.Rd +++ b/man/import_list.Rd @@ -15,7 +15,7 @@ import_list( ) } \arguments{ -\item{file}{A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip directory, or HTML file), or a vector of file paths for multiple files to be imported.} +\item{file}{A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip file, or HTML file), or a vector of file paths for multiple files to be imported.} \item{setclass}{An optional character vector specifying one or more classes to set on the import. By default, the return object is always a \dQuote{data.frame}. Allowed values include \dQuote{tbl_df}, \dQuote{tbl}, or \dQuote{tibble} (if using dplyr) or \dQuote{data.table} (if using data.table). Other values are ignored, such that a data.frame is returned.} @@ -33,7 +33,7 @@ import_list( If \code{rbind=FALSE} (the default), a list of a data frames. Otherwise, that list is passed to \code{\link[data.table:rbindlist]{data.table::rbindlist()}} with \code{fill = TRUE} and returns a data frame object of class set by the \code{setclass} argument; if this operation fails, the list is returned. } \description{ -Use \code{\link[=import]{import()}} to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zip directory, or HTML file) +Use \code{\link[=import]{import()}} to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zipped directory in a zip file, or HTML file) } \examples{ ## For demo, a temp. file path is created with the file extension .xlsx diff --git a/tests/testthat/test_import_list.R b/tests/testthat/test_import_list.R index 8395b98..a520a64 100644 --- a/tests/testthat/test_import_list.R +++ b/tests/testthat/test_import_list.R @@ -89,6 +89,23 @@ test_that("File names are added as attributes by import_list()", { unlink(c("mtcars.csv", "mtcars.tsv")) }) +test_that("URL #294", { + skip_on_cran() + ## url <- "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.xls" That's 10MB! + url <- "https://github.com/tidyverse/readxl/raw/main/tests/testthat/sheets/sheet-xml-lookup.xlsx" + expect_error(x <- import_list(url), NA) + expect_true(inherits(x, "list")) + expect_true("Asia" %in% names(x)) + expect_true("Africa" %in% x[[1]]$continent) + expect_false("Africa" %in% x[[2]]$continent) + ## double URLs; it reads twice the first sheet by default + urls <- c(url, url) + expect_error(x2 <- import_list(urls), NA) + expect_true("sheet-xml-lookup" %in% names(x2)) + expect_true("Africa" %in% x2[[1]]$continent) + expect_true("Africa" %in% x2[[2]]$continent) +}) + unlink("data.rdata") unlink("mtcars.rds") unlink("mtcars.csv.zip")