From 8bf9a23aa17722d2cd92436e08aea0c706bcedfa Mon Sep 17 00:00:00 2001
From: chainsawriot <chainsawtiney@gmail.com>
Date: Mon, 4 Sep 2023 11:41:30 +0200
Subject: [PATCH] Fix #294 (#337)

* Refactor

And it passes all existing tests

* Update doc on zip directory [no ci]

* Add tests for #294

* Update NEWS [no ci]
---
 NEWS.md                           |   1 +
 R/import_list.R                   | 167 +++++++++++++++++-------------
 man/import_list.Rd                |   4 +-
 tests/testthat/test_import_list.R |  17 +++
 4 files changed, 113 insertions(+), 76 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 0d0b4a2..b49422c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -8,6 +8,7 @@
    - POTENTIALLY BREAKING: YAML are exported using yaml::write_yaml(). But it can't pass the UTF-8 check on older systems.
      Disclaimer added. #318
    - More check for the `file` argument #301
+   - `import_list` works with single Excel/HTML/Zip online #294
 * Declutter
    - remove the obsolete data.table option #323
    - write all documentation blocks in markdown #311
diff --git a/R/import_list.R b/R/import_list.R
index f091d97..e42ed80 100644
--- a/R/import_list.R
+++ b/R/import_list.R
@@ -1,6 +1,6 @@
 #' @title Import list of data frames
-#' @description Use [import()] to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zip directory, or HTML file)
-#' @param file A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip directory, or HTML file), or a vector of file paths for multiple files to be imported.
+#' @description Use [import()] to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zipped directory in a zip file, or HTML file)
+#' @param file A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip file, or HTML file), or a vector of file paths for multiple files to be imported.
 #' @param which If `file` is a single file path, this specifies which objects should be extracted (passed to [import()]'s `which` argument). Ignored otherwise.
 #' @param rbind A logical indicating whether to pass the import list of data frames through [data.table::rbindlist()].
 #' @param rbind_label If `rbind = TRUE`, a character string specifying the name of a column to add to the data frame indicating its source file.
@@ -38,79 +38,13 @@ function(file,
     if (missing(setclass)) {
         setclass <- NULL
     }
-    strip_exts <- function(file) {
-      vapply(file, function(x) tools::file_path_sans_ext(basename(x)), character(1))
-    }
-    if (length(file) > 1) {
-        names(file) <- strip_exts(file)
-        x <- lapply(file, function(thisfile) {
-            out <- try(import(thisfile, setclass = setclass, ...), silent = TRUE)
-            if (inherits(out, "try-error")) {
-                warning(sprintf("Import failed for %s", thisfile))
-                out <- NULL
-            } else if (isTRUE(rbind)) {
-                out[[rbind_label]] <- thisfile
-            }
-            structure(out, filename = thisfile)
-        })
-        names(x) <- names(file)
+    ## special cases
+    if (length(file) == 1) {
+        x <- .read_file_as_list(file = file, which = which, setclass = setclass, rbind = rbind, rbind_label = rbind_label, ...)
     } else {
-        if (get_ext(file) == "rdata") {
-            e <- new.env()
-            load(file, envir = e)
-            x <- as.list(e)
-        } else {
-            if (get_ext(file) == "html") {
-                .check_pkg_availability("xml2")
-                tables <- xml2::xml_find_all(xml2::read_html(unclass(file)), ".//table")
-                if (missing(which)) {
-                    which <- seq_along(tables)
-                }
-                whichnames <- vapply(xml2::xml_attrs(tables[which]),
-                                     function(x) if ("class" %in% names(x)) x["class"] else "",
-                                     FUN.VALUE = character(1))
-                names(which) <- whichnames
-            } else if (get_ext(file) %in% c("xls","xlsx")) {
-                .check_pkg_availability("readxl")
-                whichnames <- readxl::excel_sheets(path = file)
-                if (missing(which)) {
-                    which <- seq_along(whichnames)
-                    names(which) <- whichnames
-                } else if (is.character(which)) {
-                    whichnames <- which
-                } else {
-                    whichnames <- whichnames[which]
-                }
-            } else if (get_ext(file) %in% c("zip")) {
-                if (missing(which)) {
-                    whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
-                    which <- seq_along(whichnames)
-                    names(which) <- strip_exts(whichnames)
-                } else if (is.character(which)) {
-                    whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
-                    whichnames <- whichnames[whichnames %in% which]
-                } else {
-                    whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
-                    names(which) <- strip_exts(whichnames)
-                }
-            } else {
-                which <- 1
-                whichnames <- NULL
-            }
-            x <- lapply(which, function(thiswhich) {
-                out <- try(import(file, setclass = setclass, which = thiswhich, ...), silent = TRUE)
-                if (inherits(out, "try-error")) {
-                    warning(sprintf("Import failed for %s from %s", thiswhich, file))
-                    out <- NULL
-                } else if (isTRUE(rbind) && length(which) > 1) {
-                    out[[rbind_label]] <- thiswhich
-                }
-                out
-            })
-            names(x) <- whichnames
-        }
+        ## note the plural
+        x <- .read_multiple_files_as_list(files = file, setclass = setclass, rbind = rbind, rbind_label = rbind_label, ...)
     }
-
     # optionally rbind
     if (isTRUE(rbind)) {
         if (length(x) == 1) {
@@ -124,7 +58,7 @@ function(file,
                 x <- x2
             }
         }
-        # set class
+        ## set class
         a <- list(...)
         if (is.null(setclass)) {
             if ("data.table" %in% names(a) && isTRUE(a[["data.table"]])) {
@@ -148,3 +82,88 @@ function(file,
 
     return(x)
 }
+
+.strip_exts <- function(file) {
+    vapply(file, function(x) tools::file_path_sans_ext(basename(x)), character(1))
+}
+
+.read_multiple_files_as_list <- function(files, setclass, rbind, rbind_label,...) {
+    names(files) <- .strip_exts(files)
+    x <- lapply(files, function(thisfile) {
+        out <- try(import(thisfile, setclass = setclass, ...), silent = TRUE)
+        if (inherits(out, "try-error")) {
+            warning(sprintf("Import failed for %s", thisfile))
+            out <- NULL
+        } else if (isTRUE(rbind)) {
+            out[[rbind_label]] <- thisfile
+        }
+        structure(out, filename = thisfile)
+    })
+    names(x) <- names(files)
+    return(x)
+}
+
+.read_file_as_list <- function(file, which, setclass, rbind, rbind_label,...) {
+    if (grepl("^http.*://", file)) {
+        file <- remote_to_local(file)
+    }
+    if (get_ext(file) == "rdata") {
+        e <- new.env()
+        load(file, envir = e)
+        return(as.list(e))
+    }
+    if (!get_ext(file) %in% c("html", "xlsx", "xls", "zip")) {
+        which <- 1
+        whichnames <- NULL
+    }
+    ## getting list of `whichnames`
+    if (get_ext(file) == "html") {
+        .check_pkg_availability("xml2")
+        tables <- xml2::xml_find_all(xml2::read_html(unclass(file)), ".//table")
+        if (missing(which)) {
+            which <- seq_along(tables)
+        }
+        whichnames <- vapply(xml2::xml_attrs(tables[which]),
+                             function(x) if ("class" %in% names(x)) x["class"] else "",
+                             FUN.VALUE = character(1))
+        names(which) <- whichnames
+    }
+    if (get_ext(file) %in% c("xls","xlsx")) {
+        ##.check_pkg_availability("readxl")
+        whichnames <- readxl::excel_sheets(path = file)
+        if (missing(which)) {
+            which <- seq_along(whichnames)
+            names(which) <- whichnames
+        } else if (is.character(which)) {
+            whichnames <- which
+        } else {
+            whichnames <- whichnames[which]
+        }
+    }
+    if (get_ext(file) %in% c("zip")) {
+        if (missing(which)) {
+            whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
+            which <- seq_along(whichnames)
+            names(which) <- .strip_exts(whichnames)
+        } else if (is.character(which)) {
+            whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
+            whichnames <- whichnames[whichnames %in% which]
+        } else {
+            whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
+            names(which) <- .strip_exts(whichnames)
+        }
+    }
+    ## reading all `whichnames`
+    x <- lapply(which, function(thiswhich) {
+        out <- try(import(file, setclass = setclass, which = thiswhich, ...), silent = TRUE)
+        if (inherits(out, "try-error")) {
+            warning(sprintf("Import failed for %s from %s", thiswhich, file))
+            out <- NULL
+        } else if (isTRUE(rbind) && length(which) > 1) {
+            out[[rbind_label]] <- thiswhich
+        }
+        out
+    })
+    names(x) <- whichnames
+    return(x)
+}
diff --git a/man/import_list.Rd b/man/import_list.Rd
index 3310cf0..7158325 100644
--- a/man/import_list.Rd
+++ b/man/import_list.Rd
@@ -15,7 +15,7 @@ import_list(
 )
 }
 \arguments{
-\item{file}{A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip directory, or HTML file), or a vector of file paths for multiple files to be imported.}
+\item{file}{A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip file, or HTML file), or a vector of file paths for multiple files to be imported.}
 
 \item{setclass}{An optional character vector specifying one or more classes to set on the import. By default, the return object is always a \dQuote{data.frame}. Allowed values include \dQuote{tbl_df}, \dQuote{tbl}, or \dQuote{tibble} (if using dplyr) or \dQuote{data.table} (if using data.table). Other values are ignored, such that a data.frame is returned.}
 
@@ -33,7 +33,7 @@ import_list(
 If \code{rbind=FALSE} (the default), a list of a data frames. Otherwise, that list is passed to \code{\link[data.table:rbindlist]{data.table::rbindlist()}} with \code{fill = TRUE} and returns a data frame object of class set by the \code{setclass} argument; if this operation fails, the list is returned.
 }
 \description{
-Use \code{\link[=import]{import()}} to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zip directory, or HTML file)
+Use \code{\link[=import]{import()}} to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zipped directory in a zip file, or HTML file)
 }
 \examples{
 ## For demo, a temp. file path is created with the file extension .xlsx
diff --git a/tests/testthat/test_import_list.R b/tests/testthat/test_import_list.R
index 8395b98..a520a64 100644
--- a/tests/testthat/test_import_list.R
+++ b/tests/testthat/test_import_list.R
@@ -89,6 +89,23 @@ test_that("File names are added as attributes by import_list()", {
     unlink(c("mtcars.csv", "mtcars.tsv"))
 })
 
+test_that("URL #294", {
+    skip_on_cran()
+    ## url <- "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.xls" That's 10MB!
+    url <- "https://github.com/tidyverse/readxl/raw/main/tests/testthat/sheets/sheet-xml-lookup.xlsx"
+    expect_error(x <- import_list(url), NA)
+    expect_true(inherits(x, "list"))
+    expect_true("Asia" %in% names(x))
+    expect_true("Africa" %in% x[[1]]$continent)
+    expect_false("Africa" %in% x[[2]]$continent)
+    ## double URLs; it reads twice the first sheet by default
+    urls <- c(url, url)
+    expect_error(x2 <- import_list(urls), NA)
+    expect_true("sheet-xml-lookup" %in% names(x2))
+    expect_true("Africa" %in% x2[[1]]$continent)
+    expect_true("Africa" %in% x2[[2]]$continent)
+})
+
 unlink("data.rdata")
 unlink("mtcars.rds")
 unlink("mtcars.csv.zip")