download_zenodo(): add unit tests

inbo · Nov 20, 2023 · 8984790 · 8984790
1 parent 07ca0e6
commit 8984790
Show file tree

Hide file tree

Showing 3 changed files with 336 additions and 0 deletions.
diff --git a/20a9ee_download_zenodo.R b/20a9ee_download_zenodo.R
@@ -0,0 +1,239 @@
+#' Get data from a Zenodo archive
+#'
+#' This function will download an entire archive from Zenodo
+#' (\url{https://zenodo.org}).
+#' It only works for Zenodo created DOI (not when the DOI is for
+#' example derived from Zookeys.)
+#'
+#' @author Hans Van Calster, \email{hans.vancalster@@inbo.be}
+#' @author Floris Vanderhaeghe, \email{floris.vanderhaeghe@@inbo.be}
+#'
+#' @param path Path where the data must be downloaded.
+#' Defaults to the working directory.
+#' @param doi a doi pointer to the Zenodo archive starting with
+#' '10.5281/zenodo.'.
+#' See examples.
+#' @param parallel Logical (\code{FALSE} by default).
+#' If \code{TRUE}, will run a number of parallel processes, each downloading
+#' another file.
+#' This is useful when multiple large files are present in the Zenodo
+#' record, which otherwise would be downloaded sequentially.
+#' Of course, the operation is limited by bandwidth and traffic limitations.
+#' @param quiet Logical (\code{FALSE} by default).
+#' Do you want to suppress informative messages (not warnings)?
+#'
+#' @importFrom stringr
+#' fixed
+#' str_remove
+#' str_split
+#' str_match
+#' @importFrom curl curl_fetch_memory curl_download
+#' @importFrom jsonlite fromJSON
+#' @importFrom tools md5sum
+#' @importFrom utils tail
+#' @importFrom assertthat
+#' assert_that
+#' is.string
+#' is.flag
+#' noNA
+#' @importFrom parallel
+#' makeCluster
+#' clusterMap
+#' stopCluster
+#'
+#' @export
+#' @family download_functions
+#'
+#' @examples
+#' \dontrun{
+#' # Example download of an archive containing a single zip
+#' download_zenodo(doi = "10.5281/zenodo.1283345")
+#' download_zenodo(doi = "10.5281/zenodo.1283345", quiet = TRUE)
+#' # Example download of an archive containing multiple files
+#' # using parallel download
+#' # (multiple files will be simultaneously downloaded)
+#' download_zenodo(doi = "10.5281/zenodo.1172801", parallel = TRUE)
+#' # Example download of an archive containing a single pdf file
+#' download_zenodo(doi = "10.5281/zenodo.168478")
+#' }
+download_zenodo <- function(doi,
+                            path = ".",
+                            parallel = FALSE,
+                            quiet = FALSE) {
+  assert_that(is.string(doi), is.string(path))
+  assert_that(is.flag(parallel), noNA(parallel), is.flag(quiet), noNA(quiet))
+
+  # check for existence of the folder
+  stopifnot(dir.exists(path))
+
+  record <- str_remove(doi, fixed("10.5281/zenodo."))
+
+  # Retrieve file name by records call
+  base_url <- "https://zenodo.org/api/records/"
+  req <- curl_fetch_memory(paste0(base_url, record))
+  content <- fromJSON(rawToChar(req$content))
+
+  # Calculate total file size
+  totalsize <- sum(content$files$size) %>%
+    human_filesize()
+
+  # extract individual file names and urls
+  file_urls <- content$files$links$self
+  filenames <- basename(content$files$key)
+  destfiles <- file.path(path, filenames)
+
+  # extract check-sum(s)
+  file_md5 <- content$files$checksum
+
+  # download files
+  if (!quiet) {
+    message(
+      "Will download ",
+      (nrfiles <- length(filenames)),
+      " file",
+      ifelse(nrfiles > 1, "s", ""),
+      " (total size: ",
+      totalsize,
+      ") from https://doi.org/",
+      doi,
+      " (",
+      content$metadata$title,
+      "; version: ",
+      ifelse(!is.null(content$metadata$version),
+        content$metadata$version,
+        content$metadata$relations$version[1, 1]
+      ),
+      ")\n"
+    )
+  }
+
+  if (parallel) {
+    nr_nodes <- min(10, length(file_urls))
+
+    if (!quiet) {
+      message(
+        "Initializing parallel download on ",
+        nr_nodes,
+        " R session nodes...\n"
+      )
+    }
+
+    clus <- makeCluster(nr_nodes)
+
+    if (!quiet) {
+      message(
+        "Starting parallel downloads. ",
+        "This may take a while (and I can't show you the overall progress).\n",
+        "Be patient...\n"
+      )
+    }
+
+    clusterMap(
+      clus,
+      function(src, dest) {
+        curl_download(
+          url = src,
+          destfile = dest,
+          quiet = quiet
+        )
+      },
+      file_urls,
+      destfiles
+    )
+
+    stopCluster(clus)
+
+    if (!quiet) message("Ended parallel downloads.")
+  } else {
+    mapply(curl_download,
+      file_urls,
+      destfiles,
+      MoreArgs = list(quiet = quiet)
+    )
+  }
+
+  # check each of the files
+
+  if (!quiet) message("\nVerifying file integrity...\n")
+
+  for (i in seq_along(file_urls)) {
+    filename <- filenames[i]
+    destfile <- destfiles[i]
+    md5 <- unname(md5sum(destfile))
+    zenodo_md5 <- str_split(file_md5[i], ":")[[1]][2]
+    if (identical(md5, zenodo_md5)) {
+      if (!quiet) {
+        message(
+          filename,
+          " was downloaded and its integrity verified (md5sum: ",
+          md5,
+          ")"
+        )
+      }
+    } else {
+      warning(
+        "Incorrect download! md5sum ",
+        md5,
+        " for file",
+        filename,
+        " does not match the Zenodo archived md5sum ",
+        zenodo_md5
+      )
+    }
+  }
+}
+
+
+
+#' Human-readable binary file size
+#'
+#' Takes an integer (referring to number of bytes) and returns an optimally
+#' human-readable
+#' \href{https://en.wikipedia.org/wiki/Binary_prefix}{binary-prefixed}
+#' byte size (KiB, MiB, GiB, TiB, PiB, EiB).
+#' The function is vectorised.
+#'
+#' @author Floris Vanderhaeghe, \email{floris.vanderhaeghe@@inbo.be}
+#'
+#' @param x A positive integer, i.e. the number of bytes (B).
+#' Can be a vector of file sizes.
+#'
+#' @return
+#' A character vector.
+#'
+#' @examples
+#' human_filesize(7845691)
+#' v <- c(12345, 456987745621258)
+#' human_filesize(v)
+#'
+#' @family Helpers
+#'
+#' @export
+#' @importFrom assertthat
+#' assert_that
+#' @importFrom dplyr
+#' %>%
+human_filesize <- function(x) {
+  assert_that(is.numeric(x))
+  assert_that(all(x %% 1 == 0 & x >= 0))
+  magnitude <-
+    log(x, base = 1024) %>%
+    floor() %>%
+    pmin(8)
+  unit <- factor(magnitude,
+    levels = 0:8,
+    labels = c(
+      "B",
+      "KiB",
+      "MiB",
+      "GiB",
+      "TiB",
+      "PiB",
+      "EiB",
+      "ZiB",
+      "YiB"
+    )
+  )
+  size <- (x / 1024^magnitude) %>% round(1)
+  return(paste(size, unit))
+}
diff --git a/tests/testthat/_snaps/zenodo.md b/tests/testthat/_snaps/zenodo.md
@@ -0,0 +1,54 @@
+# download_zenodo() works for a single-file record
+
+    Code
+      download_zenodo(doi = "10.5281/zenodo.3784149", path = zenodo_dir)
+    Message
+      Will download 1 file (total size: 32.5 KiB) from https://doi.org/10.5281/zenodo.3784149 (Distribution of the Natura 2000 habitat type 7220 (Cratoneurion) in Flanders and Brussels Capital Region, Belgium (version 2020); version: habitatsprings_2020v2)
+      
+      
+      Verifying file integrity...
+      
+      habitatsprings.geojson was downloaded and its integrity verified (md5sum: 64c3db07d17274da047b3962aab28e80)
+
+# download_zenodo() works for a GitHub code record
+
+    Code
+      download_zenodo(doi = "10.5281/zenodo.7335805", path = zenodo_dir)
+    Message
+      Will download 1 file (total size: 236.7 KiB) from https://doi.org/10.5281/zenodo.7335805 (R package n2khab: providing preprocessed reference data for Flemish Natura 2000 habitat analyses; version: 0.8.0)
+      
+      
+      Verifying file integrity...
+      
+      n2khab-v0.8.0.zip was downloaded and its integrity verified (md5sum: 25fb33360d257c085bce567da8f6a2cb)
+
+# download_zenodo() works for a multi-file record
+
+    Code
+      download_zenodo(doi = "10.5281/zenodo.4420858", path = zenodo_dir)
+    Message
+      Will download 4 files (total size: 534.5 KiB) from https://doi.org/10.5281/zenodo.4420858 (Redistribution of the Natura 2000 habitat map of Flanders, partim habitat type 3260 (version 1.7); version: habitatstreams_v1.7)
+      
+      
+      Verifying file integrity...
+      
+      habitatstreams.dbf was downloaded and its integrity verified (md5sum: f66ddddacc9511133cc02d8c1960a917)
+      habitatstreams.shx was downloaded and its integrity verified (md5sum: e7725c8267ed671f3e5f09c5fcc68bff)
+      habitatstreams.shp was downloaded and its integrity verified (md5sum: 5c94b58c9dc7809c4eeeaf660aa3323c)
+      habitatstreams.prj was downloaded and its integrity verified (md5sum: f881f61a6c07741b58cb618d8bbb0b99)
+
+# download_zenodo() can work sequentially for a multi-file record
+
+    Code
+      download_zenodo(doi = "10.5281/zenodo.4420858", path = zenodo_dir, parallel = FALSE)
+    Message
+      Will download 4 files (total size: 534.5 KiB) from https://doi.org/10.5281/zenodo.4420858 (Redistribution of the Natura 2000 habitat map of Flanders, partim habitat type 3260 (version 1.7); version: habitatstreams_v1.7)
+      
+      
+      Verifying file integrity...
+      
+      habitatstreams.dbf was downloaded and its integrity verified (md5sum: f66ddddacc9511133cc02d8c1960a917)
+      habitatstreams.shx was downloaded and its integrity verified (md5sum: e7725c8267ed671f3e5f09c5fcc68bff)
+      habitatstreams.shp was downloaded and its integrity verified (md5sum: 5c94b58c9dc7809c4eeeaf660aa3323c)
+      habitatstreams.prj was downloaded and its integrity verified (md5sum: f881f61a6c07741b58cb618d8bbb0b99)
+
diff --git a/tests/testthat/test-zenodo.R b/tests/testthat/test-zenodo.R
@@ -0,0 +1,43 @@
+test_that("download_zenodo() works for a single-file record", {
+  zenodo_dir <- tempfile()
+  withr::local_file(zenodo_dir)
+  dir.create(zenodo_dir)
+  expect_snapshot(
+    download_zenodo(doi = "10.5281/zenodo.3784149", path = zenodo_dir)
+  )
+})
+
+test_that("download_zenodo() works for a GitHub code record", {
+  zenodo_dir <- tempfile()
+  withr::local_file(zenodo_dir)
+  dir.create(zenodo_dir)
+  expect_snapshot(
+    download_zenodo(doi = "10.5281/zenodo.7335805", path = zenodo_dir)
+  )
+})
+
+test_that("download_zenodo() works for a multi-file record", {
+  zenodo_dir <- tempfile()
+  withr::local_file(zenodo_dir)
+  dir.create(zenodo_dir)
+  expect_snapshot(
+    download_zenodo(
+      doi = "10.5281/zenodo.4420858",
+      path = zenodo_dir
+    )
+  )
+})
+
+test_that("download_zenodo() can work sequentially for a multi-file record", {
+  zenodo_dir <- tempfile()
+  withr::local_file(zenodo_dir)
+  dir.create(zenodo_dir)
+  expect_snapshot(
+    download_zenodo(
+      doi = "10.5281/zenodo.4420858",
+      path = zenodo_dir,
+      parallel = FALSE
+    )
+  )
+})
+