Skip to content

Commit

Permalink
download_zenodo(): add unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
florisvdh committed Nov 20, 2023
1 parent 07ca0e6 commit 8984790
Show file tree
Hide file tree
Showing 3 changed files with 336 additions and 0 deletions.
239 changes: 239 additions & 0 deletions 20a9ee_download_zenodo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
#' Get data from a Zenodo archive
#'
#' This function will download an entire archive from Zenodo
#' (\url{https://zenodo.org}).
#' It only works for Zenodo created DOI (not when the DOI is for
#' example derived from Zookeys.)
#'
#' @author Hans Van Calster, \email{hans.vancalster@@inbo.be}
#' @author Floris Vanderhaeghe, \email{floris.vanderhaeghe@@inbo.be}
#'
#' @param path Path where the data must be downloaded.
#' Defaults to the working directory.
#' @param doi a doi pointer to the Zenodo archive starting with
#' '10.5281/zenodo.'.
#' See examples.
#' @param parallel Logical (\code{FALSE} by default).
#' If \code{TRUE}, will run a number of parallel processes, each downloading
#' another file.
#' This is useful when multiple large files are present in the Zenodo
#' record, which otherwise would be downloaded sequentially.
#' Of course, the operation is limited by bandwidth and traffic limitations.
#' @param quiet Logical (\code{FALSE} by default).
#' Do you want to suppress informative messages (not warnings)?
#'
#' @importFrom stringr
#' fixed
#' str_remove
#' str_split
#' str_match
#' @importFrom curl curl_fetch_memory curl_download
#' @importFrom jsonlite fromJSON
#' @importFrom tools md5sum
#' @importFrom utils tail
#' @importFrom assertthat
#' assert_that
#' is.string
#' is.flag
#' noNA
#' @importFrom parallel
#' makeCluster
#' clusterMap
#' stopCluster
#'
#' @export
#' @family download_functions
#'
#' @examples
#' \dontrun{
#' # Example download of an archive containing a single zip
#' download_zenodo(doi = "10.5281/zenodo.1283345")
#' download_zenodo(doi = "10.5281/zenodo.1283345", quiet = TRUE)
#' # Example download of an archive containing multiple files
#' # using parallel download
#' # (multiple files will be simultaneously downloaded)
#' download_zenodo(doi = "10.5281/zenodo.1172801", parallel = TRUE)
#' # Example download of an archive containing a single pdf file
#' download_zenodo(doi = "10.5281/zenodo.168478")
#' }
download_zenodo <- function(doi,
path = ".",
parallel = FALSE,
quiet = FALSE) {
assert_that(is.string(doi), is.string(path))
assert_that(is.flag(parallel), noNA(parallel), is.flag(quiet), noNA(quiet))

# check for existence of the folder
stopifnot(dir.exists(path))

record <- str_remove(doi, fixed("10.5281/zenodo."))

# Retrieve file name by records call
base_url <- "https://zenodo.org/api/records/"
req <- curl_fetch_memory(paste0(base_url, record))
content <- fromJSON(rawToChar(req$content))

# Calculate total file size
totalsize <- sum(content$files$size) %>%
human_filesize()

# extract individual file names and urls
file_urls <- content$files$links$self
filenames <- basename(content$files$key)
destfiles <- file.path(path, filenames)

# extract check-sum(s)
file_md5 <- content$files$checksum

# download files
if (!quiet) {
message(
"Will download ",
(nrfiles <- length(filenames)),
" file",
ifelse(nrfiles > 1, "s", ""),
" (total size: ",
totalsize,
") from https://doi.org/",
doi,
" (",
content$metadata$title,
"; version: ",
ifelse(!is.null(content$metadata$version),
content$metadata$version,
content$metadata$relations$version[1, 1]
),
")\n"
)
}

if (parallel) {
nr_nodes <- min(10, length(file_urls))

if (!quiet) {
message(
"Initializing parallel download on ",
nr_nodes,
" R session nodes...\n"
)
}

clus <- makeCluster(nr_nodes)

if (!quiet) {
message(
"Starting parallel downloads. ",
"This may take a while (and I can't show you the overall progress).\n",
"Be patient...\n"
)
}

clusterMap(
clus,
function(src, dest) {
curl_download(
url = src,
destfile = dest,
quiet = quiet
)
},
file_urls,
destfiles
)

stopCluster(clus)

if (!quiet) message("Ended parallel downloads.")
} else {
mapply(curl_download,
file_urls,
destfiles,
MoreArgs = list(quiet = quiet)
)
}

# check each of the files

if (!quiet) message("\nVerifying file integrity...\n")

for (i in seq_along(file_urls)) {
filename <- filenames[i]
destfile <- destfiles[i]
md5 <- unname(md5sum(destfile))
zenodo_md5 <- str_split(file_md5[i], ":")[[1]][2]
if (identical(md5, zenodo_md5)) {
if (!quiet) {
message(
filename,
" was downloaded and its integrity verified (md5sum: ",
md5,
")"
)
}
} else {
warning(
"Incorrect download! md5sum ",
md5,
" for file",
filename,
" does not match the Zenodo archived md5sum ",
zenodo_md5
)
}
}
}



#' Human-readable binary file size
#'
#' Takes an integer (referring to number of bytes) and returns an optimally
#' human-readable
#' \href{https://en.wikipedia.org/wiki/Binary_prefix}{binary-prefixed}
#' byte size (KiB, MiB, GiB, TiB, PiB, EiB).
#' The function is vectorised.
#'
#' @author Floris Vanderhaeghe, \email{floris.vanderhaeghe@@inbo.be}
#'
#' @param x A positive integer, i.e. the number of bytes (B).
#' Can be a vector of file sizes.
#'
#' @return
#' A character vector.
#'
#' @examples
#' human_filesize(7845691)
#' v <- c(12345, 456987745621258)
#' human_filesize(v)
#'
#' @family Helpers
#'
#' @export
#' @importFrom assertthat
#' assert_that
#' @importFrom dplyr
#' %>%
human_filesize <- function(x) {
assert_that(is.numeric(x))
assert_that(all(x %% 1 == 0 & x >= 0))
magnitude <-
log(x, base = 1024) %>%
floor() %>%
pmin(8)
unit <- factor(magnitude,
levels = 0:8,
labels = c(
"B",
"KiB",
"MiB",
"GiB",
"TiB",
"PiB",
"EiB",
"ZiB",
"YiB"
)
)
size <- (x / 1024^magnitude) %>% round(1)
return(paste(size, unit))
}
54 changes: 54 additions & 0 deletions tests/testthat/_snaps/zenodo.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# download_zenodo() works for a single-file record

Code
download_zenodo(doi = "10.5281/zenodo.3784149", path = zenodo_dir)
Message
Will download 1 file (total size: 32.5 KiB) from https://doi.org/10.5281/zenodo.3784149 (Distribution of the Natura 2000 habitat type 7220 (Cratoneurion) in Flanders and Brussels Capital Region, Belgium (version 2020); version: habitatsprings_2020v2)
Verifying file integrity...
habitatsprings.geojson was downloaded and its integrity verified (md5sum: 64c3db07d17274da047b3962aab28e80)

# download_zenodo() works for a GitHub code record

Code
download_zenodo(doi = "10.5281/zenodo.7335805", path = zenodo_dir)
Message
Will download 1 file (total size: 236.7 KiB) from https://doi.org/10.5281/zenodo.7335805 (R package n2khab: providing preprocessed reference data for Flemish Natura 2000 habitat analyses; version: 0.8.0)
Verifying file integrity...
n2khab-v0.8.0.zip was downloaded and its integrity verified (md5sum: 25fb33360d257c085bce567da8f6a2cb)

# download_zenodo() works for a multi-file record

Code
download_zenodo(doi = "10.5281/zenodo.4420858", path = zenodo_dir)
Message
Will download 4 files (total size: 534.5 KiB) from https://doi.org/10.5281/zenodo.4420858 (Redistribution of the Natura 2000 habitat map of Flanders, partim habitat type 3260 (version 1.7); version: habitatstreams_v1.7)
Verifying file integrity...
habitatstreams.dbf was downloaded and its integrity verified (md5sum: f66ddddacc9511133cc02d8c1960a917)
habitatstreams.shx was downloaded and its integrity verified (md5sum: e7725c8267ed671f3e5f09c5fcc68bff)
habitatstreams.shp was downloaded and its integrity verified (md5sum: 5c94b58c9dc7809c4eeeaf660aa3323c)
habitatstreams.prj was downloaded and its integrity verified (md5sum: f881f61a6c07741b58cb618d8bbb0b99)

# download_zenodo() can work sequentially for a multi-file record

Code
download_zenodo(doi = "10.5281/zenodo.4420858", path = zenodo_dir, parallel = FALSE)
Message
Will download 4 files (total size: 534.5 KiB) from https://doi.org/10.5281/zenodo.4420858 (Redistribution of the Natura 2000 habitat map of Flanders, partim habitat type 3260 (version 1.7); version: habitatstreams_v1.7)
Verifying file integrity...
habitatstreams.dbf was downloaded and its integrity verified (md5sum: f66ddddacc9511133cc02d8c1960a917)
habitatstreams.shx was downloaded and its integrity verified (md5sum: e7725c8267ed671f3e5f09c5fcc68bff)
habitatstreams.shp was downloaded and its integrity verified (md5sum: 5c94b58c9dc7809c4eeeaf660aa3323c)
habitatstreams.prj was downloaded and its integrity verified (md5sum: f881f61a6c07741b58cb618d8bbb0b99)

43 changes: 43 additions & 0 deletions tests/testthat/test-zenodo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
test_that("download_zenodo() works for a single-file record", {
zenodo_dir <- tempfile()
withr::local_file(zenodo_dir)
dir.create(zenodo_dir)
expect_snapshot(
download_zenodo(doi = "10.5281/zenodo.3784149", path = zenodo_dir)
)
})

test_that("download_zenodo() works for a GitHub code record", {
zenodo_dir <- tempfile()
withr::local_file(zenodo_dir)
dir.create(zenodo_dir)
expect_snapshot(
download_zenodo(doi = "10.5281/zenodo.7335805", path = zenodo_dir)
)
})

test_that("download_zenodo() works for a multi-file record", {
zenodo_dir <- tempfile()
withr::local_file(zenodo_dir)
dir.create(zenodo_dir)
expect_snapshot(
download_zenodo(
doi = "10.5281/zenodo.4420858",
path = zenodo_dir
)
)
})

test_that("download_zenodo() can work sequentially for a multi-file record", {
zenodo_dir <- tempfile()
withr::local_file(zenodo_dir)
dir.create(zenodo_dir)
expect_snapshot(
download_zenodo(
doi = "10.5281/zenodo.4420858",
path = zenodo_dir,
parallel = FALSE
)
)
})

Check warning on line 43 in tests/testthat/test-zenodo.R

View workflow job for this annotation

GitHub Actions / check package

file=tests/testthat/test-zenodo.R,line=43,col=1,[trailing_blank_lines_linter] Trailing blank lines are superfluous.

0 comments on commit 8984790

Please sign in to comment.