-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
340 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
#' Get data from a Zenodo archive | ||
#' | ||
#' This function will download an entire archive from Zenodo | ||
#' (\url{https://zenodo.org}). | ||
#' It only works for Zenodo created DOI (not when the DOI is for | ||
#' example derived from Zookeys.) | ||
#' | ||
#' @author Hans Van Calster, \email{hans.vancalster@@inbo.be} | ||
#' @author Floris Vanderhaeghe, \email{floris.vanderhaeghe@@inbo.be} | ||
#' | ||
#' @param path Path where the data must be downloaded. | ||
#' Defaults to the working directory. | ||
#' @param doi a doi pointer to the Zenodo archive starting with | ||
#' '10.5281/zenodo.'. | ||
#' See examples. | ||
#' @param parallel Logical (\code{FALSE} by default). | ||
#' If \code{TRUE}, will run a number of parallel processes, each downloading | ||
#' another file. | ||
#' This is useful when multiple large files are present in the Zenodo | ||
#' record, which otherwise would be downloaded sequentially. | ||
#' Of course, the operation is limited by bandwidth and traffic limitations. | ||
#' @param quiet Logical (\code{FALSE} by default). | ||
#' Do you want to suppress informative messages (not warnings)? | ||
#' | ||
#' @importFrom stringr | ||
#' fixed | ||
#' str_remove | ||
#' str_split | ||
#' str_match | ||
#' @importFrom curl curl_fetch_memory curl_download | ||
#' @importFrom jsonlite fromJSON | ||
#' @importFrom tools md5sum | ||
#' @importFrom utils tail | ||
#' @importFrom assertthat | ||
#' assert_that | ||
#' is.string | ||
#' is.flag | ||
#' noNA | ||
#' @importFrom parallel | ||
#' makeCluster | ||
#' clusterMap | ||
#' stopCluster | ||
#' | ||
#' @export | ||
#' @family download_functions | ||
#' | ||
#' @examples | ||
#' \dontrun{ | ||
#' # Example download of an archive containing a single zip | ||
#' download_zenodo(doi = "10.5281/zenodo.1283345") | ||
#' download_zenodo(doi = "10.5281/zenodo.1283345", quiet = TRUE) | ||
#' # Example download of an archive containing multiple files | ||
#' # using parallel download | ||
#' # (multiple files will be simultaneously downloaded) | ||
#' download_zenodo(doi = "10.5281/zenodo.1172801", parallel = TRUE) | ||
#' # Example download of an archive containing a single pdf file | ||
#' download_zenodo(doi = "10.5281/zenodo.168478") | ||
#' } | ||
download_zenodo <- function(doi, | ||
path = ".", | ||
parallel = FALSE, | ||
quiet = FALSE) { | ||
assert_that(is.string(doi), is.string(path)) | ||
assert_that(is.flag(parallel), noNA(parallel), is.flag(quiet), noNA(quiet)) | ||
|
||
# check for existence of the folder | ||
stopifnot(dir.exists(path)) | ||
|
||
record <- str_remove(doi, fixed("10.5281/zenodo.")) | ||
|
||
# Retrieve file name by records call | ||
base_url <- "https://zenodo.org/api/records/" | ||
req <- curl_fetch_memory(paste0(base_url, record)) | ||
content <- fromJSON(rawToChar(req$content)) | ||
|
||
# Calculate total file size | ||
totalsize <- sum(content$files$size) %>% | ||
human_filesize() | ||
|
||
# extract individual file names and urls | ||
file_urls <- content$files$links$self | ||
filenames <- basename(content$files$key) | ||
destfiles <- file.path(path, filenames) | ||
|
||
# extract check-sum(s) | ||
file_md5 <- content$files$checksum | ||
|
||
# download files | ||
if (!quiet) { | ||
message( | ||
"Will download ", | ||
(nrfiles <- length(filenames)), | ||
" file", | ||
ifelse(nrfiles > 1, "s", ""), | ||
" (total size: ", | ||
totalsize, | ||
") from https://doi.org/", | ||
doi, | ||
" (", | ||
content$metadata$title, | ||
"; version: ", | ||
ifelse(!is.null(content$metadata$version), | ||
content$metadata$version, | ||
content$metadata$relations$version[1, 1] | ||
), | ||
")\n" | ||
) | ||
} | ||
|
||
if (parallel) { | ||
nr_nodes <- min(10, length(file_urls)) | ||
|
||
if (!quiet) { | ||
message( | ||
"Initializing parallel download on ", | ||
nr_nodes, | ||
" R session nodes...\n" | ||
) | ||
} | ||
|
||
clus <- makeCluster(nr_nodes) | ||
|
||
if (!quiet) { | ||
message( | ||
"Starting parallel downloads. ", | ||
"This may take a while (and I can't show you the overall progress).\n", | ||
"Be patient...\n" | ||
) | ||
} | ||
|
||
clusterMap( | ||
clus, | ||
function(src, dest) { | ||
curl_download( | ||
url = src, | ||
destfile = dest, | ||
quiet = quiet | ||
) | ||
}, | ||
file_urls, | ||
destfiles | ||
) | ||
|
||
stopCluster(clus) | ||
|
||
if (!quiet) message("Ended parallel downloads.") | ||
} else { | ||
mapply(curl_download, | ||
file_urls, | ||
destfiles, | ||
MoreArgs = list(quiet = quiet) | ||
) | ||
} | ||
|
||
# check each of the files | ||
|
||
if (!quiet) message("\nVerifying file integrity...\n") | ||
|
||
for (i in seq_along(file_urls)) { | ||
filename <- filenames[i] | ||
destfile <- destfiles[i] | ||
md5 <- unname(md5sum(destfile)) | ||
zenodo_md5 <- str_split(file_md5[i], ":")[[1]][2] | ||
if (identical(md5, zenodo_md5)) { | ||
if (!quiet) { | ||
message( | ||
filename, | ||
" was downloaded and its integrity verified (md5sum: ", | ||
md5, | ||
")" | ||
) | ||
} | ||
} else { | ||
warning( | ||
"Incorrect download! md5sum ", | ||
md5, | ||
" for file", | ||
filename, | ||
" does not match the Zenodo archived md5sum ", | ||
zenodo_md5 | ||
) | ||
} | ||
} | ||
} | ||
|
||
|
||
|
||
#' Human-readable binary file size | ||
#' | ||
#' Takes an integer (referring to number of bytes) and returns an optimally | ||
#' human-readable | ||
#' \href{https://en.wikipedia.org/wiki/Binary_prefix}{binary-prefixed} | ||
#' byte size (KiB, MiB, GiB, TiB, PiB, EiB). | ||
#' The function is vectorised. | ||
#' | ||
#' @author Floris Vanderhaeghe, \email{floris.vanderhaeghe@@inbo.be} | ||
#' | ||
#' @param x A positive integer, i.e. the number of bytes (B). | ||
#' Can be a vector of file sizes. | ||
#' | ||
#' @return | ||
#' A character vector. | ||
#' | ||
#' @examples | ||
#' human_filesize(7845691) | ||
#' v <- c(12345, 456987745621258) | ||
#' human_filesize(v) | ||
#' | ||
#' @family Helpers | ||
#' | ||
#' @export | ||
#' @importFrom assertthat | ||
#' assert_that | ||
#' @importFrom dplyr | ||
#' %>% | ||
human_filesize <- function(x) { | ||
assert_that(is.numeric(x)) | ||
assert_that(all(x %% 1 == 0 & x >= 0)) | ||
magnitude <- | ||
log(x, base = 1024) %>% | ||
floor() %>% | ||
pmin(8) | ||
unit <- factor(magnitude, | ||
levels = 0:8, | ||
labels = c( | ||
"B", | ||
"KiB", | ||
"MiB", | ||
"GiB", | ||
"TiB", | ||
"PiB", | ||
"EiB", | ||
"ZiB", | ||
"YiB" | ||
) | ||
) | ||
size <- (x / 1024^magnitude) %>% round(1) | ||
return(paste(size, unit)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# download_zenodo() works for a single-file record | ||
|
||
Code | ||
download_zenodo(doi = "10.5281/zenodo.3784149", path = zenodo_dir) | ||
Message | ||
Will download 1 file (total size: 32.5 KiB) from https://doi.org/10.5281/zenodo.3784149 (Distribution of the Natura 2000 habitat type 7220 (Cratoneurion) in Flanders and Brussels Capital Region, Belgium (version 2020); version: habitatsprings_2020v2) | ||
Verifying file integrity... | ||
habitatsprings.geojson was downloaded and its integrity verified (md5sum: 64c3db07d17274da047b3962aab28e80) | ||
|
||
# download_zenodo() works for a GitHub code record | ||
|
||
Code | ||
download_zenodo(doi = "10.5281/zenodo.7335805", path = zenodo_dir) | ||
Message | ||
Will download 1 file (total size: 236.7 KiB) from https://doi.org/10.5281/zenodo.7335805 (R package n2khab: providing preprocessed reference data for Flemish Natura 2000 habitat analyses; version: 0.8.0) | ||
Verifying file integrity... | ||
n2khab-v0.8.0.zip was downloaded and its integrity verified (md5sum: 25fb33360d257c085bce567da8f6a2cb) | ||
|
||
# download_zenodo() works for a multi-file record | ||
|
||
Code | ||
download_zenodo(doi = "10.5281/zenodo.4420858", path = zenodo_dir) | ||
Message | ||
Will download 4 files (total size: 534.5 KiB) from https://doi.org/10.5281/zenodo.4420858 (Redistribution of the Natura 2000 habitat map of Flanders, partim habitat type 3260 (version 1.7); version: habitatstreams_v1.7) | ||
Verifying file integrity... | ||
habitatstreams.dbf was downloaded and its integrity verified (md5sum: f66ddddacc9511133cc02d8c1960a917) | ||
habitatstreams.shx was downloaded and its integrity verified (md5sum: e7725c8267ed671f3e5f09c5fcc68bff) | ||
habitatstreams.shp was downloaded and its integrity verified (md5sum: 5c94b58c9dc7809c4eeeaf660aa3323c) | ||
habitatstreams.prj was downloaded and its integrity verified (md5sum: f881f61a6c07741b58cb618d8bbb0b99) | ||
|
||
# download_zenodo() can work sequentially for a multi-file record | ||
|
||
Code | ||
download_zenodo(doi = "10.5281/zenodo.4420858", path = zenodo_dir, parallel = FALSE) | ||
Message | ||
Will download 4 files (total size: 534.5 KiB) from https://doi.org/10.5281/zenodo.4420858 (Redistribution of the Natura 2000 habitat map of Flanders, partim habitat type 3260 (version 1.7); version: habitatstreams_v1.7) | ||
Verifying file integrity... | ||
habitatstreams.dbf was downloaded and its integrity verified (md5sum: f66ddddacc9511133cc02d8c1960a917) | ||
habitatstreams.shx was downloaded and its integrity verified (md5sum: e7725c8267ed671f3e5f09c5fcc68bff) | ||
habitatstreams.shp was downloaded and its integrity verified (md5sum: 5c94b58c9dc7809c4eeeaf660aa3323c) | ||
habitatstreams.prj was downloaded and its integrity verified (md5sum: f881f61a6c07741b58cb618d8bbb0b99) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
test_that("download_zenodo() works for a single-file record", { | ||
testthat::local_edition(3) | ||
zenodo_dir <- tempfile() | ||
withr::local_file(zenodo_dir) | ||
dir.create(zenodo_dir) | ||
expect_snapshot( | ||
download_zenodo(doi = "10.5281/zenodo.3784149", path = zenodo_dir) | ||
) | ||
}) | ||
|
||
test_that("download_zenodo() works for a GitHub code record", { | ||
testthat::local_edition(3) | ||
zenodo_dir <- tempfile() | ||
withr::local_file(zenodo_dir) | ||
dir.create(zenodo_dir) | ||
expect_snapshot( | ||
download_zenodo(doi = "10.5281/zenodo.7335805", path = zenodo_dir) | ||
) | ||
}) | ||
|
||
test_that("download_zenodo() works for a multi-file record", { | ||
testthat::local_edition(3) | ||
zenodo_dir <- tempfile() | ||
withr::local_file(zenodo_dir) | ||
dir.create(zenodo_dir) | ||
expect_snapshot( | ||
download_zenodo( | ||
doi = "10.5281/zenodo.4420858", | ||
path = zenodo_dir | ||
) | ||
) | ||
}) | ||
|
||
test_that("download_zenodo() can work sequentially for a multi-file record", { | ||
testthat::local_edition(3) | ||
zenodo_dir <- tempfile() | ||
withr::local_file(zenodo_dir) | ||
dir.create(zenodo_dir) | ||
expect_snapshot( | ||
download_zenodo( | ||
doi = "10.5281/zenodo.4420858", | ||
path = zenodo_dir, | ||
parallel = FALSE | ||
) | ||
) | ||
}) | ||
|
||