data-raw now over at boettiger-lab/taxadb-cache

ropensci · Feb 20, 2020 · bf210e3 · bf210e3
1 parent 723bd08
commit bf210e3
Show file tree

Hide file tree

Showing 12 changed files with 285 additions and 278 deletions.
diff --git a/data-raw/README.md b/data-raw/README.md
@@ -0,0 +1,7 @@
+# data-raw
+
+The scripts used to update the cache are now being maintained separately at
+<https://github.com/boettiger-lab/taxadb-cache>
+
+Please consult that repository for the latest scripts, functions, and metadata regarding the backend cache for `taxadb`.
+
diff --git a/data-raw/col.R b/data-raw/col.R
@@ -1,20 +1,25 @@
-library(dplyr)
-library(readr)
-library(forcats)
-library(stringi)
-source("data-raw/helper-routines.R")
-
-# All snapshots available from: http://www.catalogueoflife.org/DCA_Export/archive.php
-
+#' preprocess_col
+#' 
+#' @param url Source of Catalogue of Life
+#' @param output_paths paths where output will be written: must be two paths, one named dwc, one named common
+#' @param dir working directory for downloads, will be tempdir() by default
+#' 
+#' @export
+#' @import stringi forcats readr dplyr
+#' @importFrom methods className
+#' @importFrom stats family setNames
+#' @importFrom utils download.file untar unzip
+#' @details NOTE: A list of  all snapshots available from: http://www.catalogueoflife.org/DCA_Export/archive.php
 preprocess_col <- function(url = paste0("http://www.catalogueoflife.org/DCA_Export/zip-fixed/",
                                         2019,
                                         "-annual.zip"),
                            output_paths = c(dwc = "2019/dwc_col.tsv.bz2",
-                                            common = "2019/common_col.tsv.bz2")){
+                                            common = "2019/common_col.tsv.bz2"),
+                           dir = file.path(tempdir(), "col")){
 
-  dir = file.path(tempdir(), "col")
+
   dir.create(dir, FALSE, FALSE)
-  download.file(url,
+  curl::curl_download(url,
                 file.path(dir, "col-annual.zip"))
   unzip(file.path(dir, "col-annual.zip"), exdir=dir)
 
@@ -32,15 +37,14 @@ preprocess_col <- function(url = paste0("http://www.catalogueoflife.org/DCA_Expo
     select(taxonID, scientificName, acceptedNameUsageID, taxonomicStatus, taxonRank,
            kingdom, phylum, class, order, family, genus, specificEpithet, infraspecificEpithet,
            taxonConceptID, isExtinct, nameAccordingTo, namePublishedIn, scientificNameAuthorship)
-
   taxa <- bind_rows(
     taxa_tmp %>%
       filter(!is.na(scientificNameAuthorship)) %>%
       mutate(scientificName =
              stri_trim(stri_replace_first_fixed(scientificName, scientificNameAuthorship, ""))),
     taxa_tmp %>%
       filter(is.na(scientificNameAuthorship))
-  )
+  ) 
 
   ## For accepted names, set acceptedNameUsageID to match taxonID, rather NA
   accepted <-
@@ -53,18 +57,18 @@ preprocess_col <- function(url = paste0("http://www.catalogueoflife.org/DCA_Expo
     filter(!is.na(acceptedNameUsageID)) %>%
     select(taxonID, scientificName, acceptedNameUsageID, taxonomicStatus) %>%
     left_join(accepted_heirarchy, by = c("acceptedNameUsageID" = "taxonID"))
+
 
   # We drop un-mapped synonyms, as they are not helpful
 
-
   vernacular <- read_tsv(file.path(dir, "vernacular.txt"))
   #First we create the separate common names table
   comm_table <- vernacular %>%
     select(taxonID, vernacularName, language) %>%
     inner_join(bind_rows(accepted), by = "taxonID") %>%
     mutate(taxonID = stringi::stri_paste("COL:", taxonID),
-           acceptedNameUsageID = stringi::stri_paste("COL:", acceptedNameUsageID))
-
+           acceptedNameUsageID = stringi::stri_paste("COL:", acceptedNameUsageID)
+           )
 
   # Also add a common name to the master dwc table
   # first english names,
@@ -79,22 +83,30 @@ preprocess_col <- function(url = paste0("http://www.catalogueoflife.org/DCA_Expo
     n_in_group(group_var = "taxonID", n = 1, wt = vernacularName) %>%
     bind_rows(comm_eng)  %>%
     select(taxonID, vernacularName)
-
+  
   ## stri_paste respects NAs, avoids "<prefix>:NA"
-  ## de-duplicate avoids cases where an accepted name is also listed as a synonym.
   dwc_col <-
     bind_rows(accepted, rest) %>%
     left_join(comm_names, by = "taxonID") %>%
     mutate(taxonID = stringi::stri_paste("COL:", taxonID),
            acceptedNameUsageID = stringi::stri_paste("COL:", acceptedNameUsageID))
+
+
+  message("writing COL Output...\n")
+
+ write_tsv(dwc_col, output_paths[["dwc"]])
+ write_tsv(comm_table, output_paths[["common"]])
 
+}
 
-  dir.create(dirname(output_paths["dwc"]), FALSE)
-  write_tsv(dwc_col, output_paths["dwc"])
-  write_tsv(comm_table, output_paths["common"])
+#' @importFrom utils globalVariables
 
-  file_hash(output_paths)
-}
+globalVariables(c("taxonID", "scientificName", "kingdom", "phylum", "class", "order", "family", "genus", "species",
+                  "taxonConceptID", "isExtinct", "specificEpithet", "infraspecificEpithet", "nameAccordingTo",
+                  "namePublishedIn", "scientificNameAuthorship", "namePublishedIn", "vernacularName", "language",
+                  "Species", "SpecCode", "Class", "SuperClass", "Family", "Order", "Genus", "Species", "type", "TaxonLevel", 
+                  "synonym", "name", "Language", "commonNames", "accepted_id", "synonym_id", "SynCode", "taxon",
+                  "taxonomicStatus", "acceptedNameUsageID"))
 
 
 #preprocess_col(year = "2019")

diff --git a/data-raw/fb.R b/data-raw/fb.R
@@ -1,8 +1,4 @@
-library(rfishbase) # 3.0
-library(tidyverse)
-library(stringi)
-source(here::here("data-raw/helper-routines.R"))
-
+#' @export
 preprocess_fb <- function(output_paths = c(dwc = "2019/dwc_fb.tsv.bz2",
                                            common = "2019/common_fb.tsv.bz2")
                           ){

diff --git a/data-raw/gbif.R b/data-raw/gbif.R
@@ -1,9 +1,4 @@
-library(dplyr)
-library(stringi)
-library(readr)
-source("data-raw/helper-routines.R")
 
-## extracted from: https://doi.org/10.15468/39omei
 
 preprocess_gbif <- function(url = "http://rs.gbif.org/datasets/backbone/backbone-current.zip",
                             output_paths = c(dwc = "2019/dwc_gbif.tsv.bz2",

diff --git a/data-raw/helper-routines.R b/data-raw/helper-routines.R
@@ -1,3 +1,4 @@
+#' @importFrom rlang quo := !!
 de_duplicate <- function(species){
 
   ## Note for further testing: commented code below

diff --git a/data-raw/itis.R b/data-raw/itis.R
@@ -1,10 +1,5 @@
-library(tidyverse)
-library(stringi)
-library(piggyback)
-library(RSQLite)
-library(openssl)
-source("data-raw/helper-routines.R")
 
+#' @import arkdb RSQLite DBI readr dplyr stringr
 
 preprocess_itis <- function(url = "https://www.itis.gov/downloads/itisSqlite.zip",
                             output_paths =
@@ -149,7 +144,7 @@ preprocess_itis <- function(url = "https://www.itis.gov/downloads/itisSqlite.zip
     de_duplicate()
 
 
-  dwc <- itis_taxonid %>%
+  dwc_core <- itis_taxonid %>%
     rename(taxonID = id,
            scientificName = name,
            taxonRank = rank,
@@ -161,15 +156,12 @@ preprocess_itis <- function(url = "https://www.itis.gov/downloads/itisSqlite.zip
                        specificEpithet = species
                        #infraspecificEpithet
                 ),
-              by = c("acceptedNameUsageID" =  "taxonID")) %>%
-    left_join(com_names %>%
-                select(vernacularName = vernacular_name, acceptedNameUsageID),
-              by = "acceptedNameUsageID") %>%
-    distinct()
+              by = c("acceptedNameUsageID" =  "taxonID")) 
+
 
-  species <- stringi::stri_extract_all_words(dwc$specificEpithet, simplify = TRUE)
-  dwc$specificEpithet <- species[,2]
-  dwc$infraspecificEpithet <- species[,3]
+  species <- stringi::stri_extract_all_words(dwc_core$specificEpithet, simplify = TRUE)
+  dwc_core$specificEpithet <- species[,2]
+  dwc_core$infraspecificEpithet <- species[,3]
 
 
 
@@ -196,20 +188,22 @@ preprocess_itis <- function(url = "https://www.itis.gov/downloads/itisSqlite.zip
     bind_rows(acc_common) %>%
     distinct(acceptedNameUsageID, .keep_all = TRUE)
 
+
+  ## add vernacular name
+  dwc <- dwc_core %>%
+    left_join(com_names %>%
+                select(vernacularName = vernacular_name, acceptedNameUsageID),
+              by = "acceptedNameUsageID") %>%
+    distinct()
+
+
   ## Common name table
   common <-  vern %>%
     select(-approved_ind, -vern_id) %>%
     inner_join(dwc %>% select(-vernacularName, -update_date)) %>%
     rename(vernacularName = vernacular_name)
 
 
-
-  dir.create("dwc")
-  write_tsv(dwc, "dwc/dwc_itis.tsv.bz2")
-  write_tsv(common, "dwc/common_itis.tsv.bz2")
-
-
-
   dir.create(dirname(output_paths["dwc"]), FALSE)
   write_tsv(dwc, output_paths["dwc"])
   write_tsv(common, output_paths["common"])

diff --git a/data-raw/iucn.R b/data-raw/iucn.R
@@ -1,12 +1,11 @@
-library(tidyverse)
-library(fs)
-library(vroom)
+#' @importFrom vroom vroom
 
 ## Go to the Redlist website, create a user, login, and set search options to include taxonomy, synoynms, and common names
 ## Bird data is rate-limited -- filter for Passerine birds, & then for all other birds to get it in two separate downloads
 ## Then also download all other vertabrates & invertabrates
 ## Unzip all downloads to a location specified in path
 
+## This approach is much faster & more thorough than the API approach taken in `redlist.R`
 
 preprocess_iucn <- function(path = "~/Documents/data/redlist-downloads-2019-11-25",
                             output_paths = c(dwc = "2019/dwc_iucn.tsv.bz2",

diff --git a/data-raw/make.R b/data-raw/make.R
@@ -12,54 +12,80 @@ conflict_prefer("filter", "dplyr")
 conflict_prefer("lag", "dplyr")
 conflict_scout()
 
-source(here("data-raw/helper-routines.R"))
-
-## And here we go, loading individual routines
-source(here("data-raw/gbif.R"))
-source(here("data-raw/itis.R"))
-source(here("data-raw/ncbi.R"))
-source(here("data-raw/col.R"))
-source(here("data-raw/fb.R"))
-source(here("data-raw/slb.R"))
-source(here("data-raw/ott.R"))
+message(fs::path_wd())
 
+devtools::load_all()
 #plan <- drake_plan(
 
+tag <- "2020"
+dir.create(tag)
+## 2020 annual not released yet
+## col_source <- paste0("http://www.catalogueoflife.org/DCA_Export/zip-fixed/", tag, "-annual.zip")
+col_source <- "http://www.catalogueoflife.org/DCA_Export/zip-fixed/2020-01-10-archive-complete.zip"
+ott_source <- "http://files.opentreeoflife.org/ott/ott3.2/ott3.2.tgz"
+gbif_source <- "http://rs.gbif.org/datasets/backbone/backbone-current.zip"
+itis_source <- "https://www.itis.gov/downloads/itisSqlite.zip"
+ncbi_source <- "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"
+# 
+# contenturi::register_remote(col_source)
+# contenturi::register_remote(ott_source)
+# contenturi::register_remote(gbif_source)
+# contenturi::register_remote(itis_source)
+
+
+message("FishBase...")
+fb = preprocess_fb(output_paths = c(dwc = file.path(tag, "dwc_fb.tsv.bz2"),
+                                    common = file.path(tag, "common_fb.tsv.bz2")))
+
+message("SeaLifeBase...")
+
+slb = preprocess_slb(output_paths = c(dwc = file.path(tag, "dwc_slb.tsv.bz2"),
+                                    common = file.path(tag, "common_slb.tsv.bz2")))
+
+message("OTT...")
+
+ott = preprocess_ott(url = ott_source,
+                     output_paths = c(dwc = file.path(tag, "dwc_ott.tsv.bz2")))
 
-gbif = preprocess_gbif(url = file_in("http://rs.gbif.org/datasets/backbone/backbone-current.zip"),
-                       output_paths = c(dwc = file_out("2020/dwc_gbif.tsv.bz2"),
-                                        common = file_out("2020/common_gbif.tsv.bz2")))
 
-itis = preprocess_itis(url = file_in("https://www.itis.gov/downloads/itisSqlite.zip"),
-                       output_paths = c(dwc = file_out("2020/dwc_itis.tsv.bz2"),
-                                        common = file_out("2020/common_itis.tsv.bz2")))
+message("GBIF...")
 
-ncbi = preprocess_ncbi(url = file_in("ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"),
-                      output_paths = c(dwc = file_out("2020/dwc_ncbi.tsv.bz2"),
-                                       common = file_out("2020/common_ncbi.tsv.bz2")))
+gbif = preprocess_gbif(url = gbif_source,
+                       output_paths = c(dwc = file.path(tag, "dwc_gbif.tsv.bz2"),
+                                        common = file.path(tag, "common_gbif.tsv.bz2")))
 
-col = preprocess_col(url = file_in("http://www.catalogueoflife.org/DCA_Export/zip-fixed/2020-annual.zip"),
-                     output_paths = c(dwc = file_out("2020/dwc_col.tsv.bz2"),
-                                      common = file_out("2020/common_col.tsv.bz2")))
 
+message("ITIS...")
 
-fb = preprocess_fb(output_paths = c(dwc = file_out("2020/dwc_fb.tsv.bz2"),
-                                    common = file_out("2020/common_fb.tsv.bz2")))
+itis = preprocess_itis(url = itis_source,
+                       output_paths = c(dwc = file.path(tag, "dwc_itis.tsv.bz2"),
+                                        common = file.path(tag, "common_itis.tsv.bz2")))
 
-slb = preprocess_slb(output_paths = c(dwc = file_out("2020/dwc_slb.tsv.bz2"),
-                                    common = file_out("2020/common_slb.tsv.bz2")))
 
-ott = preprocess_ott(url = file_in("http://files.opentreeoflife.org/ott/ott3.2/ott3.2.tgz"),
-                     output_paths = c(dwc = file_out("2020/dwc_ott.tsv.bz2"),
-                                      common = file_out("2020/common_ott.tsv.bz2")))
+message("NCBI...")
+
+ncbi = preprocess_ncbi(url = ncbi_source,
+                       output_paths = c(dwc = file.path(tag, "dwc_ncbi.tsv.bz2"),
+                                        common = file.path(tag, "common_ncbi.tsv.bz2")))
+
+
+
+message("COL...")
+
+col = preprocess_col(url = col_source,
+                     output_paths = c(dwc = file.path(tag, "dwc_col.tsv.bz2"),
+                                      common = file.path(tag, "common_col.tsv.bz2")))
 
 
 #)
 
+# library(piggyback)
+# setwd("2020"); fs::dir_ls("*.bz2") %>% pb_upload(repo = "boettiger-lab/taxadb-cache", tag = tag)
+
 #library(pins)
 #board_register_github(repo = "cboettig/pins-test", name = "github_pins_test")
-#pin("dwc_gbif.tsv.bz2",
-#    board = "github_pins_test")
+#pin("2020/dwc_gbif.tsv.bz2", board = "github_pins_test")
+
 
 #config <- drake_config(plan)
 #vis_drake_graph(config)

diff --git a/data-raw/ncbi.R b/data-raw/ncbi.R
@@ -1,8 +1,5 @@
-library(tidyverse)
-library(openssl)
-source(here::here("data-raw/helper-routines.R"))
-
 
+#' @export
 preprocess_ncbi <- function(url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip",
                             output_paths =
                               c(dwc = "2019/dwc_ncbi.tsv.bz2",