Skip to content

Commit

Permalink
data-raw now over at boettiger-lab/taxadb-cache
Browse files Browse the repository at this point in the history
  • Loading branch information
cboettig committed Feb 20, 2020
1 parent 723bd08 commit bf210e3
Show file tree
Hide file tree
Showing 12 changed files with 285 additions and 278 deletions.
7 changes: 7 additions & 0 deletions data-raw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# data-raw

The scripts used to update the cache are now being maintained separately at
<https://github.com/boettiger-lab/taxadb-cache>

Please consult that repository for the latest scripts, functions, and metadata regarding the backend cache for `taxadb`.

58 changes: 35 additions & 23 deletions data-raw/col.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
library(dplyr)
library(readr)
library(forcats)
library(stringi)
source("data-raw/helper-routines.R")

# All snapshots available from: http://www.catalogueoflife.org/DCA_Export/archive.php

#' preprocess_col
#'
#' @param url Source of Catalogue of Life
#' @param output_paths paths where output will be written: must be two paths, one named dwc, one named common
#' @param dir working directory for downloads, will be tempdir() by default
#'
#' @export
#' @import stringi forcats readr dplyr
#' @importFrom methods className
#' @importFrom stats family setNames
#' @importFrom utils download.file untar unzip
#' @details NOTE: A list of all snapshots available from: http://www.catalogueoflife.org/DCA_Export/archive.php
preprocess_col <- function(url = paste0("http://www.catalogueoflife.org/DCA_Export/zip-fixed/",
2019,
"-annual.zip"),
output_paths = c(dwc = "2019/dwc_col.tsv.bz2",
common = "2019/common_col.tsv.bz2")){
common = "2019/common_col.tsv.bz2"),
dir = file.path(tempdir(), "col")){

dir = file.path(tempdir(), "col")

dir.create(dir, FALSE, FALSE)
download.file(url,
curl::curl_download(url,
file.path(dir, "col-annual.zip"))
unzip(file.path(dir, "col-annual.zip"), exdir=dir)

Expand All @@ -32,15 +37,14 @@ preprocess_col <- function(url = paste0("http://www.catalogueoflife.org/DCA_Expo
select(taxonID, scientificName, acceptedNameUsageID, taxonomicStatus, taxonRank,
kingdom, phylum, class, order, family, genus, specificEpithet, infraspecificEpithet,
taxonConceptID, isExtinct, nameAccordingTo, namePublishedIn, scientificNameAuthorship)

taxa <- bind_rows(
taxa_tmp %>%
filter(!is.na(scientificNameAuthorship)) %>%
mutate(scientificName =
stri_trim(stri_replace_first_fixed(scientificName, scientificNameAuthorship, ""))),
taxa_tmp %>%
filter(is.na(scientificNameAuthorship))
)
)

## For accepted names, set acceptedNameUsageID to match taxonID, rather NA
accepted <-
Expand All @@ -53,18 +57,18 @@ preprocess_col <- function(url = paste0("http://www.catalogueoflife.org/DCA_Expo
filter(!is.na(acceptedNameUsageID)) %>%
select(taxonID, scientificName, acceptedNameUsageID, taxonomicStatus) %>%
left_join(accepted_heirarchy, by = c("acceptedNameUsageID" = "taxonID"))


# We drop un-mapped synonyms, as they are not helpful


vernacular <- read_tsv(file.path(dir, "vernacular.txt"))
#First we create the separate common names table
comm_table <- vernacular %>%
select(taxonID, vernacularName, language) %>%
inner_join(bind_rows(accepted), by = "taxonID") %>%
mutate(taxonID = stringi::stri_paste("COL:", taxonID),
acceptedNameUsageID = stringi::stri_paste("COL:", acceptedNameUsageID))

acceptedNameUsageID = stringi::stri_paste("COL:", acceptedNameUsageID)
)

# Also add a common name to the master dwc table
# first english names,
Expand All @@ -79,22 +83,30 @@ preprocess_col <- function(url = paste0("http://www.catalogueoflife.org/DCA_Expo
n_in_group(group_var = "taxonID", n = 1, wt = vernacularName) %>%
bind_rows(comm_eng) %>%
select(taxonID, vernacularName)

## stri_paste respects NAs, avoids "<prefix>:NA"
## de-duplicate avoids cases where an accepted name is also listed as a synonym.
dwc_col <-
bind_rows(accepted, rest) %>%
left_join(comm_names, by = "taxonID") %>%
mutate(taxonID = stringi::stri_paste("COL:", taxonID),
acceptedNameUsageID = stringi::stri_paste("COL:", acceptedNameUsageID))


message("writing COL Output...\n")

write_tsv(dwc_col, output_paths[["dwc"]])
write_tsv(comm_table, output_paths[["common"]])

}

dir.create(dirname(output_paths["dwc"]), FALSE)
write_tsv(dwc_col, output_paths["dwc"])
write_tsv(comm_table, output_paths["common"])
#' @importFrom utils globalVariables

file_hash(output_paths)
}
globalVariables(c("taxonID", "scientificName", "kingdom", "phylum", "class", "order", "family", "genus", "species",
"taxonConceptID", "isExtinct", "specificEpithet", "infraspecificEpithet", "nameAccordingTo",
"namePublishedIn", "scientificNameAuthorship", "namePublishedIn", "vernacularName", "language",
"Species", "SpecCode", "Class", "SuperClass", "Family", "Order", "Genus", "Species", "type", "TaxonLevel",
"synonym", "name", "Language", "commonNames", "accepted_id", "synonym_id", "SynCode", "taxon",
"taxonomicStatus", "acceptedNameUsageID"))


#preprocess_col(year = "2019")
Expand Down
6 changes: 1 addition & 5 deletions data-raw/fb.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
library(rfishbase) # 3.0
library(tidyverse)
library(stringi)
source(here::here("data-raw/helper-routines.R"))

#' @export
preprocess_fb <- function(output_paths = c(dwc = "2019/dwc_fb.tsv.bz2",
common = "2019/common_fb.tsv.bz2")
){
Expand Down
5 changes: 0 additions & 5 deletions data-raw/gbif.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
library(dplyr)
library(stringi)
library(readr)
source("data-raw/helper-routines.R")

## extracted from: https://doi.org/10.15468/39omei

preprocess_gbif <- function(url = "http://rs.gbif.org/datasets/backbone/backbone-current.zip",
output_paths = c(dwc = "2019/dwc_gbif.tsv.bz2",
Expand Down
1 change: 1 addition & 0 deletions data-raw/helper-routines.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#' @importFrom rlang quo := !!
de_duplicate <- function(species){

## Note for further testing: commented code below
Expand Down
38 changes: 16 additions & 22 deletions data-raw/itis.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
library(tidyverse)
library(stringi)
library(piggyback)
library(RSQLite)
library(openssl)
source("data-raw/helper-routines.R")

#' @import arkdb RSQLite DBI readr dplyr stringr

preprocess_itis <- function(url = "https://www.itis.gov/downloads/itisSqlite.zip",
output_paths =
Expand Down Expand Up @@ -149,7 +144,7 @@ preprocess_itis <- function(url = "https://www.itis.gov/downloads/itisSqlite.zip
de_duplicate()


dwc <- itis_taxonid %>%
dwc_core <- itis_taxonid %>%
rename(taxonID = id,
scientificName = name,
taxonRank = rank,
Expand All @@ -161,15 +156,12 @@ preprocess_itis <- function(url = "https://www.itis.gov/downloads/itisSqlite.zip
specificEpithet = species
#infraspecificEpithet
),
by = c("acceptedNameUsageID" = "taxonID")) %>%
left_join(com_names %>%
select(vernacularName = vernacular_name, acceptedNameUsageID),
by = "acceptedNameUsageID") %>%
distinct()
by = c("acceptedNameUsageID" = "taxonID"))


species <- stringi::stri_extract_all_words(dwc$specificEpithet, simplify = TRUE)
dwc$specificEpithet <- species[,2]
dwc$infraspecificEpithet <- species[,3]
species <- stringi::stri_extract_all_words(dwc_core$specificEpithet, simplify = TRUE)
dwc_core$specificEpithet <- species[,2]
dwc_core$infraspecificEpithet <- species[,3]



Expand All @@ -196,20 +188,22 @@ preprocess_itis <- function(url = "https://www.itis.gov/downloads/itisSqlite.zip
bind_rows(acc_common) %>%
distinct(acceptedNameUsageID, .keep_all = TRUE)


## add vernacular name
dwc <- dwc_core %>%
left_join(com_names %>%
select(vernacularName = vernacular_name, acceptedNameUsageID),
by = "acceptedNameUsageID") %>%
distinct()


## Common name table
common <- vern %>%
select(-approved_ind, -vern_id) %>%
inner_join(dwc %>% select(-vernacularName, -update_date)) %>%
rename(vernacularName = vernacular_name)



dir.create("dwc")
write_tsv(dwc, "dwc/dwc_itis.tsv.bz2")
write_tsv(common, "dwc/common_itis.tsv.bz2")



dir.create(dirname(output_paths["dwc"]), FALSE)
write_tsv(dwc, output_paths["dwc"])
write_tsv(common, output_paths["common"])
Expand Down
5 changes: 2 additions & 3 deletions data-raw/iucn.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
library(tidyverse)
library(fs)
library(vroom)
#' @importFrom vroom vroom

## Go to the Redlist website, create a user, login, and set search options to include taxonomy, synoynms, and common names
## Bird data is rate-limited -- filter for Passerine birds, & then for all other birds to get it in two separate downloads
## Then also download all other vertabrates & invertabrates
## Unzip all downloads to a location specified in path

## This approach is much faster & more thorough than the API approach taken in `redlist.R`

preprocess_iucn <- function(path = "~/Documents/data/redlist-downloads-2019-11-25",
output_paths = c(dwc = "2019/dwc_iucn.tsv.bz2",
Expand Down
88 changes: 57 additions & 31 deletions data-raw/make.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,54 +12,80 @@ conflict_prefer("filter", "dplyr")
conflict_prefer("lag", "dplyr")
conflict_scout()

source(here("data-raw/helper-routines.R"))

## And here we go, loading individual routines
source(here("data-raw/gbif.R"))
source(here("data-raw/itis.R"))
source(here("data-raw/ncbi.R"))
source(here("data-raw/col.R"))
source(here("data-raw/fb.R"))
source(here("data-raw/slb.R"))
source(here("data-raw/ott.R"))
message(fs::path_wd())

devtools::load_all()
#plan <- drake_plan(

tag <- "2020"
dir.create(tag)
## 2020 annual not released yet
## col_source <- paste0("http://www.catalogueoflife.org/DCA_Export/zip-fixed/", tag, "-annual.zip")
col_source <- "http://www.catalogueoflife.org/DCA_Export/zip-fixed/2020-01-10-archive-complete.zip"
ott_source <- "http://files.opentreeoflife.org/ott/ott3.2/ott3.2.tgz"
gbif_source <- "http://rs.gbif.org/datasets/backbone/backbone-current.zip"
itis_source <- "https://www.itis.gov/downloads/itisSqlite.zip"
ncbi_source <- "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"
#
# contenturi::register_remote(col_source)
# contenturi::register_remote(ott_source)
# contenturi::register_remote(gbif_source)
# contenturi::register_remote(itis_source)


message("FishBase...")
fb = preprocess_fb(output_paths = c(dwc = file.path(tag, "dwc_fb.tsv.bz2"),
common = file.path(tag, "common_fb.tsv.bz2")))

message("SeaLifeBase...")

slb = preprocess_slb(output_paths = c(dwc = file.path(tag, "dwc_slb.tsv.bz2"),
common = file.path(tag, "common_slb.tsv.bz2")))

message("OTT...")

ott = preprocess_ott(url = ott_source,
output_paths = c(dwc = file.path(tag, "dwc_ott.tsv.bz2")))

gbif = preprocess_gbif(url = file_in("http://rs.gbif.org/datasets/backbone/backbone-current.zip"),
output_paths = c(dwc = file_out("2020/dwc_gbif.tsv.bz2"),
common = file_out("2020/common_gbif.tsv.bz2")))

itis = preprocess_itis(url = file_in("https://www.itis.gov/downloads/itisSqlite.zip"),
output_paths = c(dwc = file_out("2020/dwc_itis.tsv.bz2"),
common = file_out("2020/common_itis.tsv.bz2")))
message("GBIF...")

ncbi = preprocess_ncbi(url = file_in("ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"),
output_paths = c(dwc = file_out("2020/dwc_ncbi.tsv.bz2"),
common = file_out("2020/common_ncbi.tsv.bz2")))
gbif = preprocess_gbif(url = gbif_source,
output_paths = c(dwc = file.path(tag, "dwc_gbif.tsv.bz2"),
common = file.path(tag, "common_gbif.tsv.bz2")))

col = preprocess_col(url = file_in("http://www.catalogueoflife.org/DCA_Export/zip-fixed/2020-annual.zip"),
output_paths = c(dwc = file_out("2020/dwc_col.tsv.bz2"),
common = file_out("2020/common_col.tsv.bz2")))

message("ITIS...")

fb = preprocess_fb(output_paths = c(dwc = file_out("2020/dwc_fb.tsv.bz2"),
common = file_out("2020/common_fb.tsv.bz2")))
itis = preprocess_itis(url = itis_source,
output_paths = c(dwc = file.path(tag, "dwc_itis.tsv.bz2"),
common = file.path(tag, "common_itis.tsv.bz2")))

slb = preprocess_slb(output_paths = c(dwc = file_out("2020/dwc_slb.tsv.bz2"),
common = file_out("2020/common_slb.tsv.bz2")))

ott = preprocess_ott(url = file_in("http://files.opentreeoflife.org/ott/ott3.2/ott3.2.tgz"),
output_paths = c(dwc = file_out("2020/dwc_ott.tsv.bz2"),
common = file_out("2020/common_ott.tsv.bz2")))
message("NCBI...")

ncbi = preprocess_ncbi(url = ncbi_source,
output_paths = c(dwc = file.path(tag, "dwc_ncbi.tsv.bz2"),
common = file.path(tag, "common_ncbi.tsv.bz2")))



message("COL...")

col = preprocess_col(url = col_source,
output_paths = c(dwc = file.path(tag, "dwc_col.tsv.bz2"),
common = file.path(tag, "common_col.tsv.bz2")))


#)

# library(piggyback)
# setwd("2020"); fs::dir_ls("*.bz2") %>% pb_upload(repo = "boettiger-lab/taxadb-cache", tag = tag)

#library(pins)
#board_register_github(repo = "cboettig/pins-test", name = "github_pins_test")
#pin("dwc_gbif.tsv.bz2",
# board = "github_pins_test")
#pin("2020/dwc_gbif.tsv.bz2", board = "github_pins_test")


#config <- drake_config(plan)
#vis_drake_graph(config)
Expand Down
5 changes: 1 addition & 4 deletions data-raw/ncbi.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
library(tidyverse)
library(openssl)
source(here::here("data-raw/helper-routines.R"))


#' @export
preprocess_ncbi <- function(url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip",
output_paths =
c(dwc = "2019/dwc_ncbi.tsv.bz2",
Expand Down
Loading

0 comments on commit bf210e3

Please sign in to comment.