Merge branch 'master' of https://github.com/cboettig/taxald

ropensci · Jun 28, 2018 · fa5114c · fa5114c
2 parents 67181bb + 6f2c270
commit fa5114c
Show file tree

Hide file tree

Showing 9 changed files with 142 additions and 81 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,4 @@
 ^\.manifest\.json$
 ^schema\.md$
 ^README\.Rmd$
+^LICENSE\.md$
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,5 @@ data/*
 *.sqlite
 *.sql
 *.sqlite.bz2
+.Rbuildignore
+taxadb
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,21 @@
+Package: taxald
+Version: 0.0.0.9000
+Title: Taxonomic Linked Data 
+Description: Creates a local database of many commonly used taxonomic authorities
+             and provides functions that can quickly query this data.  
+Authors@R: person("Carl", "Boettiger", , "cboettig@gmail.com", c("aut", "cre"))
+License: MIT + file LICENSE
+Encoding: UTF-8
+LazyData: true
+ByteCompile: true
+Imports: 
+    dplyr,
+    dbplyr,
+    DBI,
+    MonetDBLite,
+    arkdb,
+    piggyback,
+    fs
+Remotes: cboettig/arkdb, cboettig/piggyback
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 6.0.1.9000
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2018
+COPYRIGHT HOLDER: Carl Boettiger
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2018 Carl Boettiger
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/R/create_db.R b/R/create_db.R
@@ -0,0 +1,59 @@
+
+#' create a local taxonomic database
+#'
+#' @param path a location on your computer where the database should be installed.
+#'  By default, will install to `.taxald` in your home directory.
+#' @param authorities a list (character vector) of authorities to be included in the
+#'  database. By default, will install all authorities.  Choose a subset for a faster
+#'  install.  
+#' @return path where database has been installed (invisibly)
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#'   tmp <- tempdir()
+#'   create_taxadb(tmp, authorities = "itis")
+#'
+#' }
+create_taxadb <- function(path = fs::path(fs::path_home(), ".taxald"),
+                          authorities = c("itis", "ncbi", "col", "tpl",
+                                          "gbif", "fb", "slb", "wiki")){
+  ## FIXME offer more fine-grained support over which authorities to install
+  ## FIXME include some messaging about the large downloads etc?
+
+
+  ## FIXME generate list of data files to download based on requested
+  ## authorities
+
+  ## FIXME eventually will pull from Zenodo, not piggyback
+  tmp <- tempdir()
+  piggyback::pb_download(dest = tmp, repo="cboettig/taxald")
+
+
+  files <- fs::dir_ls("data/", glob="*.tsv.bz2")
+
+
+  dbdir <- fs::dir_create(path)
+  con <- DBI::dbConnect(MonetDBLite::MonetDBLite(), dbdir)
+  db <- dbplyr::src_dbi(con)
+  arkdb::unark(files, db, lines = 1e6)
+
+  ## Clean up imported files
+  fs::dir_delete(fs::path(tmp, "data"))
+
+  ## Set id as primary key in each table?
+  # tbls <- DBI::dbListTables(db$con)
+  # lapply(tbls, function(table)
+  # glue::glue("ALTER TABLE {table} ADD PRIMARY KEY ({key});", 
+  #            table = table, key = "id"))
+
+  DBI::dbDisconnect(db$con)
+
+  invisible(dbdir)
+}
+## Consider shipping the original database pre-compressed?
+
+#R.utils::bzip2("taxa.sqlite", remove = FALSE)
+## Set up database connection from compressed file
+#R.utils::bunzip2("taxa.sqlite.bz2", remove = FALSE)
+
diff --git a/R/taxa-db.R b/R/taxa-db.R
@@ -1,73 +1,44 @@
 
-## Or this totally nut-so approach will go straight into the database
-## library(sparklyr)
-## sparklyr::spark_install()
-#sc <- sparklyr::spark_connect("local")
-#spark_df <- sparklyr::spark_read_csv(sc, "taxa", "data/taxa.tsv.bz2", delimiter = "\\t")
+library(DBI)
+library(dplyr)
+library(MonetDBLite)
+#dbdir <- fs::dir_create("taxadb")
 
-#library(dplyr)
-#spark_df %>% right_join(data_frame(name = "Gadus morhua"), copy=TRUE)
+dbdir <- Sys.getenv("TAXALD_HOME", fs::path(fs::path_home(), ".taxald"))
+con <- dbConnect(MonetDBLite::MonetDBLite(), dbdir)
 
 
-library(DBI)
-library(RSQLite)
-library(dplyr)
-library(R.utils)
 
-## con <- dbConnect(RSQLite::SQLite(), ":memory:")
-## cars <- tibble::rownames_to_column(mtcars)
-## write_tsv(cars, "cars.tsv.bz2")
-## R.utils::bunzip2("cars.tsv.bz2")
+library(geiger)
+data(primates)
+df <- data_frame(name = gsub("_", " ", primates$phy$tip.label))
+
+out <- right_join(tbl(con, "col_taxonid"), df, copy = TRUE) %>% collect()
 
-## Manual chunking can(?) handle bzip2 streaming?
-## https://github.com/vimc/montagu-r/blob/4fe82fd29992635b30e637d5412312b0c5e3e38f/R/util.R#L48-L60
+out <- right_join(tbl(con, "ncbi_taxonid"), df, copy = TRUE) %>% collect() %>% arrange(name)
 
-## Read tsv chunked
-# read_tsv_chunked("cars.tsv", ...)
+# out <- right_join(tbl(con, "itis_taxonid"), df, copy = TRUE) %>% collect() %>% arrange(name)
 
-## SQLite is fragile about quotes, so need to provide a cleaner, quote-free taxa db
-## dbWriteTable(con, "cars", "cars.tsv", sep="\t")
 
-## Alternately, consider: `readr::read_tsv_chunked()`
+out <- right_join(tbl(con, "ncbi_long"), df, copy = TRUE) %>% collect() %>% arrange(name)
 
 
-## left_join(data.frame(name = "Gadus morhua"))
+## Support queries to preferred authority or multiple/all authorities 
+
+## Look up id given a species name
+## Look up a heirarchy given a species name or species id
+## Return all species names / species ids belonging to a higher level rank
+## 
+## Look up a scientific name at any rank level.
+## 
+## Look up ids for higher ranks (when available)
+## Look up synonyms
+## Crosswalk / compare taxonomy across authorities in common format
+
+
+## Install authorities in opt-in workflow
+## Install layout / formats in opt-in style?
 
-library(geiger)
-data(primates)
- df <- data_frame(name = gsub("_", " ", primates$phy$tip.label))
- ex <- taxa_join(df)
-#' 
-#' ## How many distinct matches did we get? 
-#' ex %>% select(name) %>% distinct()
-#' 
-#' ## How many of those have NCBII matches? 
-#' ex %>% select(id, name, rank) %>% distinct() %>% filter(grepl("^NCBI:", id))
-taxa_join <- function(df, dbname = "data/taxa.sql", collect = TRUE){
-  con <- dbConnect(RSQLite::SQLite(), dbname = dbname)
-  ## y is copied into x, so this is fast for most y
-  out <- inner_join(tbl(con, "taxa"), df, copy = TRUE)
-
-  if(collect)
-    out <- collect(out)
-
-  out 
-}
-
-
-taxa_id <- function(name = NULL, id = NULL, rank = NULL, partial_match){
-
-  con <- dbConnect(RSQLite::SQLite(), dbname="data/taxa.sql")
-  taxa <- tbl(con, "taxa") %>% 
-    select(id, name, rank) %>% 
-    distinct()
-
-  if(partial_match){
-    taxa %>% 
-      filter(name %like% paste0("%", name, "%"))
-  }
-  taxa %>% filter(name == name)
-}
 
 system.time({
   tbl(con, "taxa") %>% select(id, name, rank) %>% distinct()  %>% filter(name %like% "%Gadus%")

diff --git a/data-raw/create_db.R b/data-raw/create_db.R
diff --git a/schema.md b/schema.md
@@ -3,13 +3,20 @@
 For each authority:
 
 - `hierarchy` table, in which any valid rank is a column, and a taxon `id` is a unique key.
+(wide)
 
 - `taxonid` table, in which a taxonomic `id` is a unique key to a scientific name, and every taxonomic identifier defined by the authority is present in exactly one row.  Additional columns include the `name` and `rank` associated with that information.  
 
+  long %>% select(id, name, rank, date, type) %>% distinct() %>% filter(goodnames)
+
 - Optionally, a `hierarchy_long` table, with columns `id`, `path`, `path_id`, and `path_rank`, defining hierarchy connected to any given taxonomic `id`.  Unlike the "wide" format, this approach can associate un-ranked and duplicate rank names (such as multiple `superfamily` names found in NCBI).    
 
+long %>% select(id, path_id, path_name, path_rank) %>% distinct
+
 - Optionally, a `other_names` table, in which a column `other_names` includes any possible name except those names already given in the `taxonid` table (e.g. common names, synonyms, misspellings).   A column `name_type` indicates if name is a common name, A column `id` associates a name with a taxonomic identifer as a foreign key. An optional column `language` specifies the language of any common/vernacular name.
 
+
+
 ## Long format
 
 `<prefix>_wide`